1 Background

Using info from @jani notebook on kaggle.

2 Testing base model

Using default settings.

2.1 Data

Needs to be in the form of matrix, without the outcome.

2.2 Training

grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE  
)

m_xgb <- caret::train(
  x = training_x,
  y = training_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = TRUE,
  verbosity = 0
)

              Length Class              Mode       
handle             1 xgb.Booster.handle externalptr
raw           167853 -none-             raw        
niter              1 -none-             numeric    
call               7 -none-             call       
params             9 -none-             list       
callbacks          1 -none-             list       
feature_names    120 -none-             character  
nfeatures          1 -none-             numeric    
xNames           120 -none-             character  
problemType        1 -none-             character  
tuneValue          7 data.frame         list       
obsLevels          1 -none-             logical    
param              2 -none-             list

Feature importance:

importance <- xgb.importance(model = m_xgb$finalModel)

xgb.plot.importance(importance, top_n = 15)

2.3 Prediction

p_xgb <- predict(m_xgb$finalModel, testing_x)

Correlation:

cor(p_xgb, testing_y)

[1] 0.8855791

MAE:

ModelMetrics::mae(testing_y, p_xgb)

[1] 24.13134

RMSE:

ModelMetrics::rmse(p_xgb, testing_y)

[1] 31.45925

Distro of predicted (red) nd actual (green):

3 Tuning

3.1 Training

Possible parameters

modelLookup("xgbTree")

    model        parameter                          label forReg forClass
1 xgbTree          nrounds          # Boosting Iterations   TRUE     TRUE
2 xgbTree        max_depth                 Max Tree Depth   TRUE     TRUE
3 xgbTree              eta                      Shrinkage   TRUE     TRUE
4 xgbTree            gamma         Minimum Loss Reduction   TRUE     TRUE
5 xgbTree colsample_bytree     Subsample Ratio of Columns   TRUE     TRUE
6 xgbTree min_child_weight Minimum Sum of Instance Weight   TRUE     TRUE
7 xgbTree        subsample           Subsample Percentage   TRUE     TRUE
  probModel
1      TRUE
2      TRUE
3      TRUE
4      TRUE
5      TRUE
6      TRUE
7      TRUE

3.1.1 Number of Iterations and the Learning Rate

Using grid of parameters to explore:

nrounds <- 1000

tune_grid <- expand.grid(
  nrounds = seq(from = 200, to = nrounds, by = 50),
  eta = c(0.025, 0.05, 0.1, 0.3),
  max_depth = c(2, 3, 4, 5),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

tune_control <- caret::trainControl(
  method = "cv",
  number = 3,
  # index = createFolds(tr_treated$Id_clean), # fix the folds
  verboseIter = FALSE, # no training log
  allowParallel = TRUE 
)

set.seed(12345)

xgb_tune <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

xgb_tune$bestTune

   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
17    1000         2 0.025     0                1                1         1

round(min(xgb_tune$results$RMSE), digits = 5)

[1] 34.82689

3.1.2 Maximum Depth and Minimum Child Weight

tune_grid2 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = ifelse(xgb_tune$bestTune$max_depth == 2,
                     c(xgb_tune$bestTune$max_depth:4),
                     xgb_tune$bestTune$max_depth - 1:xgb_tune$bestTune$max_depth + 1),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = c(1, 2, 3, 4),
  subsample = 1
)

set.seed(12345)

xgb_tune2 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid2,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

xgb_tune2$bestTune

   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
52     600         2 0.025     0                1                3         1

round(min(xgb_tune2$results$RMSE), digits = 5)

[1] 34.79478

3.1.3 Column and Row Sampling

tune_grid3 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = 0,
  colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = c(0.5, 0.75, 1.0)
)

xgb_tune3 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid3,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

xgb_tune3$bestTune

   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
72     600         2 0.025     0              0.6                3       0.5

round(min(xgb_tune3$results$RMSE), digits = 5)

[1] 34.52475

3.1.4 Gamma

tune_grid4 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune4 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid4,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

xgb_tune4$bestTune

   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
92     600         2 0.025   0.7              0.6                3       0.5

round(min(xgb_tune4$results$RMSE), digits = 5)

[1] 34.28292

3.1.5 Reducing the Learning Rate

tune_grid5 <- expand.grid(
  nrounds = seq(from = 100, to = 10000, by = 100),
  eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = xgb_tune4$bestTune$gamma,
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune5 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid5,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

xgb_tune5$bestTune

    nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
106     600         2 0.015   0.7              0.6                3       0.5

round(min(xgb_tune5$results$RMSE), digits = 5)

[1] 36.52597

3.1.6 Final model

train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE  
)

(final_grid <- expand.grid(
  nrounds = xgb_tune5$bestTune$nrounds,
  eta = xgb_tune5$bestTune$eta,
  max_depth = xgb_tune5$bestTune$max_depth,
  gamma = xgb_tune5$bestTune$gamma,
  colsample_bytree = xgb_tune5$bestTune$colsample_bytree,
  min_child_weight = xgb_tune5$bestTune$min_child_weight,
  subsample = xgb_tune5$bestTune$subsample
))

  nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5

m_xgb_tu <- caret::train(
  x = training_x,
  y = training_y,
  trControl = train_control,
  tuneGrid = final_grid,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)

m_xgb_tu$bestTune

  nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)

xgb.plot.importance(importance, top_n = 15)

3.2 Prediction with testing dataset

p_xgb_tu <- predict(m_xgb_tu, testing_x)

Correlation:

cor(p_xgb_tu, testing_y)

[1] 0.9205904

MAE:

ModelMetrics::mae(testing_y, p_xgb_tu)

[1] 18.47843

RMSE:

ModelMetrics::rmse(testing_y, p_xgb_tu)

[1] 25.32764

Distro of predicted (red) nd actual (green):

3.3 Prediction with full dataset

data <- bind_rows(read_rds("data/training.Rds"),
                  read_rds("data/testing.Rds"))

data_x <- as.matrix(select(data, -outcome))
data_y <- data$outcome

3.3.1 Default model

m_xgb_tu <- caret::train(
  x = data_x,
  y = data_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = FALSE)

m_xgb_tu$finalModel

##### xgb.Booster
raw: 181.3 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
# of features: 120 
niter: 100
nfeatures : 120 
xNames : akita alaskan_malamute american_staffordshire_terrier australian_cattle_dog australian_kelpie australian_koolie australian_shepherd australian_silky_terrier australian_terrier beagle belgian_shepherd bichon_frise border_collie boston_terrier boxer british_bulldog bull_arab bull_terrier bullmastiff cairn_terrier cavalier_king_charles_spaniel chihuahua cocker_spaniel curly_coated_retriever dachshund dalmatian dobermann dogue_de_bordeaux fox_terrier french_bulldog german_shepherd golden_retriever great_dane hungarian_vizsla irish_wolfhound jack_russell_terrier japanese_spitz keeshond king_charles_spaniel labrador_retriever lhasa_apso maltese maremma_sheepdog mastiff papillon pomeranian poodle poodle_toy pug rhodesian_ridgeback rottweiler samoyed schnauzer shar_pei shiba_inu shih_tzu siberian_husky staffordshire_bull_terrier tenterfield_terrier tibetan_spaniel tibetan_terrier welsh_corgi west_highland_white_terrier whippet yorkshire_terrier airedale_terrier bearded_collie english_springer_spaniel english_toy_terrier german_shorthaired_pointer greyhound irish_setter italian_greyhound old_english_sheepdog pekingese rough_collie shetland_sheepdog welsh_terrier basenji basset_hound bedlington_terrier bernese_mountain_dog border_terrier brittany chinese_crested_dog chow_chow finnish_lapphund german_spitz griffon_bruxellois havanese irish_terrier lowchen neapolitan_mastiff newfoundland poodle_miniature scottish_terrier soft_coated_wheaten_terrier weimaraner poodle_standard portuguese_water_dog chihuahua_smooth_coat fox_terrier_smooth pointer welsh_springer_spaniel lagotto_romagnolo smooth_collie st_bernard stag_hound flat_coated_retriever schipperke schnauzer_miniature english_setter manchester_terrier miniature_pinscher kerry_blue_terrier lakeland_terrier field_spaniel chihuahua_long_coat nova_scotia_duck_tolling_retriever anatolian_shepherd_dog 
problemType : Regression 
tuneValue :
      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
1     100         6 0.3     0                1                1         1
obsLevels : NA 
param :
    $verbose
[1] FALSE

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)

xgb.plot.importance(importance, top_n = 15)

p_xgb_tu <- predict(m_xgb_tu, training_x)

Correlation:

cor(p_xgb_tu, training_y)

[1] 1

MAE:

ModelMetrics::mae(training_y, p_xgb_tu)

[1] 0.0002829812

RMSE:

ModelMetrics::rmse(training_y, p_xgb_tu)

[1] 0.0003897477

Distro of predicted (red) nd actual (green):

3.3.2 Tuned model

m_xgb_tu <- caret::train(
  x = data_x,
  y = data_y,
  trControl = train_control,
  tuneGrid = final_grid,
  method = "xgbTree",
  verbose = FALSE)

m_xgb_tu$finalModel

##### xgb.Booster
raw: 505.2 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.015", max_depth = "2", gamma = "0.7", colsample_bytree = "0.6", min_child_weight = "3", subsample = "0.5", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
# of features: 120 
niter: 600
nfeatures : 120 
xNames : akita alaskan_malamute american_staffordshire_terrier australian_cattle_dog australian_kelpie australian_koolie australian_shepherd australian_silky_terrier australian_terrier beagle belgian_shepherd bichon_frise border_collie boston_terrier boxer british_bulldog bull_arab bull_terrier bullmastiff cairn_terrier cavalier_king_charles_spaniel chihuahua cocker_spaniel curly_coated_retriever dachshund dalmatian dobermann dogue_de_bordeaux fox_terrier french_bulldog german_shepherd golden_retriever great_dane hungarian_vizsla irish_wolfhound jack_russell_terrier japanese_spitz keeshond king_charles_spaniel labrador_retriever lhasa_apso maltese maremma_sheepdog mastiff papillon pomeranian poodle poodle_toy pug rhodesian_ridgeback rottweiler samoyed schnauzer shar_pei shiba_inu shih_tzu siberian_husky staffordshire_bull_terrier tenterfield_terrier tibetan_spaniel tibetan_terrier welsh_corgi west_highland_white_terrier whippet yorkshire_terrier airedale_terrier bearded_collie english_springer_spaniel english_toy_terrier german_shorthaired_pointer greyhound irish_setter italian_greyhound old_english_sheepdog pekingese rough_collie shetland_sheepdog welsh_terrier basenji basset_hound bedlington_terrier bernese_mountain_dog border_terrier brittany chinese_crested_dog chow_chow finnish_lapphund german_spitz griffon_bruxellois havanese irish_terrier lowchen neapolitan_mastiff newfoundland poodle_miniature scottish_terrier soft_coated_wheaten_terrier weimaraner poodle_standard portuguese_water_dog chihuahua_smooth_coat fox_terrier_smooth pointer welsh_springer_spaniel lagotto_romagnolo smooth_collie st_bernard stag_hound flat_coated_retriever schipperke schnauzer_miniature english_setter manchester_terrier miniature_pinscher kerry_blue_terrier lakeland_terrier field_spaniel chihuahua_long_coat nova_scotia_duck_tolling_retriever anatolian_shepherd_dog 
problemType : Regression 
tuneValue :
      nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5
obsLevels : NA 
param :
    $verbose
[1] FALSE

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)

xgb.plot.importance(importance, top_n = 15)

p_xgb_tu <- predict(m_xgb_tu, training_x)

Correlation:

cor(p_xgb_tu, training_y)

[1] 0.983128

MAE:

ModelMetrics::mae(training_y, p_xgb_tu)

[1] 10.49284

RMSE:

ModelMetrics::rmse(training_y, p_xgb_tu)

[1] 13.42157

Distro of predicted (red) nd actual (green):

Dog-SEP

Machine learning: XGBoost

Radoslaw Panczak

10 February, 2023

1 Background

2 Testing base model

2.1 Data

2.2 Training

2.3 Prediction

3 Tuning

3.1 Training

3.1.1 Number of Iterations and the Learning Rate

3.1.2 Maximum Depth and Minimum Child Weight

3.1.3 Column and Row Sampling

3.1.4 Gamma

3.1.5 Reducing the Learning Rate

3.1.6 Final model

3.2 Prediction with testing dataset

3.3 Prediction with full dataset

3.3.1 Default model

3.3.2 Tuned model