1 Background

Using info from @jani notebook on kaggle.

2 Testing base model

Using default settings.

2.1 Data

Needs to be in the form of matrix, without the outcome.

2.2 Training

grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE  
)

m_xgb <- caret::train(
  x = training_x,
  y = training_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = TRUE,
  verbosity = 0
)
              Length Class              Mode       
handle             1 xgb.Booster.handle externalptr
raw           167853 -none-             raw        
niter              1 -none-             numeric    
call               7 -none-             call       
params             9 -none-             list       
callbacks          1 -none-             list       
feature_names    120 -none-             character  
nfeatures          1 -none-             numeric    
xNames           120 -none-             character  
problemType        1 -none-             character  
tuneValue          7 data.frame         list       
obsLevels          1 -none-             logical    
param              2 -none-             list       

Feature importance:

importance <- xgb.importance(model = m_xgb$finalModel)
xgb.plot.importance(importance, top_n = 15)

2.3 Prediction

p_xgb <- predict(m_xgb$finalModel, testing_x)

Correlation:

cor(p_xgb, testing_y)
[1] 0.8855791

MAE:

ModelMetrics::mae(testing_y, p_xgb)
[1] 24.13134

RMSE:

ModelMetrics::rmse(p_xgb, testing_y)
[1] 31.45925

Distro of predicted (red) nd actual (green):

3 Tuning

3.1 Training

Possible parameters

modelLookup("xgbTree")
    model        parameter                          label forReg forClass
1 xgbTree          nrounds          # Boosting Iterations   TRUE     TRUE
2 xgbTree        max_depth                 Max Tree Depth   TRUE     TRUE
3 xgbTree              eta                      Shrinkage   TRUE     TRUE
4 xgbTree            gamma         Minimum Loss Reduction   TRUE     TRUE
5 xgbTree colsample_bytree     Subsample Ratio of Columns   TRUE     TRUE
6 xgbTree min_child_weight Minimum Sum of Instance Weight   TRUE     TRUE
7 xgbTree        subsample           Subsample Percentage   TRUE     TRUE
  probModel
1      TRUE
2      TRUE
3      TRUE
4      TRUE
5      TRUE
6      TRUE
7      TRUE

3.1.1 Number of Iterations and the Learning Rate

Using grid of parameters to explore:

nrounds <- 1000

tune_grid <- expand.grid(
  nrounds = seq(from = 200, to = nrounds, by = 50),
  eta = c(0.025, 0.05, 0.1, 0.3),
  max_depth = c(2, 3, 4, 5),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

tune_control <- caret::trainControl(
  method = "cv",
  number = 3,
  # index = createFolds(tr_treated$Id_clean), # fix the folds
  verboseIter = FALSE, # no training log
  allowParallel = TRUE 
)

set.seed(12345)

xgb_tune <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
xgb_tune$bestTune
   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
17    1000         2 0.025     0                1                1         1

round(min(xgb_tune$results$RMSE), digits = 5)
[1] 34.82689

3.1.2 Maximum Depth and Minimum Child Weight

tune_grid2 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = ifelse(xgb_tune$bestTune$max_depth == 2,
                     c(xgb_tune$bestTune$max_depth:4),
                     xgb_tune$bestTune$max_depth - 1:xgb_tune$bestTune$max_depth + 1),
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = c(1, 2, 3, 4),
  subsample = 1
)

set.seed(12345)

xgb_tune2 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid2,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
xgb_tune2$bestTune
   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
52     600         2 0.025     0                1                3         1

round(min(xgb_tune2$results$RMSE), digits = 5)
[1] 34.79478

3.1.3 Column and Row Sampling

tune_grid3 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = 0,
  colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = c(0.5, 0.75, 1.0)
)

xgb_tune3 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid3,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
xgb_tune3$bestTune
   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
72     600         2 0.025     0              0.6                3       0.5

round(min(xgb_tune3$results$RMSE), digits = 5)
[1] 34.52475

3.1.4 Gamma

tune_grid4 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = xgb_tune$bestTune$eta,
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune4 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid4,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
xgb_tune4$bestTune
   nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
92     600         2 0.025   0.7              0.6                3       0.5

round(min(xgb_tune4$results$RMSE), digits = 5)
[1] 34.28292

3.1.5 Reducing the Learning Rate

tune_grid5 <- expand.grid(
  nrounds = seq(from = 100, to = 10000, by = 100),
  eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
  max_depth = xgb_tune2$bestTune$max_depth,
  gamma = xgb_tune4$bestTune$gamma,
  colsample_bytree = xgb_tune3$bestTune$colsample_bytree,
  min_child_weight = xgb_tune2$bestTune$min_child_weight,
  subsample = xgb_tune3$bestTune$subsample
)

xgb_tune5 <- caret::train(
  x = training_x,
  y = training_y,
  trControl = tune_control,
  tuneGrid = tune_grid5,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
xgb_tune5$bestTune
    nrounds max_depth   eta gamma colsample_bytree min_child_weight subsample
106     600         2 0.015   0.7              0.6                3       0.5

round(min(xgb_tune5$results$RMSE), digits = 5)
[1] 36.52597

3.1.6 Final model

train_control <- caret::trainControl(
  method = "none",
  verboseIter = FALSE, # no training log
  allowParallel = TRUE  
)

(final_grid <- expand.grid(
  nrounds = xgb_tune5$bestTune$nrounds,
  eta = xgb_tune5$bestTune$eta,
  max_depth = xgb_tune5$bestTune$max_depth,
  gamma = xgb_tune5$bestTune$gamma,
  colsample_bytree = xgb_tune5$bestTune$colsample_bytree,
  min_child_weight = xgb_tune5$bestTune$min_child_weight,
  subsample = xgb_tune5$bestTune$subsample
))
  nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5
m_xgb_tu <- caret::train(
  x = training_x,
  y = training_y,
  trControl = train_control,
  tuneGrid = final_grid,
  method = "xgbTree",
  verbose = FALSE,
  verbosity = 0
)
m_xgb_tu$bestTune
  nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)
xgb.plot.importance(importance, top_n = 15)

3.2 Prediction with testing dataset

p_xgb_tu <- predict(m_xgb_tu, testing_x)

Correlation:

cor(p_xgb_tu, testing_y)
[1] 0.9205904

MAE:

ModelMetrics::mae(testing_y, p_xgb_tu)
[1] 18.47843

RMSE:

ModelMetrics::rmse(testing_y, p_xgb_tu)
[1] 25.32764

Distro of predicted (red) nd actual (green):

3.3 Prediction with full dataset

data <- bind_rows(read_rds("data/training.Rds"),
                  read_rds("data/testing.Rds"))

data_x <- as.matrix(select(data, -outcome))
data_y <- data$outcome

3.3.1 Default model

m_xgb_tu <- caret::train(
  x = data_x,
  y = data_y,
  trControl = train_control,
  tuneGrid = grid_default,
  method = "xgbTree",
  verbose = FALSE)
m_xgb_tu$finalModel
##### xgb.Booster
raw: 181.3 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
# of features: 120 
niter: 100
nfeatures : 120 
xNames : akita alaskan_malamute american_staffordshire_terrier australian_cattle_dog australian_kelpie australian_koolie australian_shepherd australian_silky_terrier australian_terrier beagle belgian_shepherd bichon_frise border_collie boston_terrier boxer british_bulldog bull_arab bull_terrier bullmastiff cairn_terrier cavalier_king_charles_spaniel chihuahua cocker_spaniel curly_coated_retriever dachshund dalmatian dobermann dogue_de_bordeaux fox_terrier french_bulldog german_shepherd golden_retriever great_dane hungarian_vizsla irish_wolfhound jack_russell_terrier japanese_spitz keeshond king_charles_spaniel labrador_retriever lhasa_apso maltese maremma_sheepdog mastiff papillon pomeranian poodle poodle_toy pug rhodesian_ridgeback rottweiler samoyed schnauzer shar_pei shiba_inu shih_tzu siberian_husky staffordshire_bull_terrier tenterfield_terrier tibetan_spaniel tibetan_terrier welsh_corgi west_highland_white_terrier whippet yorkshire_terrier airedale_terrier bearded_collie english_springer_spaniel english_toy_terrier german_shorthaired_pointer greyhound irish_setter italian_greyhound old_english_sheepdog pekingese rough_collie shetland_sheepdog welsh_terrier basenji basset_hound bedlington_terrier bernese_mountain_dog border_terrier brittany chinese_crested_dog chow_chow finnish_lapphund german_spitz griffon_bruxellois havanese irish_terrier lowchen neapolitan_mastiff newfoundland poodle_miniature scottish_terrier soft_coated_wheaten_terrier weimaraner poodle_standard portuguese_water_dog chihuahua_smooth_coat fox_terrier_smooth pointer welsh_springer_spaniel lagotto_romagnolo smooth_collie st_bernard stag_hound flat_coated_retriever schipperke schnauzer_miniature english_setter manchester_terrier miniature_pinscher kerry_blue_terrier lakeland_terrier field_spaniel chihuahua_long_coat nova_scotia_duck_tolling_retriever anatolian_shepherd_dog 
problemType : Regression 
tuneValue :
      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
1     100         6 0.3     0                1                1         1
obsLevels : NA 
param :
    $verbose
[1] FALSE

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)
xgb.plot.importance(importance, top_n = 15)

p_xgb_tu <- predict(m_xgb_tu, training_x)

Correlation:

cor(p_xgb_tu, training_y)
[1] 1

MAE:

ModelMetrics::mae(training_y, p_xgb_tu)
[1] 0.0002829812

RMSE:

ModelMetrics::rmse(training_y, p_xgb_tu)
[1] 0.0003897477

Distro of predicted (red) nd actual (green):

3.3.2 Tuned model

m_xgb_tu <- caret::train(
  x = data_x,
  y = data_y,
  trControl = train_control,
  tuneGrid = final_grid,
  method = "xgbTree",
  verbose = FALSE)
m_xgb_tu$finalModel
##### xgb.Booster
raw: 505.2 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.015", max_depth = "2", gamma = "0.7", colsample_bytree = "0.6", min_child_weight = "3", subsample = "0.5", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
# of features: 120 
niter: 600
nfeatures : 120 
xNames : akita alaskan_malamute american_staffordshire_terrier australian_cattle_dog australian_kelpie australian_koolie australian_shepherd australian_silky_terrier australian_terrier beagle belgian_shepherd bichon_frise border_collie boston_terrier boxer british_bulldog bull_arab bull_terrier bullmastiff cairn_terrier cavalier_king_charles_spaniel chihuahua cocker_spaniel curly_coated_retriever dachshund dalmatian dobermann dogue_de_bordeaux fox_terrier french_bulldog german_shepherd golden_retriever great_dane hungarian_vizsla irish_wolfhound jack_russell_terrier japanese_spitz keeshond king_charles_spaniel labrador_retriever lhasa_apso maltese maremma_sheepdog mastiff papillon pomeranian poodle poodle_toy pug rhodesian_ridgeback rottweiler samoyed schnauzer shar_pei shiba_inu shih_tzu siberian_husky staffordshire_bull_terrier tenterfield_terrier tibetan_spaniel tibetan_terrier welsh_corgi west_highland_white_terrier whippet yorkshire_terrier airedale_terrier bearded_collie english_springer_spaniel english_toy_terrier german_shorthaired_pointer greyhound irish_setter italian_greyhound old_english_sheepdog pekingese rough_collie shetland_sheepdog welsh_terrier basenji basset_hound bedlington_terrier bernese_mountain_dog border_terrier brittany chinese_crested_dog chow_chow finnish_lapphund german_spitz griffon_bruxellois havanese irish_terrier lowchen neapolitan_mastiff newfoundland poodle_miniature scottish_terrier soft_coated_wheaten_terrier weimaraner poodle_standard portuguese_water_dog chihuahua_smooth_coat fox_terrier_smooth pointer welsh_springer_spaniel lagotto_romagnolo smooth_collie st_bernard stag_hound flat_coated_retriever schipperke schnauzer_miniature english_setter manchester_terrier miniature_pinscher kerry_blue_terrier lakeland_terrier field_spaniel chihuahua_long_coat nova_scotia_duck_tolling_retriever anatolian_shepherd_dog 
problemType : Regression 
tuneValue :
      nrounds   eta max_depth gamma colsample_bytree min_child_weight subsample
1     600 0.015         2   0.7              0.6                3       0.5
obsLevels : NA 
param :
    $verbose
[1] FALSE

Feature importance:

importance <- xgb.importance(model = m_xgb_tu$finalModel)
xgb.plot.importance(importance, top_n = 15)

p_xgb_tu <- predict(m_xgb_tu, training_x)

Correlation:

cor(p_xgb_tu, training_y)
[1] 0.983128

MAE:

ModelMetrics::mae(training_y, p_xgb_tu)
[1] 10.49284

RMSE:

ModelMetrics::rmse(training_y, p_xgb_tu)
[1] 13.42157

Distro of predicted (red) nd actual (green):