1 Background

Using info from chapter 7 of Machine Learning with R book, caret package manual, and randomForest package.

2 CV set up

3-fold CV.

fit_control <- trainControl(method = "cv",
                            number = 3)

3 Analysis with randomForest

3.1 Data

3.2 Training

set.seed(12345)
m_rf <- randomForest(outcome ~ ., data = training)
                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted       143    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times       143    -none- numeric  
importance      120    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y               143    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call     

3.3 Prediction

p_rf <- predict(m_rf, testing[-1])

Correlation:

cor(p_rf, testing$outcome)
[1] 0.9109422

MAE between predicted and actual values:

MAE(p_rf, testing$outcome)
[1] 21.61364

Distro of predicted (red) nd actual (green):

3.4 Tuning

modelLookup("rf")
  model parameter                         label forReg forClass probModel
1    rf      mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE

Using grid of parameters to explore:

grid_tu <- expand.grid(mtry = seq(5, ncol(training) - 1, 1))

nrow(grid_tu)
[1] 116
p_load(doParallel)

cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)

set.seed(12345)
m_rf_tu <- train(outcome ~ .,
                 method = "rf",
                 data = training,
                 # weights = training_weights, 
                 metric = "RMSE",
                 # metric = "Rsquared",
                 tuneGrid = grid_tu,
                 trControl = fit_control)

stopCluster(cl)
p_unload(doParallel)

m_rf_tu$finalModel

Call:
 randomForest(x = x, y = y, mtry = param$mtry) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 99

          Mean of squared residuals: 1367.154
                    % Var explained: 72.79

p_rf_tu <- predict(m_rf_tu, testing[-1])

Correlation:

cor(p_rf_tu, testing$outcome)
[1] 0.8669796

MAE between predicted and actual values:

MAE(p_rf_tu, testing$outcome)
[1] 24.97651

Distro of predicted (red) nd actual (green):

4 Analysis with ranger

4.1 Training

set.seed(12345)
m_rf2 <- ranger(outcome ~ ., data = training, 
                importance = "impurity") # impurity, impurity_corrected, permutation
                          Length Class         Mode     
predictions               143    -none-        numeric  
num.trees                   1    -none-        numeric  
num.independent.variables   1    -none-        numeric  
mtry                        1    -none-        numeric  
min.node.size               1    -none-        numeric  
variable.importance       120    -none-        numeric  
prediction.error            1    -none-        numeric  
forest                      7    ranger.forest list     
splitrule                   1    -none-        character
treetype                    1    -none-        character
r.squared                   1    -none-        numeric  
call                        4    -none-        call     
importance.mode             1    -none-        character
num.samples                 1    -none-        numeric  
replace                     1    -none-        logical  

4.2 Prediction

p_rf2 <- predict(m_rf2, testing[-1])

Correlation:

cor(p_rf2$predictions, testing$outcome)
[1] 0.9141625

MAE between predicted and actual values:

MAE(p_rf2$predictions, testing$outcome)
[1] 25.19897

Distro of predicted (red) nd actual (green):

4.3 Tuning

modelLookup("ranger")
   model     parameter                         label forReg forClass probModel
1 ranger          mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE
2 ranger     splitrule                Splitting Rule   TRUE     TRUE      TRUE
3 ranger min.node.size             Minimal Node Size   TRUE     TRUE      TRUE

Using grid of parameters to explore:

grid_tu <- expand.grid(mtry = seq(5, ncol(training) - 1, 1),
                       splitrule = c("variance", "extratrees", "maxstat", "beta"),
                       min.node.size = seq(3, 30, 1))

nrow(grid_tu)
[1] 12992
p_load(doParallel)

cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)

set.seed(12345)
m_rf2_tu <- train(outcome ~ .,
                  method = "ranger",
                  data = training,
                  # weights = training_weights, 
                  metric = "RMSE",
                  # metric = "Rsquared",
                  tuneGrid = grid_tu,
                  trControl = fit_control)

stopCluster(cl)
p_unload(doParallel)

m_rf2_tu$finalModel
Ranger result

Call:
 ranger::ranger(dependent.variable.name = ".outcome", data = x,      mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size,      splitrule = as.character(param$splitrule), write.forest = TRUE,      probability = classProbs, ...) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      143 
Number of independent variables:  120 
Mtry:                             65 
Target node size:                 4 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       1339.663 
R squared (OOB):                  0.7352148 
p_rf2_tu <- predict(m_rf2_tu, testing[-1])

Correlation:

cor(p_rf2_tu, testing$outcome)
[1] 0.8993273

MAE between predicted and actual values:

MAE(p_rf2_tu, testing$outcome)
[1] 22.60483

Distro of predicted (red) nd actual (green):