Using info from chapter 7 of Machine Learning with R book,
caret package manual, and
randomForest package.
3-fold CV.
fit_control <- trainControl(method = "cv",
number = 3)randomForestset.seed(12345)
m_rf <- randomForest(outcome ~ ., data = training) Length Class Mode
call 3 -none- call
type 1 -none- character
predicted 143 -none- numeric
mse 500 -none- numeric
rsq 500 -none- numeric
oob.times 143 -none- numeric
importance 120 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 143 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
p_rf <- predict(m_rf, testing[-1])Correlation:
cor(p_rf, testing$outcome)[1] 0.9109422
MAE between predicted and actual values:
MAE(p_rf, testing$outcome)[1] 21.61364
Distro of predicted (red) nd actual (green):
modelLookup("rf") model parameter label forReg forClass probModel
1 rf mtry #Randomly Selected Predictors TRUE TRUE TRUE
Using grid of parameters to explore:
grid_tu <- expand.grid(mtry = seq(5, ncol(training) - 1, 1))
nrow(grid_tu)[1] 116
p_load(doParallel)
cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)
set.seed(12345)
m_rf_tu <- train(outcome ~ .,
method = "rf",
data = training,
# weights = training_weights,
metric = "RMSE",
# metric = "Rsquared",
tuneGrid = grid_tu,
trControl = fit_control)
stopCluster(cl)
p_unload(doParallel)m_rf_tu$finalModel
Call:
randomForest(x = x, y = y, mtry = param$mtry)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 99
Mean of squared residuals: 1367.154
% Var explained: 72.79
p_rf_tu <- predict(m_rf_tu, testing[-1])Correlation:
cor(p_rf_tu, testing$outcome)[1] 0.8669796
MAE between predicted and actual values:
MAE(p_rf_tu, testing$outcome)[1] 24.97651
Distro of predicted (red) nd actual (green):
rangerset.seed(12345)
m_rf2 <- ranger(outcome ~ ., data = training,
importance = "impurity") # impurity, impurity_corrected, permutation Length Class Mode
predictions 143 -none- numeric
num.trees 1 -none- numeric
num.independent.variables 1 -none- numeric
mtry 1 -none- numeric
min.node.size 1 -none- numeric
variable.importance 120 -none- numeric
prediction.error 1 -none- numeric
forest 7 ranger.forest list
splitrule 1 -none- character
treetype 1 -none- character
r.squared 1 -none- numeric
call 4 -none- call
importance.mode 1 -none- character
num.samples 1 -none- numeric
replace 1 -none- logical
p_rf2 <- predict(m_rf2, testing[-1])Correlation:
cor(p_rf2$predictions, testing$outcome)[1] 0.9141625
MAE between predicted and actual values:
MAE(p_rf2$predictions, testing$outcome)[1] 25.19897
Distro of predicted (red) nd actual (green):
modelLookup("ranger") model parameter label forReg forClass probModel
1 ranger mtry #Randomly Selected Predictors TRUE TRUE TRUE
2 ranger splitrule Splitting Rule TRUE TRUE TRUE
3 ranger min.node.size Minimal Node Size TRUE TRUE TRUE
Using grid of parameters to explore:
grid_tu <- expand.grid(mtry = seq(5, ncol(training) - 1, 1),
splitrule = c("variance", "extratrees", "maxstat", "beta"),
min.node.size = seq(3, 30, 1))
nrow(grid_tu)[1] 12992
p_load(doParallel)
cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)
set.seed(12345)
m_rf2_tu <- train(outcome ~ .,
method = "ranger",
data = training,
# weights = training_weights,
metric = "RMSE",
# metric = "Rsquared",
tuneGrid = grid_tu,
trControl = fit_control)
stopCluster(cl)
p_unload(doParallel)m_rf2_tu$finalModelRanger result
Call:
ranger::ranger(dependent.variable.name = ".outcome", data = x, mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size, splitrule = as.character(param$splitrule), write.forest = TRUE, probability = classProbs, ...)
Type: Regression
Number of trees: 500
Sample size: 143
Number of independent variables: 120
Mtry: 65
Target node size: 4
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 1339.663
R squared (OOB): 0.7352148
p_rf2_tu <- predict(m_rf2_tu, testing[-1])Correlation:
cor(p_rf2_tu, testing$outcome)[1] 0.8993273
MAE between predicted and actual values:
MAE(p_rf2_tu, testing$outcome)[1] 22.60483
Distro of predicted (red) nd actual (green):