Using info from chapter 6 of Machine Learning with R book,
caret
package manual, and
Cubist
package vignette.
3-fold CV.
<- trainControl(method = "cv",
fit_control number = 3)
rpart
set.seed(12345)
<- rpart(outcome ~ ., data = training)
m_rpart # m_rpart <- rpart(outcome ~ ., data = training, weights = training_weights)
n= 143
node), split, n, deviance, yval
* denotes terminal node
1) root 143 718439.400 1071.7690
2) staffordshire_bull_terrier>=0.0857157 20 83244.550 954.6500
4) cavalier_king_charles_spaniel< 0.01754728 7 16307.430 882.7143 *
5) cavalier_king_charles_spaniel>=0.01754728 13 11209.080 993.3846 *
3) staffordshire_bull_terrier< 0.0857157 123 316248.700 1090.8130
6) poodle< 0.0699912 79 157894.000 1068.0000
12) american_staffordshire_terrier>=0.01869916 31 56149.940 1037.7420
24) cavalier_king_charles_spaniel< 0.03437848 15 16890.930 1006.9330 *
25) cavalier_king_charles_spaniel>=0.03437848 16 11673.750 1066.6250 *
13) american_staffordshire_terrier< 0.01869916 48 55031.920 1087.5420
26) irish_wolfhound< 0.00690611 40 34580.400 1080.2000
52) greyhound< 0.003563465 9 3006.000 1053.0000 *
53) greyhound>=0.003563465 31 22982.710 1088.0970
106) japanese_spitz< 0.004057787 24 14109.960 1079.5420 *
107) japanese_spitz>=0.004057787 7 1093.714 1117.4290 *
27) irish_wolfhound>=0.00690611 8 7515.500 1124.2500 *
7) poodle>=0.0699912 44 43421.730 1131.7730
14) jack_russell_terrier>=0.03578173 18 10485.610 1112.7220 *
15) jack_russell_terrier< 0.03578173 26 21880.960 1144.9620 *
<- predict(m_rpart, testing[-1]) p_rpart
Correlation:
cor(p_rpart, testing$outcome)
[1] 0.6417208
MAE between predicted and actual values:
MAE(p_rpart, testing$outcome)
[1] 42.47538
Distro of predicted (red) nd actual (green):
modelLookup("rpart")
model parameter label forReg forClass probModel
1 rpart cp Complexity Parameter TRUE TRUE TRUE
Using grid of parameters to explore:
<- expand.grid(cp = seq(0, 1, 0.01))
grid_tu
nrow(grid_tu)
[1] 101
p_load(doParallel)
<- makePSOCKcluster(parallel::detectCores())
cl registerDoParallel(cl)
set.seed(12345)
<- train(outcome ~ .,
m_rpart_tu method = "rpart",
# method = "rpart2", # uses max tree depth
data = training,
# weights = training_weights,
metric = "RMSE",
# metric = "Rsquared",
tuneGrid = grid_tu,
trControl = fit_control)
stopCluster(cl)
p_unload(doParallel)
$finalModel m_rpart_tu
n= 143
node), split, n, deviance, yval
* denotes terminal node
1) root 143 718439.40 1071.7690
2) staffordshire_bull_terrier>=0.0857157 20 83244.55 954.6500
4) cavalier_king_charles_spaniel< 0.01754728 7 16307.43 882.7143 *
5) cavalier_king_charles_spaniel>=0.01754728 13 11209.08 993.3846 *
3) staffordshire_bull_terrier< 0.0857157 123 316248.70 1090.8130
6) poodle< 0.0699912 79 157894.00 1068.0000
12) american_staffordshire_terrier>=0.01869916 31 56149.94 1037.7420
24) cavalier_king_charles_spaniel< 0.03437848 15 16890.93 1006.9330 *
25) cavalier_king_charles_spaniel>=0.03437848 16 11673.75 1066.6250 *
13) american_staffordshire_terrier< 0.01869916 48 55031.92 1087.5420 *
7) poodle>=0.0699912 44 43421.73 1131.7730 *
<- predict(m_rpart_tu, testing[-1]) p_rpart_tu
Correlation:
cor(p_rpart_tu, testing$outcome)
[1] 0.599047
MAE between predicted and actual values:
MAE(p_rpart_tu, testing$outcome)
[1] 42.81571
Distro of predicted (red) nd actual (green):
Cubist
set.seed(12345)
<- cubist(x = training[-1], y = training$outcome) m_cubist
Call:
cubist.default(x = training[-1], y = training$outcome)
Cubist [Release 2.07 GPL Edition] Fri Feb 10 21:18:18 2023
---------------------------------
Target attribute `outcome'
Read 143 cases (121 attributes) from undefined.data
Model:
Rule 1: [8 cases, mean 918.2, range 795 to 1024, est err 29.1]
if
poodle <= 0.02212389
then
outcome = 1024 - 5640 poodle - 703 staffordshire_bull_terrier
Rule 2: [135 cases, mean 1080.9, range 899 to 1198, est err 24.3]
if
poodle > 0.02212389
then
outcome = 1141.1 - 1304 american_staffordshire_terrier
- 743 staffordshire_bull_terrier - 787 fox_terrier
+ 588 poodle - 1124 rottweiler + 219 labrador_retriever
- 1898 bullmastiff + 285 border_collie
- 579 jack_russell_terrier - 151 shih_tzu
+ 859 scottish_terrier + 153 schnauzer
Evaluation on training data (143 cases):
Average |error| 24.0
Relative |error| 0.44
Correlation coefficient 0.90
Attribute usage:
Conds Model
100% 100% poodle
100% staffordshire_bull_terrier
94% american_staffordshire_terrier
94% border_collie
94% bullmastiff
94% fox_terrier
94% jack_russell_terrier
94% labrador_retriever
94% rottweiler
94% schnauzer
94% shih_tzu
94% scottish_terrier
Time: 0.0 secs
<- predict(m_cubist, testing[-1]) p_cubist
Correlation:
cor(p_cubist, testing$outcome)
[1] 0.8647995
MAE between predicted and actual values:
MAE(testing$outcome, p_cubist)
[1] 25.19107
Distro of predicted (red) nd actual (green):
Variable importance:
varImp(m_cubist) %>%
as_tibble(rownames = "breed")
# A tibble: 120 × 2
breed Overall
<chr> <dbl>
1 poodle 100
2 staffordshire_bull_terrier 50
3 american_staffordshire_terrier 47
4 border_collie 47
5 bullmastiff 47
6 fox_terrier 47
7 jack_russell_terrier 47
8 labrador_retriever 47
9 rottweiler 47
10 schnauzer 47
# … with 110 more rows
# vip::vi(m_cubist)
modelLookup("cubist")
model parameter label forReg forClass probModel
1 cubist committees #Committees TRUE FALSE FALSE
2 cubist neighbors #Instances TRUE FALSE FALSE
Using grid of parameters to explore:
<- expand.grid(committees = (50:100),
grid_tu neighbors = (5:9))
nrow(grid_tu)
[1] 255
p_load(doParallel)
<- makePSOCKcluster(parallel::detectCores())
cl registerDoParallel(cl)
set.seed(12345)
<- train(outcome ~ .,
m_cubist_tu method = "cubist",
data = training,
# weights = training_weights,
metric = "RMSE",
# metric = "Rsquared",
tuneGrid = grid_tu,
trControl = fit_control)
stopCluster(cl)
p_unload(doParallel)
$finalModel m_cubist_tu
Call:
cubist.default(x = x, y = y, committees = param$committees)
Number of samples: 143
Number of predictors: 120
Number of committees: 83
Number of rules per committee: 2, 2, 2, 1, 3, 1, 3, 1, 2, 1, 3, 1, 3, 1, 2, 1, 2, 1, 3, 4 ...
<- predict(m_cubist_tu, testing[-1]) p_cubist_tu
Correlation:
cor(p_cubist_tu, testing$outcome)
[1] 0.8919657
MAE between predicted and actual values:
MAE(p_cubist_tu, testing$outcome)
[1] 21.46933
Distro of predicted (red) nd actual (green):
Variable importance:
varImp(m_cubist_tu)
cubist variable importance
only 20 most important variables shown (out of 120)
Overall
poodle 100
staffordshire_bull_terrier 76
american_staffordshire_terrier 74
fox_terrier 58
rottweiler 43
west_highland_white_terrier 43
bull_arab 38
german_shepherd 37
cavalier_king_charles_spaniel 36
golden_retriever 34
labrador_retriever 32
bullmastiff 30
maltese 27
border_collie 27
shih_tzu 24
jack_russell_terrier 22
cocker_spaniel 22
shar_pei 22
keeshond 20
italian_greyhound 19
Rules:
p_load(tidyrules)
tidyRules(m_cubist_tu$finalModel) %>%
datatable()