1 Background

Using info from chapter 6 of Machine Learning with R book, caret package manual, and Cubist package vignette.

2 CV set up

3-fold CV.

fit_control <- trainControl(method = "cv",
                            number = 3)

3 Analysis with rpart

3.1 Data

3.2 Training

set.seed(12345)
m_rpart <- rpart(outcome ~ ., data = training)
# m_rpart <- rpart(outcome ~ ., data = training, weights = training_weights)
n= 143 

node), split, n, deviance, yval
      * denotes terminal node

  1) root 143 718439.400 1071.7690  
    2) staffordshire_bull_terrier>=0.0857157 20  83244.550  954.6500  
      4) cavalier_king_charles_spaniel< 0.01754728 7  16307.430  882.7143 *
      5) cavalier_king_charles_spaniel>=0.01754728 13  11209.080  993.3846 *
    3) staffordshire_bull_terrier< 0.0857157 123 316248.700 1090.8130  
      6) poodle< 0.0699912 79 157894.000 1068.0000  
       12) american_staffordshire_terrier>=0.01869916 31  56149.940 1037.7420  
         24) cavalier_king_charles_spaniel< 0.03437848 15  16890.930 1006.9330 *
         25) cavalier_king_charles_spaniel>=0.03437848 16  11673.750 1066.6250 *
       13) american_staffordshire_terrier< 0.01869916 48  55031.920 1087.5420  
         26) irish_wolfhound< 0.00690611 40  34580.400 1080.2000  
           52) greyhound< 0.003563465 9   3006.000 1053.0000 *
           53) greyhound>=0.003563465 31  22982.710 1088.0970  
            106) japanese_spitz< 0.004057787 24  14109.960 1079.5420 *
            107) japanese_spitz>=0.004057787 7   1093.714 1117.4290 *
         27) irish_wolfhound>=0.00690611 8   7515.500 1124.2500 *
      7) poodle>=0.0699912 44  43421.730 1131.7730  
       14) jack_russell_terrier>=0.03578173 18  10485.610 1112.7220 *
       15) jack_russell_terrier< 0.03578173 26  21880.960 1144.9620 *

3.3 Prediction

p_rpart <- predict(m_rpart, testing[-1])

Correlation:

cor(p_rpart, testing$outcome)
[1] 0.6417208

MAE between predicted and actual values:

MAE(p_rpart, testing$outcome)
[1] 42.47538

Distro of predicted (red) nd actual (green):

3.4 Tuning

modelLookup("rpart")
  model parameter                label forReg forClass probModel
1 rpart        cp Complexity Parameter   TRUE     TRUE      TRUE

Using grid of parameters to explore:

grid_tu <- expand.grid(cp = seq(0, 1, 0.01))

nrow(grid_tu)
[1] 101
p_load(doParallel)

cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)

set.seed(12345)
m_rpart_tu <- train(outcome ~ .,
                    method = "rpart",
                    # method = "rpart2", # uses max tree depth
                    data = training,
                    # weights = training_weights, 
                    metric = "RMSE",
                    # metric = "Rsquared",
                    tuneGrid = grid_tu,
                    trControl = fit_control)

stopCluster(cl)
p_unload(doParallel)

m_rpart_tu$finalModel
n= 143 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 143 718439.40 1071.7690  
   2) staffordshire_bull_terrier>=0.0857157 20  83244.55  954.6500  
     4) cavalier_king_charles_spaniel< 0.01754728 7  16307.43  882.7143 *
     5) cavalier_king_charles_spaniel>=0.01754728 13  11209.08  993.3846 *
   3) staffordshire_bull_terrier< 0.0857157 123 316248.70 1090.8130  
     6) poodle< 0.0699912 79 157894.00 1068.0000  
      12) american_staffordshire_terrier>=0.01869916 31  56149.94 1037.7420  
        24) cavalier_king_charles_spaniel< 0.03437848 15  16890.93 1006.9330 *
        25) cavalier_king_charles_spaniel>=0.03437848 16  11673.75 1066.6250 *
      13) american_staffordshire_terrier< 0.01869916 48  55031.92 1087.5420 *
     7) poodle>=0.0699912 44  43421.73 1131.7730 *

p_rpart_tu <- predict(m_rpart_tu, testing[-1])

Correlation:

cor(p_rpart_tu, testing$outcome)
[1] 0.599047

MAE between predicted and actual values:

MAE(p_rpart_tu, testing$outcome)
[1] 42.81571

Distro of predicted (red) nd actual (green):

4 Analysis with Cubist

4.1 Training

set.seed(12345)
m_cubist <- cubist(x = training[-1], y = training$outcome)

Call:
cubist.default(x = training[-1], y = training$outcome)


Cubist [Release 2.07 GPL Edition]  Fri Feb 10 21:18:18 2023
---------------------------------

    Target attribute `outcome'

Read 143 cases (121 attributes) from undefined.data

Model:

  Rule 1: [8 cases, mean 918.2, range 795 to 1024, est err 29.1]

    if
    poodle <= 0.02212389
    then
    outcome = 1024 - 5640 poodle - 703 staffordshire_bull_terrier

  Rule 2: [135 cases, mean 1080.9, range 899 to 1198, est err 24.3]

    if
    poodle > 0.02212389
    then
    outcome = 1141.1 - 1304 american_staffordshire_terrier
              - 743 staffordshire_bull_terrier - 787 fox_terrier
              + 588 poodle - 1124 rottweiler + 219 labrador_retriever
              - 1898 bullmastiff + 285 border_collie
              - 579 jack_russell_terrier - 151 shih_tzu
              + 859 scottish_terrier + 153 schnauzer


Evaluation on training data (143 cases):

    Average  |error|               24.0
    Relative |error|               0.44
    Correlation coefficient        0.90


    Attribute usage:
      Conds  Model

      100%   100%    poodle
             100%    staffordshire_bull_terrier
              94%    american_staffordshire_terrier
              94%    border_collie
              94%    bullmastiff
              94%    fox_terrier
              94%    jack_russell_terrier
              94%    labrador_retriever
              94%    rottweiler
              94%    schnauzer
              94%    shih_tzu
              94%    scottish_terrier


Time: 0.0 secs

4.2 Prediction

p_cubist <- predict(m_cubist, testing[-1])

Correlation:

cor(p_cubist, testing$outcome)
[1] 0.8647995

MAE between predicted and actual values:

MAE(testing$outcome, p_cubist)
[1] 25.19107

Distro of predicted (red) nd actual (green):

Variable importance:

varImp(m_cubist) %>% 
  as_tibble(rownames = "breed")
# A tibble: 120 × 2
   breed                          Overall
   <chr>                            <dbl>
 1 poodle                             100
 2 staffordshire_bull_terrier          50
 3 american_staffordshire_terrier      47
 4 border_collie                       47
 5 bullmastiff                         47
 6 fox_terrier                         47
 7 jack_russell_terrier                47
 8 labrador_retriever                  47
 9 rottweiler                          47
10 schnauzer                           47
# … with 110 more rows
# vip::vi(m_cubist)

4.3 Tuning

modelLookup("cubist")
   model  parameter       label forReg forClass probModel
1 cubist committees #Committees   TRUE    FALSE     FALSE
2 cubist  neighbors  #Instances   TRUE    FALSE     FALSE

Using grid of parameters to explore:

grid_tu <-  expand.grid(committees = (50:100), 
                        neighbors = (5:9))

nrow(grid_tu)
[1] 255
p_load(doParallel)

cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)

set.seed(12345)
m_cubist_tu <- train(outcome ~ .,
                     method = "cubist",
                     data = training,
                     # weights = training_weights, 
                     metric = "RMSE",
                     # metric = "Rsquared",
                     tuneGrid = grid_tu,
                     trControl = fit_control)

stopCluster(cl)
p_unload(doParallel)

m_cubist_tu$finalModel

Call:
cubist.default(x = x, y = y, committees = param$committees)

Number of samples: 143 
Number of predictors: 120 

Number of committees: 83 
Number of rules per committee: 2, 2, 2, 1, 3, 1, 3, 1, 2, 1, 3, 1, 3, 1, 2, 1, 2, 1, 3, 4 ... 
p_cubist_tu <- predict(m_cubist_tu, testing[-1])

Correlation:

cor(p_cubist_tu, testing$outcome)
[1] 0.8919657

MAE between predicted and actual values:

MAE(p_cubist_tu, testing$outcome)
[1] 21.46933

Distro of predicted (red) nd actual (green):

Variable importance:

varImp(m_cubist_tu)
cubist variable importance

  only 20 most important variables shown (out of 120)

                               Overall
poodle                             100
staffordshire_bull_terrier          76
american_staffordshire_terrier      74
fox_terrier                         58
rottweiler                          43
west_highland_white_terrier         43
bull_arab                           38
german_shepherd                     37
cavalier_king_charles_spaniel       36
golden_retriever                    34
labrador_retriever                  32
bullmastiff                         30
maltese                             27
border_collie                       27
shih_tzu                            24
jack_russell_terrier                22
cocker_spaniel                      22
shar_pei                            22
keeshond                            20
italian_greyhound                   19

Rules:

p_load(tidyrules)

tidyRules(m_cubist_tu$finalModel) %>% 
  datatable()