1 Background

Using info from chapter 7 of Machine Learning with R book, caret package manual, and e1071 package vignette.

2 Data

2.1 Prepared data

Preparations in file 00.Rmd.

Using the Index of Education and Occupation (IEO) as outcome. More info about the index from ABS.

# wide_all_n <- read_rds("data/wide_all_n.Rds") %>% 
#   sf::st_drop_geometry() %>% 
#   # as_tibble() %>% 
#   select(IEO, IEO_d, 
#          akita:last_col())

wide_all_p <- read_rds("data/wide_all_p.Rds") %>% 
  sf::st_drop_geometry() %>% 
  # as_tibble() %>% 
  select(IEO, IEO_d, 
         akita:last_col()) %>% 
  rename(outcome = IEO,
         outcome_d = IEO_d)

# tail(names(wide_all_p), n = 1)
# skimr::skim(wide_all_p)
# report::report(wide_all_p)

2.2 Near zero-variance

nzv <- nearZeroVar(wide_all_p, saveMetrics = TRUE)
nzv <- nearZeroVar(wide_all_p)
wide_all_p <- wide_all_p[, -nzv]
# wide_all_n <- wide_all_n[, -nzv]
rm(nzv)

2.3 Train split

75/25 split, taking into account distribution of deciles:

inTrain <- createDataPartition(
  y = wide_all_p$outcome_d,
  p = .75,
  list = FALSE
)

training <- wide_all_p[ inTrain, ]
testing  <- wide_all_p[-inTrain, ]

# training_weights <- wide_all_n[ inTrain,]

Training data:

outcome_d <categorical> 
# total N=143 valid N=143 mean=5.45 sd=2.88

Value |  N | Raw % | Valid % | Cum. %
-------------------------------------
    1 | 14 |  9.79 |    9.79 |   9.79
    2 | 15 | 10.49 |   10.49 |  20.28
    3 | 15 | 10.49 |   10.49 |  30.77
    4 | 15 | 10.49 |   10.49 |  41.26
    5 | 14 |  9.79 |    9.79 |  51.05
    6 | 14 |  9.79 |    9.79 |  60.84
    7 | 14 |  9.79 |    9.79 |  70.63
    8 | 14 |  9.79 |    9.79 |  80.42
    9 | 14 |  9.79 |    9.79 |  90.21
   10 | 14 |  9.79 |    9.79 | 100.00
 <NA> |  0 |  0.00 |    <NA> |   <NA>

Testing data:

outcome_d <categorical> 
# total N=40 valid N=40 mean=5.50 sd=2.91

Value | N | Raw % | Valid % | Cum. %
------------------------------------
    1 | 4 |    10 |      10 |     10
    2 | 4 |    10 |      10 |     20
    3 | 4 |    10 |      10 |     30
    4 | 4 |    10 |      10 |     40
    5 | 4 |    10 |      10 |     50
    6 | 4 |    10 |      10 |     60
    7 | 4 |    10 |      10 |     70
    8 | 4 |    10 |      10 |     80
    9 | 4 |    10 |      10 |     90
   10 | 4 |    10 |      10 |    100
 <NA> | 0 |     0 |    <NA> |   <NA>

Outcome distro checks

3 Analysis with e1071

3.1 Training

set.seed(12345)
m_svm <- svm(outcome ~ ., data = training,
             type = "eps-regression",
             kernel = "radial")

Call:
svm(formula = outcome ~ ., data = training, type = "eps-regression", 
    kernel = "radial")


Parameters:
   SVM-Type:  eps-regression 
 SVM-Kernel:  radial 
       cost:  1 
      gamma:  0.008333333 
    epsilon:  0.1 


Number of Support Vectors:  119

3.2 Prediction

p_svm <- predict(m_svm, testing)

Correlation:

cor(p_svm, testing$outcome)
[1] 0.8064335

MAE between predicted and actual values:

MAE(p_svm, testing$outcome)
[1] 32.38492

Distro of predicted (red) nd actual (green):

4 Analysis with kernlab

4.1 Tuning

modelLookup("svmRadial")
      model parameter label forReg forClass probModel
1 svmRadial     sigma Sigma   TRUE     TRUE      TRUE
2 svmRadial         C  Cost   TRUE     TRUE      TRUE

Using grid of parameters to explore:

grid_tu <-  expand.grid(sigma = seq(0.00001, 0.0002, 0.00001),
                        C = seq(10, 20, 1))

nrow(grid_tu)
[1] 220
p_load(doParallel)

cl <- makePSOCKcluster(parallel::detectCores())
registerDoParallel(cl)

set.seed(12345)
m_svm_tu <- train(outcome ~ .,
                    method = "svmRadial",
                    data = training,
                    # weights = training_weights, 
                    metric = "RMSE",
                    # metric = "Rsquared",
                    tuneGrid = grid_tu)

stopCluster(cl)
p_unload(doParallel)

m_svm_tu$finalModel
Support Vector Machine object of class "ksvm" 

SV type: eps-svr  (regression) 
 parameter : epsilon = 0.1  cost C = 17 

Gaussian Radial Basis kernel function. 
 Hyperparameter : sigma =  0.00009 

Number of Support Vectors : 121 

Objective Function Value : -731.0119 
Training error : 0.148886 
p_svm_tu <- predict(m_svm_tu, testing)

Correlation:

cor(p_svm_tu, testing$outcome)
[1] 0.847875

MAE between predicted and actual values:

MAE(p_svm_tu, testing$outcome)
[1] 30.82691

Distro of predicted (red) nd actual (green):