Using info from chapter 7 of Machine Learning with R book,
caret
package manual, and
e1071
package vignette.
Preparations in file 00.Rmd
.
Using the Index of Education and Occupation (IEO
) as
outcome. More info about the index from
ABS.
# wide_all_n <- read_rds("data/wide_all_n.Rds") %>%
# sf::st_drop_geometry() %>%
# # as_tibble() %>%
# select(IEO, IEO_d,
# akita:last_col())
<- read_rds("data/wide_all_p.Rds") %>%
wide_all_p ::st_drop_geometry() %>%
sf# as_tibble() %>%
select(IEO, IEO_d,
:last_col()) %>%
akitarename(outcome = IEO,
outcome_d = IEO_d)
# tail(names(wide_all_p), n = 1)
# skimr::skim(wide_all_p)
# report::report(wide_all_p)
<- nearZeroVar(wide_all_p, saveMetrics = TRUE) nzv
<- nearZeroVar(wide_all_p)
nzv <- wide_all_p[, -nzv]
wide_all_p # wide_all_n <- wide_all_n[, -nzv]
rm(nzv)
75/25 split, taking into account distribution of deciles:
<- createDataPartition(
inTrain y = wide_all_p$outcome_d,
p = .75,
list = FALSE
)
<- wide_all_p[ inTrain, ]
training <- wide_all_p[-inTrain, ]
testing
# training_weights <- wide_all_n[ inTrain,]
Training data:
outcome_d <categorical>
# total N=143 valid N=143 mean=5.45 sd=2.88
Value | N | Raw % | Valid % | Cum. %
-------------------------------------
1 | 14 | 9.79 | 9.79 | 9.79
2 | 15 | 10.49 | 10.49 | 20.28
3 | 15 | 10.49 | 10.49 | 30.77
4 | 15 | 10.49 | 10.49 | 41.26
5 | 14 | 9.79 | 9.79 | 51.05
6 | 14 | 9.79 | 9.79 | 60.84
7 | 14 | 9.79 | 9.79 | 70.63
8 | 14 | 9.79 | 9.79 | 80.42
9 | 14 | 9.79 | 9.79 | 90.21
10 | 14 | 9.79 | 9.79 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
Testing data:
outcome_d <categorical>
# total N=40 valid N=40 mean=5.50 sd=2.91
Value | N | Raw % | Valid % | Cum. %
------------------------------------
1 | 4 | 10 | 10 | 10
2 | 4 | 10 | 10 | 20
3 | 4 | 10 | 10 | 30
4 | 4 | 10 | 10 | 40
5 | 4 | 10 | 10 | 50
6 | 4 | 10 | 10 | 60
7 | 4 | 10 | 10 | 70
8 | 4 | 10 | 10 | 80
9 | 4 | 10 | 10 | 90
10 | 4 | 10 | 10 | 100
<NA> | 0 | 0 | <NA> | <NA>
Outcome distro checks
e1071
set.seed(12345)
<- svm(outcome ~ ., data = training,
m_svm type = "eps-regression",
kernel = "radial")
Call:
svm(formula = outcome ~ ., data = training, type = "eps-regression",
kernel = "radial")
Parameters:
SVM-Type: eps-regression
SVM-Kernel: radial
cost: 1
gamma: 0.008333333
epsilon: 0.1
Number of Support Vectors: 119
<- predict(m_svm, testing) p_svm
Correlation:
cor(p_svm, testing$outcome)
[1] 0.8064335
MAE between predicted and actual values:
MAE(p_svm, testing$outcome)
[1] 32.38492
Distro of predicted (red) nd actual (green):
kernlab
modelLookup("svmRadial")
model parameter label forReg forClass probModel
1 svmRadial sigma Sigma TRUE TRUE TRUE
2 svmRadial C Cost TRUE TRUE TRUE
Using grid of parameters to explore:
<- expand.grid(sigma = seq(0.00001, 0.0002, 0.00001),
grid_tu C = seq(10, 20, 1))
nrow(grid_tu)
[1] 220
p_load(doParallel)
<- makePSOCKcluster(parallel::detectCores())
cl registerDoParallel(cl)
set.seed(12345)
<- train(outcome ~ .,
m_svm_tu method = "svmRadial",
data = training,
# weights = training_weights,
metric = "RMSE",
# metric = "Rsquared",
tuneGrid = grid_tu)
stopCluster(cl)
p_unload(doParallel)
$finalModel m_svm_tu
Support Vector Machine object of class "ksvm"
SV type: eps-svr (regression)
parameter : epsilon = 0.1 cost C = 17
Gaussian Radial Basis kernel function.
Hyperparameter : sigma = 0.00009
Number of Support Vectors : 121
Objective Function Value : -731.0119
Training error : 0.148886
<- predict(m_svm_tu, testing) p_svm_tu
Correlation:
cor(p_svm_tu, testing$outcome)
[1] 0.847875
MAE between predicted and actual values:
MAE(p_svm_tu, testing$outcome)
[1] 30.82691
Distro of predicted (red) nd actual (green):