Introduction to Machine Learning
Lecturer: Emi Tanaka
Department of Econometrics and Business Statistics
rsample
v
in the rsample
package)# 5-fold cross-validation
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [5390/1348]> Fold1
2 <split [5390/1348]> Fold2
3 <split [5390/1348]> Fold3
4 <split [5391/1347]> Fold4
5 <split [5391/1347]> Fold5
scroll
rsplit
object:library(yardstick)
results_test1 %>%
group_by(.model) %>%
metric_set(rmse, mae, mape, mpe, rsq)(., price, .pred) %>%
pivot_wider(.model, names_from = .metric, values_from = .estimate)
# A tibble: 3 × 6
.model rmse mae mape mpe rsq
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 knn 7808. 6631. 70.4 -64.7 0.188
2 reg 5364. 4123. 33.7 -8.23 0.186
3 tree 5380. 4121. 33.8 -8.67 0.178
toyota_models <- toyota_folds %>%
# fit models
mutate(reg = map(splits, ~lm(log10(price) ~ year, data = training(.x))),
tree = map(splits, ~rpart(log10(price) ~ year, data = training(.x), method = "anova")),
knn = map(splits, ~train.kknn(log10(price) ~ year, data = training(.x))))
toyota_models
# 5-fold cross-validation
# A tibble: 5 × 5
splits id reg tree knn
<list> <chr> <list> <list> <list>
1 <split [5390/1348]> Fold1 <lm> <rpart> <trn.kknn>
2 <split [5390/1348]> Fold2 <lm> <rpart> <trn.kknn>
3 <split [5390/1348]> Fold3 <lm> <rpart> <trn.kknn>
4 <split [5391/1347]> Fold4 <lm> <rpart> <trn.kknn>
5 <split [5391/1347]> Fold5 <lm> <rpart> <trn.kknn>
scroll
toyota_metrics <- toyota_models %>%
mutate(across(c(reg, tree, knn), function(models) {
# now for every fold and model,
map2(splits, models, function(.split, .model) {
testing(.split) %>%
# compute prediction for testing set
mutate(.pred = 10^predict(.model, .)) %>%
# then get metrics
metric_set(rmse, mae, mape, mpe, rsq)(., price, .pred) %>%
# in a one-row data frame such that
# column names are metric,
# values are the accuracy measure
pivot_wider(-.estimator,
names_from = .metric,
values_from = .estimate)
})
}, .names = "{.col}_metrics"))
toyota_metrics
# 5-fold cross-validation
# A tibble: 5 × 8
splits id reg tree knn reg_metrics
<list> <chr> <list> <list> <list> <list>
1 <split [5390/1348]> Fold1 <lm> <rpart> <trn.kknn> <tibble [1 × 5]>
2 <split [5390/1348]> Fold2 <lm> <rpart> <trn.kknn> <tibble [1 × 5]>
3 <split [5390/1348]> Fold3 <lm> <rpart> <trn.kknn> <tibble [1 × 5]>
4 <split [5391/1347]> Fold4 <lm> <rpart> <trn.kknn> <tibble [1 × 5]>
5 <split [5391/1347]> Fold5 <lm> <rpart> <trn.kknn> <tibble [1 × 5]>
tree_metrics knn_metrics
<list> <list>
1 <tibble [1 × 5]> <tibble [1 × 5]>
2 <tibble [1 × 5]> <tibble [1 × 5]>
3 <tibble [1 × 5]> <tibble [1 × 5]>
4 <tibble [1 × 5]> <tibble [1 × 5]>
5 <tibble [1 × 5]> <tibble [1 × 5]>
# A tibble: 1 × 5
rmse mae mape mpe rsq
<dbl> <dbl> <dbl> <dbl> <dbl>
1 5364. 4123. 33.7 -8.23 0.186
toyota_metrics_wide <- toyota_metrics %>%
# expand to see results
unnest_wider(ends_with("_metrics"), names_sep = "_") %>%
# wrangle data into output form below
pivot_longer(contains("metrics"),
names_to = c("model", "metric"),
names_pattern = "(.*)_metrics_(.*)",
values_to = "value") %>%
pivot_wider(c(id, model),
names_from = metric,
values_from = value)
toyota_metrics_wide
# A tibble: 15 × 7
id model rmse mae mape mpe rsq
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Fold1 reg 5364. 4123. 33.7 -8.23 0.186
2 Fold1 tree 5380. 4121. 33.8 -8.67 0.178
3 Fold1 knn 7808. 6631. 70.4 -64.7 0.188
4 Fold2 reg 5633. 4227. 33.6 -6.62 0.193
5 Fold2 tree 5585. 4184. 33.4 -6.45 0.207
6 Fold2 knn 7508. 6285. 65.2 -57.6 0.205
7 Fold3 reg 5845. 4240. 34.5 -7.94 0.245
8 Fold3 tree 5794. 4197. 34.9 -8.42 0.262
9 Fold3 knn 8880. 7337. 78.0 -71.7 0.192
10 Fold4 reg 6010. 4200. 33.3 -7.79 0.184
11 Fold4 tree 5964. 4149. 33.0 -7.93 0.196
12 Fold4 knn 8512. 7092. 73.8 -68.9 0.192
13 Fold5 reg 5984. 4273. 33.3 -6.84 0.195
14 Fold5 tree 5949. 4219. 33.0 -7.00 0.201
15 Fold5 knn 7902. 6602. 68.9 -63.3 0.207
toyota_metrics_wide %>%
# get the average of each metric columns
group_by(model) %>%
summarise(across(rmse:rsq, mean))
# A tibble: 3 × 6
model rmse mae mape mpe rsq
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 knn 8122. 6790. 71.3 -65.2 0.197
2 reg 5767. 4212. 33.7 -7.48 0.201
3 tree 5734. 4174. 33.6 -7.69 0.209
Results
knn
has a large variation in the metrics - this means this model has a high variance and it is not desirable.tree
and reg
has a large variation in rmse
and rsq
- they are somewhat similar in performance.rsample
package has a specific function for this special case that is essentially similar to above:# Leave-one-out cross-validation
# A tibble: 6,738 × 2
splits id
<list> <chr>
1 <split [6737/1]> Resample1
2 <split [6737/1]> Resample2
3 <split [6737/1]> Resample3
4 <split [6737/1]> Resample4
5 <split [6737/1]> Resample5
6 <split [6737/1]> Resample6
7 <split [6737/1]> Resample7
8 <split [6737/1]> Resample8
9 <split [6737/1]> Resample9
10 <split [6737/1]> Resample10
# … with 6,728 more rows
# A tibble: 6,738 × 9
model year price trans…¹ mileage fuelT…² tax
<chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 C-HR 2019 26499 Automa… 1970 Hybrid 140
2 Aygo 2018 7800 Manual 12142 Petrol 145
3 Yaris 2015 6490 Manual 36100 Petrol 30
4 Yaris 2018 10500 Manual 9290 Petrol 145
5 Yaris 2018 9595 Manual 20740 Petrol 145
6 Auris 2016 17490 Automa… 29031 Hybrid 0
7 Yaris 2014 8498 Automa… 57677 Hybrid 0
8 PROA… 2019 28456 Automa… 9119 Diesel 145
9 Yaris 2017 7998 Manual 63978 Petrol 150
10 Auris 2017 15095 Automa… 43405 Hybrid 0
# … with 6,728 more rows, 2 more variables:
# mpg <dbl>, engineSize <dbl>, and abbreviated
# variable names ¹transmission, ²fuelType
rsample::bootstraps
function ensures testing data only contains OOB samples.# Bootstrap sampling
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [6738/2481]> Bootstrap01
2 <split [6738/2483]> Bootstrap02
3 <split [6738/2461]> Bootstrap03
4 <split [6738/2545]> Bootstrap04
5 <split [6738/2474]> Bootstrap05
6 <split [6738/2468]> Bootstrap06
7 <split [6738/2521]> Bootstrap07
8 <split [6738/2475]> Bootstrap08
9 <split [6738/2479]> Bootstrap09
10 <split [6738/2495]> Bootstrap10
outside
: the initial resampling produces the split into training and testing data for multiple folds/iterations, theninside
: the resampling for the initial training data split into training and validation data for multiple folds/iterations.rsample::nested_cv()
# Nested resampling:
# outer: 5-fold cross-validation
# inner: Bootstrap sampling
# A tibble: 5 × 3
splits id inner_resamples
<list> <chr> <list>
1 <split [5390/1348]> Fold1 <boot [10 × 2]>
2 <split [5390/1348]> Fold2 <boot [10 × 2]>
3 <split [5390/1348]> Fold3 <boot [10 × 2]>
4 <split [5391/1347]> Fold4 <boot [10 × 2]>
5 <split [5391/1347]> Fold5 <boot [10 × 2]>
rsample::nested_cv
gives some warning for bad combination but be cautious of this!Warning: Using bootstrapping as the outer resample is dangerous since the inner
resample might have the same data point in both the analysis and assessment
set.
# Nested resampling:
# outer: Bootstrap sampling
# inner: 5-fold cross-validation
# A tibble: 10 × 3
splits id inner_resamples
<list> <chr> <list>
1 <split [6738/2457]> Bootstrap01 <vfold [5 × 2]>
2 <split [6738/2457]> Bootstrap02 <vfold [5 × 2]>
3 <split [6738/2462]> Bootstrap03 <vfold [5 × 2]>
4 <split [6738/2503]> Bootstrap04 <vfold [5 × 2]>
5 <split [6738/2462]> Bootstrap05 <vfold [5 × 2]>
6 <split [6738/2501]> Bootstrap06 <vfold [5 × 2]>
7 <split [6738/2472]> Bootstrap07 <vfold [5 × 2]>
8 <split [6738/2507]> Bootstrap08 <vfold [5 × 2]>
9 <split [6738/2508]> Bootstrap09 <vfold [5 × 2]>
10 <split [6738/2475]> Bootstrap10 <vfold [5 × 2]>
ETC3250/5250 Week 3