After a few weeks of hiatus from #TidyTuesday, I am back with 27-July’21 dataset which is about Olympics. In this exercise, I will try to develop a model to predict medalists using a bagged tree model. This post is intended to workflow through a bare-bones model. So, we are not really focused on the results.

Libraries

library(tidyverse)
library(tidymodels)
library(skimr)

library(tidytuesdayR)

library(parallel)
library(doParallel)
library(tictoc)

tidymodels_prefer()
conflicted::conflict_prefer("vi", "vip")

>>>Set parallel processing

# Enable parallel processing

cores<-detectCores(logical=F)-1
# cores

core_cluster<-makePSOCKcluster(cores)
# core_cluster

registerDoParallel(core_cluster)

Get Data

tt_data<-tt_load("2021-07-27")
## 
##  Downloading file 1 of 2: `olympics.csv`
##  Downloading file 2 of 2: `regions.csv`
olympics<-tt_data$olympics
regions<-tt_data$regions

Some exploration

olympics%>%
  skim()
Data summary
Name Piped data
Number of rows 271116
Number of columns 15
_______________________
Column type frequency:
character 10
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1.00 2 108 0 134731 0
sex 0 1.00 1 1 0 2 0
team 0 1.00 2 47 0 1184 0
noc 0 1.00 3 3 0 230 0
games 0 1.00 11 11 0 51 0
season 0 1.00 6 6 0 2 0
city 0 1.00 4 22 0 42 0
sport 0 1.00 4 25 0 66 0
event 0 1.00 15 85 0 765 0
medal 231333 0.15 4 6 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 68248.95 39022.29 1 34643 68205 102097.2 135571 ▇▇▇▇▇
age 9474 0.97 25.56 6.39 10 21 24 28.0 97 ▇▃▁▁▁
height 60171 0.78 175.34 10.52 127 168 175 183.0 226 ▁▂▇▂▁
weight 62875 0.77 70.70 14.35 25 60 70 79.0 214 ▃▇▁▁▁
year 0 1.00 1978.38 29.88 1896 1960 1988 2002.0 2016 ▁▂▃▆▇

On medal field, we see 14.7% got medals. Age, height and weight has some data missing.

Cleaned Dataframe

We are going to do binary classification with target is to predict whether someone is a medalist or not.

olympics_clean<-olympics%>%
  mutate(
    medalist=factor(if_else(is.na(medal),"N","Y")),
    medalist=fct_relevel(medalist,"Y")
  )%>%
  select(-c(medal,id,name,team,games,event,city))


olympics_clean%>%
  glimpse()
## Rows: 271,116
## Columns: 9
## $ sex      <chr> "M", "M", "M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "~
## $ age      <dbl> 24, 23, 24, 34, 21, 21, 25, 25, 27, 27, 31, 31, 31, 31, 33, 3~
## $ height   <dbl> 180, 170, NA, NA, 185, 185, 185, 185, 185, 185, 188, 188, 188~
## $ weight   <dbl> 80, 60, NA, NA, 82, 82, 82, 82, 82, 82, 75, 75, 75, 75, 75, 7~
## $ noc      <chr> "CHN", "CHN", "DEN", "DEN", "NED", "NED", "NED", "NED", "NED"~
## $ year     <dbl> 1992, 2012, 1920, 1900, 1988, 1988, 1992, 1992, 1994, 1994, 1~
## $ season   <chr> "Summer", "Summer", "Summer", "Summer", "Winter", "Winter", "~
## $ sport    <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Speed Skatin~
## $ medalist <fct> N, N, N, Y, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N~

Partition

Then I split the data to train and test set. For further cross-validation, I split the train set in 5 stratified folds.

set.seed(123)
split<-initial_split(olympics_clean,strata = medalist)

train<-training(split)
test<-testing(split)

set.seed(123)
folds<-vfold_cv(train,v=5,strata = medalist)

Recipe

At pre-processing, we are only going to do imputation for missing values in age, height and weight columns with bag tree model using sex, noc and sport as predictors.

rec_base<-train%>%
  recipe(medalist~.)%>%
  step_novel(all_nominal_predictors())%>%
  step_unknown(all_nominal_predictors())%>%
  step_impute_bag(age,height,
                  weight,
                  impute_with = imp_vars(sex,noc,sport))


baked<-rec_base%>%
  prep()%>%
  bake(new_data=NULL)


baked%>%
  skim()
Data summary
Name Piped data
Number of rows 203336
Number of columns 9
_______________________
Column type frequency:
factor 5
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 M: 147575, F: 55761, new: 0, unk: 0
noc 0 1 FALSE 229 USA: 14169, FRA: 9539, GBR: 9158, ITA: 7960
season 0 1 FALSE 2 Sum: 167111, Win: 36225, new: 0, unk: 0
sport 0 1 FALSE 65 Ath: 29120, Gym: 19982, Swi: 17364, Sho: 8636
medalist 0 1 FALSE 2 N: 173499, Y: 29837

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 25.62 6.36 10 22 25 28.00 97 ▇▃▁▁▁
height 0 1 175.60 9.82 127 170 176 182.00 226 ▁▂▇▁▁
weight 0 1 71.24 13.28 25 62 71 78.94 198 ▂▇▁▁▁
year 0 1 1978.38 29.88 1896 1960 1988 2002.00 2016 ▁▂▃▆▇

After imputation, you can see that no missing data in our train set.

Model

Next I will be using a bagged tree model for training with tree depth, minimum number of data points in node and cost complexity as tuning parameters.

bag_tree_rpart_spec <-
  baguette::bag_tree(min_n=tune(),
                     tree_depth = tune(),
                     cost_complexity = tune()
                     ) %>%
  set_engine('rpart') %>%
  set_mode('classification')

Workflow

Next, I construct the workflow.

bagT_wf<-workflow()%>%
  add_recipe(rec_base)%>%
  add_model(bag_tree_rpart_spec)

Tune Grid

For hyper-parameter tuning we define 10 different combinations based on max entropy.

set.seed(123)
         
grid_bagT<-grid_max_entropy(
  bagT_wf%>%
    parameters(),
  size=10)

grid_bagT
## # A tibble: 10 x 3
##    cost_complexity tree_depth min_n
##              <dbl>      <int> <int>
##  1        6.95e- 5         12     5
##  2        8.27e- 3          4     7
##  3        2.40e- 9          8    36
##  4        1.89e- 8          3     3
##  5        7.52e- 5          9    27
##  6        4.29e- 2         13    21
##  7        1.90e- 2          3    33
##  8        5.45e- 6          3    19
##  9        2.49e- 8         13    24
## 10        1.59e-10          1    20

Training time

tic()

set.seed(123)
tune_bagT<-bagT_wf%>%
  tune_grid(
    resamples=folds,
    grid=grid_bagT,
    control=control_grid(parallel_over = NULL)
    )

toc()

Tune results

tune_bagT%>%
  show_best("roc_auc")
## # A tibble: 5 x 9
##   cost_complexity tree_depth min_n .metric .estimator  mean     n  std_err
##             <dbl>      <int> <int> <chr>   <chr>      <dbl> <int>    <dbl>
## 1   0.0000000249          13    24 roc_auc binary     0.856     5 0.000830
## 2   0.0000695             12     5 roc_auc binary     0.842     5 0.00156 
## 3   0.0000752              9    27 roc_auc binary     0.801     5 0.00153 
## 4   0.00000000240          8    36 roc_auc binary     0.794     5 0.00100 
## 5   0.00000545             3    19 roc_auc binary     0.625     5 0.0101  
## # ... with 1 more variable: .config <chr>

The best model roc_auc is 0.856.

Finalize workflow with best parameters

# Best parameters

best_params<-tune_bagT%>%
  select_best(metric="roc_auc")

# Create final workflow and train on whole training set
final_wf<-bagT_wf%>%
  finalize_workflow(best_params)%>%
  fit(train)

Predict on validation set

test_preds<-final_wf%>%
  augment(test)


roc_curve<-test_preds%>%
  roc_curve(truth=medalist,.pred_Y)%>%
  autoplot()

roc_curve

test_preds%>%
  roc_auc(truth=medalist,.pred_Y)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.858
test_preds%>%
  conf_mat(truth=medalist,.pred_class)%>%
  summary()
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.887 
##  2 kap                  binary        0.394 
##  3 sens                 binary        0.305 
##  4 spec                 binary        0.987 
##  5 ppv                  binary        0.806 
##  6 npv                  binary        0.892 
##  7 mcc                  binary        0.452 
##  8 j_index              binary        0.292 
##  9 bal_accuracy         binary        0.646 
## 10 detection_prevalence binary        0.0555
## 11 precision            binary        0.806 
## 12 recall               binary        0.305 
## 13 f_meas               binary        0.442

We get an accuracy of 0.887 with roc_auc of 0.858.

Variable importance

vi<-final_wf%>%
  extract_fit_parsnip()

vi$fit$imp%>%
  ggplot(aes(x=reorder(term,value),y=value))+
  geom_point(color="#dd1144")+
  geom_bar(stat="identity",width=0.05,fill="#dd1144")+
  coord_flip()+
  theme_minimal()+
  labs(
    x="Variable",
    y="Importance"
  )

Nation and sport seems to be the most important variables.