- Understand bias-variance trade-off in modeling
- Compare models based on predictive performance
Perform variable selection
- Readings
- ISLR ch 7.1-3
Perform variable selection
Optimal penalty is unknown, needs to be estimated
train = lfs %>% sample_frac(.70) test = setdiff( lfs, train) library(rpart) tree_out = rpart( hrlyearn ~ immig + age_6 + sex + marstat + educ + naics_21 + noc_10 + noc_40, data = train, control = rpart.control( minsplit = 20, cp = 0)) library(modelr) train %>% add_predictions( tree_out) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 5.355669 test %>% add_predictions( tree_out) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 8.535006
xerror = tree_out$cptable[,"xerror"]; xstd = tree_out$cptable[,"xstd"] cp_ind = min( which( xerror - xstd < min(xerror) ) ) cp_opt = tree_out$cptable[ cp_ind, "CP"] tree_reg = prune( tree_out, cp = cp_opt ) train %>% add_predictions( tree_reg) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 5.82472 test %>% add_predictions( tree_reg) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 8.344568
rpart()
plotcp(tree_out, )
step()
function in R performs variable selection
X, Z
& fit linear modellfs_ = lfs %>% mutate( X = rnorm(nrow(lfs)), Z = rnorm(nrow(lfs))) train_ = lfs_ %>% sample_frac(.50); test_ = setdiff( lfs_, train_) lm_out = lm( hrlyearn ~ immig + age_6 + sex + marstat + educ + noc_10 + X + Z, data = train_) lm_out %>% glance() %>% pull(r.squared) ## [1] 0.4035515 train_ %>% add_predictions( lm_out) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 7.345609 test_ %>% add_predictions( lm_out ) %>% summarise( sd( hrlyearn - pred)) %>% pull() ## [1] 7.373807
step()
to remove "redundant" variableslm_step = step(lm_out, trace = 0) lm_step$anova # see removed variables ## Step Df Deviance Resid. Df Resid. Dev AIC ## 1 NA NA 3767 204878.4 15208.20 ## 2 - X 1 41.00885 3768 204919.4 15206.96 ## 3 - Z 1 65.31969 3769 204984.7 15206.17 lm_out %>% glance() %>% pull(r.squared) ## [1] 0.4035515 train_ %>% add_predictions( lm_step ) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 7.347515 test_ %>% add_predictions( lm_step ) %>% summarise( sd( hrlyearn - pred)) %>% pull() ## [1] 7.370567