- Understand bias-variance trade-off in modeling
- Compare models based on predictive performance
Perform variable selection
- Readings
- ISLR ch 7.1-3
Perform variable selection
Optimal penalty is unknown, needs to be estimated
train = lfs %>% sample_frac(.70)
test = setdiff( lfs, train)
library(rpart)
tree_out = rpart( hrlyearn ~ immig + age_6 + sex + marstat +
educ + naics_21 + noc_10 + noc_40, data = train,
control = rpart.control( minsplit = 20, cp = 0))
library(modelr)
train %>% add_predictions( tree_out) %>%
summarise( sd(hrlyearn - pred)) %>% pull()
## [1] 5.355669
test %>% add_predictions( tree_out) %>%
summarise( sd(hrlyearn - pred)) %>% pull()
## [1] 8.535006
xerror = tree_out$cptable[,"xerror"]; xstd = tree_out$cptable[,"xstd"] cp_ind = min( which( xerror - xstd < min(xerror) ) ) cp_opt = tree_out$cptable[ cp_ind, "CP"] tree_reg = prune( tree_out, cp = cp_opt ) train %>% add_predictions( tree_reg) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 5.82472 test %>% add_predictions( tree_reg) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 8.344568
rpart()plotcp(tree_out, )
step() function in R performs variable selection
X, Z & fit linear modellfs_ = lfs %>% mutate( X = rnorm(nrow(lfs)), Z = rnorm(nrow(lfs))) train_ = lfs_ %>% sample_frac(.50); test_ = setdiff( lfs_, train_) lm_out = lm( hrlyearn ~ immig + age_6 + sex + marstat + educ + noc_10 + X + Z, data = train_) lm_out %>% glance() %>% pull(r.squared) ## [1] 0.4035515 train_ %>% add_predictions( lm_out) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 7.345609 test_ %>% add_predictions( lm_out ) %>% summarise( sd( hrlyearn - pred)) %>% pull() ## [1] 7.373807
step() to remove "redundant" variableslm_step = step(lm_out, trace = 0) lm_step$anova # see removed variables ## Step Df Deviance Resid. Df Resid. Dev AIC ## 1 NA NA 3767 204878.4 15208.20 ## 2 - X 1 41.00885 3768 204919.4 15206.96 ## 3 - Z 1 65.31969 3769 204984.7 15206.17 lm_out %>% glance() %>% pull(r.squared) ## [1] 0.4035515 train_ %>% add_predictions( lm_step ) %>% summarise( sd(hrlyearn - pred)) %>% pull() ## [1] 7.347515 test_ %>% add_predictions( lm_step ) %>% summarise( sd( hrlyearn - pred)) %>% pull() ## [1] 7.370567