- Work with multiple features
- Understand linear & nonlinear classification
- Perform multivariate classification in R with
- Logistic Regression
- Classification Trees
Readings - ISLR ch 4.3-4 (optional)
Readings - ISLR ch 4.3-4 (optional)
Information can be reflected in multiple variables
All methods produce normal vector of linear coefficients (\(\vec{\beta}\))
We will focus on logistic regression
glm()
glm_out = glm( diagnosis ~ fract.dim.m + radius.m, family = "binomial", data = wdbc %>% mutate( diagnosis = factor(diagnosis)) ) broom::tidy(glm_out) ## # A tibble: 3 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) -18.6 2.06 -9.01 1.99e-19 ## 2 fract.dim.m 52.1 6.36 8.19 2.69e-16 ## 3 radius.m 0.811 0.122 6.65 2.84e-11
predict()
returns distance from decision boundary
( glm_pred = predict(glm_out) ) %>% sample(5) ## 164 448 232 500 532 ## -4.317983 -2.263426 -8.326930 9.108325 -4.899920 wdbc %>% modelr::add_predictions(glm_out, var = "distance") %>% mutate( predicted = ifelse(distance < 0, "B", "M") ) %>% xtabs(~ predicted + diagnosis, data = .) %>% prop.table() ## diagnosis ## predicted B M ## B 0.60105448 0.03514938 ## M 0.02636204 0.33743409
rpart
(recursive partitioning)library(rpart) rpart_out = rpart( diagnosis ~ . - id, data = wdbc, method = "class", control = rpart.control(minsplit=50) ) rpart_out ## n= 569 ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 569 212 B (0.62741652 0.37258348) ## 2) concavity.w< 16.795 379 33 B (0.91292876 0.08707124) ## 4) fract.dim.m< 0.1358 333 5 B (0.98498498 0.01501502) * ## 5) fract.dim.m>=0.1358 46 18 M (0.39130435 0.60869565) * ## 3) concavity.w>=16.795 190 11 M (0.05789474 0.94210526) *
rpart.plot
library(rpart.plot); rpart.plot(rpart_out)
predict()
gives class probabilities
type = "class"
to get classespredict(rpart_out) %>% head(2) ## B M ## 1 0.05789474 0.9421053 ## 2 0.05789474 0.9421053 wdbc %>% modelr::add_predictions( rpart_out, type = "class" ) %>% xtabs( ~ pred + diagnosis, data = .) %>% prop.table() ## diagnosis ## pred B M ## B 0.576449912 0.008787346 ## M 0.050966608 0.363796134