\[y_i = \alpha + \beta_1 x_{1,i} + \beta_{2}x_{2,i} + \cdots + \epsilon_i\]
Find parameters by minimizing squared errors (MSE)
library(broom) lm( CO2_EMISSIONS ~ ENGINE_SIZE + CYLINDERS, data = fcr) %>% tidy() ## # A tibble: 3 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 122. 3.05 40.0 5.45e-214 ## 2 ENGINE_SIZE 21.7 1.75 12.4 5.78e- 33 ## 3 CYLINDERS 10.9 1.27 8.58 3.45e- 17
lm_out %>% augment() %>% ggplot(aes(x = .fitted, y = .resid)) + geom_point() + geom_hline( yintercept = 0)
( lm_out = fcr %>% filter(MAKE %in% c("FORD", "HONDA", "VOLKSWAGEN")) %>% lm( CO2_EMISSIONS ~ ENGINE_SIZE + MAKE - 1, data = .) ) %>% tidy() ## # A tibble: 4 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 ENGINE_SIZE 33.9 2.04 16.6 3.21e-37 ## 2 MAKEFORD 158. 6.94 22.7 7.25e-53 ## 3 MAKEHONDA 121. 5.81 20.8 4.40e-48 ## 4 MAKEVOLKSWAGEN 143. 6.91 20.7 8.76e-48
( - 1
in model removes overall intercept )
fcr %>% lm( CO2_EMISSIONS ~ ENGINE_SIZE + MAKE - 1, data = .) %>% tidy() %>% filter(str_detect(term, "MAKE")) %>% arrange(estimate) %>% slice( c(1:5) ) ## # A tibble: 5 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 MAKEMAZDA 117. 5.44 21.4 1.59e- 84 ## 2 MAKEACURA 120. 8.92 13.5 3.75e- 38 ## 3 MAKEHONDA 121. 4.54 26.7 8.30e-120 ## 4 MAKELEXUS 123. 5.81 21.1 1.09e- 82 ## 5 MAKECHRYSLER 124. 9.08 13.7 2.80e- 39
(Mazda is actually most fuel-efficient automaker)
fcr %>% filter(MAKE %in% c("FORD", "HONDA", "VOLKSWAGEN")) %>% lm( CO2_EMISSIONS ~ ENGINE_SIZE + MAKE, data = .) %>% tidy() ## # A tibble: 4 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 158. 6.94 22.7 7.25e-53 ## 2 ENGINE_SIZE 33.9 2.04 16.6 3.21e-37 ## 3 MAKEHONDA -37.1 5.55 -6.69 3.29e-10 ## 4 MAKEVOLKSWAGEN -15.0 6.40 -2.35 1.99e- 2
factor | dummy_A | dummy_B | dummy_C | intercept | |
---|---|---|---|---|---|
categ. A | "A" (1) | 1 | 0 | 0 | 1 |
categ. B | "B" (2) | 0 | 1 | 0 | 1 |
categ. C | "C" (3) | 0 | 0 | 1 | 1 |
fastDummies
package creates dummy variableslibrary(fastDummies) tibble( fct = c("A","B","A","C") ) %>% dummy_cols( select_columns = "fct" ) ## # A tibble: 4 x 4 ## fct fct_A fct_B fct_C ## <chr> <int> <int> <int> ## 1 A 1 0 0 ## 2 B 0 1 0 ## 3 A 1 0 0 ## 4 C 0 0 1
fcr %>% filter(MAKE %in% c("FORD", "HONDA", "VOLKSWAGEN")) %>% dummy_cols( "MAKE" )%>% lm( CO2_EMISSIONS ~ ENGINE_SIZE + MAKE_FORD + MAKE_HONDA + MAKE_VOLKSWAGEN -1, data = .) %>% tidy() ## # A tibble: 4 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 ENGINE_SIZE 33.9 2.04 16.6 3.21e-37 ## 2 MAKE_FORD 158. 6.94 22.7 7.25e-53 ## 3 MAKE_HONDA 121. 5.81 20.8 4.40e-48 ## 4 MAKE_VOLKSWAGEN 143. 6.91 20.7 8.76e-48
X1 * X2
)
fcr %>% filter(MAKE %in% c("FORD", "HONDA", "VOLKSWAGEN")) %>% lm( CO2_EMISSIONS ~ ENGINE_SIZE * MAKE, data = .) %>% tidy() ## # A tibble: 6 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 161. 7.39 21.8 2.52e-50 ## 2 ENGINE_SIZE 32.9 2.19 15.1 1.01e-32 ## 3 MAKEHONDA -58.5 15.1 -3.88 1.52e- 4 ## 4 MAKEVOLKSWAGEN -14.1 24.8 -0.569 5.70e- 1 ## 5 ENGINE_SIZE:MAKEHONDA 10.4 6.81 1.53 1.27e- 1 ## 6 ENGINE_SIZE:MAKEVOLKSWAGEN -1.00 11.3 -0.0886 9.30e- 1
forcats
package for working with factors
fct_lump()
: collapse infrequent categories into "other"fct_relevel()
: reorder factor levels by handfct_reorder()
: reorder factor by another variablefct_infreq()
: reorder factor by value frequencyfcr %>% mutate(MAKE = fct_lump(MAKE, n = 3)) %>% head(3) ## # A tibble: 3 x 15 ## YEAR MAKE MODEL CLASS ENGINE_SIZE CYLINDERS TRANSMISSION FUEL CITY ## <dbl> <fct> <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> ## 1 2018 Other ILX COMP~ 2.4 4 AM8 Z 9.4 ## 2 2018 Other MDX ~ SUV ~ 3.5 6 AS9 Z 12.6 ## 3 2018 Other MDX ~ SUV ~ 3.5 6 AS9 Z 12.2 ## # ... with 6 more variables: HWY <dbl>, COMB <dbl>, MPG <dbl>, ## # CO2_EMISSIONS <dbl>, CO2_RATING <dbl>, SMOG_RATING <dbl>