- Learn how to:
- Transform data
- Summarize data
- Perform these operations by groups
- Readings
- R4DS: ch. 5.5-5.7, 12, 18
- Data Wrangling cheatsheet
Often need to modify original data (i.e. apply functions)
mutate()
appends new variable/column to data
transmute()
only keeps new variabledata_frame(V1=1:3) %>% transmute(V2=V1*(1:2)) ## Warning in V1 * (1:2): longer object length is not a multiple of shorter ## object length ## # A tibble: 3 x 1 ## V2 ## <int> ## 1 1 ## 2 4 ## 3 3
mutate()
+ replace()
to change/fix data valuesdata_frame(V1 = 1:3, V2 = c("a","N/A","b")) %>% mutate(V2 = replace(V2, V2=="N/A", NA) ) %>% mutate(V1 = replace(V1, V2=="a", 5) ) ## # A tibble: 3 x 2 ## V1 V2 ## <dbl> <chr> ## 1 5 a ## 2 2 <NA> ## 3 3 b
summarise()
creates new data-frame with summary value(s)
E.g. number of distinct restaurants inspected
dinesafe %>% filter(ESTABLISHMENTTYPE == "Restaurant") %>% summarise(n_dist_rest = n_distinct(ESTABLISHMENT_ID)) ## # A tibble: 1 x 1 ## n_dist_rest ## <int> ## 1 6977
range(dinesafe$INSPECTION_DATE) ## [1] "2016-09-23" "2018-09-21" dinesafe %>% summarise( MIN_DATE = min(INSPECTION_DATE), MAX_DATE = max(INSPECTION_DATE)) ## # A tibble: 1 x 2 ## MIN_DATE MAX_DATE ## <date> <date> ## 1 2016-09-23 2018-09-21
group_by()
does not alter data
ungroup()
removes groupingmy_df %>% group_by(V2) %>% slice(1) ## # A tibble: 3 x 3 ## # Groups: V2 [3] ## V1 V2 V3 ## <int> <chr> <lgl> ## 1 1 a FALSE ## 2 2 b TRUE ## 3 3 c FALSE my_df %>% group_by(V2) %>% ungroup() %>% slice(1) ## # A tibble: 1 x 3 ## V1 V2 V3 ## <int> <chr> <lgl> ## 1 1 a FALSE
dinesafe %>% group_by(ESTABLISHMENTTYPE) %>% summarise( n_distinct(ESTABLISHMENT_ID) ) ## # A tibble: 55 x 2 ## ESTABLISHMENTTYPE `n_distinct(ESTABLISHMENT_ID)` ## <chr> <int> ## 1 Bake Shop 46 ## 2 Bakery 381 ## 3 Banquet Facility 151 ## 4 Bed & Breakfast 1 ## 5 Boarding / Lodging Home - Kitchen 67 ## 6 Bottling Plant 5 ## 7 Brew Your Own Beer / Wine 5 ## 8 Butcher Shop 157 ## 9 Cafeteria 186 ## 10 Cafeteria - Public Access 107 ## # ... with 45 more rows
my_df %>% group_by(V2,V3) %>% summarise(M=mean(V1)) ## # A tibble: 4 x 3 ## # Groups: V2 [3] ## V2 V3 M ## <chr> <lgl> <dbl> ## 1 a FALSE 1 ## 2 a TRUE 4 ## 3 b TRUE 3.5 ## 4 c FALSE 3 my_df %>% group_by(V2,V3) %>% summarise(M=mean(V1)) %>% summarise(MM=mean(M)) ## # A tibble: 3 x 2 ## V2 MM ## <chr> <dbl> ## 1 a 2.5 ## 2 b 3.5 ## 3 c 3
my_df %>% group_by(V2) %>% mutate( group_percent = V1 / sum(V1)) ## # A tibble: 5 x 4 ## # Groups: V2 [3] ## V1 V2 V3 group_percent ## <int> <chr> <lgl> <dbl> ## 1 1 a FALSE 0.2 ## 2 2 b TRUE 0.286 ## 3 3 c FALSE 1 ## 4 4 a TRUE 0.8 ## 5 5 b TRUE 0.714