- Learn how to:
- Organize data into tables (tidy data)
- Subset data (extract rows/columns)
- Use pipelines in R programming
- Readings
- R4DS: ch. 5.1-5.4, 12, 18
- Data Wrangling cheatsheet
# install.packages("tidyverse") library(tidyverse) ## -- Attaching packages ---------------------------------- tidyverse 1.2.1 -- ## v ggplot2 3.0.0 v purrr 0.2.5 ## v tibble 1.4.2 v dplyr 0.7.7 ## v tidyr 0.8.1 v stringr 1.3.1 ## v readr 1.1.1 v forcats 0.3.0 ## -- Conflicts ------------------------------------- tidyverse_conflicts() -- ## x dplyr::filter() masks stats::filter() ## x dplyr::lag() masks stats::lag() # help(package = "tidyverse")
my_df = data.frame( var1 = 1L:26L, var2 = letters ) str(my_df) ## 'data.frame': 26 obs. of 2 variables: ## $ var1: int 1 2 3 4 5 6 7 8 9 10 ... ## $ var2: Factor w/ 26 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...
my_tbl = data_frame( var1 = 1L:26L, var2 = letters ) str(my_tbl) ## Classes 'tbl_df', 'tbl' and 'data.frame': 26 obs. of 2 variables: ## $ var1: int 1 2 3 4 5 6 7 8 9 10 ... ## $ var2: chr "a" "b" "c" "d" ...
filter()
picks observations/rows
< |
<= |
& and |
> |
>= |
| |
== |
!= |
any() |
is.na() |
%in% |
all() |
slice()
picks rows by position (integer)distinct()
removes duplicate rows
sample_n()/sample_frac()
randomly samples rowstop_n()
picks top n values of a variableselect()
picks variables/columns
contains()/matches()
starts_with()/ends_with()
num_range()
e.g. V1 – V5spread()/gather()
separate()/unite()
arrange()
%>%
passes object on its left as (first) argument of function to its right
x %>% f(y) = f(x, y)
data %>% filter( ) %>% select( ) %>% arrange( )
arrange( select( filter(data) ) ) )
glimpse(dinesafe) ## Observations: 90,520 ## Variables: 16 ## $ ROW_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,... ## $ ESTABLISHMENT_ID <int> 1222579, 1222579, 1222579, 1222579... ## $ INSPECTION_ID <int> 103868579, 104063869, 104246429, 1... ## $ ESTABLISHMENT_NAME <chr> "SAI-LILA KHAMAN DHOKLA HOUSE", "S... ## $ ESTABLISHMENTTYPE <chr> "Food Take Out", "Food Take Out", ... ## $ ESTABLISHMENT_ADDRESS <chr> "870 MARKHAM RD", "870 MARKHAM RD"... ## $ LATITUDE <dbl> 43.76798, 43.76798, 43.76798, 43.7... ## $ LONGITUDE <dbl> -79.22903, -79.22903, -79.22903, -... ## $ ESTABLISHMENT_STATUS <chr> "Pass", "Pass", "Pass", "Pass", "P... ## $ MINIMUM_INSPECTIONS_PERYEAR <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2... ## $ INFRACTION_DETAILS <chr> "Operator fail to properly wash eq... ## $ INSPECTION_DATE <date> 2016-12-21, 2017-10-04, 2018-06-2... ## $ SEVERITY <chr> "M - Minor", NA, "NA - Not Applica... ## $ ACTION <chr> "Notice to Comply", NA, "Notice to... ## $ COURT_OUTCOME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA... ## $ AMOUNT_FINED <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA...