- Learn how to:
- Analyse textual data
- Use regular expressions for string manipulation
- Readings
- R4DS: ch. 14
- String Manipulation cheatsheet
char data type in R, indicated by double quotes (" ")typeof("my text")
## [1] "character"
a = "string"
b = c("v", "e", "c", "t", "o", "r")
rev(a)
## [1] "string"
rev(b)
## [1] "r" "o" "t" "c" "e" "v"
dinesafe = read_csv("../data/dinesafe.csv")
details = dinesafe %>% select(INFRACTION_DETAILS)
head(details)
## # A tibble: 6 x 1
## INFRACTION_DETAILS
## <chr>
## 1 Operator fail to properly wash equipment
## 2 <NA>
## 3 Fail to Hold a Valid Food Handler's Certificate. Muncipal Code Chapter 5~
## 4 Operator fail to properly wash equipment
## 5 Operator fail to properly wash surfaces in rooms
## 6 Operate food premise - equipment not arranged to permit cleaning - Sec. 9
str_detect() returns logical index (match/no match)str_which() returns integer index (row # of match)str_count() returns # matchesdetails %>%
mutate( no_wash = str_detect( INFRACTION_DETAILS,
"fail to properly wash" ) ) %>%
summarise( mean(no_wash, na.rm = TRUE) )
## # A tibble: 1 x 1
## `mean(no_wash, na.rm = TRUE)`
## <dbl>
## 1 0.248
. \(\rightarrow\) any character (except newline)\\s \(\rightarrow\) any whitespace\\d \(\rightarrow\) any number character (0-9)\\w \(\rightarrow\) any letter or number character^ \(\rightarrow\) start of string$ \(\rightarrow\) end of string* \(\rightarrow\) zero or more+ \(\rightarrow\) one or morea|b \(\rightarrow\) a or b[abc] \(\rightarrow\) a, b, or c (same as [a-c])(ab)|a \(\rightarrow\) ab or a, vsa(b|a) \(\rightarrow\) ab or aaa(?=b) \(\rightarrow\) a followed by b (e.g. abba)(?<=a)b \(\rightarrow\) b preceded by a (e.g. abba)str_view()/str_view_all()name = c("Tajinder", "Mustafa", "Liu Wei")
str_view(name, "(^.a)|(.a$)")
str_subset() return string with matchstr_extract() return matching patterndinesafe %>% distinct(ESTABLISHMENT_ID, .keep_all = TRUE) %>% pull(ESTABLISHMENT_NAME) %>% str_subset( pattern = "PIZZA" ) %>% length() ## [1] 492
str_replace() replace pattern with stringstr_to_lower/upper() convert to lower-/upper-casedinesafe %>% distinct(ESTABLISHMENT_ID, .keep_all = TRUE) %>% mutate( ESTABLISHMENT_NAME = str_to_upper(ESTABLISHMENT_NAME) ) %>% filter( str_detect(ESTABLISHMENT_NAME, "PIZZA") ) %>% dim_desc() ## [1] "[549 x 16]"
str_c( , sep = " ") defines separator stringscollapse = "" collapses result vector into single stringstr_c(1:3, name, sep = " - ") ## [1] "1 - Tajinder" "2 - Mustafa" "3 - Liu Wei" str_c(1:3, name, sep = " - ", collapse = ", ") ## [1] "1 - Tajinder, 2 - Mustafa, 3 - Liu Wei"
str_split() split string along pattern
str_split_fixed() returns fixed # of piecesstr_split(name, " ") ## [[1]] ## [1] "Tajinder" ## ## [[2]] ## [1] "Mustafa" ## ## [[3]] ## [1] "Liu" "Wei" str_split_fixed(name, "\\s", 2) ## [,1] [,2] ## [1,] "Tajinder" "" ## [2,] "Mustafa" "" ## [3,] "Liu" "Wei"
str_trim() trim whitespacestr_pad() pad strings to constant widthstr_trunc() truncate strings to constant widthstr_wrap() wrap string to fixed width paragraphstr_pad(name, width = 10, side = "right") ## [1] "Tajinder " "Mustafa " "Liu Wei " str_pad(name, width = 10, side = "right") %>% str_trim() ## [1] "Tajinder" "Mustafa" "Liu Wei"