- Learn how to:
- Analyse textual data
- Use regular expressions for string manipulation
- Readings
- R4DS: ch. 14
- String Manipulation cheatsheet
char
data type in R, indicated by double quotes (" "
)typeof("my text") ## [1] "character"
a = "string" b = c("v", "e", "c", "t", "o", "r") rev(a) ## [1] "string" rev(b) ## [1] "r" "o" "t" "c" "e" "v"
dinesafe = read_csv("../data/dinesafe.csv") details = dinesafe %>% select(INFRACTION_DETAILS) head(details) ## # A tibble: 6 x 1 ## INFRACTION_DETAILS ## <chr> ## 1 Operator fail to properly wash equipment ## 2 <NA> ## 3 Fail to Hold a Valid Food Handler's Certificate. Muncipal Code Chapter 5~ ## 4 Operator fail to properly wash equipment ## 5 Operator fail to properly wash surfaces in rooms ## 6 Operate food premise - equipment not arranged to permit cleaning - Sec. 9
str_detect()
returns logical index (match/no match)str_which()
returns integer index (row # of match)str_count()
returns # matchesdetails %>% mutate( no_wash = str_detect( INFRACTION_DETAILS, "fail to properly wash" ) ) %>% summarise( mean(no_wash, na.rm = TRUE) ) ## # A tibble: 1 x 1 ## `mean(no_wash, na.rm = TRUE)` ## <dbl> ## 1 0.248
.
\(\rightarrow\) any character (except newline)\\s
\(\rightarrow\) any whitespace\\d
\(\rightarrow\) any number character (0-9)\\w
\(\rightarrow\) any letter or number character^
\(\rightarrow\) start of string$
\(\rightarrow\) end of string*
\(\rightarrow\) zero or more+
\(\rightarrow\) one or morea|b
\(\rightarrow\) a or b[abc]
\(\rightarrow\) a, b, or c (same as [a-c]
)(ab)|a
\(\rightarrow\) ab or a, vsa(b|a)
\(\rightarrow\) ab or aaa(?=b)
\(\rightarrow\) a followed by b (e.g. abba)(?<=a)b
\(\rightarrow\) b preceded by a (e.g. abba)str_view()/str_view_all()
name = c("Tajinder", "Mustafa", "Liu Wei") str_view(name, "(^.a)|(.a$)")
str_subset()
return string with matchstr_extract()
return matching patterndinesafe %>% distinct(ESTABLISHMENT_ID, .keep_all = TRUE) %>% pull(ESTABLISHMENT_NAME) %>% str_subset( pattern = "PIZZA" ) %>% length() ## [1] 492
str_replace()
replace pattern with stringstr_to_lower/upper()
convert to lower-/upper-casedinesafe %>% distinct(ESTABLISHMENT_ID, .keep_all = TRUE) %>% mutate( ESTABLISHMENT_NAME = str_to_upper(ESTABLISHMENT_NAME) ) %>% filter( str_detect(ESTABLISHMENT_NAME, "PIZZA") ) %>% dim_desc() ## [1] "[549 x 16]"
str_c( , sep = " ")
defines separator stringscollapse = ""
collapses result vector into single stringstr_c(1:3, name, sep = " - ") ## [1] "1 - Tajinder" "2 - Mustafa" "3 - Liu Wei" str_c(1:3, name, sep = " - ", collapse = ", ") ## [1] "1 - Tajinder, 2 - Mustafa, 3 - Liu Wei"
str_split()
split string along pattern
str_split_fixed()
returns fixed # of piecesstr_split(name, " ") ## [[1]] ## [1] "Tajinder" ## ## [[2]] ## [1] "Mustafa" ## ## [[3]] ## [1] "Liu" "Wei" str_split_fixed(name, "\\s", 2) ## [,1] [,2] ## [1,] "Tajinder" "" ## [2,] "Mustafa" "" ## [3,] "Liu" "Wei"
str_trim()
trim whitespacestr_pad()
pad strings to constant widthstr_trunc()
truncate strings to constant widthstr_wrap()
wrap string to fixed width paragraphstr_pad(name, width = 10, side = "right") ## [1] "Tajinder " "Mustafa " "Liu Wei " str_pad(name, width = 10, side = "right") %>% str_trim() ## [1] "Tajinder" "Mustafa" "Liu Wei"