: @denis_mongin
09/11/2020
: @denis_mongin
download.file(url = "https://gitlab.com/dmongin/r-demos/-/archive/master/r-demos-master.zip?path=R_regex", destfile = "regex_demo.zip") unzip(zipfile = "regex_demo.zip")
library(stringr) library(dplyr)
grep
is to find regular expression!Let’s consider the string
test_string <- "r3%gne8§//g74mpe21??x0"
I want to extract any lower case letter followed by a number.
str_extract_all(test_string,"[a-z](?=[0-9])") %>% unlist() %>% paste(collapse = "")
## [1] "regex"
[a-z](?=[0-9])
r
, e
, g
, e
and x
[a-z](?=[0-9])
[a-z]
is any lower case letter, once(?=something)
means followed by something
[0-9]
any number, onceN
will match N
.Nope
will match N
followed by o
, followed by p
, followed by e
Let’s describe the functions we will use:
stringr
(Hadley Wickham again)First thing:
the function are vectorized
Meaning it applies on vectors.
So you can use %>%
between them
outputs a vector of boolean to say it found the patternor not:
str_detect(c("Nope","plouf","Nein","no"), # the vector of string "N") # the pattern
## [1] TRUE FALSE TRUE FALSE
subset places with a "M"
inside
df <- data.frame(place = c("Munich","Madrid","Valencia","Estepona"), x = sample(letters,4)) df
## place x ## 1 Munich v ## 2 Madrid x ## 3 Valencia u ## 4 Estepona m
df[str_detect(df$place,"M"),]
## place x ## 1 Munich v ## 2 Madrid x
or
library(dplyr) df %>% filter(str_detect(place,"M"))
## place x ## 1 Munich v ## 2 Madrid x
Extract: extract the pattern form string
str_extract(c("Nope","plouf","Nein","no"), # vector of string "N") # the pattern
## [1] "N" NA "N" NA
Replace the matched pattern by something
str_replace(c("Nope","plouf","Nein","no"), # vector of string "N", # pattern "P") # replacement
## [1] "Pope" "plouf" "Pein" "no"
_all
variant, where they output a list of all matchstr_extract(c("nope","plouf","nein","no"), # vector of string "n")
## [1] "n" NA "n" "n"
str_extract_all(c("nope","plouf","nein","no"), # vector of string "n")
## [[1]] ## [1] "n" ## ## [[2]] ## character(0) ## ## [[3]] ## [1] "n" "n" ## ## [[4]] ## [1] "n"
str_extract( c("Norwegian","Nope","nominative"), "[Nn]o" )
## [1] "No" "No" "no"
[Nn]
is a character class.The usefull ones:
[0-9]
any digit. You can also use \d
[a-z]
any lower case letter.[a-dn-z]
: any lower case between a
and d
, and between n
and z
[ ]
: white space, same as \s
[A-z]
: any upper case and lower case letter + underscore.
: any characterRegex will stop at the first match
str_extract("Hello my name is denis", "[A-z]")
## [1] "H"
You can specify the number of time your block repeat in your pattern.
*
: any number of time+
: one or more time{n}
: n time{n,}
: n time or more{n,m}
: between n and m time (included)str_extract_all("Hello my name is denis", "[A-z]+")
## [[1]] ## [1] "Hello" "my" "name" "is" "denis"
str_extract_all("Hello my name is denis", "[A-z]{3,5}")
## [[1]] ## [1] "Hello" "name" "denis"
remove all extra spaces, using str_replace_all
text <- "Hello my name is denis"
working:
str_replace_all("Hello my name is denis", "[ ]{1,}"," ")
## [1] "Hello my name is denis"
not working:
str_replace_all("Hello my name is denis", "[ ]","")
## [1] "Hellomynameisdenis"
Extract the 32 character hex! (https://stackoverflow.com/questions/64732417/extract-a-32-character-hex-hash-from-different-positions-in-a-url-in-r)
urls <- c( "https://example.com/123456789/checkouts/c7bc070823007e32bd88f6c2d621b262/thank_you", "https://example.com/123456789/orders/d85e0fcf1a7e9e1c3c441c0777326775", "https://example.com/123456789/orders/90415327c38f5e80e31e875052f9bbaa?validate=1")
str_extract(urls,"[a-z0-9]{32}")
## [1] "c7bc070823007e32bd88f6c2d621b262" "d85e0fcf1a7e9e1c3c441c0777326775" ## [3] "90415327c38f5e80e31e875052f9bbaa"
Calculate the mean of ages !
df <- read.csv2("./data/example2.csv") df
## age ## 1 12,2 ## 2 14,5 ## 3 16,9 ## 4 18,3 not sure ## 5 10,9 ## 6 12
tmp <- df$age %>% str_extract("[0-9,]+") tmp
## [1] "12,2" "14,5" "16,9" "18,3" "10,9" "12"
tmp <- tmp %>% str_replace(",",".") tmp
## [1] "12.2" "14.5" "16.9" "18.3" "10.9" "12"
tmp %>% as.numeric() %>% mean()
## [1] 14.13333
Here I want the tel with 10 digits, just numbers, no country indication
tellist <- c( "06 32 66 34 55", "+34 6 22 56 38 99", "0034 6 22.56 38 99", "0689335588 (telfono del papa)" )
Good start
tmp <- tellist %>% str_replace_all("[ A-z.()+]+","") tmp
## [1] "0632663455" "34622563899" "0034622563899" "0689335588"
or
tellist %>% str_extract_all("[0-9]+")
## [[1]] ## [1] "06" "32" "66" "34" "55" ## ## [[2]] ## [1] "34" "6" "22" "56" "38" "99" ## ## [[3]] ## [1] "0034" "6" "22" "56" "38" "99" ## ## [[4]] ## [1] "0689335588"
tellist %>% str_extract_all("[0-9]+") %>% purrr::map(function(x) paste0(x,collapse = ""))%>% unlist()
## [1] "0632663455" "34622563899" "0034622563899" "0689335588"
Now I want to remove the country indication and replace it with the 0:
tmp %>% str_replace_all("34","0")
## [1] "063266055" "0622563899" "000622563899" "0689335588"
gargl
$
is for the end of the string^
is for the beginning of the string\b
is for word boundary:|
is for or: pattern1|pattern2
match pattern1 or pattern 2Here:
tmp %>% str_replace_all("^34|^0034","0")
## [1] "0632663455" "0622563899" "0622563899" "0689335588"
Or we could also:
tmp %>% str_replace_all("^[0]{0,2}34","0")
## [1] "0632663455" "0622563899" "0622563899" "0689335588"
Remove the space at the beginning or at the end! Use str_replace_all
text <- c(" A banana ", "A banana","An orange ", " A second orange")
^[ ]+
[ ]+$
str_replace_all(text, "^[ ]+|[ ]+$", "")
## [1] "A banana" "A banana" "An orange" "A second orange"
find a list of word: find
keywords <- c("banana", "apple", "grape")
in
df <- data.frame(id = c(1,2,4,5,6,7), full_text = c("I like banana", "I ate an apple", "I prefer bananas and apples", "grapes", "My applepie is tasty", "Fruitsalad"))
(https://stackoverflow.com/questions/64388370/find-matching-words-in-a-text-from-a-vector)
paste(keywords,collapse = "|")
## [1] "banana|apple|grape"
str_extract_all(df$full_text ,paste(keywords,collapse = "|"))
## [[1]] ## [1] "banana" ## ## [[2]] ## [1] "apple" ## ## [[3]] ## [1] "banana" "apple" ## ## [[4]] ## [1] "grape" ## ## [[5]] ## [1] "apple" ## ## [[6]] ## character(0)
A bit of wrangling:
df %>% mutate(found = str_extract_all(full_text, paste(keywords,collapse = "|")) %>% purrr::map(function(x) paste(x,collapse = ", ")) %>% unlist())
## id full_text found ## 1 1 I like banana banana ## 2 2 I ate an apple apple ## 3 4 I prefer bananas and apples banana, apple ## 4 5 grapes grape ## 5 6 My applepie is tasty apple ## 6 7 Fruitsalad
str_extract("gogogogogoooo", "go+")
## [1] "go"
str_extract("gogogogogoooo", "(go)+")
## [1] "gogogogogo"
()
can be refered with \\n
n the number of the group:text <- "3 squared is 9, 4 squared is 16, 6 squared is 36"
str_replace_all(text, "([0-9])( squared)", "\\1x\\1")
## [1] "3x3 is 9, 4x4 is 16, 6x6 is 36"
str_match
gives a matrix with a column per group catchedroom_info <- c( "Lounge (3.66m x 3.66m (12'0\" x 12'0\"))", "Dining Kitchen (4.39m x 3.66m (14'5 x 12'0\"))", "Bedroom One (3.73m x 3.73m (12'3\" x 12'3\"))", "Bedroom Two (3.53m x 1.98m (11'7\" x 6'6\"))", "Shower Room (2.06m x 1.52m (6'9\" x 5'0\"))", "Occasional Loft Room (4.04m x 3.78m (13'3\" x 12'5\"))", "En-Suite Bathroom (3.18m x 1.98m (10'5\" x 6'6\"))" )
room_info %>% str_match("([A-z ]+)\\(([0-9.]+)m x ([0-9.]+)m")
## [,1] [,2] [,3] ## [1,] "Lounge (3.66m x 3.66m" "Lounge " "3.66" ## [2,] "Dining Kitchen (4.39m x 3.66m" "Dining Kitchen " "4.39" ## [3,] "Bedroom One (3.73m x 3.73m" "Bedroom One " "3.73" ## [4,] "Bedroom Two (3.53m x 1.98m" "Bedroom Two " "3.53" ## [5,] "Shower Room (2.06m x 1.52m" "Shower Room " "2.06" ## [6,] "Occasional Loft Room (4.04m x 3.78m" "Occasional Loft Room " "4.04" ## [7,] "Suite Bathroom (3.18m x 1.98m" "Suite Bathroom " "3.18" ## [,4] ## [1,] "3.66" ## [2,] "3.66" ## [3,] "3.73" ## [4,] "1.98" ## [5,] "1.52" ## [6,] "3.78" ## [7,] "1.98"
room_info %>% str_match("([A-z ]+)\\(([0-9.]+)m x ([0-9.]+)m")%>% .[,2:4] %>% as_tibble()%>% mutate(area = as.numeric(V2)*as.numeric(V3))
## # A tibble: 7 x 4 ## V1 V2 V3 area ## <chr> <chr> <chr> <dbl> ## 1 "Lounge " 3.66 3.66 13.4 ## 2 "Dining Kitchen " 4.39 3.66 16.1 ## 3 "Bedroom One " 3.73 3.73 13.9 ## 4 "Bedroom Two " 3.53 1.98 6.99 ## 5 "Shower Room " 2.06 1.52 3.13 ## 6 "Occasional Loft Room " 4.04 3.78 15.3 ## 7 "Suite Bathroom " 3.18 1.98 6.30
An other one:
dist <- c("Benalmadena Torremolinos: 15 min\n Malaga Fuengirola: 45 min\n Malaga Sevilla: 130 min\n Malaga Grenada: 90 min")
Make the df with provenance, destination, and time of travel. Use str_split
first, and then str_match
dist %>% str_split(.,"\n")%>% unlist() %>% str_match("([A-z]+)[ ]+([A-z]+):[ ]+([0-9]+)")
## [,1] [,2] [,3] [,4] ## [1,] "Benalmadena Torremolinos: 15" "Benalmadena" "Torremolinos" "15" ## [2,] "Malaga Fuengirola: 45" "Malaga" "Fuengirola" "45" ## [3,] "Malaga Sevilla: 130" "Malaga" "Sevilla" "130" ## [4,] "Malaga Grenada: 90" "Malaga" "Grenada" "90"
(go)
, how do you do ?⟶ escape with \
.
.
, \
, (
, )
, \s
, \d
, etc)! you must escape
\
too !
text <- "go(catchme)gogogoooo" str_extract(text,"(catchme)")
## [1] "catchme"
str_extract(text,"\\(catchme\\)")
## [1] "(catchme)"
str_extract(text,"[(]catchme[)]")
## [1] "(catchme)"
str_extract(text,"\\(.*\\)")
## [1] "(catchme)"
If you want something followed by or preceed by:
(?=something)
: followed by something
(?!something)
: not followed by something
(?<=something)
: preceed by something
(?<!something)
: not preceed by something
Find the sentence with a weight (use str_detect
and a positive lookahead)
text <- c("Anja weight 65 kg, is 172 cm tall and live 32 km from berlin", "Carolina is 132 cm tall, and live 54 km from Munich", "Pepa is 52 kg, and live 1 km from Malaga", "Julia does not care about kg")
[0-9]+
, or \\d+
(?= kg)
str_detect(text,"[0-9]+(?= kg)")
## [1] TRUE FALSE TRUE FALSE
find all sequences of length 6, finishing with ttc
and followed by aa
.
gene <- c("cgtatatcaagaagcattcacttaccatgacacagcttcagatttcattattgctgacag ctactatatcactactccatctagtagtggccacgccctatgaggcatatcctatcggaa aacaataccccccagtggcaagagtcaatgaatcgtttacatttcaaatttccaatgata cctataaatcgtctgtagacaagacagctcaaataacatacaattgcttcgacttaccga gctggctttcgtttgactctagttctagaacgttctcaggtgaaccttcttctgacttac tatctgatgcgaacaccacgttgtatttcaatgtaatactcgagggtacggactctgccg acagcacgtctttgaacaatacataccaatttgttgttacaaaccgtccatccatctcgc tatcgtcagatttcaatctattggcgttgttaaaaaactatggttatactaacggcaaaa acgctctgaaactagatcctaatgaagtcttcaacgtgacttttgaccgttcaatgttca ctaacgaagaatccattgtgtcgtattacggacgttctcagttgtataatgcgccgttac ccaattggctgttcttcgattctggcgagttgaagtttactgggacggcaccggtgataa actcggcgattgctccagaaacaagctacagttttgtcatcatcgctacagacattgaag ctattcaaaatagtttgataatcaacgttactgacacaggtaacgtttcatatgacttac")
So 6 nuleotides, ending with ttc:
[ctga]{3}
ttc
aa
: (?=aa)
str_extract_all(gene,"[cgta]{3}ttc(?=aa)")
## [[1]] ## [1] "catttc" "tatttc" "gatttc" "gtcttc" "ccgttc" "ctattc"
str_replace
!"5 pounds of silver, 7 pounds of gold, 7 kg of metal" %>% str_replace_all("[0-9]+(?= pounds)", function(x){ as.numeric(x)*0.454 %>% round(.,2) } ) %>% str_replace_all("pounds","kg")
## [1] "2.25 kg of silver, 3.15 kg of gold, 7 kg of metal"
Put the upper case at the beggining of each sentence. The function is toupper
text <- c("no upper case. never, and it is bad. you should put upper case letters at the begining of each sentence. try it, see if you master regex now")
.
and a space[a-z]
) preceed by a dot and a space (?<=\\. )
^[a-z]
toupper
str_replace_all(text,"(?<=\\. )[a-z]|^[a-z]",toupper)
## [1] "No upper case. Never, and it is bad. You should put upper case letters at the begining of each sentence. Try it, see if you master regex now"
: @denis_mongin
##Ex 12 calculate delay in seconds from the column Time (format is mm:ss)
df <- read.csv("./data/exemple3.csv",sep = ";") df$Time %>% str_match("([0-9]{2}):([0-9]{2})") %>% .[,2:3] %>% as_tibble() %>% mutate_all(as.numeric) %>% mutate(delay = V2 + V1*60)
## # A tibble: 115 x 3 ## V1 V2 delay ## <dbl> <dbl> <dbl> ## 1 0 9 9 ## 2 0 12 12 ## 3 0 15 15 ## 4 0 19 19 ## 5 0 24 24 ## 6 0 28 28 ## 7 0 32 32 ## 8 0 35 35 ## 9 0 38 38 ## 10 0 40 40 ## # ... with 105 more rows
Put the dose in per day. d is per d, w is per week, m is per month
df <- read.csv("./data/exemple4.csv",sep = ";") corresp = c(d = 1, w = 7, m = 30) df$dose %>% str_match("([0-9]+) ([dwm])") %>% .[,2:3] %>% as_tibble() %>% mutate(perday = corresp[V2]) %>% mutate(doseperday = as.numeric(V1)/perday)
## # A tibble: 9 x 4 ## V1 V2 perday doseperday ## <chr> <chr> <dbl> <dbl> ## 1 1 d 1 1 ## 2 2 d 1 2 ## 3 3 w 7 0.429 ## 4 3 w 7 0.429 ## 5 2 w 7 0.286 ## 6 10 m 30 0.333 ## 7 12 m 30 0.4 ## 8 2 d 1 2 ## 9 15 d 1 15