18/09/2020
: @denis_mongin
⟶ 6000 lines of and no life in 4 months
library(REDCapR) token = readChar("token.txt",file.info("token.txt")$size) uri = readChar("uri.txt",file.info("uri.txt")$size) dump <- redcap_read(batch_size = 1000, redcap_uri = uri, verbose = F, token = token) head(dump$data)
## record_id name surname gusta quetegusta___1 quetegusta___2 quetegusta___3 ## 1 1 sergio smith 0 0 0 0 ## 2 2 Gonzales pepe 1 1 1 1 ## 3 3 sanchez pepe 0 0 0 0 ## fecha1 rhispano_complete ## 1 2020-09-17 2 ## 2 2020-09-17 2 ## 3 2020-09-18 2
test on Rstudio
plouf <- data.frame(record_id = "10", name = "mongin", surname = "denis", gusta = "1", quetegusta___1 = "1", quetegusta___2 = "1", quetegusta___3 = "0", fecha1 = "2020-09-18", rhispano_complete = "2") redcap_write(token = token, redcap_uri = uri, ds_to_write = plouf)
* main problem in the beginning: people get several tests
⟶ detect duplicated entries
I use data.table
(the best R package ever for data management).
df$a <- 1
df$a[df$b == 0] <- 1
df %>% group_by(b)%>% mutate(c = a - mean(a))
dt[, a := 1]
dt[b == 0, a := 1]
dt[,c:= a - mean(a),by = b]
df$name %>% tolower(.) %>% chartr("âäëéèêóôöüûçîï", "aaeeeeooouucii", .)%>% gsub("\\bla\\b|\\bde\\b|\\bda\\b|\\bdu\\b|\\bl'\\b", "", .) %>% gsub("[ ]+|[-]|[,]", "", .)
## [1] "mongin" "mongin" "smith" "cruz" "cruz" "smith" "mongin" "smith" ## [9] "smith"
clean_name <- function(name) { name_clean <- name %>% tolower(.) %>% chartr("âäëéèêóôöüûçîï", "aaeeeeooouucii", .) %>% gsub("\\bla\\b|\\bde\\b|\\bda\\b|\\bdu\\b|\\bl'\\b", "", .) %>% gsub("[ ]+|[-]|[,]", "", .) return(name_clean) } clean_name(df$surname)
## [1] "denis" "denisdiego" "paul" "sofia" "sophiaelena" ## [6] "paul" "denis" "paul" "etienne"
df[,name_clean := clean_name(name)] df[,surname_clean := clean_name(surname)]
persons: index for each common name, surname and birthdate:
df[,person := .GRP,by = .(name_clean,surname_clean,birthdate)]
library(qualV) comp_names <- function(a, b) { a <- clean_name(a) # cleaning" b <- clean_name(b) av <- strsplit(a, '')[[1]] # splitting all characters bv <- strsplit(b, '')[[1]] match_str <- paste(qualV::LCS(av, bv)$LCS, collapse='') pc <- round(nchar(match_str)/(min(nchar(a), nchar(b)))*100) return(pc) }
comp_names("denis","denis diego")
## [1] 100
100% similar
comp_names("denis","john")
## [1] 25
25% similar: there is a common n
over 5 letters
The distance is just 100 - similarity
. Here the matrix:
comp_names_matrix <- function(vec) { mat <- sapply(vec, function(x) { 100 - as.numeric(Vectorize(comp_names)(x, vec)) }) rownames(mat) <- vec return(mat) }
comp_names_matrix(df$surname)
## denis denis diego paul sofia sophia, Elèna Paul dénis Paul ## denis 0 0 100 80 60 100 0 100 ## denis diego 0 0 100 60 70 100 0 100 ## paul 100 100 0 75 25 0 100 0 ## sofia 80 60 75 0 20 75 80 75 ## sophia, Elèna 60 70 25 20 0 25 60 25 ## Paul 100 100 0 75 25 0 100 0 ## dénis 0 0 100 80 60 100 0 100 ## Paul 100 100 0 75 25 0 100 0 ## Etienne 60 57 100 80 57 100 60 100 ## Etienne ## denis 60 ## denis diego 57 ## paul 100 ## sofia 80 ## sophia, Elèna 57 ## Paul 100 ## dénis 60 ## Paul 100 ## Etienne 0
clustering, to identify persons which are the same:
df$surname %>% comp_names_matrix(.) %>% stats::as.dist(.) %>% stats::hclust(.) %>% plot
I cut for a defined threshold, to define an index for same identified name:
compare_tree <- function(vec, thres = 20) { if(length(vec)>1){ output <- vec %>% comp_names_matrix(.) %>% stats::as.dist(.) %>% stats::hclust(.) %>% # cluster stats::cutree(h = thres)}else{output <- as.integer(1)} return(output) } compare_tree(df$surname,20)
## denis denis diego paul sofia sophia, Elèna ## 1 1 2 3 3 ## Paul dénis Paul Etienne ## 2 1 2 4
Perfect !!
Do that on name, surname and birthdate:
df[,name_ind := compare_tree(name,30)] df[,surname_ind := compare_tree(surname,30)] df[,birthdate_ind := compare_tree(birthdate,30)]
define an index for the common detected name, surname and birthdate:
df[,person := .GRP,by = .(birthdate_ind,surname_ind,name_ind)]
my computer:
I have 60’000 patients, so 3.6 billion of distances
Gargl
⟶ restrict degress of freedom!
index of similar birthdate and same name and surname
df[,idx1 := paste0("1_",.GRP,"_",compare_tree(birthdate,20)), by = .(name_clean,surname_clean)]
index of similar surname and same name and birthdate
df[,idx2 := paste0("2_",.GRP,"_",compare_tree(surname,30)), by = .(name_clean,birthdate)]
index of similar name and same surname and birthdate
df[,idx3 := paste0("3_",.GRP,"_",compare_tree(name,30)), by = .(surname_clean,birthdate)]
df[, rn := .I] DT <- rbindlist(list( df[, .(s=idx1, e=idx2, rn)], df[, .(s=idx1, e=idx3, rn)], df[, .(s=idx2, e=idx3, rn)]))
define the graph:
#find linked clusters library(igraph) g <- graph_from_data_frame(DT, directed=FALSE) print(g)
## IGRAPH 6db954e UN-- 20 27 -- ## + attr: name (v/c), rn (e/n) ## + edges from 6db954e (vertex names): ## [1] 1_1_1--2_1_1 1_2_1--2_1_1 1_3_1--2_2_1 1_4_1--2_3_1 1_5_1--2_3_1 ## [6] 1_3_1--2_4_1 1_1_1--2_5_1 1_3_1--2_2_1 1_6_1--2_2_2 1_1_1--3_1_1 ## [11] 1_2_1--3_2_1 1_3_1--3_3_1 1_4_1--3_4_1 1_5_1--3_5_1 1_3_1--3_6_1 ## [16] 1_1_1--3_7_1 1_3_1--3_3_1 1_6_1--3_8_1 2_1_1--3_1_1 2_1_1--3_2_1 ## [21] 2_2_1--3_3_1 2_3_1--3_4_1 2_3_1--3_5_1 2_4_1--3_6_1 2_5_1--3_7_1 ## [26] 2_2_1--3_3_1 2_2_2--3_8_1
obtain cluster, i.e. persons!
cl <- clusters(g)$membership DT[, g := cl[s]] #look up grouping for each vertex df[unique(DT, by="rn"), on=.(rn), person := g]
Few minutes with more than 60000 persons to look for
detect_duplicate(df[,.(name,surname,birthdate)])
## name surname birthdate person ## 1: Mongin denis 29-08-1986 1 ## 2: mongin denis diego 29-08-1986 1 ## 3: smith paul 02-06-1945 2 ## 4: De la cruz sofia 02-03-2000 3 ## 5: de-la-cruz sophia, Elèna 02-03-2000 3 ## 6: smith Paul 20-06-1945 2 ## 7: mongin dénis 29-07-1986 1 ## 8: SMITH Paul 02-06-1945 2 ## 9: smith Etienne 02-06-1945 4
I want to find peoples in a data frame (eg lab results, or hospitalisations)
## name surname birthdate ## 1: mongin denis 29-09-1986
I create variable to indicate line and provenance of each data frame:
tofind[,rn_df := .I] df[,rn_df := .I] tofind[,dfframe := "tosearch"] df[,dfframe := "database"]
dftot <- rbind(tofind,df[,.(name,surname,birthdate,rn_df,dfframe)]) head(dftot)
## name surname birthdate rn_df dfframe ## 1: mongin denis 29-09-1986 1 tosearch ## 2: Mongin denis 29-08-1986 1 database ## 3: mongin denis diego 29-08-1986 2 database ## 4: smith paul 02-06-1945 3 database ## 5: De la cruz sofia 02-03-2000 4 database ## 6: de-la-cruz sophia, Elèna 02-03-2000 5 database
res <- detect_duplicate(dftot)
We can get the line correspondance between the two data frames:
dftot[,expand.grid(rn_tofind = rn_df[dfframe == "tosearch"], rn_tosearch = rn_df[dfframe == "database"]), by = person]
## person rn_tofind rn_tosearch ## 1: 1 1 1 ## 2: 1 1 2 ## 3: 1 1 7
Masterpiece of COVID response of Geneva !
source: https://gitlab.com/dmongin/r-demos/-/tree/master/R_COVID_Rhispano
Follow me: @denis_mongin