Partial match column in dataframe to create new dataframe

122 views Asked by At

I'm running into an issue with encoding and partial matching.

I have two data frames, A and B. A called in via UTF-8 encoding and B on Latin1. This could already be part of the issue although I'm not sure. This was the only way I knew how to import it properly.

edit: I should clarify. This is just sample data. Both dataframes contain a large number of rows and other columns as well.

           A                                                        B
ID       Name    Expense                              Employee           Category
1    Mike Adall   3                                   Lothar Fiend          B2
2   Brian Adams   4                                   Rohan Sudarsh         A2
3        Adrián   1                                   Adrián Silva          A1
4     Floyd Oid   1                                   Semi Ajayi            A1
5    Semi Ajayi   4                                   Micheal Adall         A1
6      Jomu Aké   3                                   Jomü Ria Aké          B1
                                                      Brian Adams           B2
                                                      Floyd Öid Matheus     B1       

            

I've been trying to extract the B$Employee$ and partially match them with A$Name to create a new df C that would include B$Category. This is the output that I would like.

edit: With Category, I would also want to include all the other columns of both A & B excluding Employee.

             C
ID       Name    Expense   Category
1    Mike Adall   3        A1
2   Brian Adams   4        B2
3        Adrián   1        A1
4     Floyd Oid   1        B1
5    Semi Ajayi   4        A1
6      Jomu Aké   3        B1

So far I've got it to match 80% of the characters using the fuzzyjoin package.

C <- A %>% fuzzy_inner_join(B, by = c(Name = "Employee"))

The main issue seems to be these odd latin characters such as Ö,ß, etc. or sometimes when it occurs at the end of a name like 'Aké'. The results seem to vary from name to name.

How could I get it to partially match all the names?

2

There are 2 answers

1
Onyambu On

In base R, you could use both agrep and adist as follows:

d<-sapply(A$Name,agrep, B$Employee)
d[e]<-max.col(-adist(e<-names(Filter(Negate(length),d)), B$Employee))
cbind(A,B[unlist(d),])

 ID        Name Expense          Employee Category
5  1  Mike Adall       3     Micheal Adall       A1
7  2 Brian Adams       4       Brian Adams       B2
3  3      Adrián       1      Adrián Silva       A1
8  4   Floyd Oid       1 Floyd Öid Matheus       B1
4  5  Semi Ajayi       4        Semi Ajayi       A1
6  6    Jomu Aké       3      Jomü Ria Aké       B1

EDIT:

using the stringdist package: You could do:

cbind(A, B[max.col(-t(sapply(A$Name,stringdist::stringdist,B$Employee,"lcs"))),])
  ID        Name Expense          Employee Category
5  1  Mike Adall       3     Micheal Adall       A1
7  2 Brian Adams       4       Brian Adams       B2
3  3      Adrián       1      Adrián Silva       A1
8  4   Floyd Oid       1 Floyd Öid Matheus       B1
4  5  Semi Ajayi       4        Semi Ajayi       A1
6  6    Jomu Aké       3      Jomü Ria Aké       B1
1
Ferroao On

This method will only result in one match (column match), because which.min and max.col are length one even when there are distance ties.

It is important to check manually ties. Ties can be checked in data.frame res, column minMatchSeveral, or in the second script below.

require(stringdist)
{
firstvector <-A$Name
secondvector<-B$Employee   
threshold <- 14   # max 14 characters of divergence

lenMin<-mindist<-integer()
match <- minMatchSeveral <- sortedmatches <- character()

for (i in 1:length(firstvector) ) {
  matchdist <- stringdist::stringdist(firstvector[i],secondvector,"lcs") # several methods available
  matchdist <- ifelse(matchdist>threshold,NA,matchdist)
  sortedmatches[i] <- paste(secondvector[order(matchdist, na.last=NA)], collapse = ", ")
  mindist[i]<- tryCatch(ifelse(is.integer(which.min(matchdist)),matchdist[which.min(matchdist)],NA), error = function(e){NA})
  lenMin[i] <- tryCatch(length(matchdist[which(matchdist == min(matchdist, na.rm=T) ) ]),warning = function(w){""} )
  match[i]<-ifelse(length(secondvector[which.min(matchdist)])==0,NA,
                   secondvector[which.min(matchdist)] )
  minMatchSeveral[i] <- ifelse(lenMin[i]>1, 
                               suppressWarnings(ifelse(length(secondvector[which(matchdist==min(matchdist, na.rm=T) )  ] )==0,
                                                       NA,
                                                       paste(secondvector[ which(matchdist==min(matchdist, na.rm=T) )  ], collapse = ", " )
                               ))
                               , NA) 
}

res<-data.frame(firstvector=firstvector,
                match=match,divergence=mindist, 
                lenMin= lenMin,
                minMatchSeveral = minMatchSeveral,
                sortedmatches=sortedmatches, 
                stringsAsFactors = F)
}
res
  firstvector             match divergence lenMin              minMatchSeveral                                                                                   sortedmatches
1  Mike Adall     Micheal Adall          5      2 Micheal Adall, Micheol Adall                                           Micheal Adall, Micheol Adall, Brian Adams, Semi Ajayi
2 Brian Adams       Brian Adams          0      1                         <NA>              Brian Adams, Rohan Sudarsh, Micheal Adall, Adrián Silva, Semi Ajayi, Micheol Adall
3      Adrian      Adrián Silva          8      1                         <NA> Adrián Silva, Brian Adams, Lothar Fiend, Semi Ajayi, Micheal Adall, Micheol Adall, Jomü Ria Aké
4   Floyd Oid Floyd Öid Matheus         10      1                         <NA>                                                                 Floyd Öid Matheus, Lothar Fiend
5  Semi Ajayi        Semi Ajayi          0      1                         <NA>                                                           Semi Ajayi, Brian Adams, Jomü Ria Aké
6    Jomu Aké      Jomü Ria Aké          6      1                         <NA>                                                                        Jomü Ria Aké, Semi Ajayi

A$match<-match
# For large tables, consider using data.table::merge
C <- merge(A, B, by.x="match", by.y = "Employee", all.x=T)
C[,2:ncol(C)]

  ID        Name Expense Category
1  3      Adrián       1       A1
2  2 Brian Adams       4       B2
3  4   Floyd Oid       1       B1
4  6    Jomu Aké       3       B1
5  1  Mike Adall       3       A1
6  5  Semi Ajayi       4       A1

From the ?stringdist-metrics

The longest common substring (method='lcs') is defined as the longest string that can be obtained by pairing characters from a and b while keeping the order of characters intact. The lcs-distance is defined as the number of unpaired characters. The distance is equivalent to the edit distance allowing only deletions and insertions, each with weight one.

In addition you can take a look at stringi::stri_trans_general

EDIT: another way to visualize ties

{
mm  <- -t(sapply(A$Name,stringdist::stringdist,B$Employee,"lcs"))
idx <- mm[cbind(seq_along(max.col(mm)),max.col(mm))]
ties <-sapply(seq_along(mm[,1]), function(x) which(mm[x,] %in% idx[x]) )
list <-sapply(ties, function(x) paste(B[x,] ), simplify=F)
my<-as.matrix(do.call("rbind",list) )
dimnames( my)[[2]] <- c("closestMatch","Category") 
cbind(A, my )  
}

  ID        Name Expense                        closestMatch      Category
1  1  Mike Adall       3 c("Micheal Adall", "Micheol Adall") c("A1", "A1")
2  2 Brian Adams       4                         Brian Adams            B2
3  3      Adrian       1                        Adrián Silva            A1
4  4   Floyd Oid       1                   Floyd Öid Matheus            B1
5  5  Semi Ajayi       4                          Semi Ajayi            A1
6  6    Jomu Aké       3                        Jomü Ria Aké            B1

data

{
A<-read.table(text="ID       Name    Expense
1    \"Mike Adall\"   3                             
2   \"Brian Adams\"   4                             
3        \"Adrian\"   1                             
4     \"Floyd Oid\"   1                             
5    \"Semi Ajayi\"   4                             
6      \"Jomu Aké\"   3 ", header=T, stringsAsFactors = F)                            
 
B<-read.table(text="Employee           Category
\"Lothar Fiend\"          B2
\"Rohan Sudarsh\"         A2
\"Adrián Silva\"          A1
\"Semi Ajayi\"            A1
\"Micheal Adall\"         A1
\"Micheol Adall\"         A1 # testing ties
\"Jomü Ria Aké\"          B1
\"Brian Adams\"           B2
\"Floyd Öid Matheus\"     B1", header=T, stringsAsFactors = F)
}