Fuzzy matching (and overwriting) vector entries

114 views Asked by At

I have 5 vectors with column names, which are similar, but not identical.

I am trying to find a way to correct the entries in vector2, vector3, vector4, vector5, based on the names in vector1.

I have been getting some ideas here and here, leading to the code below. But in the end, I even get stuck comparing the first two.vectors. Let alone overwriting them.

library(dplyr)
library(fuzzyjoin)


vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")

I started out as follows:

apply(adist(x = vector1, y = vector2), 1, which.min)

data.frame(string_to_match = vector1, 
           closest_match = vector2[apply(adist(x = vector1, y = vector2), 1, which.min)])

           
  string_to_match closest_match
1       something    some thing
2         nothing      no thing
3        anything      anything
4         number4       number4

Is there anyway to add the distance to this solution and to overwrite the vector based on the distance?

Desired result:

  string_to_match closest_match  distance
1       something    some thing   1
2         nothing      no thing   1
3        anything      anything   0
4         number4       number4   0

vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("something","nothing","addition", "anything", "number4")
vector3 <- c("something","nothing", "anything")
vector4 <- c("something","nothing", "anything", "number4")
vector5 <- c("something","nothing", "anything", "number4")

Is there anyone who can put me on the right track?

1

There are 1 answers

0
Arthur Yip On BEST ANSWER

fuzzyjoin functions will add the distance metric. You don't need to overwrite if you just select the closest_match column/vector.

library(fuzzyjoin); library(dplyr)
vector1 <- c("something","nothing", "anything", "number4")
vector2 <- c("some thing","no thing","addition", "anything", "number4")
vector3 <- c("some thing wrong","nothing", "anything_")
vector4 <- c("something","nothingg", "anything", "number_4")
vector5 <- c("something","nothing", "anything happening", "number4")

# solution for your desired output for vector 2
stringdist_left_join(x = tibble(things = vector1), y = tibble(things = vector2), 
                     max_dist = 1, distance_col = "distance")
#> Joining by: "things"
#> # A tibble: 4 x 3
#>   things.x  things.y   distance
#>   <chr>     <chr>         <dbl>
#> 1 something some thing        1
#> 2 nothing   no thing          1
#> 3 anything  anything          0
#> 4 number4   number4           0

# fuller solution for vector 3 or any other
(full_table_of_possible_matches_for_vector3 <- stringdist_left_join(x = tibble(things = vector3), 
                                                                    y = tibble(things = vector1), 
                                                                    max_dist = 99, distance_col = "distance"))
#> Joining by: "things"
#> # A tibble: 12 x 3
#>    things.x         things.y  distance
#>    <chr>            <chr>        <dbl>
#>  1 some thing wrong something        7
#>  2 some thing wrong nothing         10
#>  3 some thing wrong anything        11
#>  4 some thing wrong number4         14
#>  5 nothing          something        3
#>  6 nothing          nothing          0
#>  7 nothing          anything         2
#>  8 nothing          number4          6
#>  9 anything_        something        5
#> 10 anything_        nothing          3
#> 11 anything_        anything         1
#> 12 anything_        number4          8
(table_of_closest_matches <- full_table_of_possible_matches_for_vector3 %>%
  group_by(things.x) %>%
  mutate(rank = row_number(distance)) %>%
  filter(rank == 1))
#> # A tibble: 3 x 4
#> # Groups:   things.x [3]
#>   things.x         things.y  distance  rank
#>   <chr>            <chr>        <dbl> <int>
#> 1 some thing wrong something        7     1
#> 2 nothing          nothing          0     1
#> 3 anything_        anything         1     1
  #slice_min(distance, with_ties = FALSE) # can't use slice_min or order will mess up
(new_vector3 <- table_of_closest_matches$things.y)
#> [1] "something" "nothing"   "anything"

(new_vector2 <- stringdist_left_join(x = tibble(things = vector2), 
                                     y = tibble(things = vector1), 
                                     max_dist = 99, distance_col = "distance") %>%
    group_by(things.x) %>%
    mutate(rank = row_number(distance)) %>%
    filter(rank == 1) %>%
    .$things.y)
#> Joining by: "things"
#> [1] "something" "nothing"   "anything"  "anything"  "number4"

Created on 2021-01-06 by the reprex package (v0.3.0)