I need to run Jaro Wrinkler 1500000 times for finding similarity between given []byte
to exiting []byte
. I took the code from https://rosettacode.org/wiki/Jaro-Winkler_distance#Go and modified it to work directly on []byte
instead of string
.
func JaroWrinkler(a, b []byte) float64 {
lenPrefix := len(commonPrefix(a, b))
if lenPrefix > 4 {
lenPrefix = 4
}
// Return similarity.
similarity := JaroSim(a, b)
return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity))
}
func JaroSim(str1, str2 []byte) float64 {
if len(str1) == 0 && len(str2) == 0 {
return 1
}
if len(str1) == 0 || len(str2) == 0 {
return 0
}
match_distance := len(str1)
if len(str2) > match_distance {
match_distance = len(str2)
}
match_distance = match_distance/2 - 1
str1_matches := make([]bool, len(str1))
str2_matches := make([]bool, len(str2))
matches := 0.
transpositions := 0.
for i := range str1 {
start := i - match_distance
if start < 0 {
start = 0
}
end := i + match_distance + 1
if end > len(str2) {
end = len(str2)
}
for k := start; k < end; k++ {
if str2_matches[k] {
continue
}
if str1[i] != str2[k] {
continue
}
str1_matches[i] = true
str2_matches[k] = true
matches++
break
}
}
if matches == 0 {
return 0
}
k := 0
for i := range str1 {
if !str1_matches[i] {
continue
}
for !str2_matches[k] {
k++
}
if str1[i] != str2[k] {
transpositions++
}
k++
}
transpositions /= 2
return (matches/float64(len(str1)) +
matches/float64(len(str2)) +
(matches-transpositions)/matches) / 3
}
func commonPrefix(first, second []byte) []byte {
if len(first) > len(second) {
first, second = second, first
}
var commonLen int
sRunes := []byte(second)
for i, r := range first {
if r != sRunes[i] {
break
}
commonLen++
}
return sRunes[0:commonLen]
}
I benchmarked it with the https://github.com/adrg/strutil package, and it outperformed.
It is very slow for my use case. Can it be optimised more?