Optimizing Jaro-Winkler algorithm in Golang

340 views Asked by At

I need to run Jaro Wrinkler 1500000 times for finding similarity between given []byte to exiting []byte. I took the code from https://rosettacode.org/wiki/Jaro-Winkler_distance#Go and modified it to work directly on []byte instead of string.

func JaroWrinkler(a, b []byte) float64 {

    lenPrefix := len(commonPrefix(a, b))
    if lenPrefix > 4 {
        lenPrefix = 4
    }
    // Return similarity.
    similarity := JaroSim(a, b)
    return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity))

}


func JaroSim(str1, str2 []byte) float64 {
    if len(str1) == 0 && len(str2) == 0 {
        return 1
    }
    if len(str1) == 0 || len(str2) == 0 {
        return 0
    }
    match_distance := len(str1)
    if len(str2) > match_distance {
        match_distance = len(str2)
    }
    match_distance = match_distance/2 - 1
    str1_matches := make([]bool, len(str1))
    str2_matches := make([]bool, len(str2))
    matches := 0.
    transpositions := 0.
    for i := range str1 {
        start := i - match_distance
        if start < 0 {
            start = 0
        }
        end := i + match_distance + 1
        if end > len(str2) {
            end = len(str2)
        }
        for k := start; k < end; k++ {
            if str2_matches[k] {
                continue
            }
            if str1[i] != str2[k] {
                continue
            }
            str1_matches[i] = true
            str2_matches[k] = true
            matches++
            break
        }
    }
    if matches == 0 {
        return 0
    }
    k := 0
    for i := range str1 {
        if !str1_matches[i] {
            continue
        }
        for !str2_matches[k] {
            k++
        }
        if str1[i] != str2[k] {
            transpositions++
        }
        k++
    }
    transpositions /= 2
    return (matches/float64(len(str1)) +
        matches/float64(len(str2)) +
        (matches-transpositions)/matches) / 3
}
func commonPrefix(first, second []byte) []byte {
    if len(first) > len(second) {
        first, second = second, first
    }

    var commonLen int
    sRunes := []byte(second)
    for i, r := range first {
        if r != sRunes[i] {
            break
        }
        commonLen++
    }
    return sRunes[0:commonLen]
}


I benchmarked it with the https://github.com/adrg/strutil package, and it outperformed.

enter image description here

It is very slow for my use case. Can it be optimised more?

0

There are 0 answers