Aho-Corasick for partial text matching

550 views Asked by At

I need to extend the Aho-Corasick algorithm to match partial dictionary entries. For that, I modified an implementation in JS and I made some tests but it does not work well yet

Here is my code:

var AhoCorasick = function (keywords) {
  this._buildTables(keywords);
};

AhoCorasick.prototype._buildTables = function (keywords) {
  var gotoFn = {
    0: {},
  };
  var output = {};

  var state = 0;
  keywords.forEach(function (word) {
    var curr = 0;
    for (var i = 0; i < word.length; i++) {
      var l = word[i];
      if (gotoFn[curr] && l in gotoFn[curr]) {
        curr = gotoFn[curr][l];
      } else {
        state++;
        gotoFn[curr][l] = state;
        gotoFn[state] = {};
        curr = state;
        output[state] = [];
      }
    }

    output[curr].push(word);
  });

  var failure = {};
  var xs = [];

  // f(s) = 0 for all states of depth 1 (the ones from which the 0 state can transition to)
  for (var l in gotoFn[0]) {
    var state = gotoFn[0][l];
    failure[state] = 0;
    xs.push(state);
  }

  while (xs.length) {
    var r = xs.shift();
    // for each symbol a such that g(r, a) = s
    for (var l in gotoFn[r]) {
      var s = gotoFn[r][l];
      xs.push(s);

      // set state = f(r)
      var state = failure[r];
      while (state > 0 && !(l in gotoFn[state])) {
        state = failure[state];
      }

      if (l in gotoFn[state]) {
        var fs = gotoFn[state][l];
        failure[s] = fs;
        output[s] = output[s].concat(output[fs]);
      } else {
        failure[s] = 0;
      }
    }
  }

  this.gotoFn = gotoFn;
  this.output = output;
  this.failure = failure;
};

AhoCorasick.prototype.search = function (string) {
  //console.log(this.gotoFn);
  var noErrorString = '';
  var state = 0;
  var results = [];
  var h = 0;
  var init = false;
  aho_loop: for (var i = 0; i < string.length; i++) {
    var l = string[i];
    while (state > 0 && !(l in this.gotoFn[state])) {
      state = this.failure[state];
      noErrorString = '';
    }
    if (!(l in this.gotoFn[state])) {
      if (h != 0 || !init) {
        h++;
        init = true;
        var results2 = [];
        for (key in this.gotoFn) {
          for (subkey in this.gotoFn[key]) {
            if (subkey == l) {
              results2.push(this.gotoFn[key][l]);
            }
          }
        }
      }
      console.log(l);
      console.log(results2);
      while (h < results2.length) {
        state = results2[h];
        noErrorString += l;
        h++;
        continue aho_loop;
      }
      h = 0;
      init = false;

      continue;
    }

    state = this.gotoFn[state][l];
    noErrorString += l;
    h = 0;
    init = false;

    if ((noErrorString.match(/ /g) || []).length > 1) {
      var foundStrs = noErrorString;

      if (string !== undefined) {
        if (i + 1 < string.length) {
          if (!string[i + 1].match(/\w+/g)) {
            if (typeof string[i - foundStrs.length] === 'undefined') {
              results.push([i, foundStrs]); // + 1
            } else if (!string[i - foundStrs.length].match(/\w+/g)) {
              results.push([i, foundStrs]); // + 1
            }
          }
        } else {
          results.push([i, foundStrs]);
        }
      }
    }
  }

  return results;
};

var ac = new AhoCorasick([
  'brownie t test keyword1 hello stop',
  'lele brown',
  'etc',
]);
var results = ac.search('test keyword1');

When i enter this entry in the dictionary:

brownie t test keyword1 hello stop

The result is:

"test keyword1 "

So is good, but if i enter this:

brownie test test keyword1 hello stop

The result is:

"st keyword1 "

And when i enter this:

brownie t t test keyword1 hello stop

The result is empty...

Can someone help me find my error?

Thanks!

0

There are 0 answers