Word frequency for array of key/values on javascript

4.4k views Asked by At

I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:

[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]

I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var newArray = [];
    $.each(wordArray, function (ix, word) {
        if (newArray.length >= 1){
            newArray.some(function (w){
                if (w.text === word){
                    w.size++;
                } else {
                    newArray.push({text: word, size: 1});
                }
            });
        } else {
            newArray.push({text: word, size: 1});
        }
    });
    return newArray;
}
5

There are 5 answers

0
LJᛃ On BEST ANSWER

Array.prototype.some expects the given callback to return true or false and returns true as soon as your callback returns true for a given element, otherwise it returns false.

So some iterates over all elements, with your given callback, and your callback checks if the given element text equals the search word and if not adds a new object. Introducing a new element the some function can iterate over.

So to make this clear, for every word thats in the newArray before the word you're searching, you're adding a new object containing your word.

Suppose your newArray looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"}]

after calling your function for the word even it looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"},{word:"even"},{word:"even"},{word:"even"},{word:"even"}]

Using Array.prototype.filter would be the better approach here, finding you the matching element, note that I also replaced $.each with Array.prototype.forEach:

function wordFrequency(txt){
  var wordArray = txt.split(/[ .?!,*'"]/);
  var newArray = [], wordObj;
  wordArray.forEach(function (word) {
    wordObj = newArray.filter(function (w){
      return w.text == word;
    });
    if (wordObj.length) {
      wordObj[0].size += 1;
    } else {
      newArray.push({text: word, size: 1});
    }
  });
  return newArray;
}
document.write(JSON.stringify(wordFrequency("count everything, count all the words, count all the words!").sort(function(a,b){return a.size<b.size})).split("},").join("}<br/>"));

1
Ahmad Mageed On

To tell the frequency all you need is a hash map approach. Your algorithm is quadratic, since the some method is nested in the each method, so you're always looping over the newArray just to find an entry and increment the size.

A map approach is easily achievable using a JavaScript object. It also gives you constant look-up time, which is better performance than the nested loops approach.

Try this approach instead:

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var map = {};
    $.each(wordArray, function(ix, word) {
      // skip empty results
      if (!word.length) {
        return;
      }
      // add word to map
      if (!map[word]) {
        map[word] = 0;
      } 
      map[word]++;
    });
    return map;
}

To use the function:

var text = "hello!world*hello foo  'bar'foo";
var result = wordFrequency(text);

// iterate over results
Object.keys(result).forEach(function(w) {
  console.log(w + ": " + result[w]);
});

// or use for...in
for (var w in result) {
  console.log(w + ": " + result[w]);
}

If you really wanted to, you could then map the result into your desired array format with text and size properties:

var mappedResult = Object.keys(result).map(function(w) {
  return { text: w, size: result[w] };
});
console.log(mappedResult);

Also, depending on your target browsers, you might consider using the array forEach instead of the jQuery $.each, similar to what I did with the Object.keys portion.

Here's the JSBin example.

7
masswerk On

You would probably want to avoid any iterations on duplicate elements and keep your results array unique. Since any of the iterators of Array.prototype will include each of the elements, they might not be the ideal solution for this. Sometimes plain old loops do the job best ... (You may also want to expressively escape any special characters in your regular expression).

function wordFrequency(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [];
    for (var i = 0; i < words.length; i++) {
        var w = words[i],
            found = false;
        for (var j = 0; j < seen.length; j++) {
            if (w === seen[j].text) {
                seen[j].size++;
                found = true;
                break;
            }
        }
        if (!found) seen.push( { text: w, size: 1 } );
    }
    return seen;
}

(Note that the inner for-loop isn't visited for the first word, so the first word will be pushed to the seen-stack and the inner for-loop will start with the second word compared to the first one. Only words that we haven't seen already are added to the seen-stack, making it an array of unique elements.)

And here is the equivalent using Array.prototype.forEach() and Array.prototype.indexOf(), but we have to add another intermediate results stack for the latter one. So we'll have to add another iteration to produce the final result. (We wouldn't have to do this using Array.prototype.findIndex(), but this is not a standard method.)

function wordFrequency2(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [],
        freq = [];
    // get frequencies
    words.forEach(function (w) {
        var idx = seen.indexOf(w);
        if (idx >= 0) {
            freq[idx]++;
        }
        else {
            seen.push(w);
            freq.push(1);
        }
    });
    // produce the results array
    var r = [];
    seen.forEach(function (w, idx) {
        r.push( { text: w, size: freq[idx] } );
    });
    return r;
}

Putting optimization into account, the first version using explicit loops will be probably performing faster ...

0
Alnitak On

It would be simpler and far more efficient to create a direct map from word to frequency, and only afterwards convert that to your array structure. Given an array words create a map of the words:

var freq = words.reduce(function(p, c) {
    p[c] = (p[c] || 0) + 1;
    return p;
}, {});

and the convert that map into your array:

var array = Object.keys(freq).map(function(key) {
   return { text: key, size: freq[key] };
});
0
Andrew On
var words = (function(){

var sWords = document.body.innerText.toLowerCase().trim().replace(/[,;.]/g,'').split(/[\s\/]+/g).sort();
var iWordsCount = sWords.length; // count w/ duplicates

// array of words to ignore
var ignore = ['and','the','to','a','of','for','as','i','with','it','is','on','that','this','can','in','be','has','if'];
ignore = (function(){
var o = {}; // object prop checking > in array checking
var iCount = ignore.length;
for (var i=0;i<iCount;i++){
o[ignore[i]] = true;
}
return o;
}());

var counts = {}; // object for math
for (var i=0; i<iWordsCount; i++) {
var sWord = sWords[i];
if (!ignore[sWord]) {
counts[sWord] = counts[sWord] || 0;
counts[sWord]++;
}
}

var arr = []; // an array of objects to return
for (sWord in counts) {
arr.push({
text: sWord,
frequency: counts[sWord]
});
}

// sort array by descending frequency | http://stackoverflow.com/a/8837505
return arr.sort(function(a,b){
return (a.frequency > b.frequency) ? -1 : ((a.frequency < b.frequency) ? 1 : 0);
});

}());

(function(){
var iWordsCount = words.length; // count w/o duplicates
for (var i=0; i<iWordsCount; i++) {
var word = words[i];
console.log(word.frequency, word.text);
}
}());