Word frequency for array of key/values on javascript

Question

Word frequency for array of key/values on javascript

4.5k views Asked by diegodacal At 12 November 2014 at 00:06

I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:

[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]

I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var newArray = [];
    $.each(wordArray, function (ix, word) {
        if (newArray.length >= 1){
            newArray.some(function (w){
                if (w.text === word){
                    w.size++;
                } else {
                    newArray.push({text: word, size: 1});
                }
            });
        } else {
            newArray.push({text: word, size: 1});
        }
    });
    return newArray;
}

Original Q&A

There are 5 answers

Ahmad Mageed On 12 November 2014 at 00:50

To tell the frequency all you need is a hash map approach. Your algorithm is quadratic, since the some method is nested in the each method, so you're always looping over the newArray just to find an entry and increment the size.

A map approach is easily achievable using a JavaScript object. It also gives you constant look-up time, which is better performance than the nested loops approach.

Try this approach instead:

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var map = {};
    $.each(wordArray, function(ix, word) {
      // skip empty results
      if (!word.length) {
        return;
      }
      // add word to map
      if (!map[word]) {
        map[word] = 0;
      } 
      map[word]++;
    });
    return map;
}

To use the function:

var text = "hello!world*hello foo  'bar'foo";
var result = wordFrequency(text);

// iterate over results
Object.keys(result).forEach(function(w) {
  console.log(w + ": " + result[w]);
});

// or use for...in
for (var w in result) {
  console.log(w + ": " + result[w]);
}

If you really wanted to, you could then map the result into your desired array format with text and size properties:

var mappedResult = Object.keys(result).map(function(w) {
  return { text: w, size: result[w] };
});
console.log(mappedResult);

Also, depending on your target browsers, you might consider using the array forEach instead of the jQuery $.each, similar to what I did with the Object.keys portion.

Here's the JSBin example.

masswerk On 12 November 2014 at 00:51

You would probably want to avoid any iterations on duplicate elements and keep your results array unique. Since any of the iterators of Array.prototype will include each of the elements, they might not be the ideal solution for this. Sometimes plain old loops do the job best ... (You may also want to expressively escape any special characters in your regular expression).

function wordFrequency(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [];
    for (var i = 0; i < words.length; i++) {
        var w = words[i],
            found = false;
        for (var j = 0; j < seen.length; j++) {
            if (w === seen[j].text) {
                seen[j].size++;
                found = true;
                break;
            }
        }
        if (!found) seen.push( { text: w, size: 1 } );
    }
    return seen;
}

(Note that the inner for-loop isn't visited for the first word, so the first word will be pushed to the seen-stack and the inner for-loop will start with the second word compared to the first one. Only words that we haven't seen already are added to the seen-stack, making it an array of unique elements.)

And here is the equivalent using Array.prototype.forEach() and Array.prototype.indexOf(), but we have to add another intermediate results stack for the latter one. So we'll have to add another iteration to produce the final result. (We wouldn't have to do this using Array.prototype.findIndex(), but this is not a standard method.)

function wordFrequency2(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [],
        freq = [];
    // get frequencies
    words.forEach(function (w) {
        var idx = seen.indexOf(w);
        if (idx >= 0) {
            freq[idx]++;
        }
        else {
            seen.push(w);
            freq.push(1);
        }
    });
    // produce the results array
    var r = [];
    seen.forEach(function (w, idx) {
        r.push( { text: w, size: freq[idx] } );
    });
    return r;
}

Putting optimization into account, the first version using explicit loops will be probably performing faster ...

Alnitak On 12 November 2014 at 00:54

It would be simpler and far more efficient to create a direct map from word to frequency, and only afterwards convert that to your array structure. Given an array words create a map of the words:

var freq = words.reduce(function(p, c) {
    p[c] = (p[c] || 0) + 1;
    return p;
}, {});

and the convert that map into your array:

var array = Object.keys(freq).map(function(key) {
   return { text: key, size: freq[key] };
});

Andrew On 12 November 2014 at 01:37

var words = (function(){

var sWords = document.body.innerText.toLowerCase().trim().replace(/[,;.]/g,'').split(/[\s\/]+/g).sort();
var iWordsCount = sWords.length; // count w/ duplicates

// array of words to ignore
var ignore = ['and','the','to','a','of','for','as','i','with','it','is','on','that','this','can','in','be','has','if'];
ignore = (function(){
var o = {}; // object prop checking > in array checking
var iCount = ignore.length;
for (var i=0;i<iCount;i++){
o[ignore[i]] = true;
}
return o;
}());

var counts = {}; // object for math
for (var i=0; i<iWordsCount; i++) {
var sWord = sWords[i];
if (!ignore[sWord]) {
counts[sWord] = counts[sWord] || 0;
counts[sWord]++;
}
}

var arr = []; // an array of objects to return
for (sWord in counts) {
arr.push({
text: sWord,
frequency: counts[sWord]
});
}

// sort array by descending frequency | http://stackoverflow.com/a/8837505
return arr.sort(function(a,b){
return (a.frequency > b.frequency) ? -1 : ((a.frequency < b.frequency) ? 1 : 0);
});

}());

(function(){
var iWordsCount = words.length; // count w/o duplicates
for (var i=0; i<iWordsCount; i++) {
var word = words[i];
console.log(word.frequency, word.text);
}
}());

**LJᛃ** · Accepted Answer · 2014-11-12T00:28:51+00:00

Array.prototype.some expects the given callback to return true or false and returns true as soon as your callback returns true for a given element, otherwise it returns false.

So some iterates over all elements, with your given callback, and your callback checks if the given element text equals the search word and if not adds a new object. Introducing a new element the some function can iterate over.

So to make this clear, for every word thats in the newArray before the word you're searching, you're adding a new object containing your word.

Suppose your newArray looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"}]

after calling your function for the word even it looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"},{word:"even"},{word:"even"},{word:"even"},{word:"even"}]

Using Array.prototype.filter would be the better approach here, finding you the matching element, note that I also replaced $.each with Array.prototype.forEach:

function wordFrequency(txt){
  var wordArray = txt.split(/[ .?!,*'"]/);
  var newArray = [], wordObj;
  wordArray.forEach(function (word) {
    wordObj = newArray.filter(function (w){
      return w.text == word;
    });
    if (wordObj.length) {
      wordObj[0].size += 1;
    } else {
      newArray.push({text: word, size: 1});
    }
  });
  return newArray;
}
document.write(JSON.stringify(wordFrequency("count everything, count all the words, count all the words!").sort(function(a,b){return a.size<b.size})).split("},").join("}<br/>"));

TechQA.

Word frequency for array of key/values on javascript

There are 5 answers

Related Questions in JAVASCRIPT

Related Questions in ARRAYS

Related Questions in WORD-FREQUENCY

Popular Questions

Trending Questions