bigram calculation - Memory error, large file problem

94 views Asked by At

Here is a code for bigram calculation from the text corpus:

import sys
import csv
import string

import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from collections import Counter

tokenizer   = RegexpTokenizer(r'\w+')

with open('hugefile.csv', 'r') as file:
    
            text  = file.read()
                
            token = tokenizer.tokenize(text)
            trigrams = ngrams(token,3)
            count = Counter(trigrams)

            with open('out.csv','w') as csvfile:
                
                for tag, count in count.items():  
                
                    csvfile.write('{},{},{},{}\n'.format(tag[0], tag[1], tag[2], count)) 

The error:

    Traceback (most recent call last):
  File "C:\OSPanel\domains\vad\freq.py", line 17, in <module>
    text = file.read();
           ^^^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
MemoryError

The code works correct in cases where hugefile.csv is relativity small. And problems begin when file is really huge (several GB), script stop working.

How to solve that problem? Thanks.

Updated:

import sys
import csv
import string

import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from collections import deque
from collections import Counter

tokenizer   = RegexpTokenizer(r'\w+')

def ngram_iter(it, n=3):
    
    fifo = deque(maxlen=3)
    for line in it:
        for token in tokenizer.tokenize(line):
            fifo.append(token)
            if len(fifo) == n:
                yield list(fifo)
                
# Create an empty counter to store the trigram counts

with open('file.txt', 'r') as file:
    
            count = Counter(ngram_iter(file, 3))

            with open('out_2.csv','w') as csvfile:
                
                for tag, freq in count.items():  
                
                    csvfile.write('{},{},{},{}\n'.format(tag[0], tag[1], tag[2], freq)) 

Error:

    Traceback (most recent call last):
  File "freq2.py", line 30, in <module>
    count = Counter(ngram_iter(file, 3))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\collections\__init__.py", line 599, in __init__
    self.update(iterable, **kwds)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1776.0_x64__qbz5n2kfra8p0\Lib\collections\__init__.py", line 690, in update
    _count_elements(self, iterable)
TypeError: unhashable type: 'list'
1

There are 1 answers

4
Mad Physicist On

Assuming that I understand what n-grams are correctly, you can make the following iterator that technically avoids loading the entire file at once. You might still have an overly large Counter if there are insufficient repetitions among the n-grams, but that's a separate problem (which you can solve by writing the intermediate n-gram data to disk and sorting to emulate the behavior of a disk-based counter).

from collections import deque

def ngram_iter(it, n=3):
    """
    it: any iterator of strings, be it file, list, generator, or whatever
    """
    fifo = deque(maxlen=3)
    for line in it:
        for token in tokenizer.tokenize(line):
            fifo.append(token)
            if len(fifo) == n:
                yield ' '.join(fifo)

Now you can create the counter without loading the entire file at once:

Counter(ngram_iter(file, 3))