Issues with Foma fst when using python

405 views Asked by At

I am trying to morph analyse a full folder containing txt files.

using https://code.google.com/archive/p/foma/

This is the code which i have written.I am passing each word to the foma fst in python but after running for 143 files out of 1900 files the loop is stuck for indefinite amount of time.I tried commenting the foma call applyup in the loop and the files were written without any issues into the new folder(non root form).

class Lemmatizer:
    def __init__(self, inputFolderPath=None, outputFolderPath=None, fomaBinFilePath="Konkani.bin"):
        self.inputFolderPath = inputFolderPath
        self.outputFolderPath = outputFolderPath
        self.fomaBinFilePath=fomaBinFilePath
        self.net = foma.foma_fsm_read_binary_file(fomaBinFilePath)
        self.ah = foma.foma_apply_init(self.net)


 def lemmatize_folder(self):
        net = foma.foma_fsm_read_binary_file(self.fomaBinFilePath)
        ah = foma.foma_apply_init(net)

    if not os.path.exists(self.outputFolderPath):
        os.makedirs(self.outputFolderPath)
    for root, dirs, files in os.walk(self.inputFolderPath):
        for file in filter(lambda file: file.endswith('.txt'), files):
            with codecs.open(os.path.join(self.outputFolderPath, file), 'w') as outputFile:
                with codecs.open(os.path.join(root, file), 'r','utf-8') as inputFile:
                    for line in inputFile:
                        for word in nltk.word_tokenize(line):
                            result = foma.foma_apply_up(ah, word)
                            # result = None
                            if result is not None:
                                print file
                                print result.split('+', 1)[0]
                                # outputFile.write(result.split('+', 1)[0])
                            else:
                                outputFile.write(word.encode('utf-8'))
                            outputFile.write(' ')
                    outputFile.write('\n')

has anyone faced the similar issue ?. Is it that foma fst has some limitations on number of times it can be called once initialised ?

0

There are 0 answers