How to change the list format into text file and pass it as argument to a function defined in python?

112 views Asked by At

How to pass the two text file as argument to a function defined instead of list as passed as argument?

My code contain three pair of sentences defined in a list. It is passed as argument to function em_run.

Now I need to read two corpus, i.e. two separate text file to be read into the function instead of these three pair of sentence.

Here's my code:

#!/usr/bin/env python
"""An implementation of the IBM Model 1 expectation-maximization algorithm  for learning word alignments."""

from collections import defaultdict
import copy
import itertools
import operator


def em_run(sentence_pairs):
#Run expectation-maximization on a list of pairs of the form
# `(source_tokens, target_tokens)`
# where `source_tokens` is a list of tokens in the source language and
#`target_tokens` is a list of tokens for a translationally equivalent
#sentence in the target language.
#Returns a mapping `(t1, t2) => p` where `t1` is a source-language
#token, `t2` is a target-language token, and the value `p` represents
#$P(t1|t2)$.


source_sentences, target_sentences = zip(*sentence_pairs)
source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

# Value with which to initialize each conditional probability
uniform_prob = 1.0 / len(source_vocabulary)

conditional_probs_old = None
conditional_probs = {(source_w, target_w): uniform_prob
    for source_w in source_vocabulary
    for target_w in target_vocabulary}

alignments = [[zip(source, target_perm)
    for target_perm in itertools.permutations(target)]
    for source, target in sentence_pairs] 

# Repeat until convergence
i = 0
while conditional_probs_old != conditional_probs:
    conditional_probs_old = copy.copy(conditional_probs)

    alignment_probs = {
        i: {
            tuple(alignment):
            reduce(operator.mul, [conditional_probs[pair]
            for pair in alignment])
            for alignment in sentence_alignments
           }

    for i, sentence_alignments in enumerate(alignments)
    }

# Normalize alignment probabilities
for sentence_idx, sentence_alignments in alignment_probs.iteritems():
    total = float(sum(sentence_alignments.values()))
    probs = {alignment: value / total
        for alignment, value in sentence_alignments.iteritems()}
    alignment_probs[sentence_idx] = probs

# Now join all alignments and begin the maximization step: group
# by target-language word and collect corresponding
# source-language probabilities
word_translations = defaultdict(lambda: defaultdict(float))
for sentence_alignments in alignment_probs.itervalues():
    for word_pairs, prob in sentence_alignments.iteritems():
        for source_word, target_word in word_pairs:
            word_translations[target_word][source_word] += prob

# Now calculate new conditional probability mapping, ungrouping
# the `word_translations` tree and normalizing values into
# conditional probabilities
conditional_probs = {}
for target_word, translations in word_translations.iteritems():
    total = float(sum(translations.values()))
    for source_word, score in translations.iteritems():
        conditional_probs[source_word, target_word] = score / total

return conditional_probs


def main():
    SENTENCES = [
            ('mi casa verde'.split(), 'my green house'.split()),
            ('casa verde'.split(), 'green house'.split()),
            ('la casa'.split(), 'the house'.split()),
            ]
    print em_run(SENTENCES)

if __name__ == '__main__':
    main()
2

There are 2 answers

7
Michael Kazarian On

I see two ways:

  1. Use persistence. If you use this way you can save any object to some store i.g. file. But not always you can change it manually.
  2. Write string representation to file and parse it. It good way for simple objects i.g. numbers, string, etc. but has manually parsing.

If you read data from files... Suppose it has below format:

English

my green house
green house
the house

Malayalam

mi casa verde
casa verde
la casa

Below your code, adapted for file instead lists.

#!/usr/bin/env python

from itertools import izip
from collections import defaultdict
import copy
import itertools
import operator


def em_run(sentence_pairs):
    #Run expectation-maximization on a list of pairs of the form
    # `(source_tokens, target_tokens)`
    # where `source_tokens` is a list of tokens in the source language and
    #`target_tokens` is a list of tokens for a translationally equivalent
    #sentence in the target language.
    #Returns a mapping `(t1, t2) => p` where `t1` is a source-language
    #token, `t2` is a target-language token, and the value `p` represents
    #$P(t1|t2)$.


    source_sentences, target_sentences = zip(*sentence_pairs)
    source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
    target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

    # Value with which to initialize each conditional probability
    uniform_prob = 1.0 / len(source_vocabulary)

    conditional_probs_old = None
    conditional_probs = {(source_w, target_w): uniform_prob
        for source_w in source_vocabulary
        for target_w in target_vocabulary}

    alignments = [[zip(source, target_perm)
        for target_perm in itertools.permutations(target)]
        for source, target in sentence_pairs] 

    # Repeat until convergence
    i = 0
    while conditional_probs_old != conditional_probs:
        conditional_probs_old = copy.copy(conditional_probs)

        alignment_probs = {
            i: {
                tuple(alignment):
                reduce(operator.mul, [conditional_probs[pair]
                    for pair in alignment])
                for alignment in sentence_alignments
           }

    for i, sentence_alignments in enumerate(alignments)
    }

    # Normalize alignment probabilities
    for sentence_idx, sentence_alignments in alignment_probs.iteritems():
        total = float(sum(sentence_alignments.values()))
        probs = {alignment: value / total
            for alignment, value in sentence_alignments.iteritems()}
        alignment_probs[sentence_idx] = probs

    # Now join all alignments and begin the maximization step: group
    # by target-language word and collect corresponding
    # source-language probabilities
    word_translations = defaultdict(lambda: defaultdict(float))
    for sentence_alignments in alignment_probs.itervalues():
        for word_pairs, prob in sentence_alignments.iteritems():
            for source_word, target_word in word_pairs:
                word_translations[target_word][source_word] += prob

    # Now calculate new conditional probability mapping, ungrouping
    # the `word_translations` tree and normalizing values into
    # conditional probabilities
    conditional_probs = {}
    for target_word, translations in word_translations.iteritems():
        total = float(sum(translations.values()))
        for source_word, score in translations.iteritems():
            conditional_probs[source_word, target_word] = score / total

    return conditional_probs


def main():
    SENTENCES = [
    ('mi casa verde'.split(), 'my green house'.split()),
    ('casa verde'.split(), 'green house'.split()),
    ('la casa'.split(), 'the house'.split()),
            ]
    print "Original SENTENCES"
    print "Original results" em_run(SENTENCES)
    print "******** Read words from files ********************"
    NEWSENTENCES = []
    with open("datafile_english") as textEn, open("datafile_malayalam") as textMal:
        for x, y in izip(textEn, textMal):
            x = x.strip().split()
            y = y.strip().split()
            NEWSENTENCES.append((y, x))
    print "NEWRESULT", em_run(NEWSENTENCES)
if __name__ == '__main__':
    main()

Output:

Original SENTENCES
Original results {('mi', 'green'): 0.16666666666666669, ('verde', 'my'): 0.3333333333333333, ('la', 'the'): 0.5, ('mi', 'my'): 0.3333333333333333, ('mi', 'house'): 0.1111111111111111, ('casa', 'the'): 0.5, ('casa', 'my'): 0.3333333333333333, ('verde', 'house'): 0.27777777777777773, ('casa', 'house'): 0.4444444444444444, ('casa', 'green'): 0.4166666666666667, ('verde', 'green'): 0.4166666666666667, ('la', 'house'): 0.16666666666666666}
**** Read words from file ************************
NEWRESULT {('mi', 'green'): 0.16666666666666669, ('verde', 'my'): 0.3333333333333333, ('la', 'the'): 0.5, ('mi', 'my'): 0.3333333333333333, ('mi', 'house'): 0.1111111111111111, ('casa', 'the'): 0.5, ('casa', 'my'): 0.3333333333333333, ('verde', 'house'): 0.27777777777777773, ('casa', 'house'): 0.4444444444444444, ('casa', 'green'): 0.4166666666666667, ('verde', 'green'): 0.4166666666666667, ('la', 'house'): 0.16666666666666666}
8
Matthew Trevor On

If each file contains one part of the pair of sentences, and each line of the first file corresponds to the same line in the second, all you need to do is open the files and zip them together:

em_run(zip(open('file1'), open('file2')))