How to pass the two text file as argument to a function defined instead of list as passed as argument?
My code contain three pair of sentences defined in a list. It is passed as argument to function em_run
.
Now I need to read two corpus, i.e. two separate text file to be read into the function instead of these three pair of sentence.
Here's my code:
#!/usr/bin/env python
"""An implementation of the IBM Model 1 expectation-maximization algorithm for learning word alignments."""
from collections import defaultdict
import copy
import itertools
import operator
def em_run(sentence_pairs):
#Run expectation-maximization on a list of pairs of the form
# `(source_tokens, target_tokens)`
# where `source_tokens` is a list of tokens in the source language and
#`target_tokens` is a list of tokens for a translationally equivalent
#sentence in the target language.
#Returns a mapping `(t1, t2) => p` where `t1` is a source-language
#token, `t2` is a target-language token, and the value `p` represents
#$P(t1|t2)$.
source_sentences, target_sentences = zip(*sentence_pairs)
source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
target_vocabulary = set(itertools.chain.from_iterable(target_sentences))
# Value with which to initialize each conditional probability
uniform_prob = 1.0 / len(source_vocabulary)
conditional_probs_old = None
conditional_probs = {(source_w, target_w): uniform_prob
for source_w in source_vocabulary
for target_w in target_vocabulary}
alignments = [[zip(source, target_perm)
for target_perm in itertools.permutations(target)]
for source, target in sentence_pairs]
# Repeat until convergence
i = 0
while conditional_probs_old != conditional_probs:
conditional_probs_old = copy.copy(conditional_probs)
alignment_probs = {
i: {
tuple(alignment):
reduce(operator.mul, [conditional_probs[pair]
for pair in alignment])
for alignment in sentence_alignments
}
for i, sentence_alignments in enumerate(alignments)
}
# Normalize alignment probabilities
for sentence_idx, sentence_alignments in alignment_probs.iteritems():
total = float(sum(sentence_alignments.values()))
probs = {alignment: value / total
for alignment, value in sentence_alignments.iteritems()}
alignment_probs[sentence_idx] = probs
# Now join all alignments and begin the maximization step: group
# by target-language word and collect corresponding
# source-language probabilities
word_translations = defaultdict(lambda: defaultdict(float))
for sentence_alignments in alignment_probs.itervalues():
for word_pairs, prob in sentence_alignments.iteritems():
for source_word, target_word in word_pairs:
word_translations[target_word][source_word] += prob
# Now calculate new conditional probability mapping, ungrouping
# the `word_translations` tree and normalizing values into
# conditional probabilities
conditional_probs = {}
for target_word, translations in word_translations.iteritems():
total = float(sum(translations.values()))
for source_word, score in translations.iteritems():
conditional_probs[source_word, target_word] = score / total
return conditional_probs
def main():
SENTENCES = [
('mi casa verde'.split(), 'my green house'.split()),
('casa verde'.split(), 'green house'.split()),
('la casa'.split(), 'the house'.split()),
]
print em_run(SENTENCES)
if __name__ == '__main__':
main()
I see two ways:
If you read data from files... Suppose it has below format:
English
Malayalam
Below your code, adapted for file instead
list
s.Output: