Python: Create multiple dictionaries of letter transitions

886 views Asked by At

So me and my groupmates are trying to make a Markov Model that finds the probability of letter transitions in a text file. In the text file we have a group of words "Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates". In the code we have spaces added to the beginning of the first letter and after the last letter in each word. So the problem we are having is making a function that puts the letter transitions into separate dictionaries. For example all the e transitions(ex: "_e", "ea"...etc, the _ is a space) would go into a dictionary and then the t, s, a, and m.

This is the code we have so far:

import random
import re

inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)

fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file


def addSpaces(atuple):
    theString=''
    for i in atuple:
        theString=theString+' '+i+' '
    return(theString)

print('The words in the text file:',addSpaces(fileTuple))


fileDict = { }
for i in fileList:
    fileDict['_'+i+'_']=''

print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)

def countTotalWords(atuple):
    count=0
    for i in atuple:
        count=count+1
    return(count)

print('Total amount of words:',countTotalWords(fileTuple))

def findFirstLetter(aDict):
    for i in aDict:
        aDict[i]=i[0:2]
    return(aDict)

print('The first letters of each word in the file:',findFirstLetter(fileDict))



valueList=list(fileDict.values())
keyList=list(fileDict.keys())



def countFirstLetters(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += 1
        else:
            d[character] = 1

    return d

print('Total amount of occurences of each first letter:',countFirstLetters(valueList))

def countFirstLettersProbability(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += (1/countTotalWords(fileTuple))
        else:
            d[character] = (1/countTotalWords(fileTuple))

    return d


print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))


def countAllLetters(alist):
    d={}
    for word in alist:
        for char in word:
            if char in d:
                d[char] += 1
            else:
                d[char] = 1

    return d

print('Total amount of occurences of each letter:',countFirstLetters(fileString))
1

There are 1 answers

0
Hugh Bothwell On

Here is a solid start; I've rewritten your code as a Markov class.

from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip

def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
    return reg.sub(' ', text.strip())

def nwise(iterable, n):
    "s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
    args = tee(iterable, n)
    for i,t in enumerate(args):
        for j in range(i):
            next(t, None)
    return izip(*args)

class Markov():
    CHAINLEN = 3
    PRE = ' '*(CHAINLEN - 1)

    @classmethod
    def from_file(cls, fname):
        with open(fname) as inf:
            return Markov(inf)

    def __init__(self, text):
        """
        Create a new Markov chain model

            text
                Either a string or a sequence of strings
        """
        self.lookup = defaultdict(list)
        self.words = 0
        self.strings = 0

        if hasattr(text, '__iter__'):
            for s in text:
                self.add_text(s)
        else:
            self.add_text(text)

    def add_text(self, text):
        """
        Add a string to the lookup table

            text
                string to add
        """
        text = strip_non_alpha(text).lower()
        self.words += len(text.split())
        self.strings += 1
        for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
            stem = ''.join(chars[:-1])
            self.lookup[stem].append(chars[-1])

    def gen_text(self, upto=200):
        """
        Generate a string

            upto
                maximum length of string to be generated
        """
        s = Markov.PRE
        res = []
        for i in range(upto + Markov.CHAINLEN):
            ch = choice(self.lookup[s])
            res.append(ch)
            s = s[1:] + ch
            if s == Markov.PRE:    # terminal string
                break
        return ''.join(res[:-(Markov.CHAINLEN - 1)])

    def __str__(self):
        return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))

def main():
    # mc = Markov.from_file('markov.txt')
    mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))

    print mc.strings, mc.words
    print mc

    for i in range(10):
        print(mc.gen_text())

if __name__=="__main__":
    main()