I have the a dataframe constructed from voice transcription of multiple audio files (one for each person):

#      Name   Start_Time           Duration   Transcript
# Person A    12:12:2018 12:12:00  3.5        Transcript from Person A
# Person B    12:12:2018 12:14:00  5.5        Transcript from Person B
# .........................
# .........................
# Person N    12:12:2018 13:00:00  9.0        Transcript from Person N

Is there a way to:

  1. Find out the 'n' most frequent words spoken per 'x' minutes of the complete conversation.
  2. Is there a way to plot the 'n' most frequent words per 'x' minutes of the complete duration of the conversation.

For part 2. a per 'x' minute bar plot with the height of the bar scaled to the sum of the occurrences of the 'n' most common words? Is there a more intuitive way of graphically showing this information?

EDIT: I am attaching a basic minimal Ipython notebook of what I have right now

Ipython notebook

Problems:

  1. Resampling the dataframe per 60s doesn't do the trick as some of the conversations can be more than 60 second long. e.g The first and 4th row in the dataframe below have conversations lasting 114 seconds. Not sure if these be split in to exact 60s, even if they can the split may cascade in to the next 1 min time-slot and and make its duration more than 60s, as in case of the first and second row in the dataframe below.
Start Time                  Name        Start Time                      End Time            Duration    Transcript                  
2019-04-13 18:51:22.567532  Person A    2019-04-13 18:51:22.567532  2019-04-13 18:53:16.567532  114     A dude meows on this cool guy my gardener met yesterday for no apparent reason.
2019-04-13 18:53:24.567532  Person D    2019-04-13 18:53:24.567532  2019-04-13 18:54:05.567532  41      Your homie flees from the king for a disease.
2019-04-13 18:57:14.567532  Person B    2019-04-13 18:57:14.567532  2019-04-13 18:57:55.567532  41      The king hacks some guy because the sky is green.
2019-04-13 18:59:32.567532  Person D    2019-04-13 18:59:32.567532  2019-04-13 19:01:26.567532  114     A cat with rabies spies on a cat with rabies for a disease.
  1. As each minute interval has a different set of top 'n' frequency words, the bar plot didn't seem possible. I settled for a Dot plot 2 which looks confusing because of the y-axis height of different words. Is there a better plot to visualise this data?

Also listing the complete code here:

import pandas as pd
import random
import urllib
import plotly
import plotly.graph_objs as go
from datetime import datetime,timedelta
from collections import Counter
from IPython.core.display import display, HTML
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

def printdfhtml(df):
        old_width = pd.get_option('display.max_colwidth')
        pd.set_option('display.max_colwidth', -1)
        display(HTML(df.to_html(index=True)))
        pd.set_option('display.max_colwidth', old_width)

def removeStopwords(wordlist, stopwords):
        return [w for w in wordlist if w not in stopwords]

def stripNonAlphaNum(text):
    import re
    return re.compile(r'\W+', re.UNICODE).split(text)

def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

def sortDictKeepTopN(freqdict,keepN):
    return dict(Counter(freqdict).most_common(keepN))

def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist,wordfreq))

s_nouns = ["A dude", "My bat", "The king", "Some guy", "A cat with rabies", "A sloth", "Your homie", "This cool guy my gardener met yesterday", "Superman"]
p_nouns = ["These dudes", "Both of my cars", "All the kings of the world", "Some guys", "All of a cattery's cats", "The multitude of sloths living under your bed", "Your homies", "Like, these, like, all these people", "Supermen"]
s_verbs = ["eats", "kicks", "gives", "treats", "meets with", "creates", "hacks", "configures", "spies on", "retards", "meows on", "flees from", "tries to automate", "explodes"]
p_verbs = ["eat", "kick", "give", "treat", "meet with", "create", "hack", "configure", "spy on", "retard", "meow on", "flee from", "try to automate", "explode"]
infinitives = ["to make a pie.", "for no apparent reason.", "because the sky is green.", "for a disease.", "to be able to make toast explode.", "to know more about archeology."]

people = ["Person A","Person B","Person C","Person D"]
start_time = datetime.now() - timedelta(minutes = 10)
complete_transcript = pd.DataFrame(columns=['Name','Start Time','End Time','Duration','Transcript'])
for i in range(1,10):
    start_time = start_time + timedelta(seconds = random.randint(10,240)) # random delay bw ppl talking 10sec to 4 mins
    curr_transcript = " ".join([random.choice(s_nouns), random.choice(s_verbs), random.choice(s_nouns).lower() or random.choice(p_nouns).lower(), random.choice(infinitives)])
    talk_duration = random.randint(5,120) # 5 sec to 2 min talk
    end_time = start_time + timedelta(seconds = talk_duration)
    complete_transcript.loc[i] = [random.choice(people),
                                 start_time,
                                 end_time,
                                 talk_duration,
                                 curr_transcript] 


df = complete_transcript.copy()
df = df.sort_values(['Start Time'])
df.index=df['Start Time']
printdfhtml(df)

re_df = df.copy()
re_df = re_df.drop("Name", axis=1)
re_df = re_df.drop("End Time", axis=1)
re_df = re_df.drop("Start Time", axis=1)
re_df = re_df.resample('60S').sum()
printdfhtml(re_df)

stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves','']

x_trace = np.linspace(1,len(re_df.index),len(re_df.index))
n_top_words = 3
y_trace1 = []
y_trace2 = []
y_trace3 = []
for index, row in re_df.iterrows():
    str_to_check=str(row['Transcript']).lower()
    if(str_to_check!='0') and (str_to_check!=''):
        print('-----------------------------')
        wordlist = stripNonAlphaNum(str_to_check)
        wordlist = removeStopwords(wordlist, stopwords)
        dictionary = wordListToFreqDict(wordlist)
        print('text: ',str_to_check)
        print('words dropped dict: ',dictionary)
        sorteddict = sortDictKeepTopN(dictionary,n_top_words)
        cnt=0
        for s in sorteddict: 
            print(str(s))
            if cnt==0:
                y_trace1.append(s)
            elif cnt==1:
                y_trace2.append(s)
            elif cnt==2:
                y_trace3.append(s)
            cnt+=1

trace1 = {"x": x_trace, 
          "y": y_trace1, 
          "marker": {"color": "pink", "size": 12}, 
          "mode": "markers", 
          "name": "1st", 
          "type": "scatter"
}

trace2 = {"x": x_trace, 
          "y": y_trace2, 
          "marker": {"color": "blue", "size": 12}, 
          "mode": "markers", 
          "name": "2nd", 
          "type": "scatter", 
}

trace3 = {"x": x_trace, 
          "y": y_trace3, 
          "marker": {"color": "grey", "size": 12}, 
          "mode": "markers", 
          "name": "3rd", 
          "type": "scatter", 
}

data = [trace3, trace2, trace1]
layout = {"title": "Most Frequent Words per Minute", 
          "xaxis": {"title": "Time (in Minutes)", }, 
          "yaxis": {"title": "Words"}}

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

0 Answers