How to get pos-tag lemmatiser to iterate through df

91 views Asked by At

I want to use POS-labelling and lemmatisation on my text data. I've found this example code from kaggle. This applies it to a sentence, but I want to modify this code in order to apply it to a column of a dataframe.

#Kaggle example code: 
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' # if mapping isn't found, fall back to Noun.
    
# `pos_tag` takes the tokenized sentence as input, i.e. list of string,
# and returns a tuple of (word, tg), i.e. list of tuples of strings
# so we need to get the tag from the 2nd element.

walking_tagged = pos_tag(word_tokenize('He is walking to school'))
#print(walking_tagged)

testing["text"].apply(penn2morphy)
#[wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in walking_tagged]

I presumed you would just use the apply function but that doesnt work. The first line where the pos_tag is being applied is just labelling each row as n, so i presume it isnt iterating through each row.

#Example data
    r1 = ["he, has, a, glass, of, water, together, with, a, mirror"],"Pass"
    r2 = ["lamp, lens, right, left"], "Fail"
    r3 = ["candle, clock, vase, spoon"], "Fail"
    d=(r1,r2,r3)
    ex_df = pd.DataFrame(d, columns=["col1", "col2"])

walking_tagged2 = ex_df["col1"].apply(pos_tag)
[wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in walking_tagged2]

Any ideas? Thank you

1

There are 1 answers

3
alvas On

Take a look at https://github.com/alvations/pywsd/blob/master/pywsd/utils.py#L124

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

import re

porter = PorterStemmer()
wnl = WordNetLemmatizer()


def lemmatize(ambiguous_word: str, pos: str = None, neverstem=False,
              lemmatizer=wnl, stemmer=porter) -> str:
    """
    Tries to convert a surface word into lemma, and if lemmatize word is not in
    wordnet then try and convert surface word into its stem.
    This is to handle the case where users input a surface word as an ambiguous
    word and the surface word is a not a lemma.
    """

    # Try to be a little smarter and use most frequent POS.
    pos = pos if pos else penn2morphy(pos_tag([ambiguous_word])[0][1],
                                     default_to_noun=True)
    lemma = lemmatizer.lemmatize(ambiguous_word, pos=pos)
    stem = stemmer.stem(ambiguous_word)
    # Ensure that ambiguous word is a lemma.
    if not wn.synsets(lemma):
        if neverstem:
            return ambiguous_word
        if not wn.synsets(stem):
            return ambiguous_word
        else:
            return stem
    else:
        return lemma


def penn2morphy(penntag, returnNone=False, default_to_noun=False) -> str:
    """
    Converts tags from Penn format (input: single string) to Morphy.
    """
    morphy_tag = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        if returnNone:
            return None
        elif default_to_noun:
            return 'n'
        else:
            return ''

def lemmatize_sentence(sentence: str, neverstem=False, keepWordPOS=False,
                       tokenizer=word_tokenize, postagger=pos_tag,
                       lemmatizer=wnl, stemmer=porter) -> list:

    words, lemmas, poss = [], [], []
    for word, pos in postagger(tokenizer(sentence)):
        pos = penn2morphy(pos)
        lemmas.append(lemmatize(word.lower(), pos, neverstem,
                                lemmatizer, stemmer))
        poss.append(pos)
        words.append(word)

    if keepWordPOS:
        return words, lemmas, [None if i == '' else i for i in poss]

    return lemmas

After you have a lemmatize_sentence function like above,

# If you want to use a pre-coded version:
# pip install -U pywsd

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


from pywsd.utils import lemmatize_sentence

lemmatize_sentence("he has glasses of water together with many mirrors")


[out]:

['he', 'have', 'glass', 'of', 'water', 'together', 'with', 'many', 'mirror']

With dataframes:

import pandas as pd

df = pd.DataFrame({'text': ["he has glasses of water together with many mirrors", 
                            "lamp, lens, right, left"]})

df['text'].apply(lemmatize_sentence)