I have a Dataframe of some tweets about the Russia-Ukraine conflict and I have pos_tagged the tweets after cleaning and want to lemmatize postagged column. My code returns only the first pos_tagged word as a lemma. How can I correctly lemmatize the pos_tagged column?
Function to pos tag cleaned tweets:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
tags = pos_tag(word_tokenize(text))
newlist = []
for word, tag in tags:
if word.lower() not in set(stopwords.words('english')):
newlist.append(tuple([word, pos_dict.get(tag[0])]))
return newlist
df['POS_tagged'] = df['cleanedTweets'].apply(token_stop_pos)
Function to lemmatize df['POS_tagged'] column:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
lemma_rew = " "
for word, pos in pos_data:
if not pos:
lemma = word
lemma_rew = lemma_rew + " " + lemma
else:
lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
lemma_rew = lemma_rew + " " + lemma
return lemma_rew
df['Lemma'] = df['POS_tagged'].apply(lemmatize)