Adding a lemma for a new word and the concept of normalization/lemmatization in spaCy

91 views Asked by At

Following the examples from documentation regarding tokenization I have the following code:

import spacy
from spacy.symbols import ORTH, NORM

nlp = spacy.load("en_core_web_sm")
special_case = [{ORTH: "gim", NORM: "give"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

doc = nlp("gimme that. he gave me that. Going to someplace.")

Then I check the tokenization

doc[0].norm_  # 'give'  (as expected)

But the lemmatizer does not return the same output

lemmatizer = nlp.get_pipe("lemmatizer")
lemmatizer.lemmatize(doc[0])  # ['gim']  (expected ['give']

In other hand

lemmatizer.lemmatize(doc[5]) # ['give']
lemmatizer.lemmatize(doc[9]) # [go']

What I'm doing wrong? How to "fix"? In spaCy what is the difference between normalized tokens and lemmatized tokens? How can I "teach" the lemmatization of a single token (as this gim token in example) ?

2

There are 2 answers

2
Vaibhav Patil On BEST ANSWER

In your code you've customized the tokenizer to handle the special case "gimme" and normalize it to "give.

Here's how you can achieve consistent lemmatization results with your custom normalization

import spacy
from spacy.language import Language
from spacy.symbols import ORTH, NORM
        
nlp = spacy.load("en_core_web_sm")
special_case = [{ORTH: "gim", NORM: "give"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
        
# Define a custom lemmatization function
@Language.component(name="custom_lemmatizer")
def custom_lemmatizer_function(doc):
    for token in doc:
        if token.norm_ == "give":
            token.lemma_ = "give"
    # Add more custom rules for other words if needed
    return doc
        
# Add the custom lemmatizer to the pipeline
nlp.add_pipe("custom_lemmatizer", name="custom_lemmatizer", after="lemmatizer")
        
doc = nlp("gimme that. he gave me that. Going to someplace.")
print(doc[0].lemma_)  # 'give' (as expected)
print(doc[5].lemma_)  # 'give' (as expected)
print(doc[9].lemma_)  # 'go' (as expected)
0
Vaibhav Patil On
import spacy
from spacy.language import Language
from spacy.symbols import ORTH, NORM

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a special case to recognize "gimme" as "give me"
special_case = [{ORTH: "gim", NORM: "give"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Define a custom lemmatization function
@Language.component(name="custom_lemmatizer")
def custom_lemmatizer_function(doc):
    for token in doc:
        # Check if the normalized form of the token is "give"
        if token.norm_ == "give":
            # Override the lemma to "give"
            token.lemma_ = "give"
    # Add more custom rules for other words if needed
    return doc

# Add the custom lemmatizer to the pipeline after the default lemmatizer
nlp.add_pipe("custom_lemmatizer", name="custom_lemmatizer", after="lemmatizer")

# Process a text with the spaCy pipeline
doc = nlp("gimme that. he gave me that. Going to someplace.")

# Check lemmatization results`enter code here`
print(doc[0].lemma_)  # 'give' (as expected)
print(doc[5].lemma_)  # 'give' (as expected)
print(doc[9].lemma_)  # 'go' (as expected)