SpaCy Sentiment Analysis: Non-blank NLP model raises error during training update

23 views Asked by FSic At 15 March 2024 at 10:17

I've found this nice example of sentiment analysis implementation, however it does not really work with spacy V3. I managed to get it working with spaCy's later versions like this (feel free to use it!):

import spacy
import os
import random
from spacy.util import minibatch, compounding
import  pandas as pd
from spacy.training.example import Example


def load_training_data(
    data_directory: str = "../trainPath", split: float = 0.8, limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}", encoding="utf-8") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label,
                            }
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

def train_model(training_data: list, test_data: list, iterations: int = 20):
    print(f"Loading the nlp model")
    # nlp = spacy.load("en_core_web_sm")
    # nlp = spacy.load("en_core_web_lg")
    nlp = spacy.blank("en")


    print(f"Adding textcat pipe")
    if "textcat" not in nlp.pipe_names:
        # textcat = nlp.add_pipe("textcat",config=single_label_cnn_config, last=True)
        textcat = nlp.add_pipe("textcat",  last=True)
    else:
        print("Textcat pipe already present.")
        textcat = nlp.get_pipe("texcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    print()
    # optimizer = nlp.create_optimizer()
    optimizer = nlp.begin_training()
    print("Beginning training")
    print("Loss\tPrecision\tRecall\tF-score")
    batch_sizes = compounding(
    4.0, 32.0, 1.001
    )
    print("Shuffling the data:")
    random.shuffle(training_data)
    # for datum in training_data:
    #     print(datum)
    for i in range(iterations):
        print(f"Training iteration {i}")
        loss = {}
        random.shuffle(training_data)
        batches = minibatch(training_data, size=batch_sizes)
        for batch in batches:
            examples = []
            for text, labels in batch:
                examples.append(Example.from_dict(nlp(text), labels))
            nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)

        evaluation_results = evaluate_model(tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data)
        print(
            f"{loss['textcat']}\t{evaluation_results['precision']}"
            f"\t{evaluation_results['recall']}"
            f"\t{evaluation_results['f-score']}"
        )
    # print(f"Value of optimizer averages: {optimizer.averages}")
        # with textcat.model.use_params(optimizer.averages):
        #     print("test")
    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        pos_label = true_label["cats"].get("pos", False)

        # Get the predicted label with the highest score
        predicted_label, score = max(review.cats.items(), key=lambda x: x[1])

        if predicted_label == "neg":
            continue
        if score >= 0.5 and pos_label:
            true_positives += 1
        elif score >= 0.5 and not pos_label:
            false_positives += 1
        elif score < 0.5 and not pos_label:
            true_negatives += 1
        elif score < 0.5 and pos_label:
            false_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

def test_model(input_data):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )

The only issue is that it works only with blank nlp models. If I try to load a model with more than the textcat component, this line:

                nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)

Raises the following error: "ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset"

I would like to add an entity-ruler component to the pipeline, therefore I was wondering if anyone has any idea how to modify this so that it works for pre-trained models.

I tried to use

        optimizer = nlp.create_optimizer()

And

        optimizer = nlp.resume_training()

Instead of begin_training() coupled with a non-blank models, as well as using nlp.make_doc(text) instead of nlp(text) to obtain the doc object with the tokens only, but the error persists. Any suggestions?

Original Q&A

TechQA.

SpaCy Sentiment Analysis: Non-blank NLP model raises error during training update

There are 0 answers

Related Questions in PYTHON

Related Questions in NLP

Related Questions in SPACY

Related Questions in SENTIMENT-ANALYSIS

Popular Questions

Trending Questions