SpaCy Sentiment Analysis: Non-blank NLP model raises error during training update

23 views Asked by At

I've found this nice example of sentiment analysis implementation, however it does not really work with spacy V3. I managed to get it working with spaCy's later versions like this (feel free to use it!):

import spacy
import os
import random
from spacy.util import minibatch, compounding
import  pandas as pd
from spacy.training.example import Example


def load_training_data(
    data_directory: str = "../trainPath", split: float = 0.8, limit: int = 0
) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}", encoding="utf-8") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label,
                            }
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

def train_model(training_data: list, test_data: list, iterations: int = 20):
    print(f"Loading the nlp model")
    # nlp = spacy.load("en_core_web_sm")
    # nlp = spacy.load("en_core_web_lg")
    nlp = spacy.blank("en")


    print(f"Adding textcat pipe")
    if "textcat" not in nlp.pipe_names:
        # textcat = nlp.add_pipe("textcat",config=single_label_cnn_config, last=True)
        textcat = nlp.add_pipe("textcat",  last=True)
    else:
        print("Textcat pipe already present.")
        textcat = nlp.get_pipe("texcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    print()
    # optimizer = nlp.create_optimizer()
    optimizer = nlp.begin_training()
    print("Beginning training")
    print("Loss\tPrecision\tRecall\tF-score")
    batch_sizes = compounding(
    4.0, 32.0, 1.001
    )
    print("Shuffling the data:")
    random.shuffle(training_data)
    # for datum in training_data:
    #     print(datum)
    for i in range(iterations):
        print(f"Training iteration {i}")
        loss = {}
        random.shuffle(training_data)
        batches = minibatch(training_data, size=batch_sizes)
        for batch in batches:
            examples = []
            for text, labels in batch:
                examples.append(Example.from_dict(nlp(text), labels))
            nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)

        evaluation_results = evaluate_model(tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data)
        print(
            f"{loss['textcat']}\t{evaluation_results['precision']}"
            f"\t{evaluation_results['recall']}"
            f"\t{evaluation_results['f-score']}"
        )
    # print(f"Value of optimizer averages: {optimizer.averages}")
        # with textcat.model.use_params(optimizer.averages):
        #     print("test")
    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        pos_label = true_label["cats"].get("pos", False)

        # Get the predicted label with the highest score
        predicted_label, score = max(review.cats.items(), key=lambda x: x[1])

        if predicted_label == "neg":
            continue
        if score >= 0.5 and pos_label:
            true_positives += 1
        elif score >= 0.5 and not pos_label:
            false_positives += 1
        elif score < 0.5 and not pos_label:
            true_negatives += 1
        elif score < 0.5 and pos_label:
            false_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

def test_model(input_data):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )

The only issue is that it works only with blank nlp models. If I try to load a model with more than the textcat component, this line:

                nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)

Raises the following error: "ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset"

I would like to add an entity-ruler component to the pipeline, therefore I was wondering if anyone has any idea how to modify this so that it works for pre-trained models.

I tried to use

        optimizer = nlp.create_optimizer()

And

        optimizer = nlp.resume_training()

Instead of begin_training() coupled with a non-blank models, as well as using nlp.make_doc(text) instead of nlp(text) to obtain the doc object with the tokens only, but the error persists. Any suggestions?

0

There are 0 answers