I've found this nice example of sentiment analysis implementation, however it does not really work with spacy V3. I managed to get it working with spaCy's later versions like this (feel free to use it!):
import spacy
import os
import random
from spacy.util import minibatch, compounding
import pandas as pd
from spacy.training.example import Example
def load_training_data(
data_directory: str = "../trainPath", split: float = 0.8, limit: int = 0
) -> tuple:
# Load from files
reviews = []
for label in ["pos", "neg"]:
labeled_directory = f"{data_directory}/{label}"
for review in os.listdir(labeled_directory):
if review.endswith(".txt"):
with open(f"{labeled_directory}/{review}", encoding="utf-8") as f:
text = f.read()
text = text.replace("<br />", "\n\n")
if text.strip():
spacy_label = {
"cats": {
"pos": "pos" == label,
"neg": "neg" == label,
}
}
reviews.append((text, spacy_label))
random.shuffle(reviews)
if limit:
reviews = reviews[:limit]
split = int(len(reviews) * split)
return reviews[:split], reviews[split:]
def train_model(training_data: list, test_data: list, iterations: int = 20):
print(f"Loading the nlp model")
# nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_lg")
nlp = spacy.blank("en")
print(f"Adding textcat pipe")
if "textcat" not in nlp.pipe_names:
# textcat = nlp.add_pipe("textcat",config=single_label_cnn_config, last=True)
textcat = nlp.add_pipe("textcat", last=True)
else:
print("Textcat pipe already present.")
textcat = nlp.get_pipe("texcat")
textcat.add_label("pos")
textcat.add_label("neg")
print()
# optimizer = nlp.create_optimizer()
optimizer = nlp.begin_training()
print("Beginning training")
print("Loss\tPrecision\tRecall\tF-score")
batch_sizes = compounding(
4.0, 32.0, 1.001
)
print("Shuffling the data:")
random.shuffle(training_data)
# for datum in training_data:
# print(datum)
for i in range(iterations):
print(f"Training iteration {i}")
loss = {}
random.shuffle(training_data)
batches = minibatch(training_data, size=batch_sizes)
for batch in batches:
examples = []
for text, labels in batch:
examples.append(Example.from_dict(nlp(text), labels))
nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)
evaluation_results = evaluate_model(tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data)
print(
f"{loss['textcat']}\t{evaluation_results['precision']}"
f"\t{evaluation_results['recall']}"
f"\t{evaluation_results['f-score']}"
)
# print(f"Value of optimizer averages: {optimizer.averages}")
# with textcat.model.use_params(optimizer.averages):
# print("test")
# Save model
with nlp.use_params(optimizer.averages):
nlp.to_disk("model_artifacts")
def evaluate_model(
tokenizer, textcat, test_data: list
) -> dict:
reviews, labels = zip(*test_data)
reviews = (tokenizer(review) for review in reviews)
true_positives = 0
false_positives = 1e-8 # Can't be 0 because of presence in denominator
true_negatives = 0
false_negatives = 1e-8
for i, review in enumerate(textcat.pipe(reviews)):
true_label = labels[i]
pos_label = true_label["cats"].get("pos", False)
# Get the predicted label with the highest score
predicted_label, score = max(review.cats.items(), key=lambda x: x[1])
if predicted_label == "neg":
continue
if score >= 0.5 and pos_label:
true_positives += 1
elif score >= 0.5 and not pos_label:
false_positives += 1
elif score < 0.5 and not pos_label:
true_negatives += 1
elif score < 0.5 and pos_label:
false_negatives += 1
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
if precision + recall == 0:
f_score = 0
else:
f_score = 2 * (precision * recall) / (precision + recall)
return {"precision": precision, "recall": recall, "f-score": f_score}
def test_model(input_data):
# Load saved trained model
loaded_model = spacy.load("model_artifacts")
# Generate prediction
parsed_text = loaded_model(input_data)
# Determine prediction to return
if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
prediction = "Positive"
score = parsed_text.cats["pos"]
else:
prediction = "Negative"
score = parsed_text.cats["neg"]
print(
f"Review text: {input_data}\nPredicted sentiment: {prediction}"
f"\tScore: {score}"
)
The only issue is that it works only with blank nlp models. If I try to load a model with more than the textcat component, this line:
nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)
Raises the following error: "ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset"
I would like to add an entity-ruler component to the pipeline, therefore I was wondering if anyone has any idea how to modify this so that it works for pre-trained models.
I tried to use
optimizer = nlp.create_optimizer()
And
optimizer = nlp.resume_training()
Instead of begin_training() coupled with a non-blank models, as well as using nlp.make_doc(text) instead of nlp(text) to obtain the doc object with the tokens only, but the error persists. Any suggestions?