I am using the below preprocessing script to create annotations from my pattern which the model will then be trained on. Although data debug evaluates all data as valid and containing annotated spans, the model will not train on it and the scorer will output zero values across all scoring metrics.
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("span_ruler", name="ruler")
patterns = [{"label": "Effective Date", "pattern": [
{"ENT_TYPE": "DATE", "OP": "{4,}"},
{"TEXT": '('},
{"TEXT": 'the'},
{"TEXT": '"'},
{"TEXT": 'Effective'},
{"TEXT": 'Date'},
{"TEXT": '"'},
{"TEXT": ')'}
]}]
ruler.add_patterns(patterns)
for i in range(100, 510):
text_open = open(f"inputfiles/ ({i}).txt", "r", encoding='utf8')
text = text_open.read()
doc = nlp(text)
db = DocBin()
for span in doc.spans["ruler"]:
print(span.text, span.label_)
text_open.close()
db.add(doc)
db.to_disk(f"./train/{i}.spacy")
With the data being validated through data debug, I am lead to believe that something is wrong with my config instead:
[paths]
train = "train"
dev = "dev"
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "en"
pipeline = ["tok2vec", "span_ruler", "spancat"]
batch_size = 10
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.spancat]
factory = "spancat"
max_positive = null
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "ruler"
threshold = 0.5
[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null
[components.spancat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [11,12,13,14,15]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = ["span_ruler"]
before_to_disk = null
before_update = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
spans_sc_f = null
spans_sc_p = 0.0
spans_sc_r = 0.0
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.components.span_ruler]
[initialize.components.span_ruler.patterns]
@readers = "srsly.read_jsonl.v1"
path = "span_ruler/patterns.jsonl"
[initialize.tokenizer]
Any ideas what could cause this? Is there a specific component or annotating component that needs to be added to the config so that this works? I managed to train a model using sentencizer as an annotating component since I was annotating labels on the sentence level, but now I am using a simpler pattern that matches examples such as this one (not on the sentence level):
I have been trying to figure this out for about 2 weeks now so any help is greatly appreciated!
The score weights need to be updated to refer to the custom key: