Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data

654 views Asked by At

I want to train new NER entities with the following code:

def train_spacy_model(data, model='en_core_web_trf', n_iter=50):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
        
    TRAIN_DATA = data
    ner = nlp.get_pipe("ner")
    
    examples = []
    for text, annotations in TRAIN_DATA:
        examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    nlp.initialize(lambda: examples)
    
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER

        for itn in range(n_iter):
            random.shuffle(examples)
            losses = {}
            batches = minibatch(examples, size=compounding(4.0, 64.0, 1.2))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    batch,  
                    drop=0.20, 
                    losses=losses
                   
                )
            print("Losses", losses)
    
    return nlp

nlp = train_spacy_model(data=dataset, n_iter=30)

I keep getting this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[296], line 40
     36             print("Losses", losses)
     38     return nlp
---> 40 nlp = train_spacy_model(data=no_verlaps_dataset, n_iter=30)
     42 # save model to output directory
     43 output_dir = '_data/models/actor_ner'

Cell In[296], line 16, in train_spacy_model(data, model, n_iter)
     14 for text, annotations in TRAIN_DATA:
     15     examples.append(Example.from_dict(nlp.make_doc(text), annotations))
---> 16 nlp.initialize(lambda: examples)
     17     # for ent in annotations.get('entities'):
     18     #     ner.add_label(ent[2])
     20 pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:1290, in Language.initialize(self, get_examples, sgd)
   1288 config = self.config.interpolate()
   1289 # These are the settings provided in the [initialize] block in the config
-> 1290 I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
   1291 before_init = I["before_init"]
   1292 if before_init is not None:

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:746, in registry.resolve(cls, config, schema, overrides, validate)
    737 @classmethod
    738 def resolve(
    739     cls,
   (...)
    744     validate: bool = True,
    745 ) -> Dict[str, Any]:
--> 746     resolved, _ = cls._make(
    747         config, schema=schema, overrides=overrides, validate=validate, resolve=True
    748     )
    749     return resolved

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:795, in registry._make(cls, config, schema, overrides, resolve, validate)
    793 if not is_interpolated:
    794     config = Config(orig_config).interpolate()
--> 795 filled, _, resolved = cls._fill(
    796     config, schema, validate=validate, overrides=overrides, resolve=resolve
    797 )
    798 filled = Config(filled, section_order=section_order)
    799 # Check that overrides didn't include invalid properties not in config

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:867, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
    864     getter = cls.get(reg_name, func_name)
    865     # We don't want to try/except this and raise our own error
    866     # here, because we want the traceback if the function fails.
--> 867     getter_result = getter(*args, **kwargs)
    868 else:
    869     # We're not resolving and calling the function, so replace
    870     # the getter_result with a Promise class
    871     getter_result = Promise(
    872         registry=reg_name, name=func_name, args=args, kwargs=kwargs
    873     )

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:108, in load_lookups_data(lang, tables)
    105 @registry.misc("spacy.LookupsDataLoader.v1")
    106 def load_lookups_data(lang, tables):
    107     util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
--> 108     lookups = load_lookups(lang=lang, tables=tables)
    109     return lookups

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/lookups.py:30, in load_lookups(lang, tables, strict)
     28 if lang not in registry.lookups:
     29     if strict and len(tables) > 0:
---> 30         raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
     31     return lookups
     32 data = registry.lookups.get(lang)

ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

I have installed the package:

pip install spacy-lookups-data
Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 MB 25.9 MB/s eta 0:00:00

But it still persists.

How can I fix this error to commence updating the model to detect new entitities for ner tasks?

EDIT It got fixed when I restarted the kernel in jupyter notbook that this code ran in.

2

There are 2 answers

0
mCs On BEST ANSWER

I've been running this code in Jupyter Notebook and the error persisted until I restarted the kernel. So the answer is to restart the notebook kernel.

1
aab On

To answer the narrow question: you probably need to restart your runtime in order for the tables in spacy-lookups-data to be registered.


To answer the question you didn't ask: the quoted script looks like it was only partially updated from v2 and I wouldn't recommend using it, in particular not for en_core_web_trf. One recommended way to update ner components in spacy v3 pipelines is shown in this demo project:

https://github.com/explosion/projects/tree/v3/pipelines/ner_demo_update

It handles a lot of the pipeline/config/training details for you in order to update ner without affecting the performance of the other components in the pipeline. A walkthrough of how to run a project is shown in the v2->v3 examples README.