I am using spaCy's NER model "en_core_web_lg". This is the basic function that I have.
def find_pii_in_text(text):
pii_dict = {}
global nlp
if not nlp:
nlp = spacy.load("en_core_web_lg")
clean_html = re.compile("<.*?>")
text = re.sub(clean_html, "", text)
text = re.sub(r"[^\x00-\x7F]+", " ", text)
text_ent = nlp(text)
pii_dict.update(process_flagged_data(text_ent, pii_dict))
return pii_dict
def process_flagged_data(doc_list, pii_dict={}):
for idx, entity in enumerate(doc_list.ents):
# Ignore some labels
pii_info = f"{entity.label_} : {entity.text}"
if pii_loc not in pii_dict:
pii_dict[pii_loc] = {}
pii_dict[pii_loc].update(
{
f"n{idx}": {
"text": entity.text,
"type": entity.label_,
}
}
)
return pii_dict
Now, when I am passing sentences to this function, I am getting false positives in form of parts of words.
For example,
- the word 'movement' was flagged as 'movem' : PERSON.
- the word 'beginner' was flagged as 'begi' : PERSON.
Any idea as to why this is happening? Is there a config to tell spacy to not tokenize words?