Python regex capturing group extension

105 views Asked by At

I am using named capturing groups using (?P<name>) with a list of verbs and word stems related to the coronavirus pandemic.

import regex
import pandas as pd


data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def covid_lang(text):    
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['Corona', 'corona', 'Covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    p1 = fr'(?<=\b(?P<predicate>{"|".join(predicates)}))[^\.]*(?P<subject>{"|".join(subjects)}[a-z]*)'

    result = []
    for m in regex.finditer(p1, text, regex.S):
        result.append([m.group('predicate'), m.group('subject')])

    p2 = fr'\b(?P<subject>{"|".join(subjects)})[^\.]*(?<=\b(?P<predicate>{"|".join(predicates)}))'
    for m in regex.finditer(p2, text, regex.S):
        result.append([m.group('subject'), m.group('predicate')])

    return result

df['result'] = df['text'].apply(covid_lang)

When there is a match, I would like to return as subject, not only the stem of the word, but the whole word (i.e 'pandemic' and 'pandemy' instead of 'pandem'). I have tried adding [a-z]* right after the list of words, so that the capturing group stops when the word ends, but it does not change anything.

Plus, is it possible to join the two queries (predicate before subject, subject before predicate) in a single query ? I've tried using (p1)|(p2) but it didn't work with named captured groups.

Lastly, is it possible to include uppercase and lowercase letters like Corona and corona in a single word ?

1

There are 1 answers

0
Jesper On BEST ANSWER

This should do all three:

from xml.etree.ElementPath import prepare_descendant

import regex
import pandas as pd

data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def expand_word(word):
    return f'({word}[a-z]*)'

def construct_named_group_from_list_of_words(word_type, word_list):
    expanded_word_regex_list = [expand_word(stem) for stem in word_list]
    word_in_named_group = fr'(?P<{word_type}>{"|".join(expanded_word_regex_list)})'
    return word_in_named_group

def covid_lang(text):
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['corona', 'covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    predicate_in_named_group = construct_named_group_from_list_of_words("predicate", predicates)
    subject_in_named_group = construct_named_group_from_list_of_words("subject", subjects)

    result = []

    p1 = fr'(?<=\b{predicate_in_named_group})[^\.]*{subject_in_named_group}'
    p2 = fr'\b{subject_in_named_group}[^\.]*(?<=\b{predicate_in_named_group})'

    p = fr'({p1})|({p2})'

    for m in regex.finditer(p, text, regex.S | regex.IGNORECASE):
        result.append([m.group('predicate'), m.group('subject')])


    return result


df['result'] = df['text'].apply(covid_lang)

print(df)

Output:

   id                                      text                  result
0   1                  The pandemy is spreading  [[spreading, pandemy]]
1   2                   He is fighting Covid-19     [[fight, Covid-19]]
2   3                The pandemic virus spreads   [[spreads, pandemic]]
3   4  This sentence is about a different topic                      []
4   5                How do we stop the virus ?         [[stop, virus]]

But I'm not sure whether you always want to output the predicate first? If not, this should do it:

from xml.etree.ElementPath import prepare_descendant

import regex
import pandas as pd


data = {'id':[1, 2, 3, 4, 5], 'text':['The pandemy is spreading', 'He is fighting Covid-19', 'The pandemic virus spreads', 'This sentence is about a different topic' , 'How do we stop the virus ?']}
df = pd.DataFrame(data)

def expand_word(word):
    return f'({word}[a-z]*)'

def construct_named_group_from_list_of_words(word_type, word_list):
    expanded_word_regex_list = [expand_word(stem) for stem in word_list]
    word_in_named_group = fr'(?P<{word_type}>{"|".join(expanded_word_regex_list)})'
    return word_in_named_group

def covid_lang(text):
    predicates = ['avoid', 'contain', 'track', 'spread', 'contact', 'stop', 'combat', 'fight']
    subjects = ['corona', 'covid-19', 'epidem', 'infect', 'virus', 'pandem', 'disease', 'outbreak']

    predicate_in_named_group = construct_named_group_from_list_of_words("predicate", predicates)
    subject_in_named_group = construct_named_group_from_list_of_words("subject", subjects)

    result = []

    p1 = fr'(?<=\b{predicate_in_named_group})[^\.]*{subject_in_named_group}'
    p2 = fr'\b{subject_in_named_group}[^\.]*(?<=\b{predicate_in_named_group})'

    for m in regex.finditer(p1, text, regex.S | regex.IGNORECASE):
        result.append([m.group('predicate'), m.group('subject')])

    for m in regex.finditer(p2, text, regex.S | regex.IGNORECASE):
        result.append([m.group('subject'), m.group('predicate')])

    return result


df['result'] = df['text'].apply(covid_lang)

print(df)

Output:

   id                                      text                  result
0   1                  The pandemy is spreading  [[pandemy, spreading]]
1   2                   He is fighting Covid-19     [[fight, Covid-19]]
2   3                The pandemic virus spreads   [[pandemic, spreads]]
3   4  This sentence is about a different topic                      []
4   5                How do we stop the virus ?         [[stop, virus]]