parsing ingredient list with parsimonious

238 views Asked by At
from parsimonious.grammar import Grammar

grammar = Grammar(
     '''
     # item = ( ( ingredient+ '(' ingredient+ ')' comma ws?) + / comma+ / ws+ / ingredient )+
     item = ((ingredient '(' ingredient ')') / ingredient )+

     ingredient = ( ( bold_tag? word+ ws? percent? )+ comma? ws? )
     word = ~"[A-Z0-9:]+"i
     ws = ~"\s*"+
     comma = ","
     bold_tag = "<b>"
     percent = ~"\([\d\.%]+\)"
     ''')

grammar = Grammar(
     """
     ingredient_item = words open_bracket (subingredient)* closed_bracket comma
     text_preceding_colon = (text colon)*
     text       = ~"[A-Z 0-9]*"i
     space = ~"[\\s]*"
     colon = ":"
     words = text+
     open_bracket = "("
     closed_bracket = ")"
     subingredient = text / b_tag / percentage
     percentage = open_bracket percentage_num closed_bracket
     percentage_num = ~"[0-9\.%]*"
     b_tag = "<b>"
     comma = ","
     """)

grammar = Grammar(
     '''
     item = (ingredient+ comma? )+
     ingredient = word? / bold_tag? / ws? / percent?
     comma = ','
     word = ~"[A-Z0-9:]+"i
     bold_tag = "<b>"
     ws = ~"\s*"
     percent = ~"\([\d\.%]+\)"
     '''
)

test_string = '''Partially Inverted <b>Brown <b>Sugar Syrup (43.3%), Salt, Acidity Regulator: Tripotassium Phosphate, Sunflower Oil'''

test_string = test_string.decode('utf-8')

tree = grammar.parse(test_string)

hi, im trying to parse an ingredients list using parsimonious without any luck. above are all the grammars i've tried to write so far. i want to split by commas into it's constituent parts ignoring nested brackets as in the more complex test string below:

test_string2 = '''Cereal Grains (Whole Grain <b>Oat Flour (28.3%), Whole Grain <b>Wheat (28.3%), Whole Grain <b>Barley Flour (17.1%), Whole Grain Maize Flour (2.0%), Whole Grain Rice Flour (2.0%)), Sugar, <b>Wheat Starch, Partially Inverted Brown Sugar Syrup, Salt, Acidity Regulator: Tripotassium Phosphate, Sunflower Oil, Colours: Carotene, Caramel and Annatto, Antioxidant: Tocopherols, Vitamins and Minerals: Vitamin C, Niacin (B3), Pantothenic Acid, Riboflavin (B2), Vitamin B6, Folic Acid, Vitamin D, Calcium Carbonate, Iron, To produce 100g of this product we have used 77.7g of Whole Grain, We guarantee every Nestlé Cereal with the green banner contains at least 8g of Whole Grain per serving'''

each ingredient may have a list of subingredients enclosed by '(' or a percentage, or a word like 'B2', and consists of one or more words.

0

There are 0 answers