Select specific range of elements from a python dictionary based on condition

141 views Asked by At

I have the following dictionary:

ip_dict = 
{
    "doc_1" : {
                "img_1" : ("FP","some long text"),
                "img_2" : ("LP", "another long text"),
                "img_3" : ("Others", "long text"),
                "img_4" : ("Others", "some loong text"),
                "img_5" : ("FP", "one more text"),
                "img_6" : ("FP", "another one"),
                "img_7" : ("LP", "ANOTHER ONE"),
                "img_8" : ("Others", "some text"),
                "img_9" : ("Others", "some moretext"),
                "img_10" : ("FP", "more text"),
                "img_11" : ("Others", "whatever"),
                "img_12" : ("Others", "more whatever"),
                "img_13" : ("LP", "SoMe TeXt"),
                "img_14" : ("Others", "some moretext"),
                "img_15" : ("FP", "whatever"),
                "img_16" : ("Others", "whatever"),
                "img_17" : ("LP", "whateverrr")
            },

    "doc_2" : {
                "img_1" : ("FP", "text"),
                "img_2" : ("FP", "more text"),
                "img_3" : ("LP", "more more text"),
                "img_4" : ("Others", "some more"),
                "img_5" : ("Others", "text text"),
                "img_6" : ("FP", "more more text"),
                "img_7" : ("Others", "lot of text"),
                "img_8" : ("LP", "still more text")
            }

}

Here FP represents the first page and LP the last page. For all the docs I only want to extract the FP and LP. For the Others, if they lie between FP and LP only then extract them, as they represent the pages between FP and LP. If they lie outside FP and LP then ignore them. Also for FP which are not followed by a LP, treat them as a single page and extract them. So my output dictionary would look like:

op_dict = 
{
    "doc_1" : [
                {
                "img_1" : ("FP","some long text"),
                "img_2" : ("LP", "another long text")
                },

                {
                    "img_5" : ("FP", "one more text")
                },

                {
                    "img_6" : ("FP", "another one"),
                    "img_7" : ("LP", "ANOTHER ONE")
                },

                {
                    "img_10" : ("FP", "more text"),
                    "img_11" : ("Others", "whatever"),
                    "img_12" : ("Others", "more whatever"),
                    "img_13" : ("LP", "SoMe TeXt"),
                },

                {
                    "img_15" : ("FP", "whatever"),
                    "img_16" : ("Others", "whatever"),
                    "img_17" : ("LP", "whateverrr"),
                }
            ],


    "doc_2" : [

                {
                "img_1" : ("FP", "text")
                },

                {        
                "img_2" : ("FP", "more text"),
                "img_3" : ("LP", "more more text")
                },        

                {
                "img_6" : ("FP", "more more text"),
                "img_7" : ("Others", "lot of text"),
                "img_8" : ("LP", "still more text")
                },

            ]
}

As you can see, all the FP and LP have been extracted, but also those Others which are in between FP and LP have also been extracted and stored in a dictionary. Also those FP which are not followed by a LP have also been extracted.

PS:

ip_dict = 
{
    "doc_1" : {
                "img_1" : ("LP","some long text"),
                "img_2" : ("Others", "another long text"),
                "img_3" : ("Others", "long text"),
                "img_4" : ("FP", "long text"),
                "img_5" : ("Others", "long text"),
                "img_6" : ("LP", "long text")
            }
}

op_dict =     {
        "doc_1" : [{
                    "img_1" : ("LP","some long text")
                },
                    {
                    "img_4" : ("FP", "long text"),
                    "img_5" : ("Others", "long text"),
                    "img_6" : ("LP", "long text")
                    }
                  ]
    
              }

Any help is appreciated!

4

There are 4 answers

10
RomanPerekhrest On BEST ANSWER

With extended sequential logic:

def select_page_ranges(d: dict):

    def _del_excess_items():
        # if previous block was not closed and has excess entries
        if start and last_mark != 'FP':
            res[pk][-1] = {start_key: res[pk][-1][start_key]}

    res = {}
    for pk, v in ip_dict.items():
        res[pk] = []
        start, start_key, last_mark = None, None, ''
        for k, v in v.items():
            if v[0] == 'FP':
                _del_excess_items()
                res[pk].append({k: v})
                start = True
                start_key = k
            elif v[0] == 'LP':
                res[pk][-1].update({k: v})
                start = False
            elif start:
                res[pk][-1].update({k: v})
            last_mark = v[0]
        _del_excess_items()
    return res

print(select_page_ranges(ip_dict))

{'doc_1': [{'img_1': ('FP', 'some long text'),
            'img_2': ('LP', 'another long text')},
           {'img_5': ('FP', 'one more text')},
           {'img_6': ('FP', 'another one'), 'img_7': ('LP', 'ANOTHER ONE')},
           {'img_61': ('FP', 'another one'), 'img_71': ('LP', 'ANOTHER ONE')},
           {'img_62': ('FP', 'another one'), 'img_72': ('LP', 'ANOTHER ONE')},
           {'img_54': ('FP', 'one more text')},
           {'img_540': ('FP', 'one more text')},
           {'img_541': ('FP', 'one more text')},
           {'img_13': ('FP', 'more text'),
            'img_14': ('Others', 'whatever'),
            'img_140': ('Others', 'whatever'),
            'img_141': ('Others', 'whatever'),
            'img_142': ('Others', 'whatever'),
            'img_15': ('Others', 'more whatever'),
            'img_16': ('LP', 'SoMe TeXt')},
           {'img_18': ('FP', 'whatever'),
            'img_19': ('Others', 'whatever'),
            'img_20': ('LP', 'whateverrr')}],
 'doc_2': [{'img_1': ('FP', 'text')},
           {'img_2': ('FP', 'more text'), 'img_3': ('LP', 'more more text')},
           {'img_6': ('FP', 'more more text'),
            'img_7': ('Others', 'lot of text'),
            'img_8': ('LP', 'still more text')},
           {'img_69': ('FP', 'more more text')}]}
1
kritserv On

this is my solution which is pretty long:

for doc in ip_dict:
    print('\n', doc, '\n')

    ignore = True

    for img in ip_dict[doc]:
    
        TYPE = ip_dict[doc][img][0] # FP or LP
        TEXT = ip_dict[doc][img][1] # The text
    
        if TYPE == 'FP':
            ignore = False
    
        if ignore == False:
            print(img,' :\t', TYPE, '/', TEXT)
        
        if TYPE == 'LP':
            ignore = True

result:

doc_1 

img_1  :     FP / some long text
img_2  :     LP / another long text
img_5  :     FP / one more text
img_6  :     FP / another one
img_7  :     LP / ANOTHER ONE
img_10  :    FP / more text
img_11  :    Others / whatever
img_12  :    Others / more whatever
img_13  :    LP / SoMe TeXt
img_15  :    FP / whatever
img_16  :    Others / whatever
img_17  :    LP / whateverrr

doc_2 

img_1  :     FP / text
img_2  :     FP / more text
img_3  :     LP / more more text
img_6  :     FP / more more text
img_7  :     Others / lot of text
img_8  :     LP / still more text
0
Debi Prasad On

Try this method. This is a classic usage of flag method, but as of the comment that it will only work if you make the input into the dictionary in order. as o now, it is giving the desired output


def process(ip_dict):
    op_dict=dict()
    for key,value in ip_dict.items():
        op_list=[]
        fp_counter=0
        lp_counter=0
        op_dup=dict()
        for key1,value1 in value.items():
            if value1[0] == "FP" and fp_counter==1:
                fp_counter=1
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                op_dup[key1]=value1
                continue
            
            if value1[0] == "FP" and fp_counter==0:
                fp_counter=1
                
               
            if value1[0] == "LP" and lp_counter==1:
                lp_counter=1
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                op_dup[key1]=value1
                continue
            
            if value1[0] == "LP" and lp_counter==0:
                lp_counter=1
                
            if(lp_counter==0 and fp_counter == 1):
                op_dup[key1]=value1
                
            if(lp_counter == 1 and fp_counter == 1 and value1[0] == "LP"):
                op_dup[key1]=value1
                
            if(lp_counter == 1 and fp_counter == 1 and value1[0] != "LP"):
                if len(op_dup) != 0:
                    op_list.append(op_dup)
                op_dup=dict()
                lp_counter=0
                fp_counter=0
        if(len(op_dup) != 0):
            op_list.append(op_dup)
        op_dict[key]=op_list
    return op_dict

print(process(ip_dict))     
2
John Collins On

One possible approach:

op_dict = {}
first_page = None
for doc, imgs in ip_dict.items():
    op_dict[doc] = []
    for k, v in imgs.items():
        if v[0] == "FP":
            if first_page:
                if len(new.keys()) == 1:
                    op_dict[doc].append(new)
                else:
                    op_dict[doc].append(
                        {list(new.keys())[0]: list(new.values())[0]}
                    )
                new = {}
            else:
                new = {k: v}
                first_page = True
                continue
        if first_page:
            new[k] = v
            if v[0] == "LP":
                op_dict[doc].append(new)
                first_page = False
    if first_page:
        op_dict[doc].append({k: v})

which gives:

{'doc_1': [{'img_1': ('FP', 'some long text'),
   'img_2': ('LP', 'another long text')},
  {'img_5': ('FP', 'one more text')},
  {'img_6': ('FP', 'another one'), 'img_7': ('LP', 'ANOTHER ONE')},
  {'img_61': ('FP', 'another one'), 'img_71': ('LP', 'ANOTHER ONE')},
  {'img_62': ('FP', 'another one'), 'img_72': ('LP', 'ANOTHER ONE')},
  {'img_54': ('FP', 'one more text')},
  {'img_540': ('FP', 'one more text')},
  {'img_541': ('FP', 'one more text')},
  {'img_13': ('FP', 'more text'),
   'img_14': ('Others', 'whatever'),
   'img_140': ('Others', 'whatever'),
   'img_141': ('Others', 'whatever'),
   'img_142': ('Others', 'whatever'),
   'img_15': ('Others', 'more whatever'),
   'img_16': ('LP', 'SoMe TeXt')},
  {'img_18': ('FP', 'whatever'),
   'img_19': ('Others', 'whatever'),
   'img_20': ('LP', 'whateverrr')}],
 'doc_2': [{'img_1': ('FP', 'text')},
  {'img_2': ('FP', 'more text'), 'img_3': ('LP', 'more more text')},
  {'img_6': ('FP', 'more more text'),
   'img_7': ('Others', 'lot of text'),
   'img_8': ('LP', 'still more text')},
  {'img_69': ('FP', 'more more text')}]}