Parent Child Path hierarchy in python using pandas

69 views Asked by At

I'm trying to get child parent hierarchy in new pandas column. Currently my code looks like this, but it search for all parents, but it should search for the one in corresponding row.

Here is the example how it should work.

Base table (input):

CHILD_ID PARENT_ID
ITEM_1 ITEM_A
ITEM_2 ITEM_B
ITEM_A ITEM_X
ITEM_A ITEM_Y
ITEM_B ITEM_Z

First iteration

CHILD_ID PARENT_ID Hierarchy
ITEM_1 ITEM_A ITEM_1, ITEM_A
ITEM_2 ITEM_B ITEM_2, ITEM_B
ITEM_A ITEM_X ITEM_A, ITEM_X
ITEM_A ITEM_Y ITEM_A, ITEM_Y
ITEM_B ITEM_Z ITEM_B, ITEM_Z

Second iteration (in this case desired output as there is no more parents)

CHILD_ID PARENT_ID Hierarchy
ITEM_1 ITEM_A ITEM_1, ITEM_A, ITEM_X
ITEM_1 ITEM_A ITEM_1, ITEM_A, ITEM_Y
ITEM_2 ITEM_B ITEM_2, ITEM_B, ITEM_Z
ITEM_A ITEM_X ITEM_A, ITEM_X
ITEM_A ITEM_Y ITEM_A, ITEM_Y
ITEM_B ITEM_Z ITEM_B, ITEM_Z

I'm struggling to achieve this result, here's my code:

def get_parents(child_id):
    list_of_parents = []

    def dfs(child_id, parents_list):
        parent_ids = df[df["child_id"] == child_id]["parent_id"].values
        if len(parent_ids) == 0:
            return
        for parent_id in parent_ids:
            if parent_id not in parents_list:
                parents_list.append(parent_id)
                dfs(parent_id, parents_list)

    dfs(child_id, list_of_parents)
    return list_of_parents

df["parent_hierarchy"] = df["child_id"].apply(get_parents)

example problem:

When trying to run the code on whole dataframe this error ocurrs: NetworkXNoPath: No path between 4614837_12_13200 and 4975995_5_13200.

After filtering dataframe to only those ID's in both sides, code worked but still doesnt work for the input dataframe

parent_id child_id
4975995_5_13200 4789551_5_13200
5003373_3_13200 4614837_12_13200
4975995_5_13200 4602153_19_13200
4975995_5_13200 4789551_7_13200
4974894_5_13200 4614837_12_13200
4975995_5_13200 4789551_10_13200
4975995_5_13200 4789551_3_13200
4975995_5_13200 4789551_6_13200
4975995_5_13200 4789551_2_13200

SQL Code that was used in draft but the will be finally in Python.

SELECT
CONCAT_WS(
  "|", 
  concat(
    bl1.parent_item_id 
  ), 
  IF(
    bl2.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl2.parent_item_id 
    )
  ), 
  IF(
    bl3.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl3.parent_item_id 
    )
  ), 
  IF(
    bl4.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl4.parent_item_id 
    )
  ), 
  IF(
    bl5.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl5.parent_item_id 
    )
  ), 
  IF(
    bl6.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl6.parent_item_id 
    )
  ), 
  IF(
    bl7.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl7.parent_item_id 
    )
  ), 
  IF(
    bl8.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl8.parent_item_id 
    )
  ), 
  IF(
    bl9.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl9.parent_item_id 
    )
  ), 
  IF(
    bl10.parent_item_id IS NULL, 
    NULL, 
    concat(
      bl10.parent_item_id 
    )
  )
) AS PATH -- joins to find 10 levels of items
FROM 
  baseln AS bl1 
  LEFT JOIN baseln AS bl2 ON bl1.parent_item_id = bl2.child_item_id 
  AND bl1.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl3 ON bl2.parent_item_id = bl3.child_item_id 
  AND bl2.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl4 ON bl2.parent_item_id = bl4.child_item_id 
  AND bl3.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl5 ON bl2.parent_item_id = bl5.child_item_id 
  AND bl4.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl6 ON bl2.parent_item_id = bl6.child_item_id 
  AND bl5.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl7 ON bl2.parent_item_id = bl7.child_item_id 
  AND bl6.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl8 ON bl2.parent_item_id = bl8.child_item_id 
  AND bl7.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl9 ON bl2.parent_item_id = bl9.child_item_id 
  AND bl8.child_item_id IS NOT NULL 
  LEFT JOIN baseln AS bl10 ON bl2.parent_item_id = bl10.child_item_id 
  AND bl9.child_item_id IS NOT NULL 

NetworkX does throw an error with below dataframe

{'parent_id':['4974894_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','5003373_3_13200','4975995_5_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','4975995_5_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','4975995_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','4975995_5_13200','5003373_3_13200','4974894_5_13200','5003373_3_13200','4974894_5_13200','4975995_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','4975995_5_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200','4974894_5_13200','4975995_5_13200','5003373_3_13200','4974894_5_13200','4974894_5_13200','5003373_3_13200','5003373_3_13200','5003373_3_13200'],'child_id':['4602363_15_13200','4613145_13_13200','4602528_21_13200','4613145_11_13200','4614837_8_13200','4613145_11_13200','4613370_11_13200','4595322_17_13200','4602576_18_13200','4595310_14_13200','4595310_14_13200','4602528_23_13200','4602528_22_13200','4595322_16_13200','4595310_13_13200','4602384_16_13200','4602153_18_13200','4602384_17_13200','4602576_17_13200','4602528_18_13200','4789551_5_13200','4613046_18_13200','4602576_17_13200','4595310_9_13200','4602363_10_13200','4595322_16_13200','4602528_21_13200','4602171_17_13200','4602153_18_13200','4602153_12_13200','4595322_17_13200','4602576_12_13200','4613145_9_13200','4602576_11_13200','4602363_15_13200','4602171_12_13200','4602528_15_13200','4613370_15_13200','4614837_12_13200','4613370_14_13200','4602363_16_13200','4595322_11_13200','4595322_10_13200','4613370_15_13200','4602153_19_13200','4614837_11_13200','4613046_13_13200','4602384_10_13200','4602363_16_13200','4613370_14_13200','4789551_7_13200','4602171_10_13200','4595322_13_13200','4613046_18_13200','4602363_14_13200','4614837_12_13200','4602384_17_13200','4602576_16_13200','4613370_16_13200','4602528_18_13200','4602528_22_13200','4613145_14_13200','4602384_16_13200','4789551_10_13200','4602528_23_13200','4613046_17_13200','4602153_19_13200','4602363_14_13200','4789551_3_13200','4595310_13_13200','4602153_19_13200','4613370_16_13200','4602363_9_13200','4789551_6_13200','4602576_18_13200','4613145_14_13200','4595322_13_13200','4602384_13_13200','4602576_16_13200','4789551_2_13200','4614837_11_13200','4613145_13_13200','4602171_17_13200','4613046_17_13200','4614837_7_13200','4602153_11_13200']}
2

There are 2 answers

10
mozway On BEST ANSWER

IIUC, use networkx to create the hierarchy:

import networkx as nx

# create graph
G = nx.from_pandas_edgelist(df, create_using=nx.DiGraph,
                            source='CHILD_ID', target='PARENT_ID')

out = {}
# for each subgraph, form pairs of nodes -> leaf
# find the path
for c in nx.weakly_connected_components(G):
    H = G.subgraph(c)
    leaves = {n for n, d in H.out_degree() if d==0}
    for n in c-leaves:
        for l in leaves:
            try:
                out.setdefault(n, []).append(nx.shortest_path(H, n, l))
            except nx.NetworkXNoPath:
                pass

# merge to unique CHILD_ID
# recreate PARENT_ID
# reorder based on original DataFrame
out = df.merge(df[['CHILD_ID']].drop_duplicates()
                 .merge(pd.Series(out, name='Hierarchy').explode(),
                        left_on='CHILD_ID', right_index=True)
                 .assign(PARENT_ID=lambda d: d['Hierarchy'].str[1]),
               how='left')

Output:

  CHILD_ID PARENT_ID                 Hierarchy
0   ITEM_1    ITEM_A  [ITEM_1, ITEM_A, ITEM_X]
1   ITEM_1    ITEM_A  [ITEM_1, ITEM_A, ITEM_Y]
2   ITEM_2    ITEM_B  [ITEM_2, ITEM_B, ITEM_Z]
3   ITEM_A    ITEM_X          [ITEM_A, ITEM_X]
4   ITEM_A    ITEM_Y          [ITEM_A, ITEM_Y]
5   ITEM_B    ITEM_Z          [ITEM_B, ITEM_Z]

Graph:

enter image description here

2
manas pandya On

usw a combination of dataframe operations and a recursive function. here's a new code, you should be able to implement this after you load the data

def get_hierarchy(child_id, hierarchy=[]):
    hierarchy.append(child_id)
    parent_id = df.loc[df["CHILD_ID"] == child_id, "PARENT_ID"].values
    if len(parent_id) > 0:
        get_hierarchy(parent_id[0], hierarchy)
    return hierarchy

df["Hierarchy"] = df["CHILD_ID"].apply(get_hierarchy)