merge dataframe but do not sort by merge key

64 views Asked by At

I wish to merge df1 and df2 to df3 and keep order. my demo code add sort=False but it not work as expected.

import pandas as pd

data1 = [
    ['4A', 1],
    ['3B', 2],
    ['2C', 3],
    ['1D', 4],
    ]
data2 = [
    ['2C', 9],
    ['4A', 3],
    ['6F', 2],
    ['5G', 1],
    ]

df1 = pd.DataFrame(data1, columns=['name', 'value'])
df2 = pd.DataFrame(data2, columns=['name', 'value'])

df3 = pd.merge(df1, df2, how='outer', on='name', sort=False)
df3 = df3.rename({'value_x': 'v1', 'value_y': 'y2'}, axis=1)
print(df1)
print(df2)
print(df3)

Output:

  name  value
0   4A      1
1   3B      2
2   2C      3
3   1D      4
  name  value
0   2C      9
1   4A      3
2   6F      2
3   5G      1
  name   v1   y2
0   1D  4.0  NaN
1   2C  3.0  9.0
2   3B  2.0  NaN
3   4A  1.0  3.0
4   5G  NaN  1.0
5   6F  NaN  2.0

Expected output:

  name   v1   y2
0   4A  1.0  3.0
1   3B  2.0  NaN
2   2C  3.0  9.0
3   1D  4.0  NaN
4   6F  NaN  2.0
5   5G  NaN  1.0
1

There are 1 answers

2
mozway On BEST ANSWER

Assuming sorted range indexes, reset_index before merge, then sort_values:

df3 = (pd.merge(df1.reset_index(), df2.reset_index(),
                how='outer', on='name', sort=False)
         .sort_values(by=['index_x', 'index_y'],
                      ignore_index=True)
         .drop(columns=['index_x', 'index_y'])
         .rename({'value_x': 'v1','value_y':'y2'}, axis=1)
      )

For a more generic case:

df3 = (pd.merge(df1.assign(index=np.arange(len(df1))),
                df2.assign(index=np.arange(len(df2))),
                how='outer', on='name', sort=False)
         .sort_values(by=['index_x', 'index_y'],
                      ignore_index=True)
         .drop(columns=['index_x', 'index_y'])
         .rename({'value_x': 'v1','value_y':'y2'}, axis=1)
      )

Alternatively, use the union of df1['name']/df2['name'] to reindex:

idx = pd.concat([df1['name'], df2['name']]).drop_duplicates()

df3 = (pd.merge(df1, df2, how='outer', on='name', sort=False)
         .set_index('name').reindex(idx).reset_index()
         .rename({'value_x': 'v1','value_y':'y2'}, axis=1)
      )

Output:

  name   v1   y2
0   4A  1.0  3.0
1   3B  2.0  NaN
2   2C  3.0  9.0
3   1D  4.0  NaN
4   6F  NaN  2.0
5   5G  NaN  1.0