Pandas dataframe grouped aggregation on multiple columns with user defined function

201 views Asked by At

I'm trying to use grouped aggregation on multiple columns with a user-defined function on a data frame. The data frame would look like this. I want to do some aggregation on [volume] and [logRet] group by [monthEndDate] and [stock]

The data frame would look like this

+---------------------------------------------------+
| date          monthEndDate  stock  volume  logRet |
+---------------------------------------------------+
| 1990-01-01    1990-01-31    A      1       NA     |
| 1990-01-02    1990-01-31    A      2       0.2    |
| 1990-02-01    1990-02-28    A      3       0.3    |
| 1990-02-02    1990-02-28    A      4       0.4    |
| ...           ...                                 |
| 1990-01-01    1990-01-31    B      1       NA     |
| 1990-01-02    1990-01-31    B      2       0.08   |
| ...           ...                                 |
| 1990-02-01    1990-02-28    B      0       0.3    |
| 1990-02-02    1990-02-28    B      3       0.4    |
| ...           ...                                 |
+---------------------------------------------------+

I tried the following method:

def check_input(fn):
    def wrapper(_s, *args, **kwargs):
        if not isinstance(_s, (pd.Series, pd.DataFrame, np.array)):
            raise TypeError()
        return fn(_s, *args, **kwargs)
    wrapper.__name__ = fn.__name__
    return wrapper


@check_input
def varLogRet(_s):
    return pd.Series({'varLogRet': np.var(_s[np.nonzero(_s)])})


@check_input
def TotRet1M(_s):
    return pd.Series({'TotRet1M': np.exp(np.sum(_s))})


@check_input
def avgVolume(_s):
    return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})

return_m = price_d.groupby(['monthEndDate', 'tradingItemId']).agg({'logRet': [varLogRet, TotRet1M],
                                                                       'volume': avgVolume})

However, it failed with a ValueError:

Traceback (most recent call last):
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 663, in agg_series
    return self._aggregate_series_fast(obj, func)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 681, in _aggregate_series_fast
    result, counts = grouper.get_result()
  File "pandas/_libs/reduction.pyx", line 429, in pandas._libs.reduction.SeriesGrouper.get_result
  File "pandas/_libs/reduction.pyx", line 413, in pandas._libs.reduction.SeriesGrouper.get_result
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 894, in <lambda>
    f = lambda x: func(x, *args, **kwargs)
  File "<ipython-input-15-37583dd006b9>", line 5, in wrapper
    return fn(_s, *args, **kwargs)
  File "<ipython-input-15-37583dd006b9>", line 22, in avgVolume
    return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1113, in __getitem__
    return self._get_with(key)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1127, in _get_with
    return self._get_values_tuple(key)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1172, in _get_values_tuple
    raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-278a006fcb71>", line 1, in <module>
    return_m = price_d.groupby(['monthEndDate', 'tradingItemId']).agg({'volume': avgVolume})
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 1455, in aggregate
    return super().aggregate(arg, *args, **kwargs)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 229, in aggregate
    result, how = self._aggregate(func, _level=_level, *args, **kwargs)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 506, in _aggregate
    result = _agg(arg, _agg_1dim)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 456, in _agg
    result[fname] = func(fname, agg_how)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 440, in _agg_1dim
    return colg.aggregate(how, _level=(_level or 0) + 1)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 860, in aggregate
    return self._python_agg_general(func_or_funcs, *args, **kwargs)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 900, in _python_agg_general
    result, counts = self.grouper.agg_series(obj, f)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 665, in agg_series
    return self._aggregate_series_pure_python(obj, func)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 694, in _aggregate_series_pure_python
    res = func(group)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 894, in <lambda>
    f = lambda x: func(x, *args, **kwargs)
  File "<ipython-input-15-37583dd006b9>", line 5, in wrapper
    return fn(_s, *args, **kwargs)
  File "<ipython-input-15-37583dd006b9>", line 22, in avgVolume
    return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1113, in __getitem__
    return self._get_with(key)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1127, in _get_with
    return self._get_values_tuple(key)
  File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1172, in _get_values_tuple
    raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
1

There are 1 answers

0
Quan Nguyen On

Adding .iloc fixed the issue for me. The reason is because nonzero() function returns the index of the array in which you need to use iloc for selection.

def varLogRet(_s):
    return pd.Series({'varLogRet': np.var(_s.iloc[_s.to_numpy().nonzero()])})

def TotRet1M(_s):
    return pd.Series({'TotRet1M': np.exp(np.sum(_s))})

def avgVolume(_s):
    return pd.Series({'avgVolume': np.mean(_s.iloc[_s.to_numpy().nonzero()])})

temp.\
    groupby(['monthEndDate', 'stock']).\
    agg({'logRet': [varLogRet, TotRet1M], 'volume': avgVolume})

Output

                            logRet  volume
                varLogRet   TotRet1M    avgVolume
monthEndDate    stock           
1990-01-31  A   0.0000      1.221403    1.5
1990-02-28  B   0.0025      2.013753    3.5
...

Also, I use _s.to_numpy().nonzero() instead of np.nonzero(_s) just because I am using a different version to you. It is recommended to use my version from pandas 0.24.0 - https://pandas.pydata.org/pandas-docs/version/0.25.3/reference/api/pandas.Series.nonzero.html#pandas-series-nonzero.