I'm trying to use grouped aggregation on multiple columns with a user-defined function on a data frame. The data frame would look like this. I want to do some aggregation on [volume] and [logRet] group by [monthEndDate] and [stock]
The data frame would look like this
+---------------------------------------------------+
| date monthEndDate stock volume logRet |
+---------------------------------------------------+
| 1990-01-01 1990-01-31 A 1 NA |
| 1990-01-02 1990-01-31 A 2 0.2 |
| 1990-02-01 1990-02-28 A 3 0.3 |
| 1990-02-02 1990-02-28 A 4 0.4 |
| ... ... |
| 1990-01-01 1990-01-31 B 1 NA |
| 1990-01-02 1990-01-31 B 2 0.08 |
| ... ... |
| 1990-02-01 1990-02-28 B 0 0.3 |
| 1990-02-02 1990-02-28 B 3 0.4 |
| ... ... |
+---------------------------------------------------+
I tried the following method:
def check_input(fn):
def wrapper(_s, *args, **kwargs):
if not isinstance(_s, (pd.Series, pd.DataFrame, np.array)):
raise TypeError()
return fn(_s, *args, **kwargs)
wrapper.__name__ = fn.__name__
return wrapper
@check_input
def varLogRet(_s):
return pd.Series({'varLogRet': np.var(_s[np.nonzero(_s)])})
@check_input
def TotRet1M(_s):
return pd.Series({'TotRet1M': np.exp(np.sum(_s))})
@check_input
def avgVolume(_s):
return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
return_m = price_d.groupby(['monthEndDate', 'tradingItemId']).agg({'logRet': [varLogRet, TotRet1M],
'volume': avgVolume})
However, it failed with a ValueError:
Traceback (most recent call last):
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 663, in agg_series
return self._aggregate_series_fast(obj, func)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 681, in _aggregate_series_fast
result, counts = grouper.get_result()
File "pandas/_libs/reduction.pyx", line 429, in pandas._libs.reduction.SeriesGrouper.get_result
File "pandas/_libs/reduction.pyx", line 413, in pandas._libs.reduction.SeriesGrouper.get_result
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 894, in <lambda>
f = lambda x: func(x, *args, **kwargs)
File "<ipython-input-15-37583dd006b9>", line 5, in wrapper
return fn(_s, *args, **kwargs)
File "<ipython-input-15-37583dd006b9>", line 22, in avgVolume
return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1113, in __getitem__
return self._get_with(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1127, in _get_with
return self._get_values_tuple(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1172, in _get_values_tuple
raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-19-278a006fcb71>", line 1, in <module>
return_m = price_d.groupby(['monthEndDate', 'tradingItemId']).agg({'volume': avgVolume})
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 1455, in aggregate
return super().aggregate(arg, *args, **kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 229, in aggregate
result, how = self._aggregate(func, _level=_level, *args, **kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 506, in _aggregate
result = _agg(arg, _agg_1dim)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 456, in _agg
result[fname] = func(fname, agg_how)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py", line 440, in _agg_1dim
return colg.aggregate(how, _level=(_level or 0) + 1)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py", line 860, in aggregate
return self._python_agg_general(func_or_funcs, *args, **kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 900, in _python_agg_general
result, counts = self.grouper.agg_series(obj, f)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 665, in agg_series
return self._aggregate_series_pure_python(obj, func)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py", line 694, in _aggregate_series_pure_python
res = func(group)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py", line 894, in <lambda>
f = lambda x: func(x, *args, **kwargs)
File "<ipython-input-15-37583dd006b9>", line 5, in wrapper
return fn(_s, *args, **kwargs)
File "<ipython-input-15-37583dd006b9>", line 22, in avgVolume
return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1113, in __getitem__
return self._get_with(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1127, in _get_with
return self._get_values_tuple(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py", line 1172, in _get_values_tuple
raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
Adding
.iloc
fixed the issue for me. The reason is becausenonzero()
function returns the index of the array in which you need to useiloc
for selection.Output
Also, I use
_s.to_numpy().nonzero()
instead ofnp.nonzero(_s)
just because I am using a different version to you. It is recommended to use my version frompandas 0.24.0
- https://pandas.pydata.org/pandas-docs/version/0.25.3/reference/api/pandas.Series.nonzero.html#pandas-series-nonzero.