cryptic DataFrame.agg error when using dictionaries #14421

myyc · 2016-10-13T23:23:18Z

Not sure if this is a bug. This works:

(
    pd.DataFrame({"u": [2,1,4,2,5], "a": ["a", "a", "b", "a", "b"]})
    .groupby("a")
    .agg(lambda x: np.mean(x)/np.std(x))
)

while this returns an error:

(
    pd.DataFrame({"u": [2,1,4,2,5], "a": ["a", "a", "b", "a", "b"]})
    .groupby("a")
    .agg({"blah": lambda x: np.mean(x)/np.std(x)})
)

error: KeyError: 'blah'

## LONG ERROR MESSAGE

KeyError                                  Traceback (most recent call last)
/opt/local/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2103             try:
-> 2104                 return self._engine.get_loc(key)
   2105             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)()

KeyError: 'blah'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-100-4f93bf630ec4> in <module>()
      2     pd.DataFrame({"u": [2,1,4,2,5], "a": ["a", "a", "b", "a", "b"]})
      3     .groupby("a")
----> 4     .agg({"blah": lambda x: np.mean(x)/np.std(x)})
      5 )

/opt/local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   3697     @Appender(SelectionMixin._agg_doc)
   3698     def aggregate(self, arg, *args, **kwargs):
-> 3699         return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
   3700 
   3701     agg = aggregate

/opt/local/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   3195 
   3196         _level = kwargs.pop('_level', None)
-> 3197         result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
   3198         if how is None:
   3199             return result

/opt/local/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    547 
    548                 try:
--> 549                     result = _agg(arg, _agg_1dim)
    550                 except SpecificationError:
    551 

/opt/local/lib/python3.6/site-packages/pandas/core/base.py in _agg(arg, func)
    498                 result = compat.OrderedDict()
    499                 for fname, agg_how in compat.iteritems(arg):
--> 500                     result[fname] = func(fname, agg_how)
    501                 return result
    502 

/opt/local/lib/python3.6/site-packages/pandas/core/base.py in _agg_1dim(name, how, subset)
    477                 aggregate a 1-dim with how
    478                 """
--> 479                 colg = self._gotitem(name, ndim=1, subset=subset)
    480                 if colg.ndim != 1:
    481                     raise SpecificationError("nested dictionary is ambiguous "

/opt/local/lib/python3.6/site-packages/pandas/core/groupby.py in _gotitem(self, key, ndim, subset)
   3724         elif ndim == 1:
   3725             if subset is None:
-> 3726                 subset = self.obj[key]
   3727             return SeriesGroupBy(subset, selection=key,
   3728                                  grouper=self.grouper)

/opt/local/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2055             return self._getitem_multilevel(key)
   2056         else:
-> 2057             return self._getitem_column(key)
   2058 
   2059     def _getitem_column(self, key):

/opt/local/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2062         # get column
   2063         if self.columns.is_unique:
-> 2064             return self._get_item_cache(key)
   2065 
   2066         # duplicate columns & possible reduce dimensionality

/opt/local/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/opt/local/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3518 
   3519             if not isnull(item):
-> 3520                 loc = self.items.get_loc(item)
   3521             else:
   3522                 indexer = np.arange(len(self.items))[isnull(self.items)]

/opt/local/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2104                 return self._engine.get_loc(key)
   2105             except KeyError:
-> 2106                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2107 
   2108         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4160)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)()

KeyError: 'blah'```


INSTALLED VERSIONS
------------------
commit: None
python: 3.6.0.alpha.3
python-bits: 64
OS: Linux
OS-release: 3.14.32-xxxx-grs-ipv6-64
machine: x86_64
processor: 
byteorder: little
LC_ALL: None
LANG: en_IE.UTF-8
LOCALE: en_IE.UTF-8

pandas: 0.19.0
nose: None
pip: 8.1.2
setuptools: 28.3.0
Cython: 0.24.1
numpy: 1.11.2
scipy: 0.18.1
statsmodels: None
xarray: None
IPython: 5.1.0
sphinx: None
patsy: None
dateutil: 2.5.3
pytz: 2016.7
blosc: None
bottleneck: None
tables: None
numexpr: None
matplotlib: 2.0.0b3
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: 3.6.4
bs4: 4.5.1
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: 1.0.13
pymysql: None
psycopg2: 2.6.1 (dt dec pq3 ext lo64)
jinja2: 2.8
boto: None
pandas_datareader: None
</details>

The text was updated successfully, but these errors were encountered:

jreback · 2016-10-13T23:25:46Z

pls show the error

note we have no support for 3.6 atm (it may work but not tested at all)

myyc · 2016-10-13T23:26:39Z

sorry, updated with the error message. i believe python 3.6.0 is a detail, i encountered this error on 3.5 too.

jreback · 2016-10-13T23:28:09Z

yes 3.6 doesn't matter

how is that cryptic?

you get a KeyError
blah doesn't exist

jreback · 2016-10-13T23:30:19Z

http://pandas.pydata.org/pandas-docs/stable/groupby.html#applying-different-functions-to-dataframe-columns

you have to have a valid column name when aggregating in a dataframe and a dict of name to function

myyc · 2016-10-13T23:32:18Z

So the name of the column in the dict has to be the name of a column in the DataFrame. However if I do things like .agg([np.sum, lambda x: func1(x), func2]) i get three columns, the second of which will have <lambda> as name. Wouldn't it be wise to have (as an option) something that allows the opposite behaviour? As in, passing a dictionary like {"col": lambda x: func1(x)} to obtain a column named col with func1 as its result?

Apologies if there is already an option to do so.

jreback · 2016-10-13T23:35:18Z

a list works because it applies to all columns

while a dict is selective at mapping column to function

pretty straightforward / you are suggesting something even more confusing

myyc · 2016-10-13T23:37:49Z

Sure. Until you try doing this

func1 = lambda x: x.sum()/x.std()
func2 = lambda x: x.mean()/x.std()

(
    pd.DataFrame({"u": [2,1,4,2,5], "v": [3,1,5,2,3], "a": ["a", "a", "b", "a", "b"]})
    .groupby("a")
    .agg([np.sum, np.mean, func1, func2])
)

and you see it throwing this error: SpecificationError: Function names must be unique, found multiple named <lambda>.

I don't think that aggregating by some random function and renaming the aggregated column is that weird ...

myyc · 2016-10-13T23:39:23Z

What I mean is something like this

(
    pd.DataFrame({"u": [2,1,4,2,5], "v": [3,1,5,2,3], "a": ["a", "a", "b", "a", "b"]})
    .groupby("a")
    .agg2({"s": np.sum, "m": np.mean, "f1": func1, "f2": func2})
)

Returning an object with (in this case) a multi-index on the column level, with s, m, f1 and f2 below each column – which would be exactly like agg's behaviour with [np.sum, np.mean] as an argument.

jreback · 2016-10-13T23:41:21Z

pls read the docs you can do exactly that if u use a series groupby

myyc · 2016-10-13T23:41:56Z

I know it works exactly this way with Series. I am not sure why there can't be a similar option for DataFrames?

jreback · 2016-10-13T23:47:50Z

you can pass a nested dictionary

myyc · 2016-10-13T23:50:04Z

Ok so this does exactly what I wanted:

func1 = lambda x: x.sum()/x.std()
func2 = lambda x: x.mean()/x.std()

d = {"s": np.sum, "m": np.mean, "f1": func1, "f2": func2}

(
    pd.DataFrame({"u": [2,1,4,2,5], "v": [3,1,5,2,3], "a": ["a", "a", "b", "a", "b"]})
    .groupby("a")
    .agg({"u": d, "v": d})
)

Would you still think it's completely pointless to have something like .agg2(d) with the same behaviour?

myyc · 2016-10-13T23:50:25Z

Pointless / confusing

jreback · 2016-10-13T23:55:11Z

too confusing to have another function which does exactly the same thing

however the documentation could use an update on the nested dict case with a dataframe
if you would like to do a pr

myyc · 2016-10-13T23:56:53Z

I can update the documentation. I didn't mean to necessarily create another function but to try and non-destructively add this option to agg, as the above example falls to pieces when a DataFrame has more columns than just u and v. Thanks anyway.

jorisvandenbossche · 2016-10-14T15:05:40Z

@myyc I certainly understand the usecase, but the problem is basically that it is not possible to support this with the same API, as you wouldn't be able to make the distinction anymore between"do you want to apply this function to the specific column?" or "do you want to give this name to the result of this function?"
We already had some discussion about this before, see eg #9052 (specifically this post #9052 where I give an overview of the different possibilities)

I was just thinking. A possible way to be able to give a list of functions to be applied to all columns ánd being able to name the result would be to accept dicts of tuples in this list of functions. Currently you can pass a function or a string (for certain known functions), we could maybe extend this to dicts or typles like:

df.groupby(..).agg(['mean', np.mean, {'average': np.mean}, ('average', np.mean)])

myyc · 2016-10-14T17:56:44Z

@jorisvandenbossche Yeah I think that would do the trick, although the syntax is a bit weird (e.g. what happens if you pass stuff like [np.mean, {'average': np.mean, 'avg': np.mean}]?).

I understand it's impossible with the current system, but don't you think that my proposal (i.e. hypothetically changing DataFrameGroupBy.agg altogether) would make it consistent with SeriesGroupBy.agg? I don't have usage data to back this change obviously.

jorisvandenbossche · 2016-10-14T19:33:06Z

the syntax is a bit weird (e.g. what happens if you pass stuff like [np.mean, {'average': np.mean, 'avg': np.mean}]?).

Well yes, that would be a bit strange, as I think we should only accept dicts of length 1 (therefore tuples are maybe more natural, but the dicts are more like how it works for series)

IMO, both usages of the dict are useful (eg myself, I think I more use the dict to specify different functions for different columns, certainly if you have different data types in different column that should be aggregated differently, this is very handy. For correct naming I just use a def .. function instead of lambda functions).
And unfortunately, both usages are not compatible with each other. So switching the one usage with the other, does not seem like a clear win to me (breaking code + how do you then specify different functions for different columns?)
It would make it indeed more consistent with the series version, but IMO that's not enough to be worth it.

jreback closed this as completed Oct 13, 2016

jreback added Groupby Usage Question labels Oct 13, 2016

jreback added this to the No action milestone Oct 13, 2016

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cryptic DataFrame.agg error when using dictionaries #14421

cryptic DataFrame.agg error when using dictionaries #14421

myyc commented Oct 13, 2016 •

edited by jorisvandenbossche

Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

jreback commented Oct 13, 2016

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 •

edited

Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

myyc commented Oct 13, 2016 •

edited

Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 •

edited

Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

myyc commented Oct 13, 2016

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 •

edited

Loading

jorisvandenbossche commented Oct 14, 2016

myyc commented Oct 14, 2016

jorisvandenbossche commented Oct 14, 2016

cryptic DataFrame.agg error when using dictionaries #14421

cryptic DataFrame.agg error when using dictionaries #14421

Comments

myyc commented Oct 13, 2016 • edited by jorisvandenbossche Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

jreback commented Oct 13, 2016

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 • edited Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

myyc commented Oct 13, 2016 • edited Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 • edited Loading

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016

myyc commented Oct 13, 2016

jreback commented Oct 13, 2016

myyc commented Oct 13, 2016 • edited Loading

jorisvandenbossche commented Oct 14, 2016

myyc commented Oct 14, 2016

jorisvandenbossche commented Oct 14, 2016

myyc commented Oct 13, 2016 •

edited by jorisvandenbossche

Loading

myyc commented Oct 13, 2016 •

edited

Loading

myyc commented Oct 13, 2016 •

edited

Loading

myyc commented Oct 13, 2016 •

edited

Loading

myyc commented Oct 13, 2016 •

edited

Loading