From b8d86e2fd5170a5b82a0519587b489ee05e89ccc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 1 Apr 2018 10:28:46 -0400 Subject: [PATCH] BUG: groupby with categorical and other columns closes #14942 --- doc/source/whatsnew/v0.23.0.txt | 51 +++++ pandas/core/groupby.py | 77 ++----- pandas/tests/groupby/test_categorical.py | 244 +++++++++++------------ 3 files changed, 187 insertions(+), 185 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 09bd09b06d9b93..87563a092e1ab3 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -479,6 +479,57 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use 'Taxes': -200, 'Net result': 300}).sort_index() +.. _whatsnew_0230.api_breaking.categorical_grouping: + +Categorical Grouping no longer expands to all possible groupers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for each grouper, not just the observed values. This is inconsistent with output for other dtypes, can potentially cast to different dtypes (as missing values are introduced), and could cause a huge frame to be generated. Pandas will now return only the observed values, regardless if grouping on a categorical column; note that the categorical dtype is *still* preserved. You will still have a categorical columns (:issue:`14942`) + + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 + df + +Previous Behavior: + +.. code-block:: python + + In [4]: df.groupby(['A', 'B', 'C']).count() + Out[4]: + values + A B C + a c bar NaN + foo 1.0 + d bar 1.0 + foo NaN + y bar NaN + foo NaN + b c bar NaN + foo 1.0 + d bar 1.0 + foo NaN + y bar NaN + foo NaN + z c bar NaN + foo NaN + d bar NaN + foo NaN + y bar NaN + foo NaN + +New Behavior: + +.. ipython:: python + + df.groupby(['A', 'B', 'C']).count() + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7c89cab6b1428b..dc4871442e830e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2336,10 +2336,13 @@ def result_index(self): if not self.compressed and len(self.groupings) == 1: return self.groupings[0].group_index.rename(self.names[0]) - return MultiIndex(levels=[ping.group_index for ping in self.groupings], - labels=self.recons_labels, - verify_integrity=False, - names=self.names) + labels = self.recons_labels + levels = [ping.group_index for ping in self.groupings] + result = MultiIndex(levels=levels, + labels=labels, + verify_integrity=False, + names=self.names) + return result.remove_unused_levels() def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: @@ -4151,7 +4154,7 @@ def first_not_none(values): not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: - key_index = MultiIndex.from_tuples(keys, names=key_names) + key_index = self.grouper.result_index else: ping = self.grouper.groupings[0] @@ -4241,8 +4244,9 @@ def first_not_none(values): # normally use vstack as its faster than concat # and if we have mi-columns - if isinstance(v.index, - MultiIndex) or key_index is None: + if (isinstance(v.index, MultiIndex) or + key_index is None or + isinstance(key_index, MultiIndex)): stacked_values = np.vstack(map(np.asarray, values)) result = DataFrame(stacked_values, index=key_index, columns=index) @@ -4280,7 +4284,7 @@ def first_not_none(values): else: result = result._convert(datetime=True) - return self._reindex_output(result) + return result # values are not series or array-like but scalars else: @@ -4661,7 +4665,7 @@ def _wrap_aggregated_output(self, output, names=None): if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return result._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return DataFrame(output, index=self.obj.index) @@ -4682,60 +4686,7 @@ def _wrap_agged_blocks(self, items, blocks): if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) - - def _reindex_output(self, result): - """ - if we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups) - - This can re-expand the output space - """ - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [(i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis] - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) + return result._convert(datetime=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index bcd0da28b5a340..b9a8a1db448ecd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -5,11 +5,9 @@ import pytest import numpy as np -from numpy import nan - import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series, Interval) + DataFrame, Categorical, Series) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm from .common import MixIn @@ -28,11 +26,11 @@ def test_groupby(self): result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A") @@ -41,33 +39,6 @@ def test_groupby(self): result = gb.sum() tm.assert_frame_equal(result, expected) - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], @@ -186,37 +157,45 @@ def get_stats(group): result = self.df.groupby(cats).D.apply(get_stats) assert result.index.names[0] == 'C' - def test_apply_categorical_data(self): + @pytest.mark.parametrize('ordered', [True, False]) + def test_apply_categorical_data(self, ordered): # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) + + dense = Categorical(list('abc'), ordered=ordered) + + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_arrays( + [missing, dense], names=['missing', 'dense']) + expected = DataFrame([0, 1, 2.], + index=idx, + columns=['values']) + + result = grouped.apply(lambda x: np.mean(x)) + assert_frame_equal(result, expected) + + # we coerce back to ints + expected = expected.astype('int64') + result = grouped.mean() + assert_frame_equal(result, expected) + + result = grouped.agg(np.mean) + assert_frame_equal(result, expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_arrays([missing, dense], + names=['missing', 'dense']) + expected = Series(1, index=idx) + result = grouped.apply(lambda x: 1) + assert_series_equal(result, expected) def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] @@ -258,6 +237,51 @@ def test_groupby_categorical(self): tm.assert_index_equal((desc_result.stack().index .get_level_values(1)), exp) + def test_cats_with_multiple_columns(self): + # multiple groupers + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 + + # multiple groupers with a non-cat + gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2, ['foo', 'bar'] * 2], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + [1, 2, 3, 4], index=exp_index)}).sort_index() + result = gb.sum() + tm.assert_frame_equal(result, expected) + + gb = df.groupby(['A', 'B']) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, 3, 4]}, + index=exp_index) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # gh-14942 + # don't re-expand the output space + # of the grouper + + df = DataFrame({ + 'cat': np.random.randint(0, 255, size=30000), + 'int_id': np.random.randint(0, 255, size=30000), + 'other_id': np.random.randint(0, 10000, size=30000), + 'foo': 0}) + df['cat'] = df.cat.astype(str).astype('category') + + grouped = df.groupby(['cat', 'int_id', 'other_id']) + result = grouped.count() + assert result.index.levels[0].nunique() == df.cat.nunique() + assert result.index.levels[1].nunique() == df.int_id.nunique() + assert result.index.levels[2].nunique() == df.other_id.nunique() + def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) @@ -373,28 +397,26 @@ def test_groupby_multi_categorical_as_index(self): 'A': [10, 11, 11], 'B': [101, 102, 103]}) result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # function grouper f = lambda r: df.loc[r, 'A'] result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 22], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # another not in-axis grouper s = Series(['a', 'b', 'b'], name='cat2') result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # GH18872: conflicting names in desired index @@ -402,12 +424,12 @@ def test_groupby_multi_categorical_as_index(self): s.rename('cat')]).sum()) # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - group_columns = ['cat', 'A'] + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) for name in [None, 'X', 'B', 'cat']: df.index = Index(list("abc"), name=name) @@ -420,7 +442,7 @@ def test_groupby_multi_categorical_as_index(self): else: result = df.groupby(group_columns, as_index=False).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) + tm.assert_frame_equal(result, expected) def test_groupby_preserve_categories(self): # GH-13179 @@ -472,23 +494,6 @@ def test_groupby_preserve_categorical_dtype(self): tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) @@ -619,10 +624,10 @@ def test_groupby_categorical_two_columns(self): ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) + df = pd.DataFrame(d) # Grouping on a single column - groups_single_key = test.groupby("cat") + groups_single_key = df.groupby("cat") res = groups_single_key.agg('mean') exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", @@ -632,39 +637,34 @@ def test_groupby_categorical_two_columns(self): tm.assert_frame_equal(res, exp) # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) + groups_double_key = df.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) + exp = DataFrame( + {"val": [10, 30, 20, 40], + "cat": pd.Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), + "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) tm.assert_frame_equal(res, exp) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] + expected = df[(df.cat == c) & (df.ints == i)] assert_frame_equal(result, expected) d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) + df = pd.DataFrame(d) + values = pd.cut(df['C1'], [1, 2, 3, 6]) values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) + groups_double_key = df.groupby([values, 'C2']) res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical([Interval(1, 2), Interval(2, 3), - Interval(3, 6)], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [3, 3, 4, 5], + "C3": [10, 100, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) def test_empty_sum(self):