Skip to content

Commit

Permalink
BUG: groupby with categorical and other columns
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Apr 2, 2018
1 parent 5edc5c4 commit 5220a20
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 185 deletions.
51 changes: 51 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,57 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
'Taxes': -200,
'Net result': 300}).sort_index()

.. _whatsnew_0230.api_breaking.categorical_grouping:

Categorical Grouping no longer expands to all possible groupers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for each grouper, not just the observed values. This is inconsistent with output for other dtypes, can potentially cast to different dtypes (as missing values are introduced), and could cause a huge frame to be generated. Pandas will now return only the observed values, regardless if grouping on a categorical column; note that the categorical dtype is *still* preserved. You will still have a categorical columns (:issue:`14942`)


.. ipython:: python

cat1 = pd.Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = pd.Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df['C'] = ['foo', 'bar'] * 2
df

Previous Behavior:

.. code-block:: python

In [4]: df.groupby(['A', 'B', 'C']).count()
Out[4]:
values
A B C
a c bar NaN
foo 1.0
d bar 1.0
foo NaN
y bar NaN
foo NaN
b c bar NaN
foo 1.0
d bar 1.0
foo NaN
y bar NaN
foo NaN
z c bar NaN
foo NaN
d bar NaN
foo NaN
y bar NaN
foo NaN

New Behavior:

.. ipython:: python

df.groupby(['A', 'B', 'C']).count()

.. _whatsnew_0230.api_breaking.deprecate_panel:

Deprecate Panel
Expand Down
77 changes: 14 additions & 63 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2336,10 +2336,13 @@ def result_index(self):
if not self.compressed and len(self.groupings) == 1:
return self.groupings[0].group_index.rename(self.names[0])

return MultiIndex(levels=[ping.group_index for ping in self.groupings],
labels=self.recons_labels,
verify_integrity=False,
names=self.names)
labels = self.recons_labels
levels = [ping.group_index for ping in self.groupings]
result = MultiIndex(levels=levels,
labels=labels,
verify_integrity=False,
names=self.names)
return result.remove_unused_levels()

def get_group_levels(self):
if not self.compressed and len(self.groupings) == 1:
Expand Down Expand Up @@ -4151,7 +4154,7 @@ def first_not_none(values):
not_indexed_same=not_indexed_same)
elif self.grouper.groupings is not None:
if len(self.grouper.groupings) > 1:
key_index = MultiIndex.from_tuples(keys, names=key_names)
key_index = self.grouper.result_index

else:
ping = self.grouper.groupings[0]
Expand Down Expand Up @@ -4241,8 +4244,9 @@ def first_not_none(values):

# normally use vstack as its faster than concat
# and if we have mi-columns
if isinstance(v.index,
MultiIndex) or key_index is None:
if (isinstance(v.index, MultiIndex) or
key_index is None or
isinstance(key_index, MultiIndex)):
stacked_values = np.vstack(map(np.asarray, values))
result = DataFrame(stacked_values, index=key_index,
columns=index)
Expand Down Expand Up @@ -4280,7 +4284,7 @@ def first_not_none(values):
else:
result = result._convert(datetime=True)

return self._reindex_output(result)
return result

# values are not series or array-like but scalars
else:
Expand Down Expand Up @@ -4661,7 +4665,7 @@ def _wrap_aggregated_output(self, output, names=None):
if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)
return result._convert(datetime=True)

def _wrap_transformed_output(self, output, names=None):
return DataFrame(output, index=self.obj.index)
Expand All @@ -4682,60 +4686,7 @@ def _wrap_agged_blocks(self, items, blocks):
if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)

def _reindex_output(self, result):
"""
if we have categorical groupers, then we want to make sure that
we have a fully reindex-output to the levels. These may have not
participated in the groupings (e.g. may have all been
nan groups)
This can re-expand the output space
"""
groupings = self.grouper.groupings
if groupings is None:
return result
elif len(groupings) == 1:
return result
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
for ping in groupings):
return result

levels_list = [ping.group_index for ping in groupings]
index, _ = MultiIndex.from_product(
levels_list, names=self.grouper.names).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
return result.reindex(**d)

# GH 13204
# Here, the categorical in-axis groupers, which need to be fully
# expanded, are columns in `result`. An idea is to do:
# result = result.set_index(self.grouper.names)
# .reindex(index).reset_index()
# but special care has to be taken because of possible not-in-axis
# groupers.
# So, we manually select and drop the in-axis grouper columns,
# reindex `result`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = [(i, ping.name) for (i, ping)
in enumerate(groupings) if ping.in_axis]
g_nums, g_names = zip(*in_axis_grps)

result = result.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
result = result.set_index(self.grouper.result_index
).reindex(index, copy=False)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
result = result.reset_index(level=g_nums)

return result.reset_index(drop=True)
return result._convert(datetime=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
Expand Down
Loading

0 comments on commit 5220a20

Please sign in to comment.