Skip to content

Commit

Permalink
Backport PR pandas-dev#38649: BUG: Fix regression for groupby.indices…
Browse files Browse the repository at this point in the history
… in case of unused categories
  • Loading branch information
phofl authored and meeseeksmachine committed Dec 29, 2020
1 parent 5bdee11 commit e10a87d
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 8 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Fixed regressions
- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
- :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
-
- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)

.. ---------------------------------------------------------------------------
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,13 +582,8 @@ def indices(self):
if isinstance(self.grouper, ops.BaseGrouper):
return self.grouper.indices

# Return a dictionary of {group label: [indices belonging to the group label]}
# respecting whether sort was specified
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
return {
category: np.flatnonzero(codes == i)
for i, category in enumerate(Index(uniques))
}
values = Categorical(self.grouper)
return values._reverse_indexer()

@property
def codes(self) -> np.ndarray:
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import ABCCategoricalIndex
from pandas.core.dtypes.missing import isna, maybe_fill

import pandas.core.algorithms as algorithms
Expand Down Expand Up @@ -244,6 +245,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
@cache_readonly
def indices(self):
""" dict {group name -> group indices} """
if len(self.groupings) == 1 and isinstance(
self.result_index, ABCCategoricalIndex
):
# This shows unused categories in indices GH#38642
return self.groupings[0].indices
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
df_grp = df.groupby(["a", "b"], observed=observed)
result = getattr(df_grp, func)()
tm.assert_frame_equal(result, expected)


def test_groupby_categorical_indices_unused_categories():
# GH#38642
df = DataFrame(
{
"key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
"col": range(3),
}
)
grouped = df.groupby("key", sort=False)
result = grouped.indices
expected = {
"b": np.array([0, 1], dtype="int64"),
"a": np.array([2], dtype="int64"),
"c": np.array([], dtype="int64"),
}
assert result.keys() == expected.keys()
for key in result.keys():
tm.assert_numpy_array_equal(result[key], expected[key])

0 comments on commit e10a87d

Please sign in to comment.