diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index a756239ee6798..804886fb987ad 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -18,7 +18,7 @@ Fixed regressions - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) -- +- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e3196904fa56f..26fb23087ed55 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -556,13 +556,8 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - # Return a dictionary of {group label: [indices belonging to the group label]} - # respecting whether sort was specified - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) - return { - category: np.flatnonzero(codes == i) - for i, category in enumerate(Index(uniques)) - } + values = Categorical(self.grouper) + return values._reverse_indexer() @property def codes(self) -> np.ndarray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 06d01d46b64f7..3262407a99c0a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -53,6 +53,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.generic import ABCCategoricalIndex from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms @@ -241,6 +242,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ + if len(self.groupings) == 1 and isinstance( + self.result_index, ABCCategoricalIndex + ): + # This shows unused categories in indices GH#38642 + return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8cf77ca6335f4..f0bc58cbf07bf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( df_grp = df.groupby(["a", "b"], observed=observed) result = getattr(df_grp, func)() tm.assert_frame_equal(result, expected) + + +def test_groupby_categorical_indices_unused_categories(): + # GH#38642 + df = DataFrame( + { + "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]), + "col": range(3), + } + ) + grouped = df.groupby("key", sort=False) + result = grouped.indices + expected = { + "b": np.array([0, 1], dtype="int64"), + "a": np.array([2], dtype="int64"), + "c": np.array([], dtype="int64"), + } + assert result.keys() == expected.keys() + for key in result.keys(): + tm.assert_numpy_array_equal(result[key], expected[key])