Backport PR pandas-dev#38649: BUG: Fix regression for groupby.indices…

… in case of unused categories
meeseeksmachine · Dec 29, 2020 · e10a87d · e10a87d
1 parent 5bdee11
commit e10a87d
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 8 deletions.
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -17,7 +17,7 @@ Fixed regressions
 - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
 - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
 - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
--
+- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -582,13 +582,8 @@ def indices(self):
         if isinstance(self.grouper, ops.BaseGrouper):
             return self.grouper.indices
 
-        # Return a dictionary of {group label: [indices belonging to the group label]}
-        # respecting whether sort was specified
-        codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
-        return {
-            category: np.flatnonzero(codes == i)
-            for i, category in enumerate(Index(uniques))
-        }
+        values = Categorical(self.grouper)
+        return values._reverse_indexer()
 
     @property
     def codes(self) -> np.ndarray:

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -53,6 +53,7 @@
     is_timedelta64_dtype,
     needs_i8_conversion,
 )
+from pandas.core.dtypes.generic import ABCCategoricalIndex
 from pandas.core.dtypes.missing import isna, maybe_fill
 
 import pandas.core.algorithms as algorithms
@@ -244,6 +245,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
     @cache_readonly
     def indices(self):
         """ dict {group name -> group indices} """
+        if len(self.groupings) == 1 and isinstance(
+            self.result_index, ABCCategoricalIndex
+        ):
+            # This shows unused categories in indices GH#38642
+            return self.groupings[0].indices
         codes_list = [ping.codes for ping in self.groupings]
         keys = [ping.group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
     df_grp = df.groupby(["a", "b"], observed=observed)
     result = getattr(df_grp, func)()
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_categorical_indices_unused_categories():
+    # GH#38642
+    df = DataFrame(
+        {
+            "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
+            "col": range(3),
+        }
+    )
+    grouped = df.groupby("key", sort=False)
+    result = grouped.indices
+    expected = {
+        "b": np.array([0, 1], dtype="int64"),
+        "a": np.array([2], dtype="int64"),
+        "c": np.array([], dtype="int64"),
+    }
+    assert result.keys() == expected.keys()
+    for key in result.keys():
+        tm.assert_numpy_array_equal(result[key], expected[key])