BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical

rhshadrach · Sep 24, 2023 · 1325855 · 1325855
1 parent d01669f
commit 1325855
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 88 deletions.
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -124,7 +124,7 @@ jobs:
       run: |
         cd asv_bench
         asv machine --yes
-        asv run --quick --dry-run --durations=30 --python=same
+        asv run --quick --dry-run --durations=30 --python=same --show-stderr
 
   build_docker_dev_environment:
     name: Build Docker Dev Environment

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -328,7 +328,7 @@ Plotting
 
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
--
+- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`)
 -
 
 Reshaping

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1185,15 +1185,23 @@ def nsmallest(
     def idxmin(
         self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
     ) -> Series:
-        result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
-        return result.astype(self.obj.index.dtype) if result.empty else result
+        if axis is not lib.no_default:
+            axis = self.obj._get_axis_number(axis)
+            self._deprecate_axis(axis, "idxmin")
+        else:
+            axis = 0
+        return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)
 
     @doc(Series.idxmax.__doc__)
     def idxmax(
         self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
     ) -> Series:
-        result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
-        return result.astype(self.obj.index.dtype) if result.empty else result
+        if axis is not lib.no_default:
+            axis = self.obj._get_axis_number(axis)
+            self._deprecate_axis(axis, "idxmax")
+        else:
+            axis = 0
+        return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)
 
     @doc(Series.corr.__doc__)
     def corr(
@@ -2195,14 +2203,9 @@ def idxmax(
         else:
             axis = self.axis
 
-        def func(df):
-            return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
-
-        func.__name__ = "idxmax"
-        result = self._python_apply_general(
-            func, self._obj_with_exclusions, not_indexed_same=True
+        return self._idxmax_idxmin(
+            "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna
         )
-        return result.astype(self.obj.index.dtype) if result.empty else result
 
     def idxmin(
         self,
@@ -2290,14 +2293,9 @@ def idxmin(
         else:
             axis = self.axis
 
-        def func(df):
-            return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
-
-        func.__name__ = "idxmin"
-        result = self._python_apply_general(
-            func, self._obj_with_exclusions, not_indexed_same=True
+        return self._idxmax_idxmin(
+            "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna
         )
-        return result.astype(self.obj.index.dtype) if result.empty else result
 
     boxplot = boxplot_frame_groupby
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2015,10 +2015,16 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             with com.temp_setattr(self, "as_index", True):
                 # GH#49834 - result needs groups in the index for
                 # _wrap_transform_fast_result
-                if engine is not None:
-                    kwargs["engine"] = engine
-                    kwargs["engine_kwargs"] = engine_kwargs
-                result = getattr(self, func)(*args, **kwargs)
+                if func in ["idxmin", "idxmax"]:
+                    func = cast(Literal["idxmin", "idxmax"], func)
+                    result = self._idxmax_idxmin(
+                        func, ignore_unobserved=True, axis=self.axis, **kwargs
+                    )
+                else:
+                    if engine is not None:
+                        kwargs["engine"] = engine
+                        kwargs["engine_kwargs"] = engine_kwargs
+                    result = getattr(self, func)(*args, **kwargs)
 
             return self._wrap_transform_fast_result(result)
 
@@ -5720,6 +5726,92 @@ def sample(
         sampled_indices = np.concatenate(sampled_indices)
         return self._selected_obj.take(sampled_indices, axis=self.axis)
 
+    def _idxmax_idxmin(
+        self,
+        how: Literal["idxmax", "idxmin"],
+        axis: Axis = 0,
+        numeric_only: bool = False,
+        skipna: bool = True,
+        ignore_unobserved: bool = False,
+    ):
+        """Compute idxmax/idxmin.
+
+        Parameters
+        ----------
+        how: {"idxmin", "idxmax"}
+            Whether to compute idxmin or idxmax.
+        axis : {{0 or 'index', 1 or 'columns'}}, default None
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+            If axis is not provided, grouper's axis is used.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ignore_unobserved : bool, default False
+            When True and an unobserved group is encountered, do not raise. This used
+            for transform where unobserved groups do not play an impact on the result.
+
+        Returns
+        -------
+        Series or DataFrame
+            idxmax or idxmin for the groupby operation.
+        """
+        if not self.observed and any(
+            ping._passed_categorical for ping in self.grouper.groupings
+        ):
+            expected_len = np.prod(
+                [len(ping.group_index) for ping in self.grouper.groupings]
+            )
+            assert len(self.grouper.result_index) <= expected_len
+            # result_index only contains observed groups
+            has_unobserved = len(self.grouper.result_index) < expected_len
+            raise_err = not ignore_unobserved and has_unobserved
+        else:
+            raise_err = False
+        if raise_err:
+            raise ValueError(
+                f"Can't get {how} of an empty group due to unobserved categories. "
+                "Specify observed=True in groupby instead."
+            )
+
+        try:
+            if self.obj.ndim == 1:
+                result = self._op_via_apply(how, skipna=skipna)
+            else:
+
+                def func(df):
+                    method = getattr(df, how)
+                    return method(axis=axis, skipna=skipna, numeric_only=numeric_only)
+
+                func.__name__ = how
+                result = self._python_apply_general(
+                    func, self._obj_with_exclusions, not_indexed_same=True
+                )
+        except ValueError as err:
+            name = "argmax" if how == "idxmax" else "argmin"
+            if f"attempt to get {name} of an empty sequence" in str(err):
+                raise ValueError(
+                    f"Can't get {how} of an empty group due to unobserved categories. "
+                    "Specify observed=True in groupby instead."
+                )
+            raise
+
+        result = result.astype(self.obj.index.dtype) if result.empty else result
+
+        if not skipna:
+            has_na_value = result.isnull().any(axis=None)
+            if has_na_value:
+                warnings.warn(
+                    f"The behavior of {type(self).__name__}.{how} with all-NA "
+                    "values, or any-NA and skipna=False, is deprecated. In a future "
+                    "version this will raise ValueError",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+
+        return result
+
 
 @doc(GroupBy)
 def get_groupby(

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
         return
 
     agg = getattr(series_groupby, reduction_func)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            agg(*args)
+        return
+
     result = agg(*args)
 
     assert len(result) == expected_length
@@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
 
     series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
     agg = getattr(series_groupby, reduction_func)
+
+    if reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            agg(*args)
+        return
+
     result = agg(*args)
 
     zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
@@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
     df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
 
     args = get_groupby_method_args(reduction_func, df)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            getattr(df_grp, reduction_func)(*args)
+        return
+
     res = getattr(df_grp, reduction_func)(*args)
 
     expected = _results_for_groupbys_with_missing_categories[reduction_func]
@@ -1883,14 +1910,7 @@ def test_category_order_reducer(
     request, as_index, sort, observed, reduction_func, index_kind, ordered
 ):
     # GH#48749
-    if (
-        reduction_func in ("idxmax", "idxmin")
-        and not observed
-        and index_kind != "multi"
-    ):
-        msg = "GH#10694 - idxmax/min fail with unused categories"
-        request.node.add_marker(pytest.mark.xfail(reason=msg))
-    elif reduction_func == "corrwith" and not as_index:
+    if reduction_func == "corrwith" and not as_index:
         msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
         request.node.add_marker(pytest.mark.xfail(reason=msg))
     elif index_kind != "range" and not as_index:
@@ -1912,6 +1932,15 @@ def test_category_order_reducer(
         df = df.set_index(keys)
     args = get_groupby_method_args(reduction_func, df)
     gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            getattr(gb, reduction_func)(*args)
+        return
+
     op_result = getattr(gb, reduction_func)(*args)
     if as_index:
         result = op_result.index.get_level_values("a").categories
@@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys
         gb = gb["b"]
     args = get_groupby_method_args(reduction_func, df)
 
+    if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            gb.agg([reduction_func], *args)
+        return
+
     result = gb.agg([reduction_func], *args)
     expected = getattr(gb, reduction_func)(*args)
 

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1():
             gb2.idxmax(axis=1)
 
 
+@pytest.mark.parametrize(
+    "func, values, expected_values, warn",
+    [
+        ("idxmin", [0, 1, 2], [0, 2], None),
+        ("idxmax", [0, 1, 2], [1, 2], None),
+        ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning),
+        ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning),
+        ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning),
+        ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning),
+    ],
+)
+@pytest.mark.parametrize("test_series", [True, False])
+def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series):
+    # GH#54234
+    df = DataFrame(
+        {
+            "a": [1, 1, 2],
+            "b": values,
+        }
+    )
+    gb = df.groupby("a")
+    index = Index([1, 2], name="a")
+    expected = DataFrame({"b": expected_values}, index=index)
+    if test_series:
+        gb = gb["b"]
+        expected = expected["b"]
+    klass = "Series" if test_series else "DataFrame"
+    msg = f"The behavior of {klass}GroupBy.{func} with all-NA values"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = getattr(gb, func)(skipna=False)
+    tm.assert_equal(result, expected)
+
+
 @pytest.mark.parametrize("numeric_only", [True, False, None])
 def test_axis1_numeric_only(request, groupby_func, numeric_only):
     if groupby_func in ("idxmax", "idxmin"):

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error():
 @pytest.mark.parametrize(
     "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
 )
-def test_empty_groupby(
-    columns, keys, values, method, op, request, using_array_manager, dropna
-):
+def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna):
     # GH8093 & GH26411
     override_dtype = None
 
-    if (
-        isinstance(values, Categorical)
-        and len(keys) == 1
-        and op in ["idxmax", "idxmin"]
-    ):
-        mark = pytest.mark.xfail(
-            raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
-        )
-        request.node.add_marker(mark)
-
     if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
         # We expect to get Int64 back for these
         override_dtype = "Int64"
@@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected():
     is_dt64 = df.dtypes.iloc[0].kind == "M"
     is_cat = isinstance(values, Categorical)
 
-    if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
-        msg = f"Cannot perform {op} with non-ordered Categorical"
-        with pytest.raises(TypeError, match=msg):
+    if (
+        isinstance(values, Categorical)
+        and not values.ordered
+        and op in ["min", "max", "idxmin", "idxmax"]
+    ):
+        if op in ["min", "max"]:
+            msg = f"Cannot perform {op} with non-ordered Categorical"
+            klass = TypeError
+        else:
+            msg = f"Can't get {op} of an empty group due to unobserved categories"
+            klass = ValueError
+        with pytest.raises(klass, match=msg):
             get_result()
 
-        if isinstance(columns, list):
+        if op in ["min", "max"] and isinstance(columns, list):
             # i.e. DataframeGroupBy, not SeriesGroupBy
             result = get_result(numeric_only=True)
             expected = get_categorical_invalid_expected()