Skip to content

Commit

Permalink
BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed Sep 24, 2023
1 parent d01669f commit 1325855
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 88 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
run: |
cd asv_bench
asv machine --yes
asv run --quick --dry-run --durations=30 --python=same
asv run --quick --dry-run --durations=30 --python=same --show-stderr
build_docker_dev_environment:
name: Build Docker Dev Environment
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ Plotting

Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-
- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`)
-

Reshaping
Expand Down
34 changes: 16 additions & 18 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,15 +1185,23 @@ def nsmallest(
def idxmin(
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
) -> Series:
result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
return result.astype(self.obj.index.dtype) if result.empty else result
if axis is not lib.no_default:
axis = self.obj._get_axis_number(axis)
self._deprecate_axis(axis, "idxmin")
else:
axis = 0
return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)

@doc(Series.idxmax.__doc__)
def idxmax(
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
) -> Series:
result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
return result.astype(self.obj.index.dtype) if result.empty else result
if axis is not lib.no_default:
axis = self.obj._get_axis_number(axis)
self._deprecate_axis(axis, "idxmax")
else:
axis = 0
return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)

@doc(Series.corr.__doc__)
def corr(
Expand Down Expand Up @@ -2195,14 +2203,9 @@ def idxmax(
else:
axis = self.axis

def func(df):
return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = "idxmax"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
return self._idxmax_idxmin(
"idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna
)
return result.astype(self.obj.index.dtype) if result.empty else result

def idxmin(
self,
Expand Down Expand Up @@ -2290,14 +2293,9 @@ def idxmin(
else:
axis = self.axis

def func(df):
return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = "idxmin"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
return self._idxmax_idxmin(
"idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna
)
return result.astype(self.obj.index.dtype) if result.empty else result

boxplot = boxplot_frame_groupby

Expand Down
100 changes: 96 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2015,10 +2015,16 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
with com.temp_setattr(self, "as_index", True):
# GH#49834 - result needs groups in the index for
# _wrap_transform_fast_result
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)
if func in ["idxmin", "idxmax"]:
func = cast(Literal["idxmin", "idxmax"], func)
result = self._idxmax_idxmin(
func, ignore_unobserved=True, axis=self.axis, **kwargs
)
else:
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)

return self._wrap_transform_fast_result(result)

Expand Down Expand Up @@ -5720,6 +5726,92 @@ def sample(
sampled_indices = np.concatenate(sampled_indices)
return self._selected_obj.take(sampled_indices, axis=self.axis)

def _idxmax_idxmin(
self,
how: Literal["idxmax", "idxmin"],
axis: Axis = 0,
numeric_only: bool = False,
skipna: bool = True,
ignore_unobserved: bool = False,
):
"""Compute idxmax/idxmin.
Parameters
----------
how: {"idxmin", "idxmax"}
Whether to compute idxmin or idxmax.
axis : {{0 or 'index', 1 or 'columns'}}, default None
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
If axis is not provided, grouper's axis is used.
numeric_only : bool, default False
Include only float, int, boolean columns.
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
ignore_unobserved : bool, default False
When True and an unobserved group is encountered, do not raise. This used
for transform where unobserved groups do not play an impact on the result.
Returns
-------
Series or DataFrame
idxmax or idxmin for the groupby operation.
"""
if not self.observed and any(
ping._passed_categorical for ping in self.grouper.groupings
):
expected_len = np.prod(
[len(ping.group_index) for ping in self.grouper.groupings]
)
assert len(self.grouper.result_index) <= expected_len
# result_index only contains observed groups
has_unobserved = len(self.grouper.result_index) < expected_len
raise_err = not ignore_unobserved and has_unobserved
else:
raise_err = False
if raise_err:
raise ValueError(
f"Can't get {how} of an empty group due to unobserved categories. "
"Specify observed=True in groupby instead."
)

try:
if self.obj.ndim == 1:
result = self._op_via_apply(how, skipna=skipna)
else:

def func(df):
method = getattr(df, how)
return method(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = how
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
except ValueError as err:
name = "argmax" if how == "idxmax" else "argmin"
if f"attempt to get {name} of an empty sequence" in str(err):
raise ValueError(
f"Can't get {how} of an empty group due to unobserved categories. "
"Specify observed=True in groupby instead."
)
raise

result = result.astype(self.obj.index.dtype) if result.empty else result

if not skipna:
has_na_value = result.isnull().any(axis=None)
if has_na_value:
warnings.warn(
f"The behavior of {type(self).__name__}.{how} with all-NA "
"values, or any-NA and skipna=False, is deprecated. In a future "
"version this will raise ValueError",
FutureWarning,
stacklevel=find_stack_level(),
)

return result


@doc(GroupBy)
def get_groupby(
Expand Down
52 changes: 44 additions & 8 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
return

agg = getattr(series_groupby, reduction_func)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
agg(*args)
return

result = agg(*args)

assert len(result) == expected_length
Expand Down Expand Up @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(

series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, reduction_func)

if reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
agg(*args)
return

result = agg(*args)

zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
Expand Down Expand Up @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)

args = get_groupby_method_args(reduction_func, df)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(df_grp, reduction_func)(*args)
return

res = getattr(df_grp, reduction_func)(*args)

expected = _results_for_groupbys_with_missing_categories[reduction_func]
Expand Down Expand Up @@ -1883,14 +1910,7 @@ def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
):
# GH#48749
if (
reduction_func in ("idxmax", "idxmin")
and not observed
and index_kind != "multi"
):
msg = "GH#10694 - idxmax/min fail with unused categories"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif reduction_func == "corrwith" and not as_index:
if reduction_func == "corrwith" and not as_index:
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif index_kind != "range" and not as_index:
Expand All @@ -1912,6 +1932,15 @@ def test_category_order_reducer(
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(gb, reduction_func)(*args)
return

op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
Expand Down Expand Up @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys
gb = gb["b"]
args = get_groupby_method_args(reduction_func, df)

if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
gb.agg([reduction_func], *args)
return

result = gb.agg([reduction_func], *args)
expected = getattr(gb, reduction_func)(*args)

Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1():
gb2.idxmax(axis=1)


@pytest.mark.parametrize(
"func, values, expected_values, warn",
[
("idxmin", [0, 1, 2], [0, 2], None),
("idxmax", [0, 1, 2], [1, 2], None),
("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning),
("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning),
("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning),
("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning),
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series):
# GH#54234
df = DataFrame(
{
"a": [1, 1, 2],
"b": values,
}
)
gb = df.groupby("a")
index = Index([1, 2], name="a")
expected = DataFrame({"b": expected_values}, index=index)
if test_series:
gb = gb["b"]
expected = expected["b"]
klass = "Series" if test_series else "DataFrame"
msg = f"The behavior of {klass}GroupBy.{func} with all-NA values"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb, func)(skipna=False)
tm.assert_equal(result, expected)


@pytest.mark.parametrize("numeric_only", [True, False, None])
def test_axis1_numeric_only(request, groupby_func, numeric_only):
if groupby_func in ("idxmax", "idxmin"):
Expand Down
31 changes: 14 additions & 17 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error():
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(
columns, keys, values, method, op, request, using_array_manager, dropna
):
def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna):
# GH8093 & GH26411
override_dtype = None

if (
isinstance(values, Categorical)
and len(keys) == 1
and op in ["idxmax", "idxmin"]
):
mark = pytest.mark.xfail(
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
)
request.node.add_marker(mark)

if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"
Expand Down Expand Up @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected():
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)

if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
with pytest.raises(TypeError, match=msg):
if (
isinstance(values, Categorical)
and not values.ordered
and op in ["min", "max", "idxmin", "idxmax"]
):
if op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
klass = TypeError
else:
msg = f"Can't get {op} of an empty group due to unobserved categories"
klass = ValueError
with pytest.raises(klass, match=msg):
get_result()

if isinstance(columns, list):
if op in ["min", "max"] and isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)
expected = get_categorical_invalid_expected()
Expand Down
Loading

0 comments on commit 1325855

Please sign in to comment.