Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise errors for unsupported operations on certain types #15712

Merged
merged 12 commits into from
May 21, 2024
42 changes: 41 additions & 1 deletion python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ from functools import singledispatch

from pandas.errors import DataError

from cudf.api.types import is_string_dtype
from cudf.api.types import _is_categorical_dtype, is_string_dtype
from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import (
CategoricalDtype,
Expand Down Expand Up @@ -167,6 +167,46 @@ cdef class GroupBy:
included_aggregations_i = []
col_aggregations = []
for agg in aggs:
str_agg = str(agg)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Is it not enough to raise the TypeError if the if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations: condition below fails?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally yes, but that is a very loose condition for many operations vs dtype matrix we have and libcudf seems to be silently returning empty columns for those. We need to dig deeper for other types aswell.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I suppose this PR is a good enough intermediate solution until we can centeralize this in get_valid_aggregation or similar.

if (
is_string_dtype(col)
and agg not in _STRING_AGGS
and
(
str_agg in {"cumsum", "cummin", "cummax"}
or not (
any(a in str_agg for a in {
"count",
"max",
"min",
"first",
"last",
"nunique",
"unique",
"nth"
})
or (agg is list)
)
)
):
raise TypeError(
f"function is not supported for this dtype: {agg}"
)
elif (
_is_categorical_dtype(col)
and agg not in _CATEGORICAL_AGGS
and (
str_agg in {"cumsum", "cummin", "cummax"}
or
not (
any(a in str_agg for a in {"count", "max", "min", "unique"})
)
)
):
raise TypeError(
f"{col.dtype} type does not support {agg} operations"
)

agg_obj = make_aggregation(agg)
if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
included_aggregations_i.append((agg, agg_obj.kind))
Expand Down
45 changes: 39 additions & 6 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
func, dtype, dataset, groupby_jit_datasets
):
dataset = groupby_jit_datasets[dataset]

groupby_apply_jit_reductions_test_inner(func, dataset, dtype)


Expand Down Expand Up @@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by):
assert_groupby_results_equal(expect, got, check_dtype=False)


@pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43209",
)
def test_raise_data_error():
pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
gdf = cudf.from_pandas(pdf)
Expand All @@ -1904,12 +1900,13 @@ def test_raise_data_error():
)


def test_drop_unsupported_multi_agg():
def test_multi_agg():
gdf = cudf.DataFrame(
{"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
)
pdf = gdf.to_pandas()
assert_groupby_results_equal(
gdf.groupby("a").agg(["count", "mean"]),
pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
)

Expand Down Expand Up @@ -3852,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func):
)
result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
assert_eq(expected, result)


@pytest.mark.parametrize(
"op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
)
def test_group_by_raises_string_error(op):
df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})

with pytest.raises(TypeError):
df.groupby(df.a).agg(op)


@pytest.mark.parametrize(
"op",
[
"cummax",
"cummin",
"cumprod",
"cumsum",
"mean",
"median",
"prod",
"sum",
list,
],
)
def test_group_by_raises_category_error(op):
df = cudf.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
}
)

with pytest.raises(TypeError):
df.groupby(df.a).agg(op)
Loading