Skip to content

Commit

Permalink
Raise errors for unsupported operations on certain types (#15712)
Browse files Browse the repository at this point in the history
Fixes: #15668

This PR raises errors for groupby operations on un-supported types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #15712
  • Loading branch information
galipremsagar authored May 21, 2024
1 parent 4da00ea commit 6b1248e
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 7 deletions.
42 changes: 41 additions & 1 deletion python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ from functools import singledispatch

from pandas.errors import DataError

from cudf.api.types import is_string_dtype
from cudf.api.types import _is_categorical_dtype, is_string_dtype
from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import (
CategoricalDtype,
Expand Down Expand Up @@ -167,6 +167,46 @@ cdef class GroupBy:
included_aggregations_i = []
col_aggregations = []
for agg in aggs:
str_agg = str(agg)
if (
is_string_dtype(col)
and agg not in _STRING_AGGS
and
(
str_agg in {"cumsum", "cummin", "cummax"}
or not (
any(a in str_agg for a in {
"count",
"max",
"min",
"first",
"last",
"nunique",
"unique",
"nth"
})
or (agg is list)
)
)
):
raise TypeError(
f"function is not supported for this dtype: {agg}"
)
elif (
_is_categorical_dtype(col)
and agg not in _CATEGORICAL_AGGS
and (
str_agg in {"cumsum", "cummin", "cummax"}
or
not (
any(a in str_agg for a in {"count", "max", "min", "unique"})
)
)
):
raise TypeError(
f"{col.dtype} type does not support {agg} operations"
)

agg_obj = make_aggregation(agg)
if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
included_aggregations_i.append((agg, agg_obj.kind))
Expand Down
45 changes: 39 additions & 6 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
func, dtype, dataset, groupby_jit_datasets
):
dataset = groupby_jit_datasets[dataset]

groupby_apply_jit_reductions_test_inner(func, dataset, dtype)


Expand Down Expand Up @@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by):
assert_groupby_results_equal(expect, got, check_dtype=False)


@pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43209",
)
def test_raise_data_error():
pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
gdf = cudf.from_pandas(pdf)
Expand All @@ -1904,12 +1900,13 @@ def test_raise_data_error():
)


def test_drop_unsupported_multi_agg():
def test_multi_agg():
gdf = cudf.DataFrame(
{"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
)
pdf = gdf.to_pandas()
assert_groupby_results_equal(
gdf.groupby("a").agg(["count", "mean"]),
pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
)

Expand Down Expand Up @@ -3852,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func):
)
result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
assert_eq(expected, result)


@pytest.mark.parametrize(
"op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
)
def test_group_by_raises_string_error(op):
df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})

with pytest.raises(TypeError):
df.groupby(df.a).agg(op)


@pytest.mark.parametrize(
"op",
[
"cummax",
"cummin",
"cumprod",
"cumsum",
"mean",
"median",
"prod",
"sum",
list,
],
)
def test_group_by_raises_category_error(op):
df = cudf.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
}
)

with pytest.raises(TypeError):
df.groupby(df.a).agg(op)

0 comments on commit 6b1248e

Please sign in to comment.