From 9b4a81cd4a480930eddd471029179ae2d7183a46 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 10 May 2024 13:38:07 +0000 Subject: [PATCH 1/5] Raise errors for unsupported operations on certain types --- python/cudf/cudf/_lib/groupby.pyx | 34 +++++++++++++++++++++++++- python/cudf/cudf/tests/test_groupby.py | 18 +++++--------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index d5e97439180..e930336e8ee 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -3,7 +3,7 @@ from functools import singledispatch from pandas.errors import DataError -from cudf.api.types import is_string_dtype +from cudf.api.types import _is_categorical_dtype, is_string_dtype from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ( CategoricalDtype, @@ -167,6 +167,38 @@ cdef class GroupBy: included_aggregations_i = [] col_aggregations = [] for agg in aggs: + if ( + is_string_dtype(col) + and agg not in _STRING_AGGS + and not ( + agg in { + "count", + "max", + "min", + "first", + "last", + "nunique", + "unique", + } + or "count" in str(agg) + or (agg is list) + or "nth" in str(agg) + ) + ): + raise TypeError( + f"function is not supported for this dtype: {agg}" + ) + elif ( + _is_categorical_dtype(col) + and agg not in _CATEGORICAL_AGGS + and not ( + agg in {"count", "max", "min", "unique"} or "count" in str(agg) + ) + ): + raise TypeError( + f"{col.dtype} type does not support {agg} operations" + ) + agg_obj = make_aggregation(agg) if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations: included_aggregations_i.append((agg, agg_obj.kind)) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index c139b06d20f..70cc1f8123a 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions( func, dtype, dataset, groupby_jit_datasets ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) @@ -560,12 +559,9 @@ def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset] - with expect_warning_if( - func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning - ): - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) @pytest.mark.parametrize("dtype", ["float64"]) @@ -1891,9 +1887,6 @@ def test_groupby_nth(n, by): assert_groupby_results_equal(expect, got, check_dtype=False) -@pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43209", -) def test_raise_data_error(): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) gdf = cudf.from_pandas(pdf) @@ -1904,12 +1897,13 @@ def test_raise_data_error(): ) -def test_drop_unsupported_multi_agg(): +def test_multi_agg(): gdf = cudf.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} ) + pdf = gdf.to_pandas() assert_groupby_results_equal( - gdf.groupby("a").agg(["count", "mean"]), + pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), ) From 9d990ada9b36b8023e2761aba40433914e87bf31 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 10 May 2024 21:19:08 +0000 Subject: [PATCH 2/5] Fix pytest --- python/cudf/cudf/tests/test_groupby.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 70cc1f8123a..6d11bee6385 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -559,9 +559,12 @@ def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) + with expect_warning_if( + func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning + ): + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) @pytest.mark.parametrize("dtype", ["float64"]) From d0750475cd336a4fd0c3ca28bb2d9beab92eece9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 14 May 2024 19:27:46 +0000 Subject: [PATCH 3/5] simplify --- python/cudf/cudf/_lib/groupby.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 16075c6f85f..f26ec6404b4 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -167,11 +167,12 @@ cdef class GroupBy: included_aggregations_i = [] col_aggregations = [] for agg in aggs: + str_agg = str(agg) if ( is_string_dtype(col) and agg not in _STRING_AGGS and not ( - agg in { + any(a in str_agg for a in { "count", "max", "min", @@ -179,10 +180,9 @@ cdef class GroupBy: "last", "nunique", "unique", - } - or "count" in str(agg) + "nth" + }) or (agg is list) - or "nth" in str(agg) ) ): raise TypeError( @@ -192,7 +192,7 @@ cdef class GroupBy: _is_categorical_dtype(col) and agg not in _CATEGORICAL_AGGS and not ( - agg in {"count", "max", "min", "unique"} or "count" in str(agg) + any(a in str_agg for a in {"count", "max", "min", "unique"}) ) ): raise TypeError( From 9373e7271538a0a59cce4c4bdf3739726ad2cf1e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 16 May 2024 01:48:12 +0000 Subject: [PATCH 4/5] handle more error cases --- python/cudf/cudf/_lib/groupby.pyx | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index f26ec6404b4..9d18e023fe8 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -171,7 +171,10 @@ cdef class GroupBy: if ( is_string_dtype(col) and agg not in _STRING_AGGS - and not ( + and + ( + str_agg in {"cumsum", "cummin", "cummax"} + or not ( any(a in str_agg for a in { "count", "max", @@ -183,6 +186,7 @@ cdef class GroupBy: "nth" }) or (agg is list) + ) ) ): raise TypeError( @@ -191,8 +195,12 @@ cdef class GroupBy: elif ( _is_categorical_dtype(col) and agg not in _CATEGORICAL_AGGS - and not ( - any(a in str_agg for a in {"count", "max", "min", "unique"}) + and ( + str_agg in {"cumsum", "cummin", "cummax"} + or + not ( + any(a in str_agg for a in {"count", "max", "min", "unique"}) + ) ) ): raise TypeError( From 0faaeeeeb8658e6acdf14f793fd72c4317839fe7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 17 May 2024 15:04:43 +0000 Subject: [PATCH 5/5] Add tests --- python/cudf/cudf/tests/test_groupby.py | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 6d11bee6385..674f694a224 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3849,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func): ) result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) assert_eq(expected, result) + + +@pytest.mark.parametrize( + "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"] +) +def test_group_by_raises_string_error(op): + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]}) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op) + + +@pytest.mark.parametrize( + "op", + [ + "cummax", + "cummin", + "cumprod", + "cumsum", + "mean", + "median", + "prod", + "sum", + list, + ], +) +def test_group_by_raises_category_error(op): + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"), + } + ) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op)