From 9b4a81cd4a480930eddd471029179ae2d7183a46 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 10 May 2024 13:38:07 +0000
Subject: [PATCH 1/5] Raise errors for unsupported operations on certain types

---
 python/cudf/cudf/_lib/groupby.pyx      | 34 +++++++++++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py | 18 +++++---------
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index d5e97439180..e930336e8ee 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -3,7 +3,7 @@ from functools import singledispatch
 
 from pandas.errors import DataError
 
-from cudf.api.types import is_string_dtype
+from cudf.api.types import _is_categorical_dtype, is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -167,6 +167,38 @@ cdef class GroupBy:
             included_aggregations_i = []
             col_aggregations = []
             for agg in aggs:
+                if (
+                    is_string_dtype(col)
+                    and agg not in _STRING_AGGS
+                    and not (
+                        agg in {
+                            "count",
+                            "max",
+                            "min",
+                            "first",
+                            "last",
+                            "nunique",
+                            "unique",
+                        }
+                        or "count" in str(agg)
+                        or (agg is list)
+                        or "nth" in str(agg)
+                    )
+                ):
+                    raise TypeError(
+                        f"function is not supported for this dtype: {agg}"
+                    )
+                elif (
+                    _is_categorical_dtype(col)
+                    and agg not in _CATEGORICAL_AGGS
+                    and not (
+                        agg in {"count", "max", "min", "unique"} or "count" in str(agg)
+                    )
+                ):
+                    raise TypeError(
+                        f"{col.dtype} type does not support {agg} operations"
+                    )
+
                 agg_obj = make_aggregation(agg)
                 if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
                     included_aggregations_i.append((agg, agg_obj.kind))
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c139b06d20f..70cc1f8123a 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
     dataset = groupby_jit_datasets[dataset]
-
     groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
@@ -560,12 +559,9 @@ def test_groupby_apply_jit_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
     dataset = groupby_jit_datasets[dataset]
-    with expect_warning_if(
-        func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning
-    ):
-        groupby_apply_jit_reductions_special_vals_inner(
-            func, dataset, dtype, special_val
-        )
+    groupby_apply_jit_reductions_special_vals_inner(
+        func, dataset, dtype, special_val
+    )
 
 
 @pytest.mark.parametrize("dtype", ["float64"])
@@ -1891,9 +1887,6 @@ def test_groupby_nth(n, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
-@pytest.mark.xfail(
-    reason="https://github.com/pandas-dev/pandas/issues/43209",
-)
 def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
@@ -1904,12 +1897,13 @@ def test_raise_data_error():
     )
 
 
-def test_drop_unsupported_multi_agg():
+def test_multi_agg():
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
+    pdf = gdf.to_pandas()
     assert_groupby_results_equal(
-        gdf.groupby("a").agg(["count", "mean"]),
+        pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
         gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
     )
 

From 9d990ada9b36b8023e2761aba40433914e87bf31 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 10 May 2024 21:19:08 +0000
Subject: [PATCH 2/5] Fix pytest

---
 python/cudf/cudf/tests/test_groupby.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 70cc1f8123a..6d11bee6385 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -559,9 +559,12 @@ def test_groupby_apply_jit_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
     dataset = groupby_jit_datasets[dataset]
-    groupby_apply_jit_reductions_special_vals_inner(
-        func, dataset, dtype, special_val
-    )
+    with expect_warning_if(
+        func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning
+    ):
+        groupby_apply_jit_reductions_special_vals_inner(
+            func, dataset, dtype, special_val
+        )
 
 
 @pytest.mark.parametrize("dtype", ["float64"])

From d0750475cd336a4fd0c3ca28bb2d9beab92eece9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 14 May 2024 19:27:46 +0000
Subject: [PATCH 3/5] simplify

---
 python/cudf/cudf/_lib/groupby.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 16075c6f85f..f26ec6404b4 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -167,11 +167,12 @@ cdef class GroupBy:
             included_aggregations_i = []
             col_aggregations = []
             for agg in aggs:
+                str_agg = str(agg)
                 if (
                     is_string_dtype(col)
                     and agg not in _STRING_AGGS
                     and not (
-                        agg in {
+                        any(a in str_agg for a in {
                             "count",
                             "max",
                             "min",
@@ -179,10 +180,9 @@ cdef class GroupBy:
                             "last",
                             "nunique",
                             "unique",
-                        }
-                        or "count" in str(agg)
+                            "nth"
+                        })
                         or (agg is list)
-                        or "nth" in str(agg)
                     )
                 ):
                     raise TypeError(
@@ -192,7 +192,7 @@ cdef class GroupBy:
                     _is_categorical_dtype(col)
                     and agg not in _CATEGORICAL_AGGS
                     and not (
-                        agg in {"count", "max", "min", "unique"} or "count" in str(agg)
+                        any(a in str_agg for a in {"count", "max", "min", "unique"})
                     )
                 ):
                     raise TypeError(

From 9373e7271538a0a59cce4c4bdf3739726ad2cf1e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 16 May 2024 01:48:12 +0000
Subject: [PATCH 4/5] handle more error cases

---
 python/cudf/cudf/_lib/groupby.pyx | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index f26ec6404b4..9d18e023fe8 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -171,7 +171,10 @@ cdef class GroupBy:
                 if (
                     is_string_dtype(col)
                     and agg not in _STRING_AGGS
-                    and not (
+                    and
+                    (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or not (
                         any(a in str_agg for a in {
                             "count",
                             "max",
@@ -183,6 +186,7 @@ cdef class GroupBy:
                             "nth"
                         })
                         or (agg is list)
+                        )
                     )
                 ):
                     raise TypeError(
@@ -191,8 +195,12 @@ cdef class GroupBy:
                 elif (
                     _is_categorical_dtype(col)
                     and agg not in _CATEGORICAL_AGGS
-                    and not (
-                        any(a in str_agg for a in {"count", "max", "min", "unique"})
+                    and (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or
+                        not (
+                            any(a in str_agg for a in {"count", "max", "min", "unique"})
+                        )
                     )
                 ):
                     raise TypeError(

From 0faaeeeeb8658e6acdf14f793fd72c4317839fe7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 17 May 2024 15:04:43 +0000
Subject: [PATCH 5/5] Add tests

---
 python/cudf/cudf/tests/test_groupby.py | 36 ++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 6d11bee6385..674f694a224 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3849,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func):
     )
     result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
     assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
+)
+def test_group_by_raises_string_error(op):
+    df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "mean",
+        "median",
+        "prod",
+        "sum",
+        list,
+    ],
+)
+def test_group_by_raises_category_error(op):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
+        }
+    )
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)