From 4e0584b7105209038394d01836d6d3e0b21c2ac3 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 26 Aug 2021 18:19:14 -0400
Subject: [PATCH] Add handling for nested dicts in dask-cudf groupby (#9054)

Closes #9017

Adds handling for nested dict (renamed) aggregations supplied to dask-cudf's groupby, by storing the new aggregation names when standardizing the `aggs` input and applying them in `_finalize_gb_agg()`.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9054
---
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/dask_cudf/dask_cudf/groupby.py         | 80 +++++++++++--------
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 29 +++++++
 3 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 721ebf22de7..0d833a7d341 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2585,7 +2585,7 @@ def columns(self, columns):
 
         if not len(columns) == len(self._data.names):
             raise ValueError(
-                f"Length mismatch: expected {len(self._data.names)} elements ,"
+                f"Length mismatch: expected {len(self._data.names)} elements, "
                 f"got {len(columns)} elements"
             )
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 600d6cc7412..7e2c3a4f36c 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -198,34 +198,8 @@ def groupby_agg(
         in `dask.dataframe`, because it allows the cudf backend to
         perform multiple aggregations at once.
     """
-
-    # Deal with default split_out and split_every params
-    if split_every is False:
-        split_every = ddf.npartitions
-    split_every = split_every or 8
-    split_out = split_out or 1
-
-    # Standardize `gb_cols` and `columns` lists
-    aggs = _redirect_aggs(aggs_in.copy())
-    if isinstance(gb_cols, str):
-        gb_cols = [gb_cols]
-    columns = [c for c in ddf.columns if c not in gb_cols]
-    str_cols_out = False
-    if isinstance(aggs, dict):
-        # Use `str_cols_out` to specify if the output columns
-        # will have str (rather than MultiIndex/tuple) names.
-        # This happens when all values in the `aggs` dict are
-        # strings (no lists)
-        str_cols_out = True
-        for col in aggs:
-            if isinstance(aggs[col], str) or callable(aggs[col]):
-                aggs[col] = [aggs[col]]
-            else:
-                str_cols_out = False
-            if col in gb_cols:
-                columns.append(col)
-
     # Assert that aggregations are supported
+    aggs = _redirect_aggs(aggs_in)
     _supported = {
         "count",
         "mean",
@@ -244,10 +218,39 @@ def groupby_agg(
             f"Aggregations must be specified with dict or list syntax."
         )
 
-    # Always convert aggs to dict for consistency
+    # Deal with default split_out and split_every params
+    if split_every is False:
+        split_every = ddf.npartitions
+    split_every = split_every or 8
+    split_out = split_out or 1
+
+    # Standardize `gb_cols`, `columns`, and `aggs`
+    if isinstance(gb_cols, str):
+        gb_cols = [gb_cols]
+    columns = [c for c in ddf.columns if c not in gb_cols]
     if isinstance(aggs, list):
         aggs = {col: aggs for col in columns}
 
+    # Assert if our output will have a MultiIndex; this will be the case if
+    # any value in the `aggs` dict is not a string (i.e. multiple/named
+    # aggregations per column)
+    str_cols_out = True
+    aggs_renames = {}
+    for col in aggs:
+        if isinstance(aggs[col], str) or callable(aggs[col]):
+            aggs[col] = [aggs[col]]
+        elif isinstance(aggs[col], dict):
+            str_cols_out = False
+            col_aggs = []
+            for k, v in aggs[col].items():
+                aggs_renames[col, v] = k
+                col_aggs.append(v)
+            aggs[col] = col_aggs
+        else:
+            str_cols_out = False
+        if col in gb_cols:
+            columns.append(col)
+
     # Begin graph construction
     dsk = {}
     token = tokenize(ddf, gb_cols, aggs)
@@ -314,6 +317,13 @@ def groupby_agg(
         for col in aggs:
             _aggs[col] = _aggs[col][0]
     _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
+    if aggs_renames:
+        col_array = []
+        agg_array = []
+        for col, agg in _meta.columns:
+            col_array.append(col)
+            agg_array.append(aggs_renames.get((col, agg), agg))
+        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
     for s in range(split_out):
         dsk[(gb_agg_name, s)] = (
             _finalize_gb_agg,
@@ -326,6 +336,7 @@ def groupby_agg(
             sort,
             sep,
             str_cols_out,
+            aggs_renames,
         )
 
     divisions = [None] * (split_out + 1)
@@ -350,6 +361,10 @@ def _redirect_aggs(arg):
         for col in arg:
             if isinstance(arg[col], list):
                 new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            elif isinstance(arg[col], dict):
+                new_arg[col] = {
+                    k: redirects.get(v, v) for k, v in arg[col].items()
+                }
             else:
                 new_arg[col] = redirects.get(arg[col], arg[col])
         return new_arg
@@ -367,6 +382,8 @@ def _is_supported(arg, supported: set):
             for col in arg:
                 if isinstance(arg[col], list):
                     _global_set = _global_set.union(set(arg[col]))
+                elif isinstance(arg[col], dict):
+                    _global_set = _global_set.union(set(arg[col].values()))
                 else:
                     _global_set.add(arg[col])
         else:
@@ -460,10 +477,8 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in ("min", "max"):
+        elif agg in ("min", "max", "collect"):
             agg_dict[col] = [agg]
-        elif agg == "collect":
-            agg_dict[col] = ["collect"]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
 
@@ -508,6 +523,7 @@ def _finalize_gb_agg(
     sort,
     sep,
     str_cols_out,
+    aggs_renames,
 ):
     """ Final aggregation task.
 
@@ -564,7 +580,7 @@ def _finalize_gb_agg(
         else:
             name, agg = col.split(sep)
             col_array.append(name)
-            agg_array.append(agg)
+            agg_array.append(aggs_renames.get((name, agg), agg))
     if str_cols_out:
         gb.columns = col_array
     else:
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 61fa32b76ed..6569ffa94c5 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -645,3 +645,32 @@ def test_groupby_with_list_of_series():
     dd.assert_eq(
         gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"])
     )
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df: df.groupby("x").agg({"y": {"foo": "sum"}}),
+        lambda df: df.groupby("x").agg({"y": {"foo": "sum", "bar": "count"}}),
+    ],
+)
+def test_groupby_nested_dict(func):
+    pdf = pd.DataFrame(
+        {
+            "x": np.random.randint(0, 5, size=10000),
+            "y": np.random.normal(size=10000),
+        }
+    )
+
+    ddf = dd.from_pandas(pdf, npartitions=5)
+    c_ddf = ddf.map_partitions(cudf.from_pandas)
+
+    a = func(ddf).compute()
+    b = func(c_ddf).compute().to_pandas()
+
+    a.index.name = None
+    a.name = None
+    b.index.name = None
+    b.name = None
+
+    dd.assert_eq(a, b)