Merge pull request #352 from PyPSA/groupby-df-with-name-column

groupby: fix pandas dataframe with column `name` as grouper
PyPSA · Sep 24, 2024 · b491595 · b491595
2 parents 705194a + 9e26384
commit b491595
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 14 deletions.
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
@@ -4,6 +4,8 @@ Release Notes
 Upcoming Version
 ----------------
 
+* The group dimension when grouping by a pandas dataframe is now always `group`. This fixes the case that the dataframe contains a column named `name`.
+
 Version 0.3.14
 --------------
 

diff --git a/linopy/expressions.py b/linopy/expressions.py
@@ -148,12 +148,17 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy:
         xarray.core.groupby.DataArrayGroupBy
             The groupby object.
         """
-        if isinstance(self.group, (pd.Series, pd.DataFrame)):
+        if isinstance(self.group, pd.DataFrame):
             raise ValueError(
-                "Grouping by pandas objects is only supported in sum function."
+                "Grouping by a DataFrame only supported for `sum` operation with `use_fallback=False`."
             )
+        if isinstance(self.group, pd.Series):
+            group_name = self.group.name or "group"
+            group = DataArray(self.group, name=group_name)
+        else:
+            group = self.group  # type: ignore
 
-        return self.data.groupby(group=self.group, **self.kwargs)
+        return self.data.groupby(group=group, **self.kwargs)
 
     def map(
         self, func: Callable, shortcut: bool = False, args: tuple[()] = (), **kwargs
@@ -210,7 +215,11 @@ def sum(self, use_fallback: bool = False, **kwargs) -> LinearExpression:
         non_fallback_types = (pd.Series, pd.DataFrame, xr.DataArray)
         if isinstance(self.group, non_fallback_types) and not use_fallback:
             group: pd.Series | pd.DataFrame | xr.DataArray = self.group
-            group_name = getattr(group, "name", "group") or "group"
+            if isinstance(group, pd.DataFrame):
+                # dataframes do not have a name, so we need to set it
+                group_name = "group"
+            else:
+                group_name = getattr(group, "name", "group") or "group"
 
             if isinstance(group, DataArray):
                 group = group.to_pandas()
@@ -224,7 +233,9 @@ def sum(self, use_fallback: bool = False, **kwargs) -> LinearExpression:
 
             group_dim = group.index.name
             if group_name == group_dim:
-                raise ValueError("Group name cannot be the same as group dimension")
+                raise ValueError(
+                    "Group name cannot be the same as group dimension in non-fallback mode."
+                )
 
             arrays = [group, group.groupby(group).cumcount()]
             idx = pd.MultiIndex.from_arrays(

diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py
@@ -690,42 +690,74 @@ def test_linear_expression_groupby_with_name(v, use_fallback):
     assert grouped.nterm == 10
 
 
-def test_linear_expression_groupby_with_series(v):
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_series(v, use_fallback):
     expr = 1 * v
     groups = pd.Series([1] * 10 + [2] * 10, index=v.indexes["dim_2"])
-    grouped = expr.groupby(groups).sum()
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
     assert "group" in grouped.dims
     assert (grouped.data.group == [1, 2]).all()
     assert grouped.nterm == 10
 
 
-def test_linear_expression_groupby_with_series_false(v):
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_series_with_name(v, use_fallback):
+    expr = 1 * v
+    groups = pd.Series([1] * 10 + [2] * 10, index=v.indexes[v.dims[0]], name="my_group")
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    assert "my_group" in grouped.dims
+    assert (grouped.data.my_group == [1, 2]).all()
+    assert grouped.nterm == 10
+
+
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_series_false(v, use_fallback):
     expr = 1 * v
     groups = pd.Series([1] * 10 + [2] * 10, index=v.indexes["dim_2"])
     groups.name = "dim_2"
-    with pytest.raises(ValueError):
-        expr.groupby(groups).sum()
+    if not use_fallback:
+        with pytest.raises(ValueError):
+            expr.groupby(groups).sum(use_fallback=use_fallback)
+        return
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
+    assert "dim_2" in grouped.dims
+    assert (grouped.data.dim_2 == [1, 2]).all()
+    assert grouped.nterm == 10
 
 
-def test_linear_expression_groupby_with_dataframe(v):
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_dataframe(v, use_fallback):
     expr = 1 * v
     groups = pd.DataFrame(
         {"a": [1] * 10 + [2] * 10, "b": list(range(4)) * 5}, index=v.indexes["dim_2"]
     )
-    grouped = expr.groupby(groups).sum()
+    if use_fallback:
+        with pytest.raises(ValueError):
+            expr.groupby(groups).sum(use_fallback=use_fallback)
+        return
+
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
     index = pd.MultiIndex.from_frame(groups)
     assert "group" in grouped.dims
     assert set(grouped.data.group.values) == set(index.values)
     assert grouped.nterm == 3
 
 
-def test_linear_expression_groupby_with_dataarray(v):
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_linear_expression_groupby_with_dataarray(v, use_fallback):
     expr = 1 * v
     df = pd.DataFrame(
         {"a": [1] * 10 + [2] * 10, "b": list(range(4)) * 5}, index=v.indexes["dim_2"]
     )
     groups = xr.DataArray(df)
-    grouped = expr.groupby(groups).sum()
+
+    # this should not be the case, see https://github.com/PyPSA/linopy/issues/351
+    if use_fallback:
+        with pytest.raises(KeyError):
+            expr.groupby(groups).sum(use_fallback=use_fallback)
+        return
+
+    grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
     index = pd.MultiIndex.from_frame(df)
     assert "group" in grouped.dims
     assert set(grouped.data.group.values) == set(index.values)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,8 @@ Release Notes @@
     Upcoming Version
     ----------------
+    * The group dimension when grouping by a pandas dataframe is now always `group`. This fixes the case that the dataframe contains a column named `name`.
     Version 0.3.14
     --------------
@@ Expand Down @@