From 7cc4d5375e57277f316c5a8bb17a58ab8003c5f1 Mon Sep 17 00:00:00 2001 From: janus Date: Wed, 12 Aug 2020 15:42:53 +0200 Subject: [PATCH 01/23] Fix GH-29442 DataFrame.groupby doesn't preserve _metadata This bug is a regression in v1.1.0 and was introduced by the fix for GH-34214 in commit [6f065b]. Underlying cause is that the `*Splitter` classes do not use the `._constructor` property and do not call `__finalize__`. Please note that the method name used for `__finalize__` calls was my best guess since documentation for the value has been hard to find. [6f065b]: https://github.com/pandas-dev/pandas/commit/6f065b6d423ea211d803e8be93c27f547541c372 --- pandas/core/groupby/ops.py | 5 +- pandas/tests/groupby/test_custom_metadata.py | 83 ++++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/groupby/test_custom_metadata.py diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 64eb413fe78fa..b3baa39fe3f49 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -955,7 +955,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + return sdata._constructor(mgr, name=sdata.name, fastpath=True)\ + .__finalize__(sdata, method='groupby') class FrameSplitter(DataSplitter): @@ -971,7 +972,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + return sdata._constructor(mgr).__finalize__(sdata, method='groupby') def get_splitter( diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py new file mode 100644 index 0000000000000..f1b5d50d30b22 --- /dev/null +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -0,0 +1,83 @@ +""" +Test metadata propagation in groupby + +The PandasTable class below is implemented according to the [guidelines], and as such would +expect `__finalize__` to always be called so that the `_pandastable_metadata` is always populated. + +[guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties +""" + +import pytest +import pandas as pd +from warnings import warn +from typing import List + + +_TABLE_METADATA_FIELD_NAME = '_pandastable_metadata' + + +def _combine_metadata(data: List[str]) -> str: + """ + A mock implementation for testing + """ + return '+'.join(data) + + +class PandasTable(pd.DataFrame): + """ + A pandas dataframe subclass with associated table metadata. + """ + + _metadata = [_TABLE_METADATA_FIELD_NAME] # Register metadata fieldnames here + + @property + def _constructor(self): + return PandasTable + + def __finalize__(self, other, method=None, **kwargs): + """ + This method is responsible for populating metadata when creating new Table-object. + + The method argument is subject to change, and a robust handling of this is implemented + """ + src = [other] #more logic here in actual implementation + metadata = _combine_metadata([d.get_metadata() for d in src if isinstance(d, PandasTable)]) + + if not metadata: + warn('__finalize__ unable to combine metadata for method "{method}", falling back to DataFrame') + return pd.DataFrame(self) + object.__setattr__(self, _TABLE_METADATA_FIELD_NAME, metadata) + return self + + def get_metadata(self): + #return object.__getattribute__(self, _TABLE_METADATA_FIELD_NAME) + metadata = getattr(self, _TABLE_METADATA_FIELD_NAME, None) + if metadata is None: + warn('PandasTable object not correctly initialized: no metadata') + return metadata + + @staticmethod + def from_table_data(df: pd.DataFrame, metadata) -> 'PandasTable': + df = PandasTable(df) + object.__setattr__(df, _TABLE_METADATA_FIELD_NAME, metadata) + return df + + +@pytest.fixture +def dft(): + df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={'a','b','g'}) + return PandasTable.from_table_data(df, 'My metadata') + + +def test_initial_metadata(dft): + assert dft.get_metadata() == 'My metadata' + + +def test_basic_propagation(dft): + gg = dft.loc[dft.g==0, :] + assert gg.get_metadata() == 'My metadata' + + +def test_groupby(dft): + gg = [ab for g, ab in dft.groupby('g')] + assert gg[0].get_metadata() is not None From 0ab766db189bd935557563e9638769dfc38c55bc Mon Sep 17 00:00:00 2001 From: janus Date: Wed, 12 Aug 2020 16:01:04 +0200 Subject: [PATCH 02/23] Fix most PEP8 issues I don't know how to truncate remaining long lines without loosing information. --- pandas/tests/groupby/test_custom_metadata.py | 23 +++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index f1b5d50d30b22..4556566880617 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,8 +1,9 @@ """ Test metadata propagation in groupby -The PandasTable class below is implemented according to the [guidelines], and as such would -expect `__finalize__` to always be called so that the `_pandastable_metadata` is always populated. +The PandasTable class below is implemented according to the [guidelines], +and as such would expect `__finalize__` to always be called so that the +`_pandastable_metadata` is always populated. [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties """ @@ -38,19 +39,21 @@ def __finalize__(self, other, method=None, **kwargs): """ This method is responsible for populating metadata when creating new Table-object. - The method argument is subject to change, and a robust handling of this is implemented + The method argument is subject to change, and a robust handling of this + is implemented. """ - src = [other] #more logic here in actual implementation - metadata = _combine_metadata([d.get_metadata() for d in src if isinstance(d, PandasTable)]) + src = [other] # more logic here in actual implementation + metadata = _combine_metadata( + [d.get_metadata() for d in src if isinstance(d, PandasTable)]) if not metadata: - warn('__finalize__ unable to combine metadata for method "{method}", falling back to DataFrame') + warn('__finalize__ unable to combine metadata for method "{method}", ' + 'falling back to DataFrame') return pd.DataFrame(self) object.__setattr__(self, _TABLE_METADATA_FIELD_NAME, metadata) return self def get_metadata(self): - #return object.__getattribute__(self, _TABLE_METADATA_FIELD_NAME) metadata = getattr(self, _TABLE_METADATA_FIELD_NAME, None) if metadata is None: warn('PandasTable object not correctly initialized: no metadata') @@ -65,8 +68,8 @@ def from_table_data(df: pd.DataFrame, metadata) -> 'PandasTable': @pytest.fixture def dft(): - df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={'a','b','g'}) - return PandasTable.from_table_data(df, 'My metadata') + df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={'a', 'b', 'g'}) + return PandasTable.from_table_data(df, 'My metadata') def test_initial_metadata(dft): @@ -74,7 +77,7 @@ def test_initial_metadata(dft): def test_basic_propagation(dft): - gg = dft.loc[dft.g==0, :] + gg = dft.loc[dft.g == 0, :] assert gg.get_metadata() == 'My metadata' From d7b42e3d292b402293cb22925567857ef9cafa36 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 13 Aug 2020 09:10:43 +0200 Subject: [PATCH 03/23] Fix remaining PEP8 issues --- pandas/tests/groupby/test_custom_metadata.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 4556566880617..46176fcc15d3f 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -5,7 +5,7 @@ and as such would expect `__finalize__` to always be called so that the `_pandastable_metadata` is always populated. -[guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties +[guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa """ import pytest @@ -37,10 +37,9 @@ def _constructor(self): def __finalize__(self, other, method=None, **kwargs): """ - This method is responsible for populating metadata when creating new Table-object. + This method be called after constructor to populate metadata - The method argument is subject to change, and a robust handling of this - is implemented. + The "method" argument is subject to change and must be handled robustly. """ src = [other] # more logic here in actual implementation metadata = _combine_metadata( From fbc602c6e183c22c340e7decbc0ef66732f287f0 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 13 Aug 2020 12:56:13 +0200 Subject: [PATCH 04/23] Apply "black" styling and fix typo This should hopefully resolve remaining linter issues. --- pandas/core/groupby/ops.py | 7 +++-- pandas/tests/groupby/test_custom_metadata.py | 29 +++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b3baa39fe3f49..4eef300eb6596 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -955,8 +955,9 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return sdata._constructor(mgr, name=sdata.name, fastpath=True)\ - .__finalize__(sdata, method='groupby') + return sdata._constructor(mgr, name=sdata.name, fastpath=True).__finalize__( + sdata, method="groupby" + ) class FrameSplitter(DataSplitter): @@ -972,7 +973,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return sdata._constructor(mgr).__finalize__(sdata, method='groupby') + return sdata._constructor(mgr).__finalize__(sdata, method="groupby") def get_splitter( diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 46176fcc15d3f..3e4d75fa51112 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -14,14 +14,14 @@ from typing import List -_TABLE_METADATA_FIELD_NAME = '_pandastable_metadata' +_TABLE_METADATA_FIELD_NAME = "_pandastable_metadata" def _combine_metadata(data: List[str]) -> str: """ A mock implementation for testing """ - return '+'.join(data) + return "+".join(data) class PandasTable(pd.DataFrame): @@ -37,17 +37,20 @@ def _constructor(self): def __finalize__(self, other, method=None, **kwargs): """ - This method be called after constructor to populate metadata + This method will be called after constructor to populate metadata The "method" argument is subject to change and must be handled robustly. """ src = [other] # more logic here in actual implementation metadata = _combine_metadata( - [d.get_metadata() for d in src if isinstance(d, PandasTable)]) + [d.get_metadata() for d in src if isinstance(d, PandasTable)] + ) if not metadata: - warn('__finalize__ unable to combine metadata for method "{method}", ' - 'falling back to DataFrame') + warn( + '__finalize__ unable to combine metadata for method "{method}", ' + "falling back to DataFrame" + ) return pd.DataFrame(self) object.__setattr__(self, _TABLE_METADATA_FIELD_NAME, metadata) return self @@ -55,11 +58,11 @@ def __finalize__(self, other, method=None, **kwargs): def get_metadata(self): metadata = getattr(self, _TABLE_METADATA_FIELD_NAME, None) if metadata is None: - warn('PandasTable object not correctly initialized: no metadata') + warn("PandasTable object not correctly initialized: no metadata") return metadata @staticmethod - def from_table_data(df: pd.DataFrame, metadata) -> 'PandasTable': + def from_table_data(df: pd.DataFrame, metadata) -> "PandasTable": df = PandasTable(df) object.__setattr__(df, _TABLE_METADATA_FIELD_NAME, metadata) return df @@ -67,19 +70,19 @@ def from_table_data(df: pd.DataFrame, metadata) -> 'PandasTable': @pytest.fixture def dft(): - df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={'a', 'b', 'g'}) - return PandasTable.from_table_data(df, 'My metadata') + df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={"a", "b", "g"}) + return PandasTable.from_table_data(df, "My metadata") def test_initial_metadata(dft): - assert dft.get_metadata() == 'My metadata' + assert dft.get_metadata() == "My metadata" def test_basic_propagation(dft): gg = dft.loc[dft.g == 0, :] - assert gg.get_metadata() == 'My metadata' + assert gg.get_metadata() == "My metadata" def test_groupby(dft): - gg = [ab for g, ab in dft.groupby('g')] + gg = [ab for g, ab in dft.groupby("g")] assert gg[0].get_metadata() is not None From e9c64ac3b1aaefbe24985dc03106b9c1c7ae9b4f Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 13 Aug 2020 13:24:04 +0200 Subject: [PATCH 05/23] Fix import sorting issues reported by `isort` --- pandas/tests/groupby/test_custom_metadata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 3e4d75fa51112..87742fdfecb1f 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -8,11 +8,12 @@ [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa """ -import pytest -import pandas as pd -from warnings import warn from typing import List +from warnings import warn +import pytest + +import pandas as pd _TABLE_METADATA_FIELD_NAME = "_pandastable_metadata" From ec7fd00ffe5be5a2ea1eecf2bd4446e8bc9a3198 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 14 Aug 2020 10:58:13 +0200 Subject: [PATCH 06/23] Make testcase more concise --- pandas/tests/groupby/test_custom_metadata.py | 90 +++----------------- 1 file changed, 13 insertions(+), 77 deletions(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 87742fdfecb1f..90d4c78d1f0c7 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,89 +1,25 @@ -""" -Test metadata propagation in groupby +from pandas import DataFrame -The PandasTable class below is implemented according to the [guidelines], -and as such would expect `__finalize__` to always be called so that the -`_pandastable_metadata` is always populated. -[guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa -""" - -from typing import List -from warnings import warn - -import pytest - -import pandas as pd - -_TABLE_METADATA_FIELD_NAME = "_pandastable_metadata" - - -def _combine_metadata(data: List[str]) -> str: - """ - A mock implementation for testing +class CustomDataFrame(DataFrame): """ - return "+".join(data) - + Extension of DataFrame as described in [guidelines] -class PandasTable(pd.DataFrame): - """ - A pandas dataframe subclass with associated table metadata. + [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa """ - _metadata = [_TABLE_METADATA_FIELD_NAME] # Register metadata fieldnames here + _metadata = ["_custom_metadata"] @property def _constructor(self): - return PandasTable - - def __finalize__(self, other, method=None, **kwargs): - """ - This method will be called after constructor to populate metadata - - The "method" argument is subject to change and must be handled robustly. - """ - src = [other] # more logic here in actual implementation - metadata = _combine_metadata( - [d.get_metadata() for d in src if isinstance(d, PandasTable)] - ) - - if not metadata: - warn( - '__finalize__ unable to combine metadata for method "{method}", ' - "falling back to DataFrame" - ) - return pd.DataFrame(self) - object.__setattr__(self, _TABLE_METADATA_FIELD_NAME, metadata) - return self - - def get_metadata(self): - metadata = getattr(self, _TABLE_METADATA_FIELD_NAME, None) - if metadata is None: - warn("PandasTable object not correctly initialized: no metadata") - return metadata - - @staticmethod - def from_table_data(df: pd.DataFrame, metadata) -> "PandasTable": - df = PandasTable(df) - object.__setattr__(df, _TABLE_METADATA_FIELD_NAME, metadata) - return df - - -@pytest.fixture -def dft(): - df = pd.DataFrame([[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns={"a", "b", "g"}) - return PandasTable.from_table_data(df, "My metadata") - - -def test_initial_metadata(dft): - assert dft.get_metadata() == "My metadata" - + return CustomDataFrame -def test_basic_propagation(dft): - gg = dft.loc[dft.g == 0, :] - assert gg.get_metadata() == "My metadata" +def test_groupby_with_custom_metadata(): + custom_df = CustomDataFrame( + [[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns=["a", "b", "g"] + ) + custom_df._custom_metadata = "Custom metadata" -def test_groupby(dft): - gg = [ab for g, ab in dft.groupby("g")] - assert gg[0].get_metadata() is not None + for _, group_df in custom_df.groupby("g"): + assert group_df._custom_metadata == "Custom metadata" From 14056678c7c51af5015bf5b9024615c599dd96d0 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 27 Aug 2020 09:28:10 +0200 Subject: [PATCH 07/23] Include test from original issue Added testcase `test_groupby_sum_with_custom_metadata` for functionality exercised in the #GH-29442. Testcase fails on current code. --- pandas/tests/groupby/test_custom_metadata.py | 36 +++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 90d4c78d1f0c7..65999e67be8d0 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,25 +1,43 @@ -from pandas import DataFrame +import pandas as pd -class CustomDataFrame(DataFrame): +class SubclassedDataFrame2(pd.DataFrame): """ Extension of DataFrame as described in [guidelines] [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa """ - _metadata = ["_custom_metadata"] + # temporary properties + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ["added_property"] @property def _constructor(self): - return CustomDataFrame + return SubclassedDataFrame2 def test_groupby_with_custom_metadata(): - custom_df = CustomDataFrame( + custom_df = SubclassedDataFrame2( [[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns=["a", "b", "g"] ) - custom_df._custom_metadata = "Custom metadata" - - for _, group_df in custom_df.groupby("g"): - assert group_df._custom_metadata == "Custom metadata" + custom_df.added_property = "hello_pandas" + grouped = custom_df.groupby("g") + for _, group_df in grouped: + assert group_df.added_property == "hello_pandas" + + +def test_groupby_sum_with_custom_metadata(): + my_data_as_dictionary = { + "mycategorical": [1, 1, 2], + "myfloat1": [1.0, 2.0, 3.0], + "myfloat2": [1.0, 2.0, 3.0], + } + sdf = SubclassedDataFrame2(my_data_as_dictionary) + sdf.added_property = "hello pandas" + grouped = sdf.groupby("mycategorical")[["myfloat1", "myfloat2"]] + df = grouped.sum() + assert df.added_property == "hello pandas" From 8d3d896bcb20f7bcb0d3ac2b71098dfb05fff11e Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 27 Aug 2020 09:42:25 +0200 Subject: [PATCH 08/23] Fix metadata handling for `.groupby(...).sum()` In order to propagate metadata fields, the `__finalize__` method must be called for the resulting DataFrame with a reference to input. By implementing this in `_GroupBy._agg_general`, this is performed as late as possible for the `.sum()` (and similar) code-paths. Fixes #GH-29442 --- pandas/core/groupby/groupby.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4597afeeaddbf..95bfe35d9b22a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -993,8 +993,9 @@ def _agg_general( ): with _group_selection_context(self): # try a cython aggregation if we can + result = None try: - return self._cython_agg_general( + result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, @@ -1009,10 +1010,10 @@ def _agg_general( # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result.__finalize__(self.obj, method='groupby') def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 From 39e9f33d741b05905fcf5fac614881059940055d Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 27 Aug 2020 10:02:24 +0200 Subject: [PATCH 09/23] Apply black format --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 95bfe35d9b22a..b28a9c6ad0311 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1013,7 +1013,7 @@ def _agg_general( # apply a non-cython aggregation if result is None: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result.__finalize__(self.obj, method='groupby') + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 From 7075ef28463cfb1c3db5eb9aa3659791066e84c7 Mon Sep 17 00:00:00 2001 From: janus Date: Wed, 23 Sep 2020 08:50:57 +0200 Subject: [PATCH 10/23] Remove xfail decorator for `sum` aggregation test --- pandas/tests/generic/test_finalize.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4d0f1a326225d..3cf45e61ee9c1 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -778,6 +778,20 @@ def test_categorical_accessor(method): [ operator.methodcaller("sum"), lambda x: x.agg("sum"), + ], +) +def test_groupby_passing(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ lambda x: x.agg(["sum", "count"]), lambda x: x.transform(lambda y: y), lambda x: x.apply(lambda y: y), From 4720c979dea17b2942694ec223fe2db487c299e6 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 08:33:37 +0200 Subject: [PATCH 11/23] Move `__finalize__` outsize context handler --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a3d2efb8a8e6a..cc25ac1c40f8c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1017,7 +1017,7 @@ def _agg_general( # apply a non-cython aggregation if result is None: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result.__finalize__(self.obj, method="groupby") + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 From 6b290e8eceada67a65f2b685409bd90676e25e54 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 09:15:22 +0200 Subject: [PATCH 12/23] Revert __finalize__ call in DataSplitter The call was an attempt to ensure that `__finalize__` was called for DataFrames returned when iterating over grouping result, but it caused unreasonable performance degradation. The relevant test has been marked with `xfail` in order to preserve it. When a solution to the issue is implemented, this test should be reactivated. --- pandas/core/groupby/ops.py | 6 ++---- pandas/tests/groupby/test_custom_metadata.py | 10 ++++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0b1217143b4f..84efde93c222c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -909,9 +909,7 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return sdata._constructor(mgr, name=sdata.name, fastpath=True).__finalize__( - sdata, method="groupby" - ) + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -927,7 +925,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return sdata._constructor(mgr).__finalize__(sdata, method="groupby") + return sdata._constructor(mgr) def get_splitter( diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 65999e67be8d0..1f95d17472f9e 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,3 +1,5 @@ +import pytest + import pandas as pd @@ -8,10 +10,6 @@ class SubclassedDataFrame2(pd.DataFrame): [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa """ - # temporary properties - _internal_names = pd.DataFrame._internal_names + ["internal_cache"] - _internal_names_set = set(_internal_names) - # normal properties _metadata = ["added_property"] @@ -20,6 +18,10 @@ def _constructor(self): return SubclassedDataFrame2 +@pytest.mark.xfail( + reason="Missing high-performance implementation of metadata support for groupby. " + "__finalize__ is not called for grouped dataframes" +) def test_groupby_with_custom_metadata(): custom_df = SubclassedDataFrame2( [[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns=["a", "b", "g"] From a2bdc3d3dfe80018bb6cabce3583fdf9df6cdca9 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 09:19:27 +0200 Subject: [PATCH 13/23] Add performance test counting __finalize__ calls --- pandas/tests/groupby/test_custom_metadata.py | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 1f95d17472f9e..9b4c69f575d0a 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -43,3 +44,38 @@ def test_groupby_sum_with_custom_metadata(): grouped = sdf.groupby("mycategorical")[["myfloat1", "myfloat2"]] df = grouped.sum() assert df.added_property == "hello pandas" + + +def test_groupby_apply_performance_with_custom_metadata(): + """ + Check that __finalize__ is not called for each group during .apply + """ + counter = {"value": 0} + + class SubclassedDataFrame3(pd.DataFrame): + # normal properties + _metadata = ["added_property"] + + @property + def _constructor(self): + return SubclassedDataFrame3 + + def __finalize__(self, *args, **kwargs): + counter["value"] += 1 + return super().__finalize__(*args, **kwargs) + + N = 10 ** 4 + labels = np.random.randint(0, N // 5, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = SubclassedDataFrame3( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) + df.index = pd.CategoricalIndex(df.key) + df.groupby(level="key").apply(lambda x: 1) + finalize_call_count = counter["value"] + assert finalize_call_count < 42 # Random number << len(labels2) From 2af844716d4b1ade43cf0cebffd751fa4b8317d7 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 09:58:04 +0200 Subject: [PATCH 14/23] Add whatsnew entry --- doc/source/whatsnew/v1.1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 15777abcb8084..87748735ec9ae 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -58,6 +58,7 @@ Bug fixes - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) - Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) +- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()` and similar to not preserve metadata (:issue:`29442`) .. --------------------------------------------------------------------------- From ec3eeb154972de907b5ab8a227066657b6418db0 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 10:37:57 +0200 Subject: [PATCH 15/23] Move __finalize__ inside context manager In attempt to address test failure in pandas\tests\plotting\test_groupby.py that I cannot reproduce locally. --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cc25ac1c40f8c..a3d2efb8a8e6a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1017,7 +1017,7 @@ def _agg_general( # apply a non-cython aggregation if result is None: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result.__finalize__(self.obj, method="groupby") + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 From 54875637014830710a779a97e627e646a46f0740 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 2 Oct 2020 12:01:05 +0200 Subject: [PATCH 16/23] Fix typo in whatsnew entry --- doc/source/whatsnew/v1.1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 87748735ec9ae..501fe8efd10cd 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -58,7 +58,7 @@ Bug fixes - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) - Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) -- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()` and similar to not preserve metadata (:issue:`29442`) +- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) .. --------------------------------------------------------------------------- From 8bfd08d75e36a30886b7536b3ee26126a50344ab Mon Sep 17 00:00:00 2001 From: janus Date: Tue, 6 Oct 2020 10:08:20 +0200 Subject: [PATCH 17/23] Revert 1.1.3 whatsnew entry --- doc/source/whatsnew/v1.1.3.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 501fe8efd10cd..15777abcb8084 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -58,7 +58,6 @@ Bug fixes - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) - Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) -- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) .. --------------------------------------------------------------------------- From 7878041149bee9389b623d46afd0650f8c58c62a Mon Sep 17 00:00:00 2001 From: janus Date: Tue, 6 Oct 2020 10:10:02 +0200 Subject: [PATCH 18/23] Add 1.1.4 whatsnew entry --- doc/source/whatsnew/v1.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index e63912ebc8fee..7617d8d36c07d 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) .. --------------------------------------------------------------------------- From 554869960ea9439950f470bf5142465a74033828 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 8 Oct 2020 15:34:33 +0200 Subject: [PATCH 19/23] Revert all changes not strictly related to issue --- pandas/core/groupby/ops.py | 4 ++-- pandas/tests/groupby/test_custom_metadata.py | 14 -------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 84efde93c222c..6051aa3022da1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -909,7 +909,7 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return sdata._constructor(mgr, name=sdata.name, fastpath=True) + return type(sdata)(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -925,7 +925,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return sdata._constructor(mgr) + return type(sdata)(mgr) def get_splitter( diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 9b4c69f575d0a..5f951ac588dee 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -19,20 +19,6 @@ def _constructor(self): return SubclassedDataFrame2 -@pytest.mark.xfail( - reason="Missing high-performance implementation of metadata support for groupby. " - "__finalize__ is not called for grouped dataframes" -) -def test_groupby_with_custom_metadata(): - custom_df = SubclassedDataFrame2( - [[11, 12, 0], [21, 22, 0], [31, 32, 1]], columns=["a", "b", "g"] - ) - custom_df.added_property = "hello_pandas" - grouped = custom_df.groupby("g") - for _, group_df in grouped: - assert group_df.added_property == "hello_pandas" - - def test_groupby_sum_with_custom_metadata(): my_data_as_dictionary = { "mycategorical": [1, 1, 2], From 18e8ff5f434e3db1c1d057ab6db48c75173f4877 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 8 Oct 2020 15:40:07 +0200 Subject: [PATCH 20/23] Use descriptive names for unit tests --- pandas/tests/generic/test_finalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4056fa2227324..5507e98d974c1 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -774,7 +774,7 @@ def test_categorical_accessor(method): lambda x: x.agg("sum"), ], ) -def test_groupby_passing(obj, method): +def test_groupby_finalize(obj, method): obj.attrs = {"a": 1} result = method(obj.groupby([0, 0])) assert result.attrs == {"a": 1} @@ -792,7 +792,7 @@ def test_groupby_passing(obj, method): ], ) @not_implemented_mark -def test_groupby(obj, method): +def test_groupby_finalize_not_implemented(obj, method): obj.attrs = {"a": 1} result = method(obj.groupby([0, 0])) assert result.attrs == {"a": 1} From 57694ae6d1724c01f5a5a0c6e5016bfdfc899aa9 Mon Sep 17 00:00:00 2001 From: janus Date: Thu, 8 Oct 2020 16:02:13 +0200 Subject: [PATCH 21/23] Remove extraneous imports --- pandas/tests/groupby/test_custom_metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py index 5f951ac588dee..28c2ad2729fc0 100644 --- a/pandas/tests/groupby/test_custom_metadata.py +++ b/pandas/tests/groupby/test_custom_metadata.py @@ -1,5 +1,4 @@ import numpy as np -import pytest import pandas as pd From d86ae78b3fbcd902a2b0757eb6b5e90a0160aeca Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 9 Oct 2020 14:40:36 +0200 Subject: [PATCH 22/23] Delete tests for custom metadata --- pandas/tests/groupby/test_custom_metadata.py | 66 -------------------- 1 file changed, 66 deletions(-) delete mode 100644 pandas/tests/groupby/test_custom_metadata.py diff --git a/pandas/tests/groupby/test_custom_metadata.py b/pandas/tests/groupby/test_custom_metadata.py deleted file mode 100644 index 28c2ad2729fc0..0000000000000 --- a/pandas/tests/groupby/test_custom_metadata.py +++ /dev/null @@ -1,66 +0,0 @@ -import numpy as np - -import pandas as pd - - -class SubclassedDataFrame2(pd.DataFrame): - """ - Extension of DataFrame as described in [guidelines] - - [guidelines]: https://pandas.pydata.org/pandas-docs/stable/development/extending.html#override-constructor-properties # noqa - """ - - # normal properties - _metadata = ["added_property"] - - @property - def _constructor(self): - return SubclassedDataFrame2 - - -def test_groupby_sum_with_custom_metadata(): - my_data_as_dictionary = { - "mycategorical": [1, 1, 2], - "myfloat1": [1.0, 2.0, 3.0], - "myfloat2": [1.0, 2.0, 3.0], - } - sdf = SubclassedDataFrame2(my_data_as_dictionary) - sdf.added_property = "hello pandas" - grouped = sdf.groupby("mycategorical")[["myfloat1", "myfloat2"]] - df = grouped.sum() - assert df.added_property == "hello pandas" - - -def test_groupby_apply_performance_with_custom_metadata(): - """ - Check that __finalize__ is not called for each group during .apply - """ - counter = {"value": 0} - - class SubclassedDataFrame3(pd.DataFrame): - # normal properties - _metadata = ["added_property"] - - @property - def _constructor(self): - return SubclassedDataFrame3 - - def __finalize__(self, *args, **kwargs): - counter["value"] += 1 - return super().__finalize__(*args, **kwargs) - - N = 10 ** 4 - labels = np.random.randint(0, N // 5, size=N) - labels2 = np.random.randint(0, 3, size=N) - df = SubclassedDataFrame3( - { - "key": labels, - "key2": labels2, - "value1": np.random.randn(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), - } - ) - df.index = pd.CategoricalIndex(df.key) - df.groupby(level="key").apply(lambda x: 1) - finalize_call_count = counter["value"] - assert finalize_call_count < 42 # Random number << len(labels2) From 6227894372829d4e92231401906bbe229c9d1908 Mon Sep 17 00:00:00 2001 From: janus Date: Fri, 9 Oct 2020 14:41:54 +0200 Subject: [PATCH 23/23] Remove reference to internal class from whatsnew --- doc/source/whatsnew/v1.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index 7617d8d36c07d..c9dde182ab831 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :class:`BaseGroupedBy` causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) +- Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) .. ---------------------------------------------------------------------------