From 429c49e0f987c4e19008e2062b28273227e3edd7 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Sep 2023 08:39:00 +0200 Subject: [PATCH 1/4] Use own mad if not available --- tests/dataframe/test_groupby_pytest.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index e6be6c05..52de90f0 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -24,6 +24,9 @@ from tests.common import TestData +PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0]) + + class TestGroupbyDataFrame(TestData): funcs = ["max", "min", "mean", "sum"] filter_data = [ @@ -211,14 +214,28 @@ def test_groupby_dataframe_mad(self): pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"]) - pd_mad = pd_flights.groupby("DestCountry").mad() + # The mean absolute difference (mad) aggregation has been removed from + # pandas with major version 2: + # https://github.com/pandas-dev/pandas/issues/11787 + # To compare whether eland's version of it works, we need to implement + # it here ourselves. + def mad(x): + return abs(x - x.mean()).mean() + + if PANDAS_MAJOR_VERSION < 2: + pd_mad = pd_flights.groupby("DestCountry").mad() + else: + pd_mad = pd_flights.groupby("DestCountry").aggregate(mad) ed_mad = ed_flights.groupby("DestCountry").mad() assert_index_equal(pd_mad.columns, ed_mad.columns) assert_index_equal(pd_mad.index, ed_mad.index) assert_series_equal(pd_mad.dtypes, ed_mad.dtypes) - pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"]) + if PANDAS_MAJOR_VERSION < 2: + pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"]) + else: + pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", mad]) ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"]) assert_index_equal(pd_min_mad.columns, ed_min_mad.columns) From af31be813884ac1824042a2a3f311ca9823c1cff Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Sep 2023 21:12:34 +0200 Subject: [PATCH 2/4] Use own mad another time --- tests/dataframe/test_groupby_pytest.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 52de90f0..14cd1e5e 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -27,6 +27,15 @@ PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0]) +# The mean absolute difference (mad) aggregation has been removed from +# pandas with major version 2: +# https://github.com/pandas-dev/pandas/issues/11787 +# To compare whether eland's version of it works, we need to implement +# it here ourselves. +def mad(x): + return abs(x - x.mean()).mean() + + class TestGroupbyDataFrame(TestData): funcs = ["max", "min", "mean", "sum"] filter_data = [ @@ -74,7 +83,7 @@ def test_groupby_aggregate_single_aggs(self, pd_agg): @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("pd_agg", ["max", "min", "mean", "sum", "median"]) def test_groupby_aggs_numeric_only_true(self, pd_agg, dropna): - # Pandas has numeric_only applicable for the above aggs with groupby only. + # Pandas has numeric_only applicable for the above aggs with groupby only. pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) @@ -98,7 +107,12 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) - pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() + # The mad aggregation has been removed in Pandas 2, so we need to use + # our own implementation if we run the tests with Pandas 2 or higher + if PANDAS_MAJOR_VERSION >= 2 and pd_agg == "mad": + pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).aggregate(mad) + else: + pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)( numeric_only=True ) @@ -214,13 +228,6 @@ def test_groupby_dataframe_mad(self): pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"]) - # The mean absolute difference (mad) aggregation has been removed from - # pandas with major version 2: - # https://github.com/pandas-dev/pandas/issues/11787 - # To compare whether eland's version of it works, we need to implement - # it here ourselves. - def mad(x): - return abs(x - x.mean()).mean() if PANDAS_MAJOR_VERSION < 2: pd_mad = pd_flights.groupby("DestCountry").mad() From 86d5fbcdef09c52f70074225d3f98fb6ccc618bf Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Wed, 11 Oct 2023 12:10:01 +0400 Subject: [PATCH 3/4] Trigger CI From 66fd3c30739ab69e71ec024b1847c5fd18d0ff61 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 16 Oct 2023 14:38:11 +0200 Subject: [PATCH 4/4] Reformat unit test using black --- tests/dataframe/test_groupby_pytest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 14cd1e5e..73ab836c 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -23,8 +23,7 @@ from tests.common import TestData - -PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0]) +PANDAS_MAJOR_VERSION = int(pd.__version__.split(".")[0]) # The mean absolute difference (mad) aggregation has been removed from @@ -112,7 +111,9 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): if PANDAS_MAJOR_VERSION >= 2 and pd_agg == "mad": pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).aggregate(mad) else: - pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() + pd_groupby = getattr( + pd_flights.groupby("Cancelled", dropna=dropna), pd_agg + )() ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)( numeric_only=True ) @@ -228,7 +229,6 @@ def test_groupby_dataframe_mad(self): pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"]) - if PANDAS_MAJOR_VERSION < 2: pd_mad = pd_flights.groupby("DestCountry").mad() else: