From 429c49e0f987c4e19008e2062b28273227e3edd7 Mon Sep 17 00:00:00 2001
From: Bart Broere <mail@bartbroere.eu>
Date: Thu, 14 Sep 2023 08:39:00 +0200
Subject: [PATCH 1/4] Use own mad if not available

---
 tests/dataframe/test_groupby_pytest.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py
index e6be6c05..52de90f0 100644
--- a/tests/dataframe/test_groupby_pytest.py
+++ b/tests/dataframe/test_groupby_pytest.py
@@ -24,6 +24,9 @@
 from tests.common import TestData
 
 
+PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0])
+
+
 class TestGroupbyDataFrame(TestData):
     funcs = ["max", "min", "mean", "sum"]
     filter_data = [
@@ -211,14 +214,28 @@ def test_groupby_dataframe_mad(self):
         pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
         ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
 
-        pd_mad = pd_flights.groupby("DestCountry").mad()
+        # The mean absolute difference (mad) aggregation has been removed from
+        # pandas with major version 2:
+        # https://github.com/pandas-dev/pandas/issues/11787
+        # To compare whether eland's version of it works, we need to implement
+        # it here ourselves.
+        def mad(x):
+            return abs(x - x.mean()).mean()
+
+        if PANDAS_MAJOR_VERSION < 2:
+            pd_mad = pd_flights.groupby("DestCountry").mad()
+        else:
+            pd_mad = pd_flights.groupby("DestCountry").aggregate(mad)
         ed_mad = ed_flights.groupby("DestCountry").mad()
 
         assert_index_equal(pd_mad.columns, ed_mad.columns)
         assert_index_equal(pd_mad.index, ed_mad.index)
         assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
 
-        pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
+        if PANDAS_MAJOR_VERSION < 2:
+            pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
+        else:
+            pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", mad])
         ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
 
         assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)

From af31be813884ac1824042a2a3f311ca9823c1cff Mon Sep 17 00:00:00 2001
From: Bart Broere <mail@bartbroere.eu>
Date: Thu, 14 Sep 2023 21:12:34 +0200
Subject: [PATCH 2/4] Use own mad another time

---
 tests/dataframe/test_groupby_pytest.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py
index 52de90f0..14cd1e5e 100644
--- a/tests/dataframe/test_groupby_pytest.py
+++ b/tests/dataframe/test_groupby_pytest.py
@@ -27,6 +27,15 @@
 PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0])
 
 
+# The mean absolute difference (mad) aggregation has been removed from
+# pandas with major version 2:
+# https://github.com/pandas-dev/pandas/issues/11787
+# To compare whether eland's version of it works, we need to implement
+# it here ourselves.
+def mad(x):
+    return abs(x - x.mean()).mean()
+
+
 class TestGroupbyDataFrame(TestData):
     funcs = ["max", "min", "mean", "sum"]
     filter_data = [
@@ -74,7 +83,7 @@ def test_groupby_aggregate_single_aggs(self, pd_agg):
     @pytest.mark.parametrize("dropna", [True, False])
     @pytest.mark.parametrize("pd_agg", ["max", "min", "mean", "sum", "median"])
     def test_groupby_aggs_numeric_only_true(self, pd_agg, dropna):
-        # Pandas has numeric_only  applicable for the above aggs with groupby only.
+        # Pandas has numeric_only applicable for the above aggs with groupby only.
 
         pd_flights = self.pd_flights().filter(self.filter_data)
         ed_flights = self.ed_flights().filter(self.filter_data)
@@ -98,7 +107,12 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna):
         pd_flights = self.pd_flights().filter(self.filter_data)
         ed_flights = self.ed_flights().filter(self.filter_data)
 
-        pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
+        # The mad aggregation has been removed in Pandas 2, so we need to use
+        # our own implementation if we run the tests with Pandas 2 or higher
+        if PANDAS_MAJOR_VERSION >= 2 and pd_agg == "mad":
+            pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).aggregate(mad)
+        else:
+            pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
         ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)(
             numeric_only=True
         )
@@ -214,13 +228,6 @@ def test_groupby_dataframe_mad(self):
         pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
         ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
 
-        # The mean absolute difference (mad) aggregation has been removed from
-        # pandas with major version 2:
-        # https://github.com/pandas-dev/pandas/issues/11787
-        # To compare whether eland's version of it works, we need to implement
-        # it here ourselves.
-        def mad(x):
-            return abs(x - x.mean()).mean()
 
         if PANDAS_MAJOR_VERSION < 2:
             pd_mad = pd_flights.groupby("DestCountry").mad()

From 86d5fbcdef09c52f70074225d3f98fb6ccc618bf Mon Sep 17 00:00:00 2001
From: Quentin Pradet <quentin.pradet@elastic.co>
Date: Wed, 11 Oct 2023 12:10:01 +0400
Subject: [PATCH 3/4] Trigger CI


From 66fd3c30739ab69e71ec024b1847c5fd18d0ff61 Mon Sep 17 00:00:00 2001
From: Bart Broere <mail@bartbroere.eu>
Date: Mon, 16 Oct 2023 14:38:11 +0200
Subject: [PATCH 4/4] Reformat unit test using black

---
 tests/dataframe/test_groupby_pytest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py
index 14cd1e5e..73ab836c 100644
--- a/tests/dataframe/test_groupby_pytest.py
+++ b/tests/dataframe/test_groupby_pytest.py
@@ -23,8 +23,7 @@
 
 from tests.common import TestData
 
-
-PANDAS_MAJOR_VERSION = int(pd.__version__.split('.')[0])
+PANDAS_MAJOR_VERSION = int(pd.__version__.split(".")[0])
 
 
 # The mean absolute difference (mad) aggregation has been removed from
@@ -112,7 +111,9 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna):
         if PANDAS_MAJOR_VERSION >= 2 and pd_agg == "mad":
             pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).aggregate(mad)
         else:
-            pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
+            pd_groupby = getattr(
+                pd_flights.groupby("Cancelled", dropna=dropna), pd_agg
+            )()
         ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)(
             numeric_only=True
         )
@@ -228,7 +229,6 @@ def test_groupby_dataframe_mad(self):
         pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
         ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
 
-
         if PANDAS_MAJOR_VERSION < 2:
             pd_mad = pd_flights.groupby("DestCountry").mad()
         else: