From ef7e1e685f615f7763998b076dafb206cf46985d Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Sun, 24 Dec 2023 14:31:12 -0800 Subject: [PATCH] [SPARK-46471][PS][TESTS][FOLLOWUPS] Move `OpsOnDiffFramesEnabledTests` to `pyspark.pandas.tests.diff_frames_ops.*`` ### What changes were proposed in this pull request? Move `OpsOnDiffFramesEnabledTests` to `pyspark.pandas.tests.diff_frames_ops.*`` ### Why are the changes needed? test code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44471 from zhengruifeng/ps_test_diff_ops_3. Authored-by: Ruifeng Zheng Signed-off-by: Dongjoon Hyun --- dev/sparktestsupport/modules.py | 4 +- .../test_parity_basic.py} | 11 ++- .../tests/diff_frames_ops/test_arithmetic.py | 45 +++++++++ .../test_basic.py} | 91 ++----------------- 4 files changed, 59 insertions(+), 92 deletions(-) rename python/pyspark/pandas/tests/connect/{test_parity_ops_on_diff_frames.py => diff_frames_ops/test_parity_basic.py} (79%) rename python/pyspark/pandas/tests/{test_ops_on_diff_frames.py => diff_frames_ops/test_basic.py} (74%) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 939e88bf95b24..e4e3803a8f87e 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -862,7 +862,6 @@ def __hash__(self): "pyspark.pandas.tests.groupby.test_stat_prod", "pyspark.pandas.tests.groupby.test_value_counts", "pyspark.pandas.tests.test_indexing", - "pyspark.pandas.tests.test_ops_on_diff_frames", "pyspark.pandas.tests.diff_frames_ops.test_align", "pyspark.pandas.tests.diff_frames_ops.test_arithmetic", "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext", @@ -872,6 +871,7 @@ def __hash__(self): "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext_float", "pyspark.pandas.tests.diff_frames_ops.test_assign_frame", "pyspark.pandas.tests.diff_frames_ops.test_assign_series", + "pyspark.pandas.tests.diff_frames_ops.test_basic", "pyspark.pandas.tests.diff_frames_ops.test_bitwise", "pyspark.pandas.tests.diff_frames_ops.test_combine_first", "pyspark.pandas.tests.diff_frames_ops.test_compare_series", @@ -1235,7 +1235,6 @@ def __hash__(self): "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map", "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round", - "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float", @@ -1244,6 +1243,7 @@ def __hash__(self): "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext_float", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_frame", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_series", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series", diff --git a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic.py similarity index 79% rename from python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py rename to python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic.py index 777c3620a4a81..339427f4e00c5 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_basic.py @@ -16,20 +16,21 @@ # import unittest -from pyspark.pandas.tests.test_ops_on_diff_frames import OpsOnDiffFramesEnabledTestsMixin - from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.pandas.tests.diff_frames_ops.test_basic import BasicMixin -class OpsOnDiffFramesEnabledParityTests( - OpsOnDiffFramesEnabledTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class BasicParityTests( + BasicMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames import * # noqa: F401 + from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_arithmetic.py b/python/pyspark/pandas/tests/diff_frames_ops/test_arithmetic.py index 8af0e80c6e604..8e14fa768779c 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_arithmetic.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_arithmetic.py @@ -114,6 +114,29 @@ def pdf2(self): index=list(range(9)), ) + @property + def pdf5(self): + return pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [4, 5, 6, 3, 2, 1, 0, 0, 0], + "c": [4, 5, 6, 3, 2, 1, 0, 0, 0], + }, + index=[0, 1, 3, 5, 6, 8, 9, 10, 11], + ).set_index(["a", "b"]) + + @property + def pdf6(self): + return pd.DataFrame( + { + "a": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "b": [0, 0, 0, 4, 5, 6, 1, 2, 3], + "c": [9, 8, 7, 6, 5, 4, 3, 2, 1], + "e": [4, 5, 6, 3, 2, 1, 0, 0, 0], + }, + index=list(range(9)), + ).set_index(["a", "b"]) + @property def pser1(self): midx = pd.MultiIndex( @@ -130,10 +153,32 @@ def pser2(self): ) return pd.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx) + @property + def psdf5(self): + return ps.from_pandas(self.pdf5) + + @property + def psdf6(self): + return ps.from_pandas(self.pdf6) + def test_arithmetic(self): self._test_arithmetic_frame(self.pdf1, self.pdf2, check_extension=False) self._test_arithmetic_series(self.pser1, self.pser2, check_extension=False) + def test_multi_index_arithmetic(self): + psdf5 = self.psdf5 + psdf6 = self.psdf6 + pdf5 = self.pdf5 + pdf6 = self.pdf6 + + # Series + self.assert_eq((psdf5.c - psdf6.e).sort_index(), (pdf5.c - pdf6.e).sort_index()) + + self.assert_eq((psdf5["c"] / psdf6["e"]).sort_index(), (pdf5["c"] / pdf6["e"]).sort_index()) + + # DataFrame + self.assert_eq((psdf5 + psdf6).sort_index(), (pdf5 + pdf6).sort_index(), almost=True) + class ArithmeticTests( ArithmeticMixin, diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/diff_frames_ops/test_basic.py similarity index 74% rename from python/pyspark/pandas/tests/test_ops_on_diff_frames.py rename to python/pyspark/pandas/tests/diff_frames_ops/test_basic.py index 75410a65227d2..1075188779a10 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_basic.py @@ -26,7 +26,7 @@ from pyspark.testing.sqlutils import SQLTestUtils -class OpsOnDiffFramesEnabledTestsMixin: +class BasicMixin: @classmethod def setUpClass(cls): super().setUpClass() @@ -65,53 +65,6 @@ def pdf4(self): index=list(range(9)), ) - @property - def pdf5(self): - return pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "b": [4, 5, 6, 3, 2, 1, 0, 0, 0], - "c": [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, - index=[0, 1, 3, 5, 6, 8, 9, 10, 11], - ).set_index(["a", "b"]) - - @property - def pdf6(self): - return pd.DataFrame( - { - "a": [9, 8, 7, 6, 5, 4, 3, 2, 1], - "b": [0, 0, 0, 4, 5, 6, 1, 2, 3], - "c": [9, 8, 7, 6, 5, 4, 3, 2, 1], - "e": [4, 5, 6, 3, 2, 1, 0, 0, 0], - }, - index=list(range(9)), - ).set_index(["a", "b"]) - - @property - def pser1(self): - midx = pd.MultiIndex( - [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length", "power"]], - [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]], - ) - return pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx) - - @property - def pser2(self): - midx = pd.MultiIndex( - [["lama", "cow", "falcon"], ["speed", "weight", "length"]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ) - return pd.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx) - - @property - def pser3(self): - midx = pd.MultiIndex( - [["koalas", "cow", "falcon"], ["speed", "weight", "length"]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 2, 0, 0, 2, 2, 2, 1]], - ) - return pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) - @property def psdf1(self): return ps.from_pandas(self.pdf1) @@ -128,26 +81,6 @@ def psdf3(self): def psdf4(self): return ps.from_pandas(self.pdf4) - @property - def psdf5(self): - return ps.from_pandas(self.pdf5) - - @property - def psdf6(self): - return ps.from_pandas(self.pdf6) - - @property - def psser1(self): - return ps.from_pandas(self.pser1) - - @property - def psser2(self): - return ps.from_pandas(self.pser2) - - @property - def psser3(self): - return ps.from_pandas(self.pser3) - def test_ranges(self): self.assert_eq( (ps.range(10) + ps.range(10)).sort_index(), @@ -286,29 +219,17 @@ def test_different_columns(self): self.assert_eq((psdf1 + psdf4).sort_index(), (pdf1 + pdf4).sort_index(), almost=True) - def test_multi_index_arithmetic(self): - psdf5 = self.psdf5 - psdf6 = self.psdf6 - pdf5 = self.pdf5 - pdf6 = self.pdf6 - - # Series - self.assert_eq((psdf5.c - psdf6.e).sort_index(), (pdf5.c - pdf6.e).sort_index()) - - self.assert_eq((psdf5["c"] / psdf6["e"]).sort_index(), (pdf5["c"] / pdf6["e"]).sort_index()) - - # DataFrame - self.assert_eq((psdf5 + psdf6).sort_index(), (pdf5 + pdf6).sort_index(), almost=True) - -class OpsOnDiffFramesEnabledTests( - OpsOnDiffFramesEnabledTestsMixin, PandasOnSparkTestCase, SQLTestUtils +class BasicTests( + BasicMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.test_ops_on_diff_frames import * # noqa: F401 + from pyspark.pandas.tests.diff_frames_ops.test_basic import * # noqa: F401 try: import xmlrunner