From bb2195554e6d4330be287d93a281f154348438cc Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Sun, 28 Jan 2024 19:01:56 -0800 Subject: [PATCH] [SPARK-46874][PYTHON] Remove `pyspark.pandas` dependency from `assertDataFrameEqual` ### What changes were proposed in this pull request? This PR proposes to remove `pyspark.pandas` dependency from `assertDataFrameEqual` ### Why are the changes needed? To allow `assertDataFrameEqual` when pandas is not installed. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI should pass, and manually testing ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44899 from itholic/remove_deps_from_assertDataFrameEqual. Authored-by: Haejoon Lee Signed-off-by: Dongjoon Hyun --- python/pyspark/testing/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index 5da6f47174382..17f74960f8c71 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -758,16 +758,25 @@ def assertDataFrameEqual( has_pandas = False try: # If pandas dependencies are available, allow pandas or pandas-on-Spark DataFrame - import pyspark.pandas as ps import pandas as pd - from pyspark.testing.pandasutils import PandasOnSparkTestUtils has_pandas = True except ImportError: # no pandas, so we won't call pandasutils functions pass - if has_pandas: + has_arrow = False + try: + import pyarrow + + has_arrow = True + except ImportError: + pass + + if has_pandas and has_arrow: + import pyspark.pandas as ps + from pyspark.testing.pandasutils import PandasOnSparkTestUtils + if ( isinstance(actual, pd.DataFrame) or isinstance(expected, pd.DataFrame)