forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-46620][PS][CONNECT] Introduce a basic fallback mechanism for f…
…rame methods ### What changes were proposed in this pull request? 1, Introduce a basic fallback mechanism for frame methods, with a new option `compute.pandas_fallback` default false; 2, implement `Frame.asfreq` and `Frame.asof` ### Why are the changes needed? for pandas parity ### Does this PR introduce _any_ user-facing change? yes ``` In [1]: import pyspark.pandas as ps ...: import pandas as pd ...: ...: index = pd.date_range('1/1/2000', periods=4, freq='min') ...: series = pd.Series([0.0, None, 2.0, 3.0], index=index) ...: pdf = pd.DataFrame({'s': series}) ...: psdf = ps.from_pandas(pdf) In [2]: psdf.asfreq(freq='30s') --------------------------------------------------------------------------- PandasNotImplementedError Traceback (most recent call last) Cell In[2], line 1 ----> 1 psdf.asfreq(freq='30s') File ~/Dev/spark/python/pyspark/pandas/missing/__init__.py:23, in unsupported_function.<locals>.unsupported_function(*args, **kwargs) 22 def unsupported_function(*args, **kwargs): ---> 23 raise PandasNotImplementedError( 24 class_name=class_name, method_name=method_name, reason=reason 25 ) PandasNotImplementedError: The method `pd.DataFrame.asfreq()` is not implemented yet. In [3]: ps.set_option("compute.pandas_fallback", True) In [4]: psdf.asfreq(freq='30s') /Users/ruifeng.zheng/Dev/spark/python/pyspark/pandas/utils.py:1015: PandasAPIOnSparkAdviceWarning: `asfreq` is executed in fallback mode. It loads partial data into the driver's memory to infer the schema, and loads all data into one executor's memory to compute. It should only be used if the pandas DataFrame is expected to be small. warnings.warn(message, PandasAPIOnSparkAdviceWarning) /Users/ruifeng.zheng/Dev/spark/python/pyspark/pandas/utils.py:1015: PandasAPIOnSparkAdviceWarning: If the type hints is not specified for `groupby.apply`, it is expensive to infer the data type internally. warnings.warn(message, PandasAPIOnSparkAdviceWarning) Out[4]: s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 NaN 2000-01-01 00:01:30 NaN 2000-01-01 00:02:00 2.0 2000-01-01 00:02:30 NaN 2000-01-01 00:03:00 3.0 ``` ### How was this patch tested? added ut ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#44869 from zhengruifeng/ps_df_fallback. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
- Loading branch information
1 parent
95ea2a6
commit 8e1fa56
Showing
8 changed files
with
335 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
python/pyspark/pandas/tests/connect/frame/test_parity_asfreq.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
from pyspark.pandas.tests.frame.test_asfreq import AsFreqMixin | ||
from pyspark.testing.connectutils import ReusedConnectTestCase | ||
from pyspark.testing.pandasutils import PandasOnSparkTestUtils | ||
|
||
|
||
class AsFreqParityTests( | ||
AsFreqMixin, | ||
PandasOnSparkTestUtils, | ||
ReusedConnectTestCase, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.connect.frame.test_parity_asfreq import * # noqa: F401 | ||
|
||
try: | ||
import xmlrunner # type: ignore[import] | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |
41 changes: 41 additions & 0 deletions
41
python/pyspark/pandas/tests/connect/frame/test_parity_asof.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
from pyspark.pandas.tests.frame.test_asof import AsOfMixin | ||
from pyspark.testing.connectutils import ReusedConnectTestCase | ||
from pyspark.testing.pandasutils import PandasOnSparkTestUtils | ||
|
||
|
||
class AsOfParityTests( | ||
AsOfMixin, | ||
PandasOnSparkTestUtils, | ||
ReusedConnectTestCase, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.connect.frame.test_parity_asof import * # noqa: F401 | ||
|
||
try: | ||
import xmlrunner # type: ignore[import] | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
import pandas as pd | ||
|
||
import pyspark.pandas as ps | ||
from pyspark.pandas.exceptions import PandasNotImplementedError | ||
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils | ||
|
||
|
||
class AsFreqMixin: | ||
@property | ||
def pdf(self): | ||
index = pd.date_range("1/1/2000", periods=4, freq="min") | ||
series = pd.Series([0.0, None, 2.0, 3.0], index=index) | ||
return pd.DataFrame({"s": series}) | ||
|
||
@property | ||
def psdf(self): | ||
return ps.from_pandas(self.pdf) | ||
|
||
def test_disabled(self): | ||
with self.assertRaises(PandasNotImplementedError): | ||
self.psdf.asfreq(freq="30s") | ||
|
||
def test_fallback(self): | ||
ps.set_option("compute.pandas_fallback", True) | ||
|
||
self.assert_eq(self.pdf.asfreq(freq="30s"), self.psdf.asfreq(freq="30s")) | ||
self.assert_eq( | ||
self.pdf.asfreq(freq="30s", fill_value=9.0), | ||
self.psdf.asfreq(freq="30s", fill_value=9.0), | ||
) | ||
self.assert_eq( | ||
self.pdf.asfreq(freq="30s", method="bfill"), | ||
self.psdf.asfreq(freq="30s", method="bfill"), | ||
) | ||
|
||
# test with schema infered from partial dataset, len(pdf)==4 | ||
ps.set_option("compute.shortcut_limit", 2) | ||
self.assert_eq(self.pdf.asfreq(freq="30s"), self.psdf.asfreq(freq="30s")) | ||
self.assert_eq( | ||
self.pdf.asfreq(freq="30s", fill_value=9.0), | ||
self.psdf.asfreq(freq="30s", fill_value=9.0), | ||
) | ||
self.assert_eq( | ||
self.pdf.asfreq(freq="30s", method="bfill"), | ||
self.psdf.asfreq(freq="30s", method="bfill"), | ||
) | ||
|
||
ps.reset_option("compute.shortcut_limit") | ||
ps.reset_option("compute.pandas_fallback") | ||
|
||
|
||
class AsFreqTests( | ||
AsFreqMixin, | ||
PandasOnSparkTestCase, | ||
TestUtils, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.frame.test_asfreq import * # noqa: F401 | ||
|
||
try: | ||
import xmlrunner | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
import unittest | ||
|
||
import pandas as pd | ||
|
||
import pyspark.pandas as ps | ||
from pyspark.pandas.exceptions import PandasNotImplementedError | ||
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils | ||
|
||
|
||
class AsOfMixin: | ||
@property | ||
def pdf(self): | ||
return pd.DataFrame( | ||
{"a": [10.0, 20.0, 30.0, 40.0, 50.0], "b": [None, None, None, None, 500]}, | ||
index=pd.DatetimeIndex( | ||
[ | ||
"2018-02-27 09:01:00", | ||
"2018-02-27 09:02:00", | ||
"2018-02-27 09:03:00", | ||
"2018-02-27 09:04:00", | ||
"2018-02-27 09:05:00", | ||
] | ||
), | ||
) | ||
|
||
@property | ||
def psdf(self): | ||
return ps.from_pandas(self.pdf) | ||
|
||
def test_disabled(self): | ||
with self.assertRaises(PandasNotImplementedError): | ||
self.psdf.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) | ||
|
||
def test_fallback(self): | ||
ps.set_option("compute.pandas_fallback", True) | ||
|
||
self.assert_eq( | ||
self.pdf.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])), | ||
self.psdf.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])), | ||
) | ||
self.assert_eq( | ||
self.pdf.asof( | ||
pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]), | ||
subset=["a"], | ||
), | ||
self.psdf.asof( | ||
pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]), | ||
subset=["a"], | ||
), | ||
) | ||
|
||
# test with schema infered from partial dataset, len(pdf)==5 | ||
ps.set_option("compute.shortcut_limit", 2) | ||
self.assert_eq( | ||
self.pdf.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])), | ||
self.psdf.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])), | ||
) | ||
self.assert_eq( | ||
self.pdf.asof( | ||
pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]), | ||
subset=["a"], | ||
), | ||
self.psdf.asof( | ||
pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]), | ||
subset=["a"], | ||
), | ||
) | ||
|
||
ps.reset_option("compute.shortcut_limit") | ||
ps.reset_option("compute.pandas_fallback") | ||
|
||
|
||
class AsFreqTests( | ||
AsOfMixin, | ||
PandasOnSparkTestCase, | ||
TestUtils, | ||
): | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
from pyspark.pandas.tests.frame.test_asof import * # noqa: F401 | ||
|
||
try: | ||
import xmlrunner | ||
|
||
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) | ||
except ImportError: | ||
testRunner = None | ||
unittest.main(testRunner=testRunner, verbosity=2) |