diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml new file mode 100644 index 00000000000..3fc716c4b79 --- /dev/null +++ b/.github/workflows/fuzzydata-test.yml @@ -0,0 +1,36 @@ +name: fuzzy +on: pull_request +jobs: + test-fuzzydata: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.8"] + engine: ["ray", "dask"] + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: modin + environment-file: environment-dev.yml + python-version: 3.8 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + run: | + conda info + conda list + - name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}}) + run: python -m pytest modin/experimental/fuzzydata/test/test_fuzzydata.py -Wignore::UserWarning + env: + MODIN_ENGINE: ${{matrix.engine}} + - uses: actions/upload-artifact@v3 + with: + name: fuzzydata-test-workflow-${{matrix.engine}} + path: /tmp/fuzzydata-test-wf-${{matrix.engine}}/* # Must match output dir in test_fuzzydata.py + if-no-files-found: error diff --git a/README.md b/README.md index 24431748e80..bcfdf3c7261 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,18 @@

Scale your pandas workflows by changing one line of code

+
+ +|

Dev Community & Support

|

Forums

|

Socials

|

Docs

| +|:---: | :---: | :---: | :---: | +| [![Slack](https://img.shields.io/badge/Slack-4A154B?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA) | [![Stack Overflow](https://img.shields.io/badge/-Stackoverflow-FE7A16?style=for-the-badge&logo=stack-overflow&logoColor=white)](https://stackoverflow.com/questions/tagged/modin) | Twitter Follow | | + +
+

- -Slack + - PyPI version

diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst index 9135a41de5d..2505e7330b9 100644 --- a/docs/release_notes/release_notes-0.15.0.rst +++ b/docs/release_notes/release_notes-0.15.0.rst @@ -65,6 +65,7 @@ Key Features and Updates * DOCS-#4469: Say that commit messages can start with PERF (#4470). * DOCS-#4466: Recommend GitHub issues over bug_reports@modin.org (#4474). * DOCS-#4487: Recommend GitHub issues over feature_requests@modin.org (#4489). + * DOCS-#4545: Add socials to README (#4555). * Dependencies * FIX-#4327: Update min pin for xgboost version (#4328) * FIX-#4383: Remove `pathlib` from deps (#4384) diff --git a/docs/release_notes/release_notes-0.16.0.rst b/docs/release_notes/release_notes-0.16.0.rst index 1931719b41a..574632ddf09 100644 --- a/docs/release_notes/release_notes-0.16.0.rst +++ b/docs/release_notes/release_notes-0.16.0.rst @@ -9,6 +9,9 @@ Key Features and Updates * FIX-#4570: Replace ``np.bool`` -> ``np.bool_`` (#4571) * FIX-#4543: Fix `read_csv` in case skiprows=<0, []> (#4544) * FIX-#4059: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391) + * FIX-#4589: Pin protobuf<4.0.0 to fix ray (#4590) + * FIX-#4577: Set attribute of Modin dataframe to updated value (#4588) + * FIX-#4411: Fix binary_op between datetime64 Series and pandas timedelta (#4592) * FIX-#4582: Inherit custom log layer (#4583) * Performance enhancements * PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391) @@ -31,8 +34,9 @@ Key Features and Updates * Documentation improvements * DOCS-#4552: Change default sphinx language to en to fix sphinx >= 5.0.0 build (#4553) * Dependencies - * + * FEAT-#4598: Add support for pandas 1.4.3 (#4599) * New Features + * FEAT-4463: Add experimental fuzzydata integration for testing against a randomized dataframe workflow (#4556) Contributors ------------ @@ -40,3 +44,6 @@ Contributors @NickCrews @prutskov @vnlitvinov +@pyrito +@suhailrehman +@RehanSD diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt index 60eb5fe301b..6028ad30302 100644 --- a/docs/requirements-doc.txt +++ b/docs/requirements-doc.txt @@ -12,6 +12,9 @@ sphinx-click # Pin ray to < 1.13.0 to work around GH#4564 # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0. ray[default]>=1.4.0,<1.13.0 +# Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4, +# because versions >= 4.0.0 are incompatible with ray<1.13.0. +protobuf<4.0.0 git+https://github.com/modin-project/modin.git@master#egg=modin[all] sphinxcontrib_plantuml sphinx-issues diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst index 0287bb37f09..d0d3daa6fdb 100644 --- a/docs/usage_guide/advanced_usage/index.rst +++ b/docs/usage_guide/advanced_usage/index.rst @@ -89,9 +89,9 @@ internal execution flow. Logging with Modin ------------------ -Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, -and system memory. Logging is disabled by default, but when it is enabled, log files are written to a local `.modin` directory -at the same directory level as the notebook/script used to run Modin. See our :doc:`Logging with Modin documentation ` +Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, +and system memory. Logging is disabled by default, but when it is enabled, log files are written to a local `.modin` directory +at the same directory level as the notebook/script used to run Modin. See our :doc:`Logging with Modin documentation ` for usage information. Batch Pipeline API @@ -100,6 +100,13 @@ Modin provides an experimental batched API that pipelines row parallel queries. for a walkthrough on how to use this feature, as well as :doc:`Batch Pipeline API documentation ` for more information about the API. +Fuzzydata Testing +----------------- + +An experimental GitHub Action on pull request has been added to Modin, which automatically runs the Modin codebase against +`fuzzydata`, a random dataframe workflow generator. The resulting workflow that was used to test Modin codebase can be +downloaded as an artifact from the GitHub Actions tab for further inspection. See `fuzzydata`_ for more details. + .. _`blog post`: https://medium.com/riselab/why-every-data-scientist-using-pandas-needs-modin-bringing-sql-to-dataframes-3b216b29a7c0 .. _`Modin SQL documentation`: modin_sql.html .. _`Modin Spreadsheet API documentation`: spreadsheets_api.html @@ -109,3 +116,4 @@ for more information about the API. .. _`Slack`: https://modin.org/slack.html .. _`tqdm`: https://github.com/tqdm/tqdm .. _`distributed XGBoost`: https://medium.com/intel-analytics-software/distributed-xgboost-with-modin-on-ray-fc17edef7720 +.. _`fuzzydata`: https://github.com/suhailrehman/fuzzydata diff --git a/environment-dev.yml b/environment-dev.yml index a635b3f002a..4f565ea2b69 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.4.2 + - pandas==1.4.3 - numpy>=1.18.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0,<2022.2.0 @@ -42,8 +42,11 @@ dependencies: - tqdm - git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 # Pin ray to < 1.13.0 to work around GH#4564 - # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0. + # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0. - ray[default]>=1.4.0,<1.13.0 + # Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4, + # because versions >= 4.0.0 are incompatible with ray<1.13.0. + - protobuf<4.0.0 - connectorx>=0.2.6a4 # TODO: remove when resolving GH#4398 - redis>=3.5.0,<4.0.0 @@ -51,3 +54,5 @@ dependencies: - flake8 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.1.0 + # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies + - fuzzydata>=0.0.6 diff --git a/modin/experimental/fuzzydata/__init__.py b/modin/experimental/fuzzydata/__init__.py new file mode 100644 index 00000000000..153148d879b --- /dev/null +++ b/modin/experimental/fuzzydata/__init__.py @@ -0,0 +1,14 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module holds experimental fuzzydata specific functionality for Modin.""" diff --git a/modin/experimental/fuzzydata/test/__init__.py b/modin/experimental/fuzzydata/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/fuzzydata/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/fuzzydata/test/test_fuzzydata.py b/modin/experimental/fuzzydata/test/test_fuzzydata.py new file mode 100644 index 00000000000..d110f940a1c --- /dev/null +++ b/modin/experimental/fuzzydata/test/test_fuzzydata.py @@ -0,0 +1,65 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import os +import glob +import uuid +import shutil +from fuzzydata.core.generator import generate_workflow +from fuzzydata.clients.modin import ModinWorkflow + +from modin.config import Engine + + +def test_fuzzydata_sample_workflow(): + # Workflow Generation Options + wf_name = str(uuid.uuid4())[:8] # Unique name for the generated workflow + num_versions = 10 # Number of unique CSV files to generate + cols = 33 # Columns in Base Artifact + rows = 1000 # Rows in Base Artifact + bfactor = 1.0 # Branching Factor - 0.1 is linear, 10.0 is star-like + exclude_ops = ["groupby"] # In-Memory groupby operations cause issue #4287 + matfreq = 2 # How many operations to chain before materialization + + engine = Engine.get().lower() + + # Create Output Directory for Workflow Data + base_out_directory = ( + f"/tmp/fuzzydata-test-wf-{engine}/" # Must match corresponding github-action + ) + if os.path.exists(base_out_directory): + shutil.rmtree(base_out_directory) + output_directory = f"{base_out_directory}/{wf_name}/" + os.makedirs(output_directory, exist_ok=True) + + # Start Workflow Generation + workflow = generate_workflow( + workflow_class=ModinWorkflow, + name=wf_name, + num_versions=num_versions, + base_shape=(cols, rows), + out_directory=output_directory, + bfactor=bfactor, + exclude_ops=exclude_ops, + matfreq=matfreq, + wf_options={"modin_engine": engine}, + ) + + # Assertions that the workflow generation worked correctly + assert len(workflow) == num_versions + assert len(list(glob.glob(f"{output_directory}/artifacts/*.csv"))) == len( + workflow.artifact_dict + ) + assert os.path.exists(f"{output_directory}/{workflow.name}_operations.json") + assert os.path.getsize(f"{output_directory}/{workflow.name}_operations.json") > 0 + assert os.path.exists(f"{output_directory}/{workflow.name}_gt_graph.csv") diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index dfc6d3aea01..a30871a42a6 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -14,7 +14,7 @@ import pandas import warnings -__pandas_version__ = "1.4.2" +__pandas_version__ = "1.4.3" if pandas.__version__ != __pandas_version__: warnings.warn( diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 943c97e2652..4e14333f7b4 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -222,10 +222,7 @@ def _validate_other( self, other, axis, - numeric_only=False, - numeric_or_time_only=False, - numeric_or_object_only=False, - comparison_dtypes_only=False, + dtype_check=False, compare_index=False, ): """ @@ -239,14 +236,8 @@ def _validate_other( Specifies axis along which to do validation. When `1` or `None` is specified, validation is done along `index`, if `0` is specified validation is done along `columns` of `other` frame. - numeric_only : bool, default: False - Validates that both frames have only numeric dtypes. - numeric_or_time_only : bool, default: False - Validates that both frames have either numeric or time dtypes. - numeric_or_object_only : bool, default: False - Validates that both frames have either numeric or object dtypes. - comparison_dtypes_only : bool, default: False - Validates that both frames have either numeric or time or equal dtypes. + dtype_check : bool, default: False + Validates that both frames have compatible dtypes. compare_index : bool, default: False Compare Index if True. @@ -300,22 +291,10 @@ def _validate_other( if not self.index.equals(other.index): raise TypeError("Cannot perform operation with non-equal index") # Do dtype checking. - if numeric_only: - if not all( - is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype) - for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) - ): - raise TypeError("Cannot do operation on non-numeric dtypes") - elif numeric_or_object_only: + if dtype_check: if not all( (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) - for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) - ): - raise TypeError("Cannot do operation non-numeric dtypes") - elif comparison_dtypes_only: - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) or ( is_datetime_or_timedelta_dtype(self_dtype) and is_datetime_or_timedelta_dtype(other_dtype) @@ -323,21 +302,7 @@ def _validate_other( or is_dtype_equal(self_dtype, other_dtype) for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) ): - raise TypeError( - "Cannot do operation non-numeric objects with numeric objects" - ) - elif numeric_or_time_only: - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or ( - is_datetime_or_timedelta_dtype(self_dtype) - and is_datetime_or_timedelta_dtype(other_dtype) - ) - for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) - ): - raise TypeError( - "Cannot do operation non-numeric objects with numeric objects" - ) + raise TypeError("Cannot do operation with improper dtypes") return result def _validate_function(self, func, on_invalid=None): @@ -416,7 +381,7 @@ def _binary_op(self, op, other, **kwargs): return self._default_to_pandas( getattr(self._pandas_class, op), other, **kwargs ) - other = self._validate_other(other, axis, numeric_or_object_only=True) + other = self._validate_other(other, axis, dtype_check=True) exclude_list = [ "__add__", "__radd__", diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 6abeda7d50b..fa3bcb07026 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2498,6 +2498,10 @@ def __setattr__(self, key, value): pass elif key in self and key not in dir(self): self.__setitem__(key, value) + # Note: return immediately so we don't keep this `key` as dataframe state. + # `__getattr__` will return the columns not present in `dir(self)`, so we do not need + # to manually track this state in the `dir`. + return elif isinstance(value, pandas.Series): warnings.warn( "Modin doesn't allow columns to be created via a new attribute name - see " diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index e54d1af6619..87d612b780d 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -248,7 +248,8 @@ def test_inplace_series_ops(data): df_equals(modin_df, pandas_df) -def test___setattr__(): +# Note: Tests setting an attribute that is not an existing column label +def test___setattr__not_column(): pandas_df = pandas.DataFrame([1, 2, 3]) modin_df = pd.DataFrame([1, 2, 3]) @@ -257,6 +258,37 @@ def test___setattr__(): df_equals(modin_df, pandas_df) + # While `new_col` is not a column of the dataframe, + # it should be accessible with __getattr__. + assert modin_df.new_col == pandas_df.new_col + + +def test___setattr__mutating_column(): + # Use case from issue #4577 + pandas_df = pandas.DataFrame([[1]], columns=["col0"]) + modin_df = pd.DataFrame([[1]], columns=["col0"]) + + # Replacing a column with a list should mutate the column in place. + pandas_df.col0 = [3] + modin_df.col0 = [3] + + df_equals(modin_df, pandas_df) + # Check that the col0 attribute reflects the value update. + df_equals(modin_df.col0, pandas_df.col0) + + pandas_df.col0 = pd.Series([5]) + modin_df.col0 = pd.Series([5]) + + # Check that the col0 attribute reflects this update + df_equals(modin_df, pandas_df) + + pandas_df.loc[0, "col0"] = 4 + modin_df.loc[0, "col0"] = 4 + + # Check that the col0 attribute reflects update via loc + df_equals(modin_df, pandas_df) + assert modin_df.col0.equals(modin_df["col0"]) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isin(data): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 04c7142f7b6..d7e6822a40c 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1141,6 +1141,14 @@ def test_between_time(): ) +def test_add_series_to_timedeltaindex(): + # Make a pandas.core.indexes.timedeltas.TimedeltaIndex + deltas = pd.to_timedelta([1], unit="h") + test_series = create_test_series(np.datetime64("2000-12-12")) + eval_general(*test_series, lambda s: s + deltas) + eval_general(*test_series, lambda s: s - deltas) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_bfill(data): modin_series, pandas_series = create_test_series(data) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5777f4eb0a2..76b6c29656b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -pandas==1.4.2 +pandas==1.4.3 numpy>=1.18.5 pyarrow>=4.0.1 dask[complete]>=2.22.0,<2022.2.0 @@ -6,6 +6,9 @@ distributed>=2.22.0,<2022.2.0 # Pin ray to < 1.13.0 to work around GH#4564 # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0. ray[default]>=1.4.0,<1.13.0 +# Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4, +# because versions >= 4.0.0 are incompatible with ray<1.13.0. +protobuf<4.0.0 redis>=3.5.0,<4.0.0 psutil fsspec @@ -40,4 +43,6 @@ connectorx>=0.2.6a4 black flake8 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. -numpydoc==1.1.0 \ No newline at end of file +numpydoc==1.1.0 +# experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies +fuzzydata>=0.0.6 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 4478242d2b5..aa4de7a1d04 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -2,7 +2,7 @@ name: modin_on_omnisci channels: - conda-forge dependencies: - - pandas==1.4.2 + - pandas==1.4.3 - pyarrow=6 - numpy>=1.18.5 - fsspec diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 93240e89788..b6746552c66 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,7 +1,7 @@ channels: - conda-forge dependencies: - - pandas==1.4.2 + - pandas==1.4.3 - numpy>=1.18.5 - pyarrow>=4.0.1 - fsspec diff --git a/setup.py b/setup.py index 193980b4292..2a709c5e8e9 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,14 @@ # TODO: remove redis dependency when resolving GH#4398 # Pin ray to < 1.13.0 to work around GH#4564 # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0. -ray_deps = ["ray[default]>=1.4.0,<1.13.0", "pyarrow>=4.0.1", "redis>=3.5.0,<4.0.0"] +# Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4, +# because versions >= 4.0.0 are incompatible with ray<1.13.0. +ray_deps = [ + "ray[default]>=1.4.0,<1.13.0", + "pyarrow>=4.0.1", + "redis>=3.5.0,<4.0.0", + "protobuf<4.0.0", +] remote_deps = ["rpyc==4.1.5", "cloudpickle", "boto3"] spreadsheet_deps = ["modin-spreadsheet>=0.1.0"] sql_deps = ["dfsql>=0.4.2", "pyparsing<=2.4.7"] @@ -25,7 +32,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.4.2", "packaging", "numpy>=1.18.5", "fsspec", "psutil"], + install_requires=["pandas==1.4.3", "packaging", "numpy>=1.18.5", "fsspec", "psutil"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps,