Merge branch 'master' into fix-log-level-inheritance

modin-project · Jun 27, 2022 · 3961336 · 3961336
2 parents 7113296 + af7f4ed
commit 3961336
Show file tree

Hide file tree

Showing 19 changed files with 236 additions and 58 deletions.
diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml
@@ -0,0 +1,36 @@
+name: fuzzy
+on: pull_request
+jobs:
+  test-fuzzydata:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      matrix:
+        python-version: ["3.8"]
+        engine: ["ray", "dask"]
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          activate-environment: modin
+          environment-file: environment-dev.yml
+          python-version: 3.8
+          channel-priority: strict
+          use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
+      - name: Conda environment
+        run: |
+          conda info
+          conda list
+      - name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}})
+        run: python -m pytest modin/experimental/fuzzydata/test/test_fuzzydata.py -Wignore::UserWarning
+        env:
+          MODIN_ENGINE: ${{matrix.engine}}
+      - uses: actions/upload-artifact@v3
+        with:
+           name: fuzzydata-test-workflow-${{matrix.engine}}
+           path: /tmp/fuzzydata-test-wf-${{matrix.engine}}/* # Must match output dir in test_fuzzydata.py
+           if-no-files-found: error
diff --git a/README.md b/README.md
@@ -1,12 +1,18 @@
 <p align="center"><a href="https://modin.readthedocs.io"><img width=77% alt="" src="https://github.com/modin-project/modin/blob/3d6368edf311995ad231ec5342a51cd9e4e3dc20/docs/img/MODIN_ver2_hrz.png?raw=true"></a></p>
 <h2 align="center">Scale your pandas workflows by changing one line of code</h2>
 
+<div align="center">
+
+| <h3>Dev Community & Support</h3> | <h3>Forums</h3> | <h3>Socials</h3> | <h3>Docs</h3> |
+|:---: | :---: | :---: | :---: |
+| [![Slack](https://img.shields.io/badge/Slack-4A154B?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA) | [![Stack Overflow](https://img.shields.io/badge/-Stackoverflow-FE7A16?style=for-the-badge&logo=stack-overflow&logoColor=white)](https://stackoverflow.com/questions/tagged/modin) | <img alt="Twitter Follow" src="https://img.shields.io/twitter/follow/modin_project?style=social" height=28 align="center"> | <a href="https://modin.readthedocs.io/en/latest/?badge=latest"><img alt="" src="https://readthedocs.org/projects/modin/badge/?version=latest" height=28 align="center"></a> |
+
+</div>
+
 <p align="center">
-<a href="https://discuss.modin.org"><img alt="" src="https://img.shields.io/badge/discourse-forum-purple.svg?logo=discourse&logoColor=white" align="center"></a>
-<a href='https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA'><img src='https://img.shields.io/static/v1?label=chat&logo=slack&message=Slack&color=brightgreen' alt='Slack'  align="center"/></a>
+<a href="https://pepy.tech/project/modin"><img src="https://static.pepy.tech/personalized-badge/modin?period=total&units=international_system&left_color=black&right_color=blue&left_text=Downloads" align="center"></a>
 <a href="https://codecov.io/gh/modin-project/modin"><img src="https://codecov.io/gh/modin-project/modin/branch/master/graph/badge.svg" align="center"/></a>
 <a href="https://github.com/modin-project/modin/actions"><img src="https://github.com/modin-project/modin/workflows/master/badge.svg" align="center"></a>
-<a href="https://modin.readthedocs.io/en/latest/?badge=latest"><img alt="" src="https://readthedocs.org/projects/modin/badge/?version=latest" align="center"></a>
 <a href="https://pypi.org/project/modin/"><img src="https://badge.fury.io/py/modin.svg" alt="PyPI version" align="center"></a>
 <a href="https://modin.org/modin-bench/#/"><img src="https://img.shields.io/badge/benchmarked%20by-asv-blue.svg" align="center"></a>
 </p>

diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst
@@ -65,6 +65,7 @@ Key Features and Updates
   * DOCS-#4469: Say that commit messages can start with PERF (#4470).
   * DOCS-#4466: Recommend GitHub issues over bug_reports@modin.org (#4474).  
   * DOCS-#4487: Recommend GitHub issues over feature_requests@modin.org (#4489).
+  * DOCS-#4545: Add socials to README (#4555).
 * Dependencies
   * FIX-#4327: Update min pin for xgboost version (#4328)
   * FIX-#4383: Remove `pathlib` from deps (#4384)

diff --git a/docs/release_notes/release_notes-0.16.0.rst b/docs/release_notes/release_notes-0.16.0.rst
@@ -9,6 +9,9 @@ Key Features and Updates
   * FIX-#4570: Replace ``np.bool`` -> ``np.bool_`` (#4571)
   * FIX-#4543: Fix `read_csv` in case skiprows=<0, []> (#4544)
   * FIX-#4059: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391)
+  * FIX-#4589: Pin protobuf<4.0.0 to fix ray (#4590)
+  * FIX-#4577: Set attribute of Modin dataframe to updated value (#4588)
+  * FIX-#4411: Fix binary_op between datetime64 Series and pandas timedelta (#4592)
   * FIX-#4582: Inherit custom log layer (#4583)
 * Performance enhancements
   * PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391)
@@ -31,12 +34,16 @@ Key Features and Updates
 * Documentation improvements
   * DOCS-#4552: Change default sphinx language to en to fix sphinx >= 5.0.0 build (#4553)
 * Dependencies
-  *
+  * FEAT-#4598: Add support for pandas 1.4.3 (#4599)
 * New Features
+  * FEAT-4463: Add experimental fuzzydata integration for testing against a randomized dataframe workflow (#4556)
 
 Contributors
 ------------
 @mvashishtha
 @NickCrews
 @prutskov
 @vnlitvinov
+@pyrito
+@suhailrehman
+@RehanSD
diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt
@@ -12,6 +12,9 @@ sphinx-click
 # Pin ray to < 1.13.0 to work around GH#4564
 # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0.
 ray[default]>=1.4.0,<1.13.0
+# Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4,
+# because versions >= 4.0.0 are incompatible with ray<1.13.0.
+protobuf<4.0.0
 git+https://github.com/modin-project/modin.git@master#egg=modin[all]
 sphinxcontrib_plantuml
 sphinx-issues

diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst
@@ -89,9 +89,9 @@ internal execution flow.
 Logging with Modin
 ------------------
 
-Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, 
-and system memory. Logging is disabled by default, but when it is enabled, log files are written to a local `.modin` directory 
-at the same directory level as the notebook/script used to run Modin. See our :doc:`Logging with Modin documentation <modin_logging>` 
+Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata,
+and system memory. Logging is disabled by default, but when it is enabled, log files are written to a local `.modin` directory
+at the same directory level as the notebook/script used to run Modin. See our :doc:`Logging with Modin documentation <modin_logging>`
 for usage information.
 
 Batch Pipeline API
@@ -100,6 +100,13 @@ Modin provides an experimental batched API that pipelines row parallel queries.
 for a walkthrough on how to use this feature, as well as :doc:`Batch Pipeline API documentation </flow/modin/experimental/batch>`
 for more information about the API.
 
+Fuzzydata Testing
+-----------------
+
+An experimental GitHub Action on pull request has been added to Modin, which automatically runs the Modin codebase against
+`fuzzydata`, a random dataframe workflow generator. The resulting workflow that was used to test Modin codebase can be
+downloaded as an artifact from the GitHub Actions tab for further inspection. See `fuzzydata`_ for more details.
+
 .. _`blog post`: https://medium.com/riselab/why-every-data-scientist-using-pandas-needs-modin-bringing-sql-to-dataframes-3b216b29a7c0
 .. _`Modin SQL documentation`: modin_sql.html
 .. _`Modin Spreadsheet API documentation`: spreadsheets_api.html
@@ -109,3 +116,4 @@ for more information about the API.
 .. _`Slack`: https://modin.org/slack.html
 .. _`tqdm`: https://github.com/tqdm/tqdm
 .. _`distributed XGBoost`: https://medium.com/intel-analytics-software/distributed-xgboost-with-modin-on-ray-fc17edef7720
+.. _`fuzzydata`: https://github.com/suhailrehman/fuzzydata
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -2,7 +2,7 @@ name: modin
 channels:
   - conda-forge
 dependencies:
-  - pandas==1.4.2
+  - pandas==1.4.3
   - numpy>=1.18.5
   - pyarrow>=4.0.1
   - dask[complete]>=2.22.0,<2022.2.0
@@ -42,12 +42,17 @@ dependencies:
       - tqdm
       - git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
       # Pin ray to < 1.13.0 to work around GH#4564
-      # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0.      
+      # TODO(https://github.com/modin-project/modin/issues/4564): let ray go past 1.13.0.
       - ray[default]>=1.4.0,<1.13.0
+      # Following https://github.com/ray-project/ray/pull/25648, pin protobuf < 4,
+      # because versions >= 4.0.0 are incompatible with ray<1.13.0.
+      - protobuf<4.0.0
       - connectorx>=0.2.6a4
       # TODO: remove when resolving GH#4398
       - redis>=3.5.0,<4.0.0
       - black
       - flake8
       # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
       - numpydoc==1.1.0
+      # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
+      - fuzzydata>=0.0.6
diff --git a/modin/experimental/fuzzydata/__init__.py b/modin/experimental/fuzzydata/__init__.py
@@ -0,0 +1,14 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+"""Module holds experimental fuzzydata specific functionality for Modin."""
diff --git a/modin/experimental/fuzzydata/test/__init__.py b/modin/experimental/fuzzydata/test/__init__.py
@@ -0,0 +1,12 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
diff --git a/modin/experimental/fuzzydata/test/test_fuzzydata.py b/modin/experimental/fuzzydata/test/test_fuzzydata.py
@@ -0,0 +1,65 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import os
+import glob
+import uuid
+import shutil
+from fuzzydata.core.generator import generate_workflow
+from fuzzydata.clients.modin import ModinWorkflow
+
+from modin.config import Engine
+
+
+def test_fuzzydata_sample_workflow():
+    # Workflow Generation Options
+    wf_name = str(uuid.uuid4())[:8]  # Unique name for the generated workflow
+    num_versions = 10  # Number of unique CSV files to generate
+    cols = 33  # Columns in Base Artifact
+    rows = 1000  # Rows in Base Artifact
+    bfactor = 1.0  # Branching Factor - 0.1 is linear, 10.0 is star-like
+    exclude_ops = ["groupby"]  # In-Memory groupby operations cause issue #4287
+    matfreq = 2  # How many operations to chain before materialization
+
+    engine = Engine.get().lower()
+
+    # Create Output Directory for Workflow Data
+    base_out_directory = (
+        f"/tmp/fuzzydata-test-wf-{engine}/"  # Must match corresponding github-action
+    )
+    if os.path.exists(base_out_directory):
+        shutil.rmtree(base_out_directory)
+    output_directory = f"{base_out_directory}/{wf_name}/"
+    os.makedirs(output_directory, exist_ok=True)
+
+    # Start Workflow Generation
+    workflow = generate_workflow(
+        workflow_class=ModinWorkflow,
+        name=wf_name,
+        num_versions=num_versions,
+        base_shape=(cols, rows),
+        out_directory=output_directory,
+        bfactor=bfactor,
+        exclude_ops=exclude_ops,
+        matfreq=matfreq,
+        wf_options={"modin_engine": engine},
+    )
+
+    # Assertions that the workflow generation worked correctly
+    assert len(workflow) == num_versions
+    assert len(list(glob.glob(f"{output_directory}/artifacts/*.csv"))) == len(
+        workflow.artifact_dict
+    )
+    assert os.path.exists(f"{output_directory}/{workflow.name}_operations.json")
+    assert os.path.getsize(f"{output_directory}/{workflow.name}_operations.json") > 0
+    assert os.path.exists(f"{output_directory}/{workflow.name}_gt_graph.csv")
diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py
@@ -14,7 +14,7 @@
 import pandas
 import warnings
 
-__pandas_version__ = "1.4.2"
+__pandas_version__ = "1.4.3"
 
 if pandas.__version__ != __pandas_version__:
     warnings.warn(

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -222,10 +222,7 @@ def _validate_other(
         self,
         other,
         axis,
-        numeric_only=False,
-        numeric_or_time_only=False,
-        numeric_or_object_only=False,
-        comparison_dtypes_only=False,
+        dtype_check=False,
         compare_index=False,
     ):
         """
@@ -239,14 +236,8 @@ def _validate_other(
             Specifies axis along which to do validation. When `1` or `None`
             is specified, validation is done along `index`, if `0` is specified
             validation is done along `columns` of `other` frame.
-        numeric_only : bool, default: False
-            Validates that both frames have only numeric dtypes.
-        numeric_or_time_only : bool, default: False
-            Validates that both frames have either numeric or time dtypes.
-        numeric_or_object_only : bool, default: False
-            Validates that both frames have either numeric or object dtypes.
-        comparison_dtypes_only : bool, default: False
-            Validates that both frames have either numeric or time or equal dtypes.
+        dtype_check : bool, default: False
+            Validates that both frames have compatible dtypes.
         compare_index : bool, default: False
             Compare Index if True.
 
@@ -300,44 +291,18 @@ def _validate_other(
             if not self.index.equals(other.index):
                 raise TypeError("Cannot perform operation with non-equal index")
         # Do dtype checking.
-        if numeric_only:
-            if not all(
-                is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)
-                for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes)
-            ):
-                raise TypeError("Cannot do operation on non-numeric dtypes")
-        elif numeric_or_object_only:
+        if dtype_check:
             if not all(
                 (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype))
                 or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype))
-                for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes)
-            ):
-                raise TypeError("Cannot do operation non-numeric dtypes")
-        elif comparison_dtypes_only:
-            if not all(
-                (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype))
                 or (
                     is_datetime_or_timedelta_dtype(self_dtype)
                     and is_datetime_or_timedelta_dtype(other_dtype)
                 )
                 or is_dtype_equal(self_dtype, other_dtype)
                 for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes)
             ):
-                raise TypeError(
-                    "Cannot do operation non-numeric objects with numeric objects"
-                )
-        elif numeric_or_time_only:
-            if not all(
-                (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype))
-                or (
-                    is_datetime_or_timedelta_dtype(self_dtype)
-                    and is_datetime_or_timedelta_dtype(other_dtype)
-                )
-                for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes)
-            ):
-                raise TypeError(
-                    "Cannot do operation non-numeric objects with numeric objects"
-                )
+                raise TypeError("Cannot do operation with improper dtypes")
         return result
 
     def _validate_function(self, func, on_invalid=None):
@@ -416,7 +381,7 @@ def _binary_op(self, op, other, **kwargs):
             return self._default_to_pandas(
                 getattr(self._pandas_class, op), other, **kwargs
             )
-        other = self._validate_other(other, axis, numeric_or_object_only=True)
+        other = self._validate_other(other, axis, dtype_check=True)
         exclude_list = [
             "__add__",
             "__radd__",

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -2498,6 +2498,10 @@ def __setattr__(self, key, value):
             pass
         elif key in self and key not in dir(self):
             self.__setitem__(key, value)
+            # Note: return immediately so we don't keep this `key` as dataframe state.
+            # `__getattr__` will return the columns not present in `dir(self)`, so we do not need
+            # to manually track this state in the `dir`.
+            return
         elif isinstance(value, pandas.Series):
             warnings.warn(
                 "Modin doesn't allow columns to be created via a new attribute name - see "