Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#6890: Modin implementation of DataFrame API standard #7216

Merged
merged 7 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
- run: python -m pytest modin/tests/test_utils.py
- run: python -m pytest asv_bench/test/test_utils.py
- run: python -m pytest modin/tests/interchange/dataframe_protocol/base
- run: python -m pytest modin/tests/test_dataframe_api_standard.py
- run: python -m pytest modin/tests/test_logging.py
- uses: ./.github/actions/upload-coverage

Expand Down
9 changes: 9 additions & 0 deletions docs/getting_started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ storage formats or for different functionalities of Modin. Here is a list of dep

pip install "modin[mpi]" # If you want to use MPI through unidist execution engine


Consortium Standard-compatible implementation based on modin
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

.. code-block:: bash

pip install "modin[consortium-standard]"


Installing on Google Colab
"""""""""""""""""""""""""""

Expand Down
1 change: 1 addition & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ dependencies:
- isort>=5.12

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5
- asv==0.5.1
# no conda package for windows so we install it with pip
- connectorx>=0.2.6a4
Expand Down
18 changes: 18 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
_inherit_docstrings,
expanduser_path_arg,
hashable,
import_optional_dependency,
try_cast_to_pandas,
)

Expand Down Expand Up @@ -2892,6 +2893,23 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
nan_as_null=nan_as_null, allow_copy=allow_copy
)

def __dataframe_consortium_standard__(
self, *, api_version: str | None = None
): # noqa: PR01, RT01
"""
Provide entry point to the Consortium DataFrame Standard API.

This is developed and maintained outside of Modin.
Please report any issues to https://github.com/data-apis/dataframe-api-compat.
"""
dataframe_api_compat = import_optional_dependency(
"dataframe_api_compat", "implementation"
)
convert_to_standard_compliant_dataframe = (
dataframe_api_compat.modin_standard.convert_to_standard_compliant_dataframe
)
return convert_to_standard_compliant_dataframe(self, api_version=api_version)

@property
def attrs(self) -> dict: # noqa: RT01, D200
"""
Expand Down
22 changes: 21 additions & 1 deletion modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
from modin.config import PersistentPickle
from modin.logging import disable_logging
from modin.pandas.io import from_pandas, to_pandas
from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings
from modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
_inherit_docstrings,
import_optional_dependency,
)

from .accessor import CachedAccessor, SparseAccessor
from .base import _ATTRS_NO_LOOKUP, BasePandasDataset
Expand Down Expand Up @@ -222,6 +226,22 @@ def __array__(self, dtype=None) -> np.ndarray: # noqa: PR01, RT01, D200
"""
return super(Series, self).__array__(dtype).flatten()

def __column_consortium_standard__(
self, *, api_version: str | None = None
): # noqa: PR01, RT01
"""
Provide entry point to the Consortium DataFrame Standard API.

This is developed and maintained outside of Modin.
Please report any issues to https://github.com/data-apis/dataframe-api-compat.
"""
dataframe_api_compat = import_optional_dependency(
"dataframe_api_compat", "implementation"
)
return dataframe_api_compat.modin_standard.convert_to_standard_compliant_column(
self, api_version=api_version
)

def __contains__(self, key: Hashable) -> bool:
"""
Check if `key` in the `Series.index`.
Expand Down
37 changes: 37 additions & 0 deletions modin/tests/test_dataframe_api_standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest

import modin.pandas


def test_dataframe_consortium() -> None:
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
"""
Test some basic methods of the dataframe consortium standard.

Full testing is done at https://github.com/data-apis/dataframe-api-compat,
this is just to check that the entry point works as expected.
"""
pytest.importorskip("dataframe_api_compat")
df_pd = modin.pandas.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df = df_pd.__dataframe_consortium_standard__()
result_1 = df.get_column_names()
expected_1 = ["a", "b"]
assert result_1 == expected_1

ser = modin.pandas.Series([1, 2, 3])
col = ser.__column_consortium_standard__()
result_2 = col.get_value(1)
expected_2 = 2
assert result_2 == expected_2
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tqdm>=4.60.0
numexpr<2.8.5
# Latest modin-spreadsheet with widget fix
git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5

## dependencies for making release
PyGithub>=1.58.0
Expand Down
1 change: 1 addition & 0 deletions requirements/env_hdk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ dependencies:
- mypy>=1.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
- numpydoc==1.1.0
1 change: 1 addition & 0 deletions requirements/env_unidist_win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies:
- pandas-stubs>=2.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
- connectorx>=0.2.6a4
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements-no-engine.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies:
- flake8-print>=5.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5
- asv==0.5.1
# no conda package for windows
- connectorx>=0.2.6a4
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
ray_deps = ["ray[default]>=2.1.0,!=2.5.0", "pyarrow>=7.0.0"]
mpi_deps = ["unidist[mpi]>=0.2.1"]
consortium_standard_deps = ["dataframe-api-compat@git+https://github.com/data-apis/dataframe-api-compat.git@42969958d4dada47dae318b470d9855ee8bbe3a5"]
spreadsheet_deps = ["modin-spreadsheet>=0.1.0"]
# Currently, Modin does not include `mpi` option in `all`.
# Otherwise, installation of modin[all] would fail because
# users need to have a working MPI implementation and
# certain software installed beforehand.
all_deps = dask_deps + ray_deps + spreadsheet_deps
all_deps = dask_deps + ray_deps + spreadsheet_deps + consortium_standard_deps

# Distribute 'modin-autoimport-pandas.pth' along with binary and source distributions.
# This file provides the "import pandas before Ray init" feature if specific
Expand Down Expand Up @@ -62,6 +63,7 @@ def make_distribution(self):
"dask": dask_deps,
"ray": ray_deps,
"mpi": mpi_deps,
"consortium-standard": consortium_standard_deps,
"spreadsheet": spreadsheet_deps,
"all": all_deps,
},
Expand Down
Loading