Skip to content

Commit

Permalink
FEAT-#6890: Modin implementation of DataFrame API standard (#7216)
Browse files Browse the repository at this point in the history
Co-authored-by: Iaroslav Igoshev <Poolliver868@mail.ru>
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev and YarShev authored Apr 25, 2024
1 parent bbb136d commit 9ca33b4
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 2 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
- run: python -m pytest modin/tests/test_utils.py
- run: python -m pytest asv_bench/test/test_utils.py
- run: python -m pytest modin/tests/interchange/dataframe_protocol/base
- run: python -m pytest modin/tests/test_dataframe_api_standard.py
- run: python -m pytest modin/tests/test_logging.py
- uses: ./.github/actions/upload-coverage

Expand Down
9 changes: 9 additions & 0 deletions docs/getting_started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ storage formats or for different functionalities of Modin. Here is a list of dep
pip install "modin[mpi]" # If you want to use MPI through unidist execution engine
Consortium Standard-compatible implementation based on Modin
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

.. code-block:: bash
pip install "modin[consortium-standard]"
Installing on Google Colab
"""""""""""""""""""""""""""

Expand Down
1 change: 1 addition & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ dependencies:
- isort>=5.12

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@main
- asv==0.5.1
# no conda package for windows so we install it with pip
- connectorx>=0.2.6a4
Expand Down
18 changes: 18 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
_inherit_docstrings,
expanduser_path_arg,
hashable,
import_optional_dependency,
try_cast_to_pandas,
)

Expand Down Expand Up @@ -2892,6 +2893,23 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
nan_as_null=nan_as_null, allow_copy=allow_copy
)

def __dataframe_consortium_standard__(
self, *, api_version: str | None = None
): # noqa: PR01, RT01
"""
Provide entry point to the Consortium DataFrame Standard API.
This is developed and maintained outside of Modin.
Please report any issues to https://github.com/data-apis/dataframe-api-compat.
"""
dataframe_api_compat = import_optional_dependency(
"dataframe_api_compat", "implementation"
)
convert_to_standard_compliant_dataframe = (
dataframe_api_compat.modin_standard.convert_to_standard_compliant_dataframe
)
return convert_to_standard_compliant_dataframe(self, api_version=api_version)

@property
def attrs(self) -> dict: # noqa: RT01, D200
"""
Expand Down
22 changes: 21 additions & 1 deletion modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
from modin.config import PersistentPickle
from modin.logging import disable_logging
from modin.pandas.io import from_pandas, to_pandas
from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings
from modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
_inherit_docstrings,
import_optional_dependency,
)

from .accessor import CachedAccessor, SparseAccessor
from .base import _ATTRS_NO_LOOKUP, BasePandasDataset
Expand Down Expand Up @@ -222,6 +226,22 @@ def __array__(self, dtype=None) -> np.ndarray: # noqa: PR01, RT01, D200
"""
return super(Series, self).__array__(dtype).flatten()

def __column_consortium_standard__(
self, *, api_version: str | None = None
): # noqa: PR01, RT01
"""
Provide entry point to the Consortium DataFrame Standard API.
This is developed and maintained outside of Modin.
Please report any issues to https://github.com/data-apis/dataframe-api-compat.
"""
dataframe_api_compat = import_optional_dependency(
"dataframe_api_compat", "implementation"
)
return dataframe_api_compat.modin_standard.convert_to_standard_compliant_column(
self, api_version=api_version
)

def __contains__(self, key: Hashable) -> bool:
"""
Check if `key` in the `Series.index`.
Expand Down
37 changes: 37 additions & 0 deletions modin/tests/test_dataframe_api_standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest

import modin.pandas


def test_dataframe_api_standard() -> None:
"""
Test some basic methods of the dataframe consortium standard.
Full testing is done at https://github.com/data-apis/dataframe-api-compat,
this is just to check that the entry point works as expected.
"""
pytest.importorskip("dataframe_api_compat")
df_pd = modin.pandas.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df = df_pd.__dataframe_consortium_standard__()
result_1 = df.get_column_names()
expected_1 = ["a", "b"]
assert result_1 == expected_1

ser = modin.pandas.Series([1, 2, 3])
col = ser.__column_consortium_standard__()
result_2 = col.get_value(1)
expected_2 = 2
assert result_2 == expected_2
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tqdm>=4.60.0
numexpr<2.8.5
# Latest modin-spreadsheet with widget fix
git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
git+https://github.com/data-apis/dataframe-api-compat.git@main

## dependencies for making release
PyGithub>=1.58.0
Expand Down
1 change: 1 addition & 0 deletions requirements/env_hdk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ dependencies:
- mypy>=1.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@main
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
- numpydoc==1.1.0
1 change: 1 addition & 0 deletions requirements/env_unidist_win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies:
- pandas-stubs>=2.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@main
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
- connectorx>=0.2.6a4
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements-no-engine.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies:
- flake8-print>=5.0.0

- pip:
- git+https://github.com/data-apis/dataframe-api-compat.git@main
- asv==0.5.1
# no conda package for windows
- connectorx>=0.2.6a4
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
ray_deps = ["ray[default]>=2.1.0,!=2.5.0", "pyarrow>=7.0.0"]
mpi_deps = ["unidist[mpi]>=0.2.1"]
consortium_standard_deps = ["dataframe-api-compat@git+https://github.com/data-apis/dataframe-api-compat.git@main"]
spreadsheet_deps = ["modin-spreadsheet>=0.1.0"]
# Currently, Modin does not include `mpi` option in `all`.
# Otherwise, installation of modin[all] would fail because
# users need to have a working MPI implementation and
# certain software installed beforehand.
all_deps = dask_deps + ray_deps + spreadsheet_deps
all_deps = dask_deps + ray_deps + spreadsheet_deps + consortium_standard_deps

# Distribute 'modin-autoimport-pandas.pth' along with binary and source distributions.
# This file provides the "import pandas before Ray init" feature if specific
Expand Down Expand Up @@ -62,6 +63,7 @@ def make_distribution(self):
"dask": dask_deps,
"ray": ray_deps,
"mpi": mpi_deps,
"consortium-standard": consortium_standard_deps,
"spreadsheet": spreadsheet_deps,
"all": all_deps,
},
Expand Down

0 comments on commit 9ca33b4

Please sign in to comment.