Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions to compare Column objects with iterable references and to compare DataFrame objects with mapping references #66

Merged
merged 12 commits into from
Jan 24, 2024
6 changes: 5 additions & 1 deletion dataframe_api_compat/pandas_standard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,16 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType:
return Namespace.Float32()
if dtype == "Float32":
return Namespace.Float32()
if dtype == "bool":
if dtype in ("bool", "boolean"):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I discovered it by accident while experimenting. It is possible that this is no longer necessary for the current changes.

# Also for `pandas.core.arrays.boolean.BooleanDtype`
return Namespace.Bool()
if dtype == "object":
return Namespace.String()
if dtype == "string":
return Namespace.String()
if hasattr(dtype, "name"):
# For types like `numpy.dtypes.DateTime64DType`
dtype = dtype.name
if dtype.startswith("datetime64["):
match = re.search(r"datetime64\[(\w{1,2})", dtype)
assert match is not None
Expand Down
2 changes: 2 additions & 0 deletions dataframe_api_compat/pandas_standard/column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
"UInt16": "uint16",
"UInt8": "uint8",
"boolean": "bool",
"Float64": "float64",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also discovered by accident, it seems that the float type was missing, but if it was done on purpose, I can try to redo it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i probably just forgot it - let's add float32 too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

"Float32": "float32",
}


Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ ignore = [
[tool.ruff.isort]
force-single-line = true

[tool.black]
line-length = 90
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To sync with pre-commit.


[tool.pytest.ini_options]
filterwarnings = [
"error",
Expand Down
28 changes: 13 additions & 15 deletions tests/column/and_or_test.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,44 @@
from __future__ import annotations

import pandas as pd

from tests.utils import bool_dataframe_1
from tests.utils import interchange_to_pandas
from tests.utils import compare_column_with_reference


def test_column_and(library: str) -> None:
df = bool_dataframe_1(library, api_version="2023.09-beta")
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign((ser & other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, False], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, False]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_or(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign((ser | other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, True], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, True]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_and_with_scalar(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = True
result = df.assign((other & ser).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, False], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, False]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_or_with_scalar(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = True
result = df.assign((other | ser).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, True], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, True]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
16 changes: 6 additions & 10 deletions tests/column/cast_test.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
import pandas as pd

from tests.utils import compare_dataframe_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import interchange_to_pandas


def test_cast_integers(library: str) -> None:
df = integer_dataframe_1(library)
pdx = df.__dataframe_namespace__()
result = df.assign(df.col("a").cast(pdx.Int32()))
expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).astype(
{"a": "int32", "b": "int64"},
)
result_pd = interchange_to_pandas(result)
pd.testing.assert_frame_equal(result_pd, expected)
ns = df.__dataframe_namespace__()
result = df.assign(df.col("a").cast(ns.Int32()))
expected = {"a": [1, 2, 3], "b": [4, 5, 6]}
expected_dtype = {"a": ns.Int32, "b": ns.Int64}
compare_dataframe_with_reference(result, expected, expected_dtype)
54 changes: 15 additions & 39 deletions tests/column/col_sorted_indices_test.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,42 @@
from __future__ import annotations

import pandas as pd

from tests.utils import compare_dataframe_with_reference
from tests.utils import integer_dataframe_6
from tests.utils import interchange_to_pandas


def test_expression_sorted_indices_ascending(library: str) -> None:
df = integer_dataframe_6(library)
df.__dataframe_namespace__()
ns = df.__dataframe_namespace__()
col = df.col
sorted_indices = col("b").sorted_indices()
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [2, 2, 1, 1, 1],
"b": [1, 2, 3, 4, 4],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_expression_sorted_indices_descending(library: str) -> None:
df = integer_dataframe_6(library)
df.__dataframe_namespace__()
ns = df.__dataframe_namespace__()
col = df.col
sorted_indices = col("b").sorted_indices(ascending=False)
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [1, 1, 1, 2, 2],
"b": [4, 4, 3, 2, 1],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_column_sorted_indices_ascending(library: str) -> None:
df = integer_dataframe_6(library).persist()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I deleted .persist() call in several places, since the same call occurs in new comparison functions, which generates warnings, but due to the repository settings - errors. If this is incorrect, then we need a public way to check the ._is_persisted field, so as not to call the method several times.

df = integer_dataframe_6(library)
ns = df.__dataframe_namespace__()
sorted_indices = df.col("b").sorted_indices()
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [2, 2, 1, 1, 1],
"b": [1, 2, 3, 4, 4],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_column_sorted_indices_descending(library: str) -> None:
df = integer_dataframe_6(library).persist()
df = integer_dataframe_6(library)
ns = df.__dataframe_namespace__()
sorted_indices = df.col("b").sorted_indices(ascending=False)
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [1, 1, 1, 2, 2],
"b": [4, 4, 3, 2, 1],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
92 changes: 47 additions & 45 deletions tests/column/comparisons_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,86 @@

from typing import Any

import pandas as pd
import pytest

from tests.utils import compare_column_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import integer_dataframe_7
from tests.utils import interchange_to_pandas


@pytest.mark.parametrize(
("comparison", "expected_data"),
("comparison", "expected_data", "expected_dtype"),
[
("__eq__", [True, True, False]),
("__ne__", [False, False, True]),
("__ge__", [True, True, False]),
("__gt__", [False, False, False]),
("__le__", [True, True, True]),
("__lt__", [False, False, True]),
("__add__", [2, 4, 7]),
("__sub__", [0, 0, -1]),
("__mul__", [1, 4, 12]),
("__truediv__", [1, 1, 0.75]),
("__floordiv__", [1, 1, 0]),
("__pow__", [1, 4, 81]),
("__mod__", [0, 0, 3]),
("__eq__", [True, True, False], "Bool"),
("__ne__", [False, False, True], "Bool"),
("__ge__", [True, True, False], "Bool"),
("__gt__", [False, False, False], "Bool"),
("__le__", [True, True, True], "Bool"),
("__lt__", [False, False, True], "Bool"),
("__add__", [2, 4, 7], "Int64"),
("__sub__", [0, 0, -1], "Int64"),
("__mul__", [1, 4, 12], "Int64"),
("__truediv__", [1, 1, 0.75], "Float64"),
("__floordiv__", [1, 1, 0], "Int64"),
("__pow__", [1, 4, 81], "Int64"),
("__mod__", [0, 0, 3], "Int64"),
],
)
def test_column_comparisons(
library: str,
comparison: str,
expected_data: list[object],
expected_dtype: str,
) -> None:
ser: Any
df = integer_dataframe_7(library).persist()
df = integer_dataframe_7(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
if library in ("polars", "polars-lazy") and comparison == "__pow__":
expected_ns_dtype = getattr(ns, expected_dtype)
if comparison == "__pow__" and library in ("polars", "polars-lazy"):
# TODO
result_pd = result_pd.astype("int64")
pd.testing.assert_series_equal(result_pd, expected)
result = result.cast({"result": ns.Int64()})
expected_ns_dtype = ns.Int64
compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)


@pytest.mark.parametrize(
("comparison", "expected_data"),
("comparison", "expected_data", "expected_dtype"),
[
("__eq__", [False, False, True]),
("__ne__", [True, True, False]),
("__ge__", [False, False, True]),
("__gt__", [False, False, False]),
("__le__", [True, True, True]),
("__lt__", [True, True, False]),
("__add__", [4, 5, 6]),
("__sub__", [-2, -1, 0]),
("__mul__", [3, 6, 9]),
("__truediv__", [1 / 3, 2 / 3, 1]),
("__floordiv__", [0, 0, 1]),
("__pow__", [1, 8, 27]),
("__mod__", [1, 2, 0]),
("__eq__", [False, False, True], "Bool"),
("__ne__", [True, True, False], "Bool"),
("__ge__", [False, False, True], "Bool"),
("__gt__", [False, False, False], "Bool"),
("__le__", [True, True, True], "Bool"),
("__lt__", [True, True, False], "Bool"),
("__add__", [4, 5, 6], "Int64"),
("__sub__", [-2, -1, 0], "Int64"),
("__mul__", [3, 6, 9], "Int64"),
("__truediv__", [1 / 3, 2 / 3, 1], "Float64"),
("__floordiv__", [0, 0, 1], "Int64"),
("__pow__", [1, 8, 27], "Int64"),
("__mod__", [1, 2, 0], "Int64"),
],
)
def test_column_comparisons_scalar(
library: str,
comparison: str,
expected_data: list[object],
expected_dtype: str,
) -> None:
ser: Any
df = integer_dataframe_1(library).persist()
df = integer_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = 3
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
expected_ns_dtype = getattr(ns, expected_dtype)
if comparison == "__pow__" and library in ("polars", "polars-lazy"):
result_pd = result_pd.astype("int64")
pd.testing.assert_series_equal(result_pd, expected)
result = result.cast({"result": ns.Int64()})
expected_ns_dtype = ns.Int64
compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)


@pytest.mark.parametrize(
Expand All @@ -96,10 +99,9 @@ def test_right_column_comparisons(
) -> None:
# 1,2,3
ser: Any
df = integer_dataframe_7(library).persist()
df = integer_dataframe_7(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = 2
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
pd.testing.assert_series_equal(result_pd, expected)
compare_column_with_reference(result.col("result"), expected_data, dtype=ns.Int64)
15 changes: 8 additions & 7 deletions tests/column/cumulative_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import pandas as pd
import pytest
from packaging.version import Version
from packaging.version import parse

from tests.utils import compare_column_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import interchange_to_pandas


@pytest.mark.parametrize(
Expand All @@ -21,17 +23,16 @@ def test_cumulative_functions_column(
func: str,
expected_data: list[float],
) -> None:
df = integer_dataframe_1(library).persist()
df = integer_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
expected = pd.Series(expected_data, name="result")
result = df.assign(getattr(ser, func)().rename("result"))
result_pd = interchange_to_pandas(result)["result"]

if (
tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0)
and library == "pandas-nullable"
parse(pd.__version__) < Version("2.0.0") and library == "pandas-nullable"
): # pragma: no cover
# Upstream bug
result_pd = result_pd.astype("int64")
result = result.cast({"result": ns.Int64()})

pd.testing.assert_series_equal(result_pd, expected)
compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64)
Loading