Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) #59376

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,6 @@ def nullable_string_dtype(request):
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
]
)
def string_storage(request):
Expand All @@ -1305,7 +1304,6 @@ def string_storage(request):

* 'python'
* 'pyarrow'
* 'pyarrow_numpy'
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the string_storage fixture will no longer cover the "pyarrow + NaN" variant of the string dtype directly with this change, but:

  • Most tests using this fixture are inside the pandas/tests/io submodule, which had an override of this fixture to not include "pyarrow_numpy" anyway (and I removed this override)
  • I checked/updated the other tests and none of them really need this.

"""
return request.param

Expand Down
18 changes: 17 additions & 1 deletion pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from collections.abc import Callable
import os
from typing import Any

import pandas._config.config as cf
from pandas._config.config import (
Expand Down Expand Up @@ -455,12 +456,27 @@ def is_terminal() -> bool:
``future.infer_string`` is set to True.
"""


def is_valid_string_storage(value: Any) -> None:
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
legal_values = ["python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
# TODO: we can remove extra message after 3.0
msg += (
". 'pyarrow_numpy' was specified, but this option should be "
"enabled using pandas.options.future.infer_string instead"
)
Comment on lines +464 to +469
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this custom message to the error in case someone actually did end up using pd.options.mode.string_storage = "pyarrow_numpy" to point them to the right option to enable.

raise ValueError(msg)


with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
"python",
string_storage_doc,
validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
# validator=is_one_of_factory(["python", "pyarrow"]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# validator=is_one_of_factory(["python", "pyarrow"]),

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to leave this line in so it's easier to go back to that, given that this extra message is only meant to stay for 2.3, and for 3.0 we should already remove it again (because the infer_string option will be enabled by default in 3.0, so then there is no point to let users set that manually)

validator=is_valid_string_storage,
)


Expand Down
32 changes: 8 additions & 24 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,50 +513,34 @@ def test_arrow_array(dtype):
assert arr.equals(expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
# roundtrip possible from arrow 1.0.0
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(
dtype, string_storage2, request, using_infer_string
):
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand All @@ -566,10 +550,10 @@ def test_arrow_load_from_zero_chunks(
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,18 @@ def test_eq_all_na():


def test_config(string_storage, request, using_infer_string):
if using_infer_string and string_storage != "pyarrow_numpy":
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
if string_storage == "pyarrow_numpy":
if using_infer_string and string_storage == "python":
# python string storage with na_value=NaN is not yet implemented
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

dtype = StringDtype(string_storage)
dtype = StringDtype(
string_storage, na_value=np.nan if using_infer_string else pd.NA
)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
with option_context("mode.string_storage", string_storage):
df.astype("string")
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
# GH#51073 - variant of the above test with explicit dtype instances
Comment on lines +903 to +904
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this variant because the test above with the string_storage option and astype("string") will now only test the NA-variants of the string dtype.

df = DataFrame({"a": ["a", "b", val]})
expected = df.copy()
df.astype(any_string_dtype)
tm.assert_frame_equal(df, expected)
5 changes: 2 additions & 3 deletions pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@


class TestConvertDtypes:
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
Expand All @@ -18,9 +20,6 @@ def test_convert_dtypes(
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"

df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
Expand Down
16 changes: 0 additions & 16 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,19 +224,3 @@ def compression_format(request):
@pytest.fixture(params=_compression_formats_params)
def compression_ext(request):
return request.param[0]


@pytest.fixture(
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
]
)
def string_storage(request):
"""
Parametrized fixture for pd.options.mode.string_storage.

* 'python'
* 'pyarrow'
"""
return request.param
Loading