Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: map builtin str alias to StringDtype #59685

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f444105
String dtype: map builtin str alias to StringDtype
jorisvandenbossche Sep 2, 2024
630d41c
fix tests
jorisvandenbossche Sep 2, 2024
d127770
fix datetimelike astype and more tests
jorisvandenbossche Sep 2, 2024
38d011a
remove xfails
jorisvandenbossche Sep 2, 2024
c4ed9d3
try fix typing
jorisvandenbossche Sep 2, 2024
cad7d8f
fix copy_view tests
jorisvandenbossche Sep 2, 2024
189e26d
fix remaining tests with infer_string enabled
jorisvandenbossche Sep 2, 2024
1089eb3
ignore typing issue for now
jorisvandenbossche Sep 2, 2024
8f7e968
move to common.py
jorisvandenbossche Sep 2, 2024
51900f1
Merge remote-tracking branch 'upstream/main' into string-dtype-astype…
jorisvandenbossche Sep 2, 2024
15f45d2
simplify Categorical._str_get_dummies
jorisvandenbossche Sep 2, 2024
4464fb1
small cleanup
jorisvandenbossche Sep 2, 2024
650f694
fix ensure_string_array to not modify extension arrays inplace
jorisvandenbossche Sep 2, 2024
49297f0
Merge remote-tracking branch 'upstream/main' into string-dtype-astype…
jorisvandenbossche Sep 2, 2024
9164dbb
fix ensure_string_array once more + fix is_extension_array_dtype for str
jorisvandenbossche Sep 3, 2024
cf9f855
still xfail TestArrowArray::test_astype_str when not using infer_string
jorisvandenbossche Sep 3, 2024
bd79fc9
ensure maybe_convert_objects copies object dtype input array when inf…
jorisvandenbossche Sep 3, 2024
4c775d1
update test_1d_object_array_does_not_copy test
jorisvandenbossche Sep 3, 2024
3b0b779
Merge remote-tracking branch 'upstream/main' into string-dtype-astype…
jorisvandenbossche Sep 10, 2024
b0276b2
update constructor copy test + do not copy in maybe_convert_objects?
jorisvandenbossche Sep 10, 2024
d413fc6
skip str.get_dummies test for now
jorisvandenbossche Sep 10, 2024
d634da2
Merge remote-tracking branch 'upstream/main' into string-dtype-astype…
jorisvandenbossche Sep 16, 2024
e791330
Merge remote-tracking branch 'upstream/main' into string-dtype-astype…
jorisvandenbossche Sep 21, 2024
db8900c
use pandas_dtype() instead of registry.find
jorisvandenbossche Sep 21, 2024
e6aad17
fix corner cases for calling pandas_dtype
jorisvandenbossche Sep 21, 2024
4e6cf04
add TODO comment in ensure_string_array
jorisvandenbossche Sep 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,11 @@ cpdef ndarray[object] ensure_string_array(

if hasattr(arr, "to_numpy"):

if hasattr(arr, "dtype") and arr.dtype.kind in "mM":
if (
hasattr(arr, "dtype")
and arr.dtype.kind in "mM"
and not hasattr(arr, "_pa_array")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit ugly, but essentially this is avoiding that astype(str) for ArrowExtensionArray (if that happens to have a datetimelike dtype) is called here, because that will use ensure_string_array for that implementation, causing a recursion error.

An alternative would be a better check than arr.dtype.kind in "mM" that restricts itself to DatetimeLikeArrayMixin (essentially I need a isinstance(arr, DatetimeLikeArrayMixin), I think). Could potentially do that with some ABC check.

Another alternative is to handle astype(str) specifically in ArrowExtensionArray (currently astype is not implemented there, and it falls back entirely on the base class implementation)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think eventually it would be good to implement ArrowExtensionArray.astype so I would support eventually handing str there. Could you add a TODO comment here describing when this could be removed when astype is implemented?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a TODO comment

):
# dtype check to exclude DataFrame
# GH#41409 TODO: not a great place for this
out = arr.astype(str).astype(object)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@

COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
if using_string_dtype():
STRING_DTYPES: list[Dtype] = [str, "U"]
STRING_DTYPES: list[Dtype] = ["U"]
else:
STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef]
COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2685,7 +2685,9 @@ def _str_get_dummies(self, sep: str = "|"):
# sep may not be in categories. Just bail on this.
from pandas.core.arrays import NumpyExtensionArray

return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies(
sep
)

# ------------------------------------------------------------------------
# GroupBy Methods
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True):

return self._box_values(self.asi8.ravel()).reshape(self.shape)

elif is_string_dtype(dtype):
if isinstance(dtype, ExtensionDtype):
arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type]
cls = dtype.construct_array_type()
return cls._from_sequence(arr_object, dtype=dtype, copy=False)
else:
return self._format_native_types()

elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
elif is_string_dtype(dtype):
return self._format_native_types()
elif dtype.kind in "iu":
# we deliberately ignore int32 vs. int64 here.
# See https://github.com/pandas-dev/pandas/issues/24381 for more.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ def _from_sequence(
# zero_copy_only to True which caused problems see GH#52076
scalars = np.array(scalars)
# convert non-na-likes to str, and nan-likes to StringDtype().na_value
result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
result = lib.ensure_string_array(scalars, na_value=na_value, copy=True)

# Manually creating new array avoids the validation step in the __init__, so is
# faster. Refactor need for validation?
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _from_sequence(
return cls(pc.cast(scalars, pa.large_string()))

# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
result = lib.ensure_string_array(scalars, copy=True)
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))

@classmethod
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

import numpy as np

from pandas._config import using_string_dtype

from pandas._libs import (
Interval,
Period,
Expand Down Expand Up @@ -1703,6 +1705,12 @@ def pandas_dtype(dtype) -> DtypeObj:
elif isinstance(dtype, (np.dtype, ExtensionDtype)):
return dtype

# builtin aliases
if dtype is str and using_string_dtype():
from pandas.core.arrays.string_ import StringDtype

return StringDtype(na_value=np.nan)

# registered extension types
result = registry.find(dtype)
if result is not None:
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6261,7 +6261,11 @@ def _should_compare(self, other: Index) -> bool:
return False

dtype = _unpack_nested_dtype(other)
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
return (
self._is_comparable_dtype(dtype)
or is_object_dtype(dtype)
or is_string_dtype(dtype)
)

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
is_number,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
Expand Down Expand Up @@ -712,7 +713,7 @@ def _get_indexer(
# left/right get_indexer, compare elementwise, equality -> match
indexer = self._get_indexer_unique_sides(target)

elif not is_object_dtype(target.dtype):
elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)):
# homogeneous scalar index: use IntervalTree
# we should always have self._should_partial_index(target) here
target = self._maybe_convert_i8(target)
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/arrays/floating/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string):

if using_infer_string:
expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
tm.assert_extension_array_equal(a.astype("str"), expected)

# TODO(infer_string) this should also be a string array like above
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,9 @@ def test_astype_str(using_infer_string):

if using_infer_string:
expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
tm.assert_extension_array_equal(a.astype("str"), expected)

# TODO(infer_string) this should also be a string array like above
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype(str), expected)
tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/sparse/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype):
),
(
SparseArray([0, 1, 10]),
str,
SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
np.str_,
SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")),
),
(SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string):
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, "1")),
(SparseDtype(int, 1), np.str_, SparseDtype(object, "1")),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we even need this test case? I'm not sure what expectations we have around the np.str_ data type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, can also leave out this test case entirely. I am not entirely sure yet what dtype=np.str_ should mean, I was planning to open an issue about that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be fine to remove; I feel like this opens up a pandora's box of issues that aren't worth tracking down at this point in time

(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ def test_astype_string_and_object(dtype, new_dtype):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df_orig = df.copy()
df2 = df.astype(new_dtype)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
if new_dtype == "string":
# cast to string has to copy to avoid mutating the original during
# the call to ensure_string_array -> never a delayed copy
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

df2.iloc[0, 0] = "x"
tm.assert_frame_equal(df, df_orig)
Expand All @@ -105,7 +110,12 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df2 = df.astype(new_dtype)
df_orig = df2.copy()
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
if new_dtype == "string":
# cast to string has to copy to avoid mutating the original during
# the call to ensure_string_array -> never a delayed copy
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

df.iloc[0, 0] = "x"
tm.assert_frame_equal(df2, df_orig)
Expand Down Expand Up @@ -220,7 +230,7 @@ def test_convert_dtypes():
df_orig = df.copy()
df2 = df.convert_dtypes()

assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage):
"pyarrow" if HAS_PYARROW else "python", na_value=np.nan
)

with pd.option_context("future.infer_string", True):
# with the default string_storage setting
result = pandas_dtype(str)
assert result == pd.StringDtype(
"pyarrow" if HAS_PYARROW else "python", na_value=np.nan
)

with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
assert result == pd.StringDtype(string_storage, na_value=np.nan)

with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype(str)
assert result == pd.StringDtype(string_storage, na_value=np.nan)

with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def test_tolist(self, data):
assert result == expected

def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
expected = pd.Series([str(x) for x in data[:5]], dtype=str)
result = pd.Series(data[:2]).astype(str)
expected = pd.Series([str(x) for x in data[:2]], dtype=str)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,8 @@ def astype(self, dtype, copy=True):
return self.copy()
return self
elif isinstance(dtype, StringDtype):
value = self.astype(str) # numpy doesn't like nested dicts
arr_cls = dtype.construct_array_type()
return arr_cls._from_sequence(value, dtype=dtype, copy=False)
return arr_cls._from_sequence(self, dtype=dtype, copy=False)
elif not copy:
return np.asarray([dict(x) for x in self], dtype=dtype)
else:
Expand Down
20 changes: 0 additions & 20 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
pa_version_under13p0,
pa_version_under14p0,
)
import pandas.util._test_decorators as td

from pandas.core.dtypes.dtypes import (
ArrowDtype,
Expand Down Expand Up @@ -312,25 +311,6 @@ def test_astype_str(self, data, request):
)
super().test_astype_str(data)

@pytest.mark.parametrize(
"nullable_string_dtype",
[
"string[python]",
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_astype_string(self, data, nullable_string_dtype, request):
pa_dtype = data.dtype.pyarrow_dtype
if (
pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
) or pa.types.is_duration(pa_dtype):
request.applymarker(
pytest.mark.xfail(
reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
)
)
super().test_astype_string(data, nullable_string_dtype)

def test_from_dtype(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
Expand Down
17 changes: 9 additions & 8 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,21 @@ def test_astype_str(self):
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
},
dtype="object",
dtype="str",
)

tm.assert_frame_equal(result, expected)

def test_astype_str_float(self):
def test_astype_str_float(self, using_infer_string):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
expected = DataFrame(["nan"], dtype="object")
expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str")

tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)

val = "1.1234567890123457"
expected = DataFrame([val], dtype="object")
expected = DataFrame([val], dtype="str")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("dtype_class", [dict, Series])
Expand Down Expand Up @@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self):
result = df.astype(dtypes)
expected = DataFrame(
{
0: Series(vals[:, 0].astype(str), dtype=object),
0: Series(vals[:, 0].astype(str), dtype="str"),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
Expand Down Expand Up @@ -647,25 +647,26 @@ def test_astype_dt64tz(self, timezone_frame):
# dt64tz->dt64 deprecated
timezone_frame.astype("datetime64[ns]")

def test_astype_dt64tz_to_str(self, timezone_frame):
def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string):
# str formatting
result = timezone_frame.astype(str)
na_value = np.nan if using_infer_string else "NaT"
expected = DataFrame(
[
[
"2013-01-01",
"2013-01-01 00:00:00-05:00",
"2013-01-01 00:00:00+01:00",
],
["2013-01-02", "NaT", "NaT"],
["2013-01-02", na_value, na_value],
[
"2013-01-03",
"2013-01-03 00:00:00-05:00",
"2013-01-03 00:00:00+01:00",
],
],
columns=timezone_frame.columns,
dtype="object",
dtype="str",
)
tm.assert_frame_equal(result, expected)

Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/frame/methods/test_select_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
ei = df[["a"]]
tm.assert_frame_equal(ri, ei)

ri = df.select_dtypes(include=[str])
tm.assert_frame_equal(ri, ei)

def test_select_dtypes_exclude_using_list_like(self):
df = DataFrame(
{
Expand Down Expand Up @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self):
@pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
@pytest.mark.parametrize("arg", ["include", "exclude"])
def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string):
if using_infer_string and dtype == "str":
if using_infer_string and (dtype == "str" or dtype is str):
# this is tested below
pytest.skip("Selecting string columns works with future strings")
df = DataFrame(
Expand Down
Loading
Loading