Skip to content

Commit

Permalink
Merge branch 'main' into ref-get_repr
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Oct 4, 2023
2 parents caa0014 + da01e38 commit 3ad1539
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 49 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ Fixed regressions
Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
-

Expand Down
9 changes: 9 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,3 +509,12 @@ def closed(self) -> bool:

# Offsets
OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"]

# read_csv: usecols
UsecolsArgType = Union[
SequenceNotStr[Hashable],
range,
AnyArrayLike,
Callable[[HashableT], bool],
None,
]
31 changes: 25 additions & 6 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,7 +1747,7 @@ def __setitem__(self, key, value) -> None:
data = pa.chunked_array([data])
self._pa_array = data

def _rank(
def _rank_calc(
self,
*,
axis: AxisInt = 0,
Expand All @@ -1756,9 +1756,6 @@ def _rank(
ascending: bool = True,
pct: bool = False,
):
"""
See Series.rank.__doc__.
"""
if pa_version_under9p0 or axis != 0:
ranked = super()._rank(
axis=axis,
Expand All @@ -1773,7 +1770,7 @@ def _rank(
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
return type(self)(result)
return result

data = self._pa_array.combine_chunks()
sort_keys = "ascending" if ascending else "descending"
Expand Down Expand Up @@ -1812,7 +1809,29 @@ def _rank(
divisor = pc.count(result)
result = pc.divide(result, divisor)

return type(self)(result)
return result

def _rank(
self,
*,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
):
"""
See Series.rank.__doc__.
"""
return type(self)(
self._rank_calc(
axis=axis,
method=method,
na_option=na_option,
ascending=ascending,
pct=pct,
)
)

def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:
"""
Expand Down
28 changes: 27 additions & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from collections.abc import Sequence

from pandas._typing import (
AxisInt,
Dtype,
Scalar,
npt,
Expand Down Expand Up @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
def _convert_int_dtype(self, result):
return Int64Dtype().__from_arrow__(result)

def _rank(
self,
*,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
):
"""
See Series.rank.__doc__.
"""
return self._convert_int_dtype(
self._rank_calc(
axis=axis,
method=method,
na_option=na_option,
ascending=ascending,
pct=pct,
)
)


class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
Expand Down Expand Up @@ -584,7 +607,10 @@ def _str_map(
return lib.map_infer_mask(arr, f, mask.view("uint8"))

def _convert_int_dtype(self, result):
result = result.to_numpy()
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,12 @@ def sanitize_array(
if not is_list_like(data):
if index is None:
raise ValueError("index must be specified when data is not list-like")
if isinstance(data, str) and using_pyarrow_string_dtype():
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype("pyarrow_numpy")
data = construct_1d_arraylike_from_scalar(data, len(index), dtype)

return data

elif isinstance(data, ABCExtensionArray):
Expand Down
54 changes: 12 additions & 42 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@
DtypeArg,
DtypeBackend,
FilePath,
HashableT,
IndexLabel,
ReadCsvBuffer,
Self,
StorageOptions,
UsecolsArgType,
)
_doc_read_csv_and_table = (
r"""
Expand Down Expand Up @@ -142,7 +142,7 @@
Note: ``index_col=False`` can be used to force pandas to *not* use the first
column as the index, e.g., when you have a malformed file with delimiters at
the end of each line.
usecols : list of Hashable or Callable, optional
usecols : Sequence of Hashable or Callable, optional
Subset of columns to select, denoted either by column labels or column indices.
If list-like, all elements must either
be positional (i.e. integer indices into the document columns) or strings
Expand Down Expand Up @@ -645,10 +645,7 @@ def read_csv(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -707,10 +704,7 @@ def read_csv(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -770,10 +764,7 @@ def read_csv(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -833,10 +824,7 @@ def read_csv(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -907,10 +895,7 @@ def read_csv(
header: int | Sequence[int] | None | Literal["infer"] = "infer",
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
index_col: IndexLabel | Literal[False] | None = None,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = None,
usecols: UsecolsArgType = None,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down Expand Up @@ -1005,10 +990,7 @@ def read_table(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -1065,10 +1047,7 @@ def read_table(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -1125,10 +1104,7 @@ def read_table(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -1185,10 +1161,7 @@ def read_table(
header: int | Sequence[int] | None | Literal["infer"] = ...,
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = ...,
usecols: UsecolsArgType = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters: Mapping[Hashable, Callable] | None = ...,
Expand Down Expand Up @@ -1258,10 +1231,7 @@ def read_table(
header: int | Sequence[int] | None | Literal["infer"] = "infer",
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
index_col: IndexLabel | Literal[False] | None = None,
usecols: list[HashableT]
| tuple[HashableT]
| Callable[[Hashable], bool]
| None = None,
usecols: UsecolsArgType = None,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1905,6 +1905,19 @@ def test_adding_new_conditional_column() -> None:
tm.assert_frame_equal(df, expected)


def test_add_new_column_infer_string():
# GH#55366
pytest.importorskip("pyarrow")
df = DataFrame({"x": [1]})
with pd.option_context("future.infer_string", True):
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame(
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(df, expected)


class TestSetitemValidation:
# This is adapted from pandas/tests/arrays/masked/test_indexing.py
# but checks for warnings instead of errors.
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected):
df.rank()
result = df.rank(numeric_only=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype, exp_dtype",
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
)
def test_rank_string_dtype(self, dtype, exp_dtype):
# GH#55362
pytest.importorskip("pyarrow")
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
result = obj.rank(method="first")
expected = Series([1, 2, None, 3], dtype=exp_dtype)
tm.assert_series_equal(result, expected)

0 comments on commit 3ad1539

Please sign in to comment.