From 7134f2c14e614980bcf366f979a0f85aafacbde6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 15 Sep 2023 11:10:52 +0200 Subject: [PATCH 01/17] CoW: Clear dead references every time we add a new one (#55008) --- pandas/_libs/internals.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 7a9a3b84fd69f..3b1a6bc7436c3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -897,6 +897,11 @@ cdef class BlockValuesRefs: else: self.referenced_blocks = [] + def _clear_dead_references(self) -> None: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. @@ -905,6 +910,7 @@ cdef class BlockValuesRefs: blk : Block The block that the new references should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(blk)) def add_index_reference(self, index: object) -> None: @@ -915,6 +921,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(index)) def has_reference(self) -> bool: @@ -927,8 +934,6 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self._clear_dead_references() # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 From cc58350c92a9e24bef586cd1b898c01640b33abc Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 15 Sep 2023 16:06:24 -0400 Subject: [PATCH 02/17] PERF: groupby aggregations on pyarrow timestamp and duration types (#55131) * PERF: groupby aggregations on pyarrow timestamp and duration types * mypy * update --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 54e855f61905a..03b69b53836ad 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -177,6 +177,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) +- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2b2e0c843564f..a329c37c77449 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1993,9 +1993,17 @@ def _groupby_op( **kwargs, ) - masked = self._to_masked() + # maybe convert to a compatible dtype optimized for groupby + values: ExtensionArray + pa_type = self._pa_array.type + if pa.types.is_timestamp(pa_type): + values = self._to_datetimearray() + elif pa.types.is_duration(pa_type): + values = self._to_timedeltaarray() + else: + values = self._to_masked() - result = masked._groupby_op( + result = values._groupby_op( how=how, has_dropped_na=has_dropped_na, min_count=min_count, From 85062f0e09590b7da3edfe1cabd427f224180b9e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 15 Sep 2023 22:03:21 -0400 Subject: [PATCH 03/17] CI: Unpin Cython (#54784) * CI: Unpin Cython * Update actions-311-numpydev.yaml * disable cache for now * try bumping meson * bump meson everywhere --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 12 ++++++------ ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-39-minimum_versions.yaml | 4 ++-- ci/deps/actions-39.yaml | 4 ++-- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-310-arm64.yaml | 4 ++-- environment.yml | 2 +- meson.build | 2 +- pyproject.toml | 2 +- requirements-dev.txt | 2 +- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f2b426269098b..98c6226b14075 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,8 +233,8 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -271,8 +271,8 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -342,10 +342,10 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata "cython<3.0.1" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 927003b13d6be..682708d811878 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 00df41cce3bae..d09d4e5dea648 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index acf94fce5288b..7fd3a65ec91f8 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,9 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 + - cython>=0.29.33 # test dependencies - pytest>=7.3.2 @@ -25,7 +26,6 @@ dependencies: - pip - pip: - - "cython<3.0.1" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index e850f69f945c5..893341350f4ef 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 - - cython>=0.29.33, <3.0.1 + - meson[ninja]=1.2.1 + - cython>=0.29.33 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d50ea20da1e0c..f458546fc3a1b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 10862630bd596..b7cc6e9e891ce 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,8 +8,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 904b55a813a9f..4ee8dc0c6d3fc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 23c3706f43dad..4923c94ab08f3 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 4060cea73e7f6..c520fa17551e0 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/environment.yml b/environment.yml index 8deae839f5408..8103ee29c84a7 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - cython=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/meson.build b/meson.build index e0e533ffade97..68018046c081f 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'c', 'cpp', 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.0.1', + meson_version: '>=1.2.1', default_options: [ 'buildtype=release', 'c_std=c99' diff --git a/pyproject.toml b/pyproject.toml index 9e579036c128b..7807a6cc6368d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ "meson-python==0.13.1", - "meson==1.0.1", + "meson==1.2.1", "wheel", "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default diff --git a/requirements-dev.txt b/requirements-dev.txt index 01e0701bc39a7..77afa9005029c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ pip versioneer[toml] cython==0.29.33 -meson[ninja]==1.0.1 +meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov From 7b4df5a0b7a58566c673bd32e7280ce74a95fcb9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 16 Sep 2023 15:46:33 +0200 Subject: [PATCH 04/17] DOCS: temporary pin pydata-sphinx-theme to 0.13 (#55159) --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 8103ee29c84a7..5caa57ef37ee8 100644 --- a/environment.yml +++ b/environment.yml @@ -85,7 +85,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme + - pydata-sphinx-theme=0.13 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/requirements-dev.txt b/requirements-dev.txt index 77afa9005029c..b8e7e376378c6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -60,7 +60,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme +pydata-sphinx-theme==0.13 pytest-cython sphinx sphinx-design From 52bed6dfa9ba99df21fba2740da2a5a2832ef96c Mon Sep 17 00:00:00 2001 From: Deepak George Date: Sun, 17 Sep 2023 14:45:25 +0200 Subject: [PATCH 05/17] DOC: fix see also links in pandas.DataFrame.agg docstring (#55162) --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8fcb91c846826..124c21839afe7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9829,11 +9829,11 @@ def _gotitem( -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. - pandas.core.groupby.GroupBy : Perform operations over groups. - pandas.core.resample.Resampler : Perform operations over resampled bins. - pandas.core.window.Rolling : Perform operations over rolling window. - pandas.core.window.Expanding : Perform operations over expanding window. - pandas.core.window.ExponentialMovingWindow : Perform operation over exponential + pandas.DataFrame.groupby : Perform operations over groups. + pandas.DataFrame.resample : Perform operations over resampled bins. + pandas.DataFrame.rolling : Perform operations over rolling window. + pandas.DataFrame.expanding : Perform operations over expanding window. + pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential weighted window. """ ) From f4f598fb36c0809da01cade2d5d832ee09564101 Mon Sep 17 00:00:00 2001 From: Elahe Sharifi Date: Sun, 17 Sep 2023 14:48:09 +0200 Subject: [PATCH 06/17] DOC: remove duplicated sentence in return secion of pandas.DataFrame.agg /asof docstrings (#55165) --- pandas/core/generic.py | 2 -- pandas/core/shared_docs.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5c303e2a73bd7..bbaa6d3faeb05 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8323,8 +8323,6 @@ def asof(self, where, subset=None): * DataFrame : when `self` is a DataFrame and `where` is an array-like - Return scalar, Series, or DataFrame. - See Also -------- merge_asof : Perform an asof merge. Similar to left join. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 9da103e13f691..ec219941a3afc 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -34,8 +34,6 @@ * scalar : when Series.agg is called with single function * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. {see_also} Notes ----- From 36aa531b0f0e31cac32e56633ab90eb4b3fccda8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 18 Sep 2023 16:15:09 +0000 Subject: [PATCH 07/17] ENH: add Series.struct accessor for ArrowDtype[struct] (#54977) Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.explode() -- convert all fields into a DataFrame --- doc/source/reference/series.rst | 23 ++ doc/source/whatsnew/v2.2.0.rst | 28 +++ pandas/core/arrays/arrow/__init__.py | 3 +- pandas/core/arrays/arrow/accessors.py | 196 ++++++++++++++++++ pandas/core/series.py | 2 + .../series/accessors/test_struct_accessor.py | 147 +++++++++++++ 6 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 pandas/core/arrays/arrow/accessors.py create mode 100644 pandas/tests/series/accessors/test_struct_accessor.py diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 58351bab07b22..9acbab7a42800 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.from_coo Series.sparse.to_coo + +.. _api.series.struct: + +Struct accessor +~~~~~~~~~~~~~~~ + +Arrow struct-dtype specific methods and attributes are provided under the +``Series.struct`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + Series.struct.dtypes + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.struct.field + Series.struct.explode + + .. _api.series.flags: Flags diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 03b69b53836ad..9e55399afe5af 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -36,6 +36,34 @@ There are two advantages of this engine: For more, see :ref:`io.calamine` in the user guide on IO tools. +.. _whatsnew_220.enhancements.struct_accessor: + +Series.struct accessor to with PyArrow structured data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.struct`` accessor provides attributes and methods for processing +data with ``struct[pyarrow]`` dtype Series. For example, +:meth:`Series.struct.explode` converts PyArrow structured data to a pandas +DataFrame. (:issue:`54938`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + {"project": "pandas", "version": "2.2.0"}, + {"project": "numpy", "version": "1.25.2"}, + {"project": "pyarrow", "version": "13.0.0"}, + ], + dtype=pd.ArrowDtype( + pa.struct([ + ("project", pa.string()), + ("version", pa.string()), + ]) + ), + ) + series.struct.explode() + .. _whatsnew_220.enhancements.enhancement2: enhancement2 diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 58b268cbdd221..a3d33f91f597d 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -1,3 +1,4 @@ +from pandas.core.arrays.arrow.accessors import StructAccessor from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray"] +__all__ = ["ArrowExtensionArray", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000000..e4ed255476e8e --- /dev/null +++ b/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,196 @@ +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.dtypes.dtypes import ArrowDtype + +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + + +class StructAccessor: + """ + Accessor object for structured data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow struct data. + """ + + _validation_msg = ( + "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}." + ) + + def __init__(self, data=None) -> None: + self._parent = data + self._validate(data) + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not pa.types.is_struct(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def dtypes(self) -> Series: + """ + Return the dtype object of each child field of the struct. + + Returns + ------- + pandas.Series + The data type of each child field. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes + version int64[pyarrow] + project string[pyarrow] + dtype: object + """ + from pandas import ( + Index, + Series, + ) + + pa_type = self._parent.dtype.pyarrow_dtype + types = [ArrowDtype(struct.type) for struct in pa_type] + names = [struct.name for struct in pa_type] + return Series(types, index=Index(names)) + + def field(self, name_or_index: str | int) -> Series: + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | int + Name or index of the child field to extract. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + """ + from pandas import Series + + pa_arr = self._parent.array._pa_array + if isinstance(name_or_index, int): + index = name_or_index + elif isinstance(name_or_index, str): + index = pa_arr.type.get_field_index(name_or_index) + else: + raise ValueError( + "name_or_index must be an int or str, " + f"got {type(name_or_index).__name__}" + ) + + pa_field = pa_arr.type[index] + field_arr = pc.struct_field(pa_arr, [index]) + return Series( + field_arr, + dtype=ArrowDtype(field_arr.type), + index=self._parent.index, + name=pa_field.name, + ) + + def explode(self) -> DataFrame: + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + from pandas import concat + + pa_type = self._parent.dtype.pyarrow_dtype + return concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9b5c8829fd5ff..e0e27581ef7e2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -101,6 +101,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.arrow import StructAccessor from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor from pandas.core.construction import ( @@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) + struct = CachedAccessor("struct", StructAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py new file mode 100644 index 0000000000000..c645bb6807052 --- /dev/null +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -0,0 +1,147 @@ +import re + +import pytest + +from pandas import ( + ArrowDtype, + DataFrame, + Index, + Series, +) +import pandas._testing as tm +from pandas.core.arrays.arrow.accessors import StructAccessor + +pa = pytest.importorskip("pyarrow") + + +def test_struct_accessor_dtypes(): + ser = Series( + [], + dtype=ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("string_col", pa.string()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ), + ), + ] + ) + ), + ) + actual = ser.struct.dtypes + expected = Series( + [ + ArrowDtype(pa.int64()), + ArrowDtype(pa.string()), + ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ) + ), + ], + index=Index(["int_col", "string_col", "struct_col"]), + ) + tm.assert_series_equal(actual, expected) + + +def test_struct_accessor_field(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"rice": 1.0, "maize": -1, "wheat": "a"}, + {"rice": 2.0, "maize": 0, "wheat": "b"}, + {"rice": 3.0, "maize": 1, "wheat": "c"}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("rice", pa.float64()), + ("maize", pa.int64()), + ("wheat", pa.string()), + ] + ) + ), + index=index, + ) + by_name = ser.struct.field("maize") + by_name_expected = Series( + [-1, 0, 1], + dtype=ArrowDtype(pa.int64()), + index=index, + name="maize", + ) + tm.assert_series_equal(by_name, by_name_expected) + + by_index = ser.struct.field(2) + by_index_expected = Series( + ["a", "b", "c"], + dtype=ArrowDtype(pa.string()), + index=index, + name="wheat", + ) + tm.assert_series_equal(by_index, by_index_expected) + + +def test_struct_accessor_field_with_invalid_name_or_index(): + ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) + + with pytest.raises(ValueError, match="name_or_index must be an int or str"): + ser.struct.field(1.1) + + +def test_struct_accessor_explode(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"painted": 1, "snapping": {"sea": "green"}}, + {"painted": 2, "snapping": {"sea": "leatherback"}}, + {"painted": 3, "snapping": {"sea": "hawksbill"}}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("painted", pa.int64()), + ("snapping", pa.struct([("sea", pa.string())])), + ] + ) + ), + index=index, + ) + actual = ser.struct.explode() + expected = DataFrame( + { + "painted": Series([1, 2, 3], index=index, dtype=ArrowDtype(pa.int64())), + "snapping": Series( + [{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}], + index=index, + dtype=ArrowDtype(pa.struct([("sea", pa.string())])), + ), + }, + ) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize( + "invalid", + [ + pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"), + pytest.param( + Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow" + ), + ], +) +def test_struct_accessor_api_for_invalid(invalid): + msg = re.escape(StructAccessor._validation_msg.format(dtype=invalid.dtype)) + + with pytest.raises(AttributeError, match=msg): + invalid.struct From 6a2598052dd214bfb84ac8d0f6ecbb7586546cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= <8431159+mtsokol@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:17:33 +0200 Subject: [PATCH 08/17] MAINT: Cleanup expired ndarray methods (#55180) --- doc/source/user_guide/gotchas.rst | 2 +- pandas/io/stata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index c00a236ff4e9d..99c85ac66623d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -379,7 +379,7 @@ constructors using something similar to the following: .. ipython:: python x = np.array(list(range(10)), ">i4") # big endian - newx = x.byteswap().newbyteorder() # force native byteorder + newx = x.byteswap().view(x.dtype.newbyteorder()) # force native byteorder s = pd.Series(newx) See `the NumPy documentation on byte order diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7f19e62f40774..a6d17a604a23f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1773,7 +1773,7 @@ def read( self._data_read = True # if necessary, swap the byte order to native here if self._byteorder != self._native_byteorder: - raw_data = raw_data.byteswap().newbyteorder() + raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) if convert_categoricals: self._read_value_labels() From 8768cea987ae16e1e2c150d4acc2581ffff5421f Mon Sep 17 00:00:00 2001 From: jfadia <90651438+jfadia@users.noreply.github.com> Date: Mon, 18 Sep 2023 09:24:43 -0700 Subject: [PATCH 09/17] DOC: Update read_csv docs to reflect behaviour with `parse_dates` (#55176) * Update read_csv docs * Fix line length --- pandas/io/parsers/readers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e826aad478059..acf35ebd6afe5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -238,7 +238,8 @@ default False The behavior is as follows: - * ``bool``. If ``True`` -> try parsing the index. + * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to + ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse From 699a8e40a8257bba02b7b5bfa2218f272bdf1743 Mon Sep 17 00:00:00 2001 From: Moritz Schubert <35040095+mcnoat@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:26:38 +0200 Subject: [PATCH 10/17] DOC: complete parameter descriptions for DataFrame.insert's docstring (#55172) complete parameter descriptions for DataFrame.insert's docstring issue: https://github.com/jorisvandenbossche/pydata-amsterdam-pandas-sprint/issues/1 - bullet point item #5: "complete the parameter descriptions" --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 124c21839afe7..4e87e90278e7b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4915,7 +4915,9 @@ def insert( column : str, number, or hashable object Label of the inserted column. value : Scalar, Series, or array-like + Content of the inserted column. allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. See Also -------- From 2b8decdccea5a70dc92ad58283edecf8b1b049bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Sep 2023 09:32:27 -0700 Subject: [PATCH 11/17] CLN: de-duplicate code in _matplotlib.converter (#55155) * CLN: de-duplicate _matplotlib.converters * REF: avoid changing type of vmin/vmax * CLN: de-duplicate * REF: de-duplicate, annotate * REF: standardize * comment * mypy fixup --- pandas/_libs/tslibs/dtypes.pyx | 1 - pandas/plotting/_matplotlib/converter.py | 202 +++++++++++------------ pandas/tests/plotting/test_converter.py | 4 +- 3 files changed, 104 insertions(+), 103 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index bafde9f3b237b..f0f73d242cdf0 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -460,7 +460,6 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil: return NPY_DATETIMEUNIT.NPY_FR_D -# TODO: use in _matplotlib.converter? cpdef int64_t periods_per_day( NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns ) except? -1: diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 33aeaa6d81406..3f77a32014bff 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, - Final, cast, ) @@ -30,8 +29,14 @@ Timestamp, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup -from pandas._typing import F +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + periods_per_day, +) +from pandas._typing import ( + F, + npt, +) from pandas.core.dtypes.common import ( is_float, @@ -60,15 +65,6 @@ from pandas._libs.tslibs.offsets import BaseOffset -# constants -HOURS_PER_DAY: Final = 24.0 -MIN_PER_HOUR: Final = 60.0 -SEC_PER_MIN: Final = 60.0 - -SEC_PER_HOUR: Final = SEC_PER_MIN * MIN_PER_HOUR -SEC_PER_DAY: Final = SEC_PER_HOUR * HOURS_PER_DAY - -MUSEC_PER_DAY: Final = 10**6 * SEC_PER_DAY _mpl_units = {} # Cache for units overwritten by us @@ -495,7 +491,7 @@ def _get_default_annual_spacing(nyears) -> tuple[int, int]: return (min_spacing, maj_spacing) -def period_break(dates: PeriodIndex, period: str) -> np.ndarray: +def _period_break(dates: PeriodIndex, period: str) -> npt.NDArray[np.intp]: """ Returns the indices where the given period changes. @@ -506,12 +502,17 @@ def period_break(dates: PeriodIndex, period: str) -> np.ndarray: period : str Name of the period to monitor. """ + mask = _period_break_mask(dates, period) + return np.nonzero(mask)[0] + + +def _period_break_mask(dates: PeriodIndex, period: str) -> npt.NDArray[np.bool_]: current = getattr(dates, period) previous = getattr(dates - 1 * dates.freq, period) - return np.nonzero(current - previous)[0] + return current != previous -def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: +def has_level_label(label_flags: npt.NDArray[np.intp], vmin: float) -> bool: """ Returns true if the ``label_flags`` indicate there is at least one label for this level. @@ -527,54 +528,59 @@ def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: return True -def _daily_finder(vmin, vmax, freq: BaseOffset): +def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] freq_group = FreqGroup.from_period_dtype_code(dtype_code) - periodsperday = -1 + ppd = -1 # placeholder for above-day freqs if dtype_code >= FreqGroup.FR_HR.value: - if freq_group == FreqGroup.FR_NS: - periodsperday = 24 * 60 * 60 * 1000000000 - elif freq_group == FreqGroup.FR_US: - periodsperday = 24 * 60 * 60 * 1000000 - elif freq_group == FreqGroup.FR_MS: - periodsperday = 24 * 60 * 60 * 1000 - elif freq_group == FreqGroup.FR_SEC: - periodsperday = 24 * 60 * 60 - elif freq_group == FreqGroup.FR_MIN: - periodsperday = 24 * 60 - elif freq_group == FreqGroup.FR_HR: - periodsperday = 24 - else: # pragma: no cover - raise ValueError(f"unexpected frequency: {dtype_code}") - periodsperyear = 365 * periodsperday - periodspermonth = 28 * periodsperday - + # error: "BaseOffset" has no attribute "_creso" + ppd = periods_per_day(freq._creso) # type: ignore[attr-defined] + ppm = 28 * ppd + ppy = 365 * ppd elif freq_group == FreqGroup.FR_BUS: - periodsperyear = 261 - periodspermonth = 19 + ppm = 19 + ppy = 261 elif freq_group == FreqGroup.FR_DAY: - periodsperyear = 365 - periodspermonth = 28 + ppm = 28 + ppy = 365 elif freq_group == FreqGroup.FR_WK: - periodsperyear = 52 - periodspermonth = 3 - else: # pragma: no cover - raise ValueError("unexpected frequency") + ppm = 3 + ppy = 52 + elif freq_group == FreqGroup.FR_MTH: + ppm = 1 + ppy = 12 + elif freq_group == FreqGroup.FR_QTR: + ppm = -1 # placerholder + ppy = 4 + elif freq_group == FreqGroup.FR_ANN: + ppm = -1 # placeholder + ppy = 1 + else: + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") + + return ppd, ppm, ppy + + +def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + + periodsperday, periodspermonth, periodsperyear = _get_periods_per_ymd(freq) # save this for later usage vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 - (vmin, vmax) = ( - Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq), + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, ) - assert isinstance(vmin, Period) - assert isinstance(vmax, Period) - span = vmax.ordinal - vmin.ordinal + 1 - dates_ = period_range(start=vmin, end=vmax, freq=freq) + # Initialize the output info = np.zeros( span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] @@ -595,45 +601,38 @@ def first_label(label_flags): # Case 1. Less than a month if span <= periodspermonth: - day_start = period_break(dates_, "day") - month_start = period_break(dates_, "month") + day_start = _period_break(dates_, "day") + month_start = _period_break(dates_, "month") + year_start = _period_break(dates_, "year") - def _hour_finder(label_interval, force_year_start) -> None: - _hour = dates_.hour - _prev_hour = (dates_ - 1 * dates_.freq).hour - hour_start = (_hour - _prev_hour) != 0 + def _hour_finder(label_interval: int, force_year_start: bool) -> None: + target = dates_.hour + mask = _period_break_mask(dates_, "hour") info_maj[day_start] = True - info_min[hour_start & (_hour % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" if force_year_start and not has_level_label(year_start, vmin_orig): info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" - def _minute_finder(label_interval) -> None: - hour_start = period_break(dates_, "hour") - _minute = dates_.minute - _prev_minute = (dates_ - 1 * dates_.freq).minute - minute_start = (_minute - _prev_minute) != 0 + def _minute_finder(label_interval: int) -> None: + target = dates_.minute + hour_start = _period_break(dates_, "hour") + mask = _period_break_mask(dates_, "minute") info_maj[hour_start] = True - info_min[minute_start & (_minute % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" - def _second_finder(label_interval) -> None: - minute_start = period_break(dates_, "minute") - _second = dates_.second - _prev_second = (dates_ - 1 * dates_.freq).second - second_start = (_second - _prev_second) != 0 - info["maj"][minute_start] = True - info["min"][second_start & (_second % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + def _second_finder(label_interval: int) -> None: + target = dates_.second + minute_start = _period_break(dates_, "minute") + mask = _period_break_mask(dates_, "second") + info_maj[minute_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M:%S" info_fmt[day_start] = "%H:%M:%S\n%d-%b" info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" @@ -672,8 +671,6 @@ def _second_finder(label_interval) -> None: else: info_maj[month_start] = True info_min[day_start] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] info_fmt[day_start] = "%d" info_fmt[month_start] = "%d\n%b" info_fmt[year_start] = "%d\n%b\n%Y" @@ -685,15 +682,15 @@ def _second_finder(label_interval) -> None: # Case 2. Less than three months elif span <= periodsperyear // 4: - month_start = period_break(dates_, "month") + month_start = _period_break(dates_, "month") info_maj[month_start] = True if dtype_code < FreqGroup.FR_HR.value: info["min"] = True else: - day_start = period_break(dates_, "day") + day_start = _period_break(dates_, "day") info["min"][day_start] = True - week_start = period_break(dates_, "week") - year_start = period_break(dates_, "year") + week_start = _period_break(dates_, "week") + year_start = _period_break(dates_, "year") info_fmt[week_start] = "%d" info_fmt[month_start] = "\n\n%b" info_fmt[year_start] = "\n\n%b\n%Y" @@ -704,9 +701,9 @@ def _second_finder(label_interval) -> None: info_fmt[first_label(month_start)] = "\n\n%b\n%Y" # Case 3. Less than 14 months ............... elif span <= 1.15 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") - week_start = period_break(dates_, "week") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") + week_start = _period_break(dates_, "week") info_maj[month_start] = True info_min[week_start] = True info_min[year_start] = False @@ -717,17 +714,17 @@ def _second_finder(label_interval) -> None: info_fmt[first_label(month_start)] = "%b\n%Y" # Case 4. Less than 2.5 years ............... elif span <= 2.5 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") + month_start = _period_break(dates_, "month") info_maj[quarter_start] = True info_min[month_start] = True info_fmt[quarter_start] = "%b" info_fmt[year_start] = "%b\n%Y" # Case 4. Less than 4 years ................. elif span <= 4 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") info_maj[year_start] = True info_min[month_start] = True info_min[year_start] = False @@ -738,15 +735,15 @@ def _second_finder(label_interval) -> None: info_fmt[year_start] = "%b\n%Y" # Case 5. Less than 11 years ................ elif span <= 11 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") info_maj[year_start] = True info_min[quarter_start] = True info_min[year_start] = False info_fmt[year_start] = "%Y" # Case 6. More than 12 years ................ else: - year_start = period_break(dates_, "year") + year_start = _period_break(dates_, "year") year_break = dates_[year_start].year nyears = span / periodsperyear (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) @@ -759,8 +756,8 @@ def _second_finder(label_interval) -> None: return info -def _monthly_finder(vmin, vmax, freq): - periodsperyear = 12 +def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -795,6 +792,7 @@ def _monthly_finder(vmin, vmax, freq): quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True # TODO: Check the following : is it really info['fmt'] ? + # 2023-09-15 this is reached in test_finder_monthly info["fmt"][quarter_start] = True info["min"] = True @@ -829,8 +827,8 @@ def _monthly_finder(vmin, vmax, freq): return info -def _quarterly_finder(vmin, vmax, freq): - periodsperyear = 4 +def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 @@ -876,7 +874,8 @@ def _quarterly_finder(vmin, vmax, freq): return info -def _annual_finder(vmin, vmax, freq): +def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -889,8 +888,9 @@ def _annual_finder(vmin, vmax, freq): (min_anndef, maj_anndef) = _get_default_annual_spacing(span) major_idx = dates_ % maj_anndef == 0 + minor_idx = dates_ % min_anndef == 0 info["maj"][major_idx] = True - info["min"][(dates_ % min_anndef == 0)] = True + info["min"][minor_idx] = True info["fmt"][major_idx] = "%Y" return info @@ -1087,7 +1087,7 @@ def format_timedelta_ticks(x, pos, n_decimals: int) -> str: """ Convert seconds to 'D days HH:MM:SS.F' """ - s, ns = divmod(x, 10**9) + s, ns = divmod(x, 10**9) # TODO(non-nano): this looks like it assumes ns m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 0108079f1110f..7d574b86cef36 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,6 +10,8 @@ import pandas._config.config as cf +from pandas._libs.tslibs import to_offset + from pandas import ( Index, Period, @@ -390,7 +392,7 @@ def test_quarterly_finder(year_span): pytest.skip("the quarterly finder is only invoked if the span is >= 45") nyears = span / 4 (min_anndef, maj_anndef) = converter._get_default_annual_spacing(nyears) - result = converter._quarterly_finder(vmin, vmax, "Q") + result = converter._quarterly_finder(vmin, vmax, to_offset("Q")) quarters = PeriodIndex( arrays.PeriodArray(np.array([x[0] for x in result]), dtype="period[Q]") ) From abbb86e7cc69e53f5ee22fdf3bd5a78e21ad2351 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:37:34 +0200 Subject: [PATCH 12/17] docs: add explanation about `freq` units in `Timedelta.round()` (#55149) --- pandas/_libs/tslibs/timedeltas.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 2d9fe93c397cb..2178e972b4d08 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1942,6 +1942,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the rounding resolution. + It uses the same units as class contructor :class:`~pandas.Timedelta`. Returns ------- @@ -1969,6 +1970,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the flooring resolution. + It uses the same units as class contructor :class:`~pandas.Timedelta`. Examples -------- @@ -1988,6 +1990,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the ceiling resolution. + It uses the same units as class contructor :class:`~pandas.Timedelta`. Examples -------- From e0c5b87850f924a2e7ab680979c628784d96181b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 18 Sep 2023 12:39:40 -0400 Subject: [PATCH 13/17] BUG: Index.difference not always returning a unique set of values (#55113) * BUG: Index.difference not always returning a unique set of values * whatsnew * udpate test * update test --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/test_setops.py | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9e55399afe5af..b229c9d22367e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -262,7 +262,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8756bb3f3c81b..3adb4dfa227db 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3598,14 +3598,14 @@ def difference(self, other, sort=None): if len(other) == 0: # Note: we do not (yet) sort even if sort=None GH#24959 - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result if not self._should_compare(other): # Nothing matches -> difference is everything - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a64994efec85a..d6304774b87c4 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -796,11 +796,21 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): assert result.name == expected def test_difference_empty_arg(self, index, sort): - first = index[5:20] + first = index.copy() + first = first[5:20] first.name = "name" result = first.difference([], sort) + expected = index[5:20].unique() + expected.name = "name" + tm.assert_index_equal(result, expected) - tm.assert_index_equal(result, first) + def test_difference_should_not_compare(self): + # GH 55113 + left = Index([1, 1]) + right = Index([True]) + result = left.difference(right) + expected = Index([1]) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_difference_identity(self, index, sort): From 271144ae64d44399e9e23d147aa83281925a21fc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Sep 2023 06:54:12 -1000 Subject: [PATCH 14/17] BUG: Interval with Timestamp with tz shows tz (#55035) * BUG: Interval with Timestamp with tz shows tz * Add whatsnew * Fix tests --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_libs/interval.pyx | 21 +++--------------- pandas/_libs/tslibs/timestamps.pyx | 12 ---------- pandas/core/indexes/interval.py | 10 +++++---- pandas/tests/frame/methods/test_to_csv.py | 2 +- pandas/tests/indexes/interval/test_formats.py | 22 ++++++++++++++++++- pandas/tests/io/excel/test_writers.py | 8 +++---- 7 files changed, 36 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b229c9d22367e..e32f357ad3777 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -257,7 +257,7 @@ Strings Interval ^^^^^^^^ -- +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) - Indexing diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index e07d80dd04b31..82f69c1dedd53 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -478,31 +478,16 @@ cdef class Interval(IntervalMixin): args = (self.left, self.right, self.closed) return (type(self), args) - def _repr_base(self): - left = self.left - right = self.right - - # TODO: need more general formatting methodology here - if isinstance(left, _Timestamp) and isinstance(right, _Timestamp): - left = left._short_repr - right = right._short_repr - - return left, right - def __repr__(self) -> str: - - left, right = self._repr_base() - disp = str if isinstance(left, np.generic) else repr + disp = str if isinstance(self.left, (np.generic, _Timestamp)) else repr name = type(self).__name__ - repr_str = f"{name}({disp(left)}, {disp(right)}, closed={repr(self.closed)})" + repr_str = f"{name}({disp(self.left)}, {disp(self.right)}, closed={repr(self.closed)})" # noqa: E501 return repr_str def __str__(self) -> str: - - left, right = self._repr_base() start_symbol = "[" if self.closed_left else "(" end_symbol = "]" if self.closed_right else ")" - return f"{start_symbol}{left}, {right}{end_symbol}" + return f"{start_symbol}{self.left}, {self.right}{end_symbol}" def __add__(self, y): if ( diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 944a2b0e97382..65d0d454ac817 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1078,18 +1078,6 @@ cdef class _Timestamp(ABCTimestamp): return result - @property - def _short_repr(self) -> str: - # format a Timestamp with only _date_repr if possible - # otherwise _repr_base - if (self.hour == 0 and - self.minute == 0 and - self.second == 0 and - self.microsecond == 0 and - self.nanosecond == 0): - return self._date_repr - return self._repr_base - # ----------------------------------------------------------------- # Conversion Methods diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b36672df32e61..eb8d25bcea592 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1039,8 +1039,9 @@ def interval_range( >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) - IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], - (2017-01-03, 2017-01-04]], + IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], + (2017-01-02 00:00:00, 2017-01-03 00:00:00], + (2017-01-03 00:00:00, 2017-01-04 00:00:00]], dtype='interval[datetime64[ns], right]') The ``freq`` parameter specifies the frequency between the left and right. @@ -1056,8 +1057,9 @@ def interval_range( >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') - IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], - (2017-03-01, 2017-04-01]], + IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], + (2017-02-01 00:00:00, 2017-03-01 00:00:00], + (2017-03-01 00:00:00, 2017-04-01 00:00:00]], dtype='interval[datetime64[ns], right]') Specify ``start``, ``end``, and ``periods``; the frequency is generated diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 294da02e259b7..9f45347c31165 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1326,6 +1326,6 @@ def test_to_csv_categorical_and_interval(self): ) df["a"] = df["a"].astype("category") result = df.to_csv() - expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] + expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 4d6f3a62d4dd0..f003211abd857 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -80,7 +80,11 @@ def test_repr_floats(self): ((Timestamp("20180102"), Timestamp("20180103"))), ], "both", - ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + [ + "[2018-01-01 00:00:00, 2018-01-02 00:00:00]", + "NaN", + "[2018-01-02 00:00:00, 2018-01-03 00:00:00]", + ], ), ( [ @@ -103,3 +107,19 @@ def test_to_native_types(self, tuples, closed, expected_data): result = index._format_native_types() expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) + + def test_timestamp_with_timezone(self): + # GH 55035 + index = IntervalIndex( + [ + Interval( + Timestamp("2020-01-01", tz="UTC"), Timestamp("2020-01-02", tz="UTC") + ) + ] + ) + result = repr(index) + expected = ( + "IntervalIndex([(2020-01-01 00:00:00+00:00, 2020-01-02 00:00:00+00:00]], " + "dtype='interval[datetime64[ns, UTC], right]')" + ) + assert result == expected diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 9cbb29605a1ec..3944599b167c1 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -307,10 +307,10 @@ def test_multiindex_interval_datetimes(self, ext): [ range(4), [ - "(2020-01-31, 2020-07-31]", - "(2020-07-31, 2021-01-31]", - "(2021-01-31, 2021-07-31]", - "(2021-07-31, 2022-01-31]", + "(2020-01-31 00:00:00, 2020-07-31 00:00:00]", + "(2020-07-31 00:00:00, 2021-01-31 00:00:00]", + "(2021-01-31 00:00:00, 2021-07-31 00:00:00]", + "(2021-07-31 00:00:00, 2022-01-31 00:00:00]", ], ] ), From 66a49457900a6b50b9d2bf2bcf6664ef7351475b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Sep 2023 18:56:11 +0200 Subject: [PATCH 15/17] BUG df.plot.box handles matplotlib Axes with sharey=True (#54940) * BUG manage sharey in plot.box with vert=False * fix * add entry in whats new * iter --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/plotting/_matplotlib/boxplot.py | 45 ++++++++++++-------- pandas/tests/plotting/test_boxplot_method.py | 16 +++++++ 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e32f357ad3777..2afd190755b4c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -288,7 +288,7 @@ Period Plotting ^^^^^^^^ -- +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) - Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 83cb8a6ab67dd..5fcea796a9c6e 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -40,6 +40,24 @@ from pandas._typing import MatplotlibColor +def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None: + """Set the tick labels of a given axis. + + Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the + case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of + labels. + """ + ticks = ax.get_xticks() if is_vertical else ax.get_yticks() + if len(ticks) != len(labels): + i, remainder = divmod(len(ticks), len(labels)) + assert remainder == 0, remainder + labels *= i + if is_vertical: + ax.set_xticklabels(labels, **kwargs) + else: + ax.set_yticklabels(labels, **kwargs) + + class BoxPlot(LinePlot): @property def _kind(self) -> Literal["box"]: @@ -193,7 +211,9 @@ def _make_plot(self) -> None: ) self.maybe_color_bp(bp) self._return_obj[label] = ret - self._set_ticklabels(ax, ticklabels) + _set_ticklabels( + ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical" + ) else: y = self.data.values.T ax = self._get_ax(0) @@ -209,13 +229,9 @@ def _make_plot(self) -> None: labels = [pprint_thing(left) for left in labels] if not self.use_index: labels = [pprint_thing(key) for key in range(len(labels))] - self._set_ticklabels(ax, labels) - - def _set_ticklabels(self, ax: Axes, labels: list[str]) -> None: - if self.orientation == "vertical": - ax.set_xticklabels(labels) - else: - ax.set_yticklabels(labels) + _set_ticklabels( + ax=ax, labels=labels, is_vertical=self.orientation == "vertical" + ) def _make_legend(self) -> None: pass @@ -382,16 +398,9 @@ def plot_group(keys, values, ax: Axes, **kwds): ax.tick_params(axis="both", labelsize=fontsize) # GH 45465: x/y are flipped when "vert" changes - is_vertical = kwds.get("vert", True) - ticks = ax.get_xticks() if is_vertical else ax.get_yticks() - if len(ticks) != len(keys): - i, remainder = divmod(len(ticks), len(keys)) - assert remainder == 0, remainder - keys *= i - if is_vertical: - ax.set_xticklabels(keys, rotation=rot) - else: - ax.set_yticklabels(keys, rotation=rot) + _set_ticklabels( + ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot + ) maybe_color_bp(bp, **kwds) # Return axes in multiplot case, maybe revisit later # 985 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 555b9fd0c82c2..76f7fa1f22eec 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -329,6 +329,22 @@ def test_plot_xlabel_ylabel(self, vert): assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel + @pytest.mark.parametrize("vert", [True, False]) + def test_plot_box(self, vert): + # GH 54941 + rng = np.random.default_rng(2) + df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + + xlabel, ylabel = "x", "y" + _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) + df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) + df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + for ax in axs: + assert ax.get_xlabel() == xlabel + assert ax.get_ylabel() == ylabel + mpl.pyplot.close() + @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( From 95b6057ebd5d774bf5f3ab90514fe5fcee2cbe90 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Sep 2023 10:20:38 -0700 Subject: [PATCH 16/17] DEPR: downcasting in replace (#54710) * DEPR: downcasting in replace * GH refs * fix docbuild i hope * suppress doc warnings * avoid warning in docs --- doc/source/user_guide/missing_data.rst | 6 +- doc/source/whatsnew/v2.2.0.rst | 3 +- pandas/core/internals/blocks.py | 68 ++++++++++++++++++--- pandas/tests/frame/methods/test_replace.py | 62 +++++++++++++------ pandas/tests/indexing/test_coercion.py | 30 +++++++-- pandas/tests/io/excel/test_writers.py | 4 +- pandas/tests/series/methods/test_replace.py | 38 ++++++++---- 7 files changed, 162 insertions(+), 49 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index e0e752099b77a..4a2aa565dd15c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -401,7 +401,7 @@ Limit the number of NA values filled df.ffill(limit=1) -NA values can be replaced with corresponding value from a :class:`Series`` or :class:`DataFrame`` +NA values can be replaced with corresponding value from a :class:`Series` or :class:`DataFrame` where the index and column aligns between the original object and the filled object. .. ipython:: python @@ -660,7 +660,7 @@ Pass a list of regular expressions that will replace matches with a scalar. .. ipython:: python - df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], "placeholder", regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -669,7 +669,7 @@ dictionary. .. ipython:: python - df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value="placeholder") .. note:: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 2afd190755b4c..2a7b06ce0c4af 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -190,7 +190,8 @@ Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`) +- Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`) +- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6399f85723ae5..57833c1d626ee 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -761,7 +761,23 @@ def replace( if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False, using_cow=using_cow) + if get_option("future.no_silent_downcasting") is True: + blocks = [blk] + else: + blocks = blk.convert(copy=False, using_cow=using_cow) + if len(blocks) > 1 or blocks[0].dtype != blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) else: blocks = [blk] return blocks @@ -836,7 +852,21 @@ def _replace_regex( replace_regex(block.values, rx, value, mask) - return block.convert(copy=False, using_cow=using_cow) + nbs = block.convert(copy=False, using_cow=using_cow) + opt = get_option("future.no_silent_downcasting") + if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call `result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return nbs @final def replace_list( @@ -902,6 +932,7 @@ def replace_list( else: rb = [self if inplace else self.copy()] + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -939,14 +970,33 @@ def replace_list( b.refs.referenced_blocks.index(ref) ) - if convert and blk.is_object and not all(x is None for x in dest_list): + if ( + not opt + and convert + and blk.is_object + and not all(x is None for x in dest_list) + ): # GH#44498 avoid unwanted cast-back - result = extend_blocks( - [ - b.convert(copy=True and not using_cow, using_cow=using_cow) - for b in result - ] - ) + nbs = [] + for res_blk in result: + converted = res_blk.convert( + copy=True and not using_cow, using_cow=using_cow + ) + if len(converted) > 1 or converted[0].dtype != res_blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated " + "and will be removed in a future version. To " + "retain the old behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + nbs.extend(converted) + result = nbs new_rb.extend(result) rb = new_rb return rb diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 61e44b4e24c08..f07c53060a06b 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -289,7 +289,9 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) - result = df.replace({"Type": {"Q": 0, "T": 1}}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): @@ -301,16 +303,20 @@ def test_regex_replace_list_to_scalar(self, mix_abc): "c": [np.nan, np.nan, np.nan, "d"], } ) - res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - return_value = res2.replace( - [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) assert return_value is None - return_value = res3.replace( - regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -520,7 +526,9 @@ def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) m = {"foo": 1, "bar": 2, "bah": 3} - rep = df.replace(m) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes tm.assert_series_equal(expec, res) @@ -838,7 +846,12 @@ def test_replace_for_new_dtypes(self, datetime_frame): ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = frame.replace(to_replace, value) + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): @@ -927,7 +940,9 @@ def test_replace_dict_no_regex(self): "Strongly Disagree": 1, } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_series_no_regex(self): @@ -950,7 +965,9 @@ def test_replace_series_no_regex(self): } ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): @@ -1076,7 +1093,9 @@ def test_replace_period(self): expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) assert expected.dtypes.iloc[0] == "Period[M]" - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetime(self): @@ -1106,7 +1125,9 @@ def test_replace_datetime(self): ) assert set(df.fname.values) == set(d["fname"].keys()) expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetimetz(self): @@ -1307,10 +1328,12 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, request, replacer): + def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) - result = df.replace({"a": replacer, "b": replacer}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) @@ -1564,12 +1587,15 @@ def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) expected_df1 = DataFrame({"A": [1], "B": [1]}) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) expected_df2 = DataFrame({"A": [1], "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2c39729097487..82368c67dc6d4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -836,8 +836,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): # tested below return - result = obj.replace(replacer) - if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64") ): @@ -851,6 +849,17 @@ def test_replace_series(self, how, to_key, from_key, replacer): exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning + if ( + exp.dtype == obj.dtype + or exp.dtype == object + or (exp.dtype.kind in "iufc" and obj.dtype.kind in "iufc") + ): + warn = None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -866,11 +875,14 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning if exp.dtype != object else None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -888,16 +900,22 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") + warn = FutureWarning if isinstance(obj.dtype, pd.DatetimeTZDtype) and isinstance( exp.dtype, pd.DatetimeTZDtype ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) + warn = None else: assert exp.dtype == to_key + if to_key == from_key: + warn = None + + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) tm.assert_series_equal(result, exp) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 3944599b167c1..bbd427387625b 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1197,7 +1197,9 @@ def test_render_as_column_name(self, path): def test_true_and_false_value_options(self, path): # see gh-13347 df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - expected = df.replace({"foo": True, "bar": False}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.replace({"foo": True, "bar": False}) df.to_excel(path) read_frame = pd.read_excel( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ec9db8c3830d6..f08966c3816c0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -76,7 +76,9 @@ def test_replace(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -84,7 +86,8 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -92,11 +95,13 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() @@ -294,7 +299,9 @@ def test_replace2(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -302,7 +309,8 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -310,11 +318,13 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -373,7 +383,9 @@ def test_replace_unicode_with_number(self): def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) - result = s.replace([2, "4"], np.nan) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @@ -387,7 +399,9 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) - result = ser.replace({"A": 1, "B": 2}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -710,7 +724,9 @@ def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): From 1496630d35337425d860128cad2e8bc624b9b25d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Sep 2023 10:22:09 -0700 Subject: [PATCH 17/17] DEPR: fillna downcasting from object dtype (#54261) * DEPR: fillna downcasting from object dtype * GH ref * suppress warning * update test * Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/generic.py | 9 ++++++- pandas/core/internals/blocks.py | 28 +++++++++++++++++++-- pandas/io/formats/xml.py | 9 ++++++- pandas/io/json/_json.py | 11 +++++++- pandas/io/stata.py | 9 ++++++- pandas/plotting/_matplotlib/core.py | 8 +++++- pandas/tests/extension/test_masked.py | 11 +++++++- pandas/tests/frame/indexing/test_where.py | 2 ++ pandas/tests/frame/methods/test_fillna.py | 7 ++++-- pandas/tests/frame/test_arithmetic.py | 12 ++++++--- pandas/tests/frame/test_logical_ops.py | 1 + pandas/tests/frame/test_reductions.py | 1 + pandas/tests/frame/test_stack_unstack.py | 2 ++ pandas/tests/groupby/test_function.py | 1 + pandas/tests/series/methods/test_reindex.py | 8 ++++-- pandas/tests/series/test_api.py | 1 + pandas/tests/series/test_arithmetic.py | 10 +++++--- pandas/tests/series/test_logical_ops.py | 1 + 19 files changed, 113 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 2a7b06ce0c4af..55a3419e95703 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -198,6 +198,7 @@ Deprecations - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) +- Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bbaa6d3faeb05..271ad40a98272 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10393,7 +10393,14 @@ def _where( # make sure we are boolean fill_value = bool(inplace) - cond = cond.fillna(fill_value) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + cond = cond.fillna(fill_value) + cond = cond.infer_objects(copy=False) msg = "Boolean array expected for the condition, not {dtype}" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 57833c1d626ee..66b01dfb59f7f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -498,7 +498,11 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: @final def _maybe_downcast( - self, blocks: list[Block], downcast, using_cow: bool, caller: str + self, + blocks: list[Block], + downcast, + using_cow: bool, + caller: str, ) -> list[Block]: if downcast is False: return blocks @@ -510,9 +514,29 @@ def _maybe_downcast( # but ATM it breaks too much existing code. # split and convert the blocks + if caller == "fillna" and get_option("future.no_silent_downcasting"): + return blocks + nbs = extend_blocks( [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) + if caller == "fillna": + if len(nbs) != len(blocks) or not all( + x.dtype == y.dtype for x, y in zip(nbs, blocks) + ): + # GH#54261 + warnings.warn( + "Downcasting object dtype arrays on .fillna, .ffill, .bfill " + "is deprecated and will change in a future version. " + "Call result.infer_objects(copy=False) instead. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return nbs elif downcast is None: return blocks @@ -1549,7 +1573,7 @@ def pad_or_backfill( data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow, caller="pad_or_backfill") + return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") @final def interpolate( diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 76b938755755a..a6ee8407988ec 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, ) +import warnings from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -202,7 +203,13 @@ def process_dataframe(self) -> dict[int | str, dict[str, Any]]: df = df.reset_index() if self.na_rep is not None: - df = df.fillna(self.na_rep) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + df = df.fillna(self.na_rep) return df.to_dict(orient="index") diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 52ea072d1483f..ecab14a54beff 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1217,7 +1217,16 @@ def _try_convert_data( if not self.dtype: if all(notna(data)): return data, False - return data.fillna(np.nan), True + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = data.fillna(np.nan) + + return filled, True elif self.dtype is True: pass diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a6d17a604a23f..d630a5ff8a41c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2983,7 +2983,14 @@ def _prepare_data(self) -> np.rec.recarray: for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + dc = data[col].fillna("") + data[col] = dc.apply(_pad_bytes, args=(typ,)) stype = f"S{typ}" dtypes[col] = stype data[col] = data[col].astype(stype) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c62f73271577d..d88605db60720 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1538,7 +1538,13 @@ def _kind(self) -> Literal["area"]: def __init__(self, data, **kwargs) -> None: kwargs.setdefault("stacked", True) - data = data.fillna(value=0) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index f5b0b6f4efa98..d27e9b8b9e983 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -13,6 +13,8 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +import warnings + import numpy as np import pytest @@ -186,7 +188,14 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): if sdtype.kind in "iu": if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype("Float64") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = expected.fillna(np.nan) + expected = filled.astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 1eb67671da0b8..4576a86ad27cd 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -96,6 +96,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -170,6 +171,7 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 812150bb860e9..52b4b64ee279f 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -360,7 +360,9 @@ def test_fillna_dtype_conversion(self): expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) tm.assert_series_equal(result, expected) - result = df.fillna(1) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) tm.assert_frame_equal(result, expected) @@ -817,7 +819,8 @@ def test_fillna_nones_inplace(): [[None, None], [None, None]], columns=["A", "B"], ) - with tm.assert_produces_warning(False): + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): df.fillna(value={"A": 1, "B": 2}, inplace=True) expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 878e94c15e16b..1488fa65fabc0 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1254,7 +1254,9 @@ def test_operators_none_as_na(self, op): # since filling converts dtypes from object, changed expected to be # object - filled = df.fillna(np.nan) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[pd.isna(expected)] = np.nan @@ -1265,10 +1267,14 @@ def test_operators_none_as_na(self, op): expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) - result = op(df, df.fillna(7)) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df, df.fillna(7)) tm.assert_frame_equal(result, expected) - result = op(df.fillna(7), df) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df.fillna(7), df) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 2cc3b67e7ac02..a15d7d7f93f01 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -151,6 +151,7 @@ def _check_unary_op(op): _check_unary_op(operator.inv) # TODO: belongs elsewhere + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_logical_with_nas(self): d = DataFrame({"a": [np.nan, False], "b": [True, True]}) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e7b6a0c0b39b0..74473bc54d51e 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1155,6 +1155,7 @@ def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame): def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na): getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all_bool_frame(self, opname, bool_frame_with_na): # GH#12863: numpy gives back non-boolean data for object type diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index b54a795af4fdc..9b76ae093e8c4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1184,6 +1184,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): ) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "index, columns", [ @@ -1194,6 +1195,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): ) def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): # GH-28301 + df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 287310a18c7df..41bbfcf6840a9 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1534,6 +1534,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method(*args, **kwargs) +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("dtype", [bool, int, float, object]) def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 016208f2d2026..f3075c116883a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -152,7 +152,9 @@ def test_reindex_inference(): # inference of new dtype s = Series([True, False, False, True], index=list("abcd")) new_index = "agc" - result = s.reindex(list(new_index)).ffill() + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.reindex(list(new_index)).ffill() expected = Series([True, True, False], index=list(new_index)) tm.assert_series_equal(result, expected) @@ -160,7 +162,9 @@ def test_reindex_inference(): def test_reindex_downcasting(): # GH4618 shifted series downcasting s = Series(False, index=range(5)) - result = s.shift(1).bfill() + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.shift(1).bfill() expected = Series(False, index=range(5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index be63d9500ce73..a39b3ff7e6f2b 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -203,6 +203,7 @@ def test_series_datetimelike_attribute_access_invalid(self): with pytest.raises(AttributeError, match=msg): ser.weekday + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "kernel, has_numeric_only", [ diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 44121cb5f784f..55fc77fb5705f 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -639,10 +639,12 @@ def test_comparison_operators_with_nas(self, comparison_op): result = comparison_op(ser, val) expected = comparison_op(ser.dropna(), val).reindex(ser.index) - if comparison_op is operator.ne: - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + if comparison_op is operator.ne: + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 26046ef9ba295..2146e154dc7fa 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -15,6 +15,7 @@ class TestSeriesLogicalOps: + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs