From b7aaaf9c8831f7da91c9e31fe1da273744dab5ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:02:19 +0200 Subject: [PATCH] Backport PR #55621 on branch 2.1.x (BUG: mode not sorting values for arrow backed strings) (#55680) BUG: mode not sorting values for arrow backed strings (#55621) * BUG: mode not sorting values for arrow backed strings * Fix tests * Change to pa_installed variable * Update pyarrow.py * Fix * Fix (cherry picked from commit bb2d2e00524fb31d08a43fc54706035f810d2489) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 1 + pandas/tests/extension/test_arrow.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 18 +++++++++++------- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index cca508a4d04b9..a608cd136c559 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -32,6 +32,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) +- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 875461d39a93e..9c8f28d660450 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1886,6 +1886,7 @@ def _mode(self, dropna: bool = True) -> Self: if pa.types.is_temporal(pa_type): most_common = most_common.cast(pa_type) + most_common = most_common.take(pc.array_sort_indices(most_common)) return type(self)(most_common) def _maybe_convert_setitem_value(self, value): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 608cb023a3584..fef4fbea2e485 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1474,7 +1474,7 @@ def test_quantile(data, interpolation, quantile, request): @pytest.mark.parametrize( "take_idx, exp_idx", - [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]], + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], ids=["multi_mode", "single_mode"], ) def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx): @@ -1492,7 +1492,7 @@ def test_mode_dropna_false_mode_na(data): expected = pd.Series([None], dtype=data.dtype) tm.assert_series_equal(result, expected) - expected = pd.Series([None, data[0]], dtype=data.dtype) + expected = pd.Series([data[0], None], dtype=data.dtype) result = expected.mode(dropna=False) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7726dbe6994ef..49ae217513018 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,11 +5,11 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2518,10 +2518,7 @@ def test_groupby_column_index_name_lost(func): "infer_string", [ False, - pytest.param( - True, - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), - ), + pytest.param(True, marks=td.skip_if_no("pyarrow")), ], ) def test_groupby_duplicate_columns(infer_string): @@ -2751,13 +2748,20 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -def test_by_column_values_with_same_starting_value(): +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_by_column_values_with_same_starting_value(dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": ["sad", "happy", "happy"], + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"}