diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 569864ba71122..cc51e22265d7c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) +- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b55a40f5488ea..4bc30d6dbd029 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -15,8 +15,6 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( VALID_CLOSED, @@ -1233,43 +1231,10 @@ def value_counts(self, dropna: bool = True) -> Series: # --------------------------------------------------------------------- # Rendering Methods - def _format_data(self) -> str: - # TODO: integrate with categorical and make generic - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_str = ", ".join(head) - tail_str = ", ".join(tail) - summary = f"[{head_str} ... {tail_str}]" - else: - tail = [formatter(x) for x in self] - tail_str = ", ".join(tail) - summary = f"[{tail_str}]" - - return summary - - def __repr__(self) -> str: - data = self._format_data() - class_name = f"<{type(self).__name__}>\n" - - template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" - return template + def _formatter(self, boxed: bool = False): + # returning 'str' here causes us to render as e.g. "(0, 1]" instead of + # "Interval(0, 1, closed='right')" + return str # --------------------------------------------------------------------- # Vectorized Interval Properties/Attributes diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d630a5ff8a41c..d00f1c666d5d6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -977,7 +977,7 @@ def __init__(self) -> None: # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + [ (251, np.dtype(np.int8)), (252, np.dtype(np.int16)), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 7dffa7bb242d5..55d4a6c3b39fa 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -281,7 +281,7 @@ def test_frame_from_records_utc(self): def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr = np.zeros((2,), dtype=("i4,f4,S10")) arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] DataFrame.from_records(arr) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 67dd5b6217187..a38d2c6fd016a 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self): expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"] - ) + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 985a9e3602410..f0222e5cec9b5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): ) tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index fa8c4e4811ea6..fab90b112fa94 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -253,7 +253,7 @@ def test_to_records_with_categorical(self): ), # Pass in a dtype instance. ( - {"column_dtypes": np.dtype("unicode")}, + {"column_dtypes": np.dtype(np.str_)}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 21d1630af9de2..488f79eea0d11 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -182,7 +182,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), False, ), @@ -190,7 +190,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (1, "B"), (2, "A"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -198,7 +198,7 @@ def test_symmetric_difference(self): "union", np.array( [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort): index1 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) index2 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4280f8617517f..2767078674632 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -604,7 +604,7 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype("unicode") + df_mixed.columns = df_mixed.columns.astype(np.str_) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 0ff84bcf136cd..f031ac46c670c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -303,8 +303,10 @@ def test_store_dropna(tmp_path, setup_path): tm.assert_frame_equal(df_without_missing, reloaded) -def test_keyword_deprecation(): +def test_keyword_deprecation(tmp_path, setup_path): # GH 54229 + path = tmp_path / setup_path + msg = ( "Starting with pandas version 3.0 all arguments of to_hdf except for the " "argument 'path_or_buf' will be keyword-only." @@ -312,7 +314,7 @@ def test_keyword_deprecation(): df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_hdf("example", "key") + df.to_hdf(path, "key") def test_to_hdf_with_min_itemsize(tmp_path, setup_path): diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 386f363c81557..81ca227fb7afb 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort): tm.assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1 = np.zeros((2,), dtype=("i4,f4,S10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2 = np.zeros((3,), dtype=("i4,f4,S10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b6c409397c9fb..03fc6cba2902a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -403,12 +403,12 @@ def test_astype_unicode(self): # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" ser = Series([item.encode()]) - result = ser.astype("unicode") + result = ser.astype(np.str_) expected = Series([item]) tm.assert_series_equal(result, expected) for ser in test_series: - res = ser.astype("unicode") + res = ser.astype(np.str_) expec = ser.map(str) tm.assert_series_equal(res, expec)