Merge branch 'main' into ref-format

jbrockmendel · Oct 10, 2023 · 072db8f · 072db8f
2 parents d4d91df + 1025151
commit 072db8f
Show file tree

Hide file tree

Showing 12 changed files with 62 additions and 57 deletions.
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -16,6 +16,7 @@ Fixed regressions
 - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
 - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
 - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
+- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_212.bug_fixes:

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -15,8 +15,6 @@
 
 import numpy as np
 
-from pandas._config import get_option
-
 from pandas._libs import lib
 from pandas._libs.interval import (
     VALID_CLOSED,
@@ -1233,43 +1231,10 @@ def value_counts(self, dropna: bool = True) -> Series:
     # ---------------------------------------------------------------------
     # Rendering Methods
 
-    def _format_data(self) -> str:
-        # TODO: integrate with categorical and make generic
-        n = len(self)
-        max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10)
-
-        formatter = str
-
-        if n == 0:
-            summary = "[]"
-        elif n == 1:
-            first = formatter(self[0])
-            summary = f"[{first}]"
-        elif n == 2:
-            first = formatter(self[0])
-            last = formatter(self[-1])
-            summary = f"[{first}, {last}]"
-        else:
-            if n > max_seq_items:
-                n = min(max_seq_items // 2, 10)
-                head = [formatter(x) for x in self[:n]]
-                tail = [formatter(x) for x in self[-n:]]
-                head_str = ", ".join(head)
-                tail_str = ", ".join(tail)
-                summary = f"[{head_str} ... {tail_str}]"
-            else:
-                tail = [formatter(x) for x in self]
-                tail_str = ", ".join(tail)
-                summary = f"[{tail_str}]"
-
-        return summary
-
-    def __repr__(self) -> str:
-        data = self._format_data()
-        class_name = f"<{type(self).__name__}>\n"
-
-        template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"
-        return template
+    def _formatter(self, boxed: bool = False):
+        # returning 'str' here causes us to render as e.g. "(0, 1]" instead of
+        #  "Interval(0, 1, closed='right')"
+        return str
 
     # ---------------------------------------------------------------------
     # Vectorized Interval Properties/Attributes

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -977,7 +977,7 @@ def __init__(self) -> None:
         # with a label, but the underlying variable is -127 to 100
         # we're going to drop the label and cast to int
         self.DTYPE_MAP = dict(
-            list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
+            [(i, np.dtype(f"S{i}")) for i in range(1, 245)]
             + [
                 (251, np.dtype(np.int8)),
                 (252, np.dtype(np.int16)),

diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
@@ -281,7 +281,7 @@ def test_frame_from_records_utc(self):
 
     def test_from_records_to_records(self):
         # from numpy documentation
-        arr = np.zeros((2,), dtype=("i4,f4,a10"))
+        arr = np.zeros((2,), dtype=("i4,f4,S10"))
         arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
 
         DataFrame.from_records(arr)

diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self):
         expected = df3.reindex(columns=[])
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize(
-        "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"]
-    )
+    @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
     @pytest.mark.parametrize("arg", ["include", "exclude"])
     def test_select_dtypes_str_raises(self, dtype, arg):
         df = DataFrame(

diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
@@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending):
             )
 
         tm.assert_frame_equal(result, expected)
+
+
+def test_sort_index_with_sliced_multiindex():
+    # GH 55379
+    mi = MultiIndex.from_tuples(
+        [
+            ("a", "10"),
+            ("a", "18"),
+            ("a", "25"),
+            ("b", "16"),
+            ("b", "26"),
+            ("a", "45"),
+            ("b", "28"),
+            ("a", "5"),
+            ("a", "50"),
+            ("a", "51"),
+            ("b", "4"),
+        ],
+        names=["group", "str"],
+    )
+
+    df = DataFrame({"x": range(len(mi))}, index=mi)
+    result = df.iloc[0:6].sort_index()
+
+    expected = DataFrame(
+        {"x": [0, 1, 2, 5, 3, 4]},
+        index=MultiIndex.from_tuples(
+            [
+                ("a", "10"),
+                ("a", "18"),
+                ("a", "25"),
+                ("a", "45"),
+                ("b", "16"),
+                ("b", "26"),
+            ],
+            names=["group", "str"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py
@@ -253,7 +253,7 @@ def test_to_records_with_categorical(self):
             ),
             # Pass in a dtype instance.
             (
-                {"column_dtypes": np.dtype("unicode")},
+                {"column_dtypes": np.dtype(np.str_)},
                 np.rec.array(
                     [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                     dtype=[

diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
@@ -182,23 +182,23 @@ def test_symmetric_difference(self):
                 "intersection",
                 np.array(
                     [(1, "A"), (2, "A"), (1, "B"), (2, "B")],
-                    dtype=[("num", int), ("let", "a1")],
+                    dtype=[("num", int), ("let", "S1")],
                 ),
                 False,
             ),
             (
                 "intersection",
                 np.array(
                     [(1, "A"), (1, "B"), (2, "A"), (2, "B")],
-                    dtype=[("num", int), ("let", "a1")],
+                    dtype=[("num", int), ("let", "S1")],
                 ),
                 None,
             ),
             (
                 "union",
                 np.array(
                     [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")],
-                    dtype=[("num", int), ("let", "a1")],
+                    dtype=[("num", int), ("let", "S1")],
                 ),
                 None,
             ),
@@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort):
         index1 = Index(
             np.array(
                 [(1, "A"), (2, "A"), (1, "B"), (2, "B")],
-                dtype=[("num", int), ("let", "a1")],
+                dtype=[("num", int), ("let", "S1")],
             )
         )
         index2 = Index(
             np.array(
                 [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")],
-                dtype=[("num", int), ("let", "a1")],
+                dtype=[("num", int), ("let", "S1")],
             )
         )
 

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -604,7 +604,7 @@ def test_blocks_compat_GH9037(self):
         )
 
         # JSON deserialisation always creates unicode strings
-        df_mixed.columns = df_mixed.columns.astype("unicode")
+        df_mixed.columns = df_mixed.columns.astype(np.str_)
         data = StringIO(df_mixed.to_json(orient="split"))
         df_roundtrip = read_json(data, orient="split")
         tm.assert_frame_equal(

diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -303,16 +303,18 @@ def test_store_dropna(tmp_path, setup_path):
     tm.assert_frame_equal(df_without_missing, reloaded)
 
 
-def test_keyword_deprecation():
+def test_keyword_deprecation(tmp_path, setup_path):
     # GH 54229
+    path = tmp_path / setup_path
+
     msg = (
         "Starting with pandas version 3.0 all arguments of to_hdf except for the "
         "argument 'path_or_buf' will be keyword-only."
     )
     df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}])
 
     with tm.assert_produces_warning(FutureWarning, match=msg):
-        df.to_hdf("example", "key")
+        df.to_hdf(path, "key")
 
 
 def test_to_hdf_with_min_itemsize(tmp_path, setup_path):

diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort):
         tm.assert_frame_equal(df5, expected)
 
     def test_append_records(self):
-        arr1 = np.zeros((2,), dtype=("i4,f4,a10"))
+        arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
         arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
 
-        arr2 = np.zeros((3,), dtype=("i4,f4,a10"))
+        arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
         arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
 
         df1 = DataFrame(arr1)

diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
@@ -403,12 +403,12 @@ def test_astype_unicode(self):
             #  bytes with obj.decode() instead of str(obj)
             item = "野菜食べないとやばい"
             ser = Series([item.encode()])
-            result = ser.astype("unicode")
+            result = ser.astype(np.str_)
             expected = Series([item])
             tm.assert_series_equal(result, expected)
 
         for ser in test_series:
-            res = ser.astype("unicode")
+            res = ser.astype(np.str_)
             expec = ser.map(str)
             tm.assert_series_equal(res, expec)