From 1320ff12ce4f7d58dab43396df7f3d2e9e1c4792 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Tue, 13 Oct 2020 20:44:17 +1100
Subject: [PATCH 01/14] Fixed #36562 * Use special sorting comparator for tuple
 arrays which can be created when consolidate_first is called on DataFrames
 with MultiIndex which contain nan and string values

---
 doc/source/whatsnew/v1.2.0.rst                |  3 +-
 pandas/core/algorithms.py                     | 48 +++++++++++++++++--
 .../indexing/multiindex/test_multiindex.py    | 23 +++++++++
 pandas/tests/test_sorting.py                  |  7 +++
 4 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index ddee06aeab779..5e5af52678b37 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -363,7 +363,8 @@ MultiIndex
 ^^^^^^^^^^
 
 - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
--
+- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` with message ``"'<' not supported between instances of 'float' and 'str'"`` (:issue:`36562`)
+
 
 I/O
 ^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index d2005d46bbbf1..811db799c46e7 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import operator
+import functools
 from textwrap import dedent
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast
 from warnings import catch_warnings, simplefilter, warn
@@ -2055,13 +2056,52 @@ def sort_mixed(values):
         strs = np.sort(values[str_pos])
         return np.concatenate([nums, np.asarray(strs, dtype=object)])
 
+    def sort_tuples(values):
+        # sorts tuples with mixed values. can handle nan vs string comparisons.
+        def cmp_func(index_x, index_y):
+            x = values[index_x]
+            y = values[index_y]
+            if x == y:
+                return 0
+            len_x = len(x)
+            len_y = len(y)
+            for i in range(max(len_x, len_y)):
+                # check if the tuples have different lengths (shorter tuples
+                # first)
+                if i >= len_x:
+                    return -1
+                if i >= len_y:
+                    return +1
+                x_i_na = isna(x[i])
+                y_i_na = isna(y[i])
+                # values are the same -> resolve tie with next element
+                if (x_i_na and y_i_na) or (x[i] == y[i]):
+                    continue
+                # check for nan values (sort nan to the end which is consistent
+                # with numpy
+                if x_i_na and not y_i_na:
+                    return +1
+                if not x_i_na and y_i_na:
+                    return -1
+                # normal greater/less than comparison
+                if x[i] < y[i]:
+                    return -1
+                return +1
+            return 0
+
+        ixs = np.arange(len(values))
+        ixs = sorted(ixs, key=functools.cmp_to_key(cmp_func))
+        return values[ixs]
+
     sorter = None
-    if (
-        not is_extension_array_dtype(values)
-        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
-    ):
+
+    ext_arr = is_extension_array_dtype(values)
+    if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer":
         # unorderable in py3 if mixed str/int
         ordered = sort_mixed(values)
+    elif not ext_arr and values.size and isinstance(values[0], tuple):
+        # 1-D arrays with tuples of potentially mixed type (solves GH36562)
+        ordered = sort_tuples(values)
     else:
         try:
             sorter = values.argsort()
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 4565d79c632de..1bc984d1dc8c3 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -91,3 +91,26 @@ def test_multiindex_get_loc_list_raises(self):
         msg = "unhashable type"
         with pytest.raises(TypeError, match=msg):
             idx.get_loc([])
+
+
+def test_combine_first_with_nan_index():
+    mi1 = pd.MultiIndex.from_arrays(
+        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]],
+        names=["a", "b"]
+    )
+    df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
+    mi2 = pd.MultiIndex.from_arrays(
+        [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
+    )
+    s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
+    df_combined = df.combine_first(pd.DataFrame({"col": s}))
+    mi_expected = pd.MultiIndex.from_arrays([
+        ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
+        [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6,]
+    ], names=["a", "b"])
+    assert (df_combined.index == mi_expected).all()
+    exp_col = np.asarray(
+        [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan]
+    )
+    act_col = df_combined['col'].values
+    assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True)
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index deb7434694d01..3553456d58e03 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel):
         expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
         tm.assert_extension_array_equal(result, expected_values)
         tm.assert_numpy_array_equal(codes, expected_codes)
+
+
+def test_mixed_str_nan():
+    values = np.array(["b", np.nan, "a", "b"], dtype=object)
+    result = safe_sort(values)
+    expected = np.array([np.nan, "a", "b", "b"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)

From 1f76d216f8cf4b4a8097bf947a8947dddcf6f7ee Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Tue, 13 Oct 2020 20:49:58 +1100
Subject: [PATCH 02/14] Fixed flake issue

---
 pandas/tests/indexing/multiindex/test_multiindex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 1bc984d1dc8c3..c1028ebcda95b 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -106,7 +106,7 @@ def test_combine_first_with_nan_index():
     df_combined = df.combine_first(pd.DataFrame({"col": s}))
     mi_expected = pd.MultiIndex.from_arrays([
         ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
-        [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6,]
+        [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6]
     ], names=["a", "b"])
     assert (df_combined.index == mi_expected).all()
     exp_col = np.asarray(

From 7076841ba4f5b50dddc20daf5e980c8e20f2f273 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Tue, 13 Oct 2020 21:22:59 +1100
Subject: [PATCH 03/14] fixed linting errors

---
 pandas/core/algorithms.py                        |  2 +-
 .../tests/indexing/multiindex/test_multiindex.py | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 811db799c46e7..6a2e4e34f92cc 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -4,8 +4,8 @@
 """
 from __future__ import annotations
 
-import operator
 import functools
+import operator
 from textwrap import dedent
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast
 from warnings import catch_warnings, simplefilter, warn
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index c1028ebcda95b..8881266a3f3ce 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -95,8 +95,7 @@ def test_multiindex_get_loc_list_raises(self):
 
 def test_combine_first_with_nan_index():
     mi1 = pd.MultiIndex.from_arrays(
-        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]],
-        names=["a", "b"]
+        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
     )
     df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
     mi2 = pd.MultiIndex.from_arrays(
@@ -104,13 +103,16 @@ def test_combine_first_with_nan_index():
     )
     s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
     df_combined = df.combine_first(pd.DataFrame({"col": s}))
-    mi_expected = pd.MultiIndex.from_arrays([
-        ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
-        [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6]
-    ], names=["a", "b"])
+    mi_expected = pd.MultiIndex.from_arrays(
+        [
+            ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
+            [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
+        ],
+        names=["a", "b"],
+    )
     assert (df_combined.index == mi_expected).all()
     exp_col = np.asarray(
         [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan]
     )
-    act_col = df_combined['col'].values
+    act_col = df_combined["col"].values
     assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True)

From 225675db4433f523699a69042175c55bfd07c99d Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Wed, 14 Oct 2020 11:19:25 +1100
Subject: [PATCH 04/14] Addressed review comments

---
 pandas/core/algorithms.py                     | 32 +++++++++----------
 .../indexing/multiindex/test_multiindex.py    | 16 ++++++----
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6a2e4e34f92cc..631d3a5e4c9cc 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2061,32 +2061,32 @@ def sort_tuples(values):
         def cmp_func(index_x, index_y):
             x = values[index_x]
             y = values[index_y]
+            # shortcut loop in case both tuples are the same
             if x == y:
                 return 0
-            len_x = len(x)
-            len_y = len(y)
-            for i in range(max(len_x, len_y)):
+            # lexicographic sorting
+            for i in range(max(len(x), len(y))):
                 # check if the tuples have different lengths (shorter tuples
                 # first)
-                if i >= len_x:
+                if i >= len(x):
                     return -1
-                if i >= len_y:
+                if i >= len(y):
                     return +1
-                x_i_na = isna(x[i])
-                y_i_na = isna(y[i])
+                x_is_na = isna(x[i])
+                y_is_na = isna(y[i])
                 # values are the same -> resolve tie with next element
-                if (x_i_na and y_i_na) or (x[i] == y[i]):
+                if (x_is_na and y_is_na) or (x[i] == y[i]):
                     continue
-                # check for nan values (sort nan to the end which is consistent
-                # with numpy
-                if x_i_na and not y_i_na:
+                # check for nan values (sort nan to the end)
+                if x_is_na and not y_is_na:
                     return +1
-                if not x_i_na and y_i_na:
+                if not x_is_na and y_is_na:
                     return -1
                 # normal greater/less than comparison
                 if x[i] < y[i]:
                     return -1
                 return +1
+            # both values are the same (should already have been caught)
             return 0
 
         ixs = np.arange(len(values))
@@ -2095,12 +2095,10 @@ def cmp_func(index_x, index_y):
 
     sorter = None
 
-    ext_arr = is_extension_array_dtype(values)
-    if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer":
-        # unorderable in py3 if mixed str/int
+    is_ea = is_extension_array_dtype(values)
+    if not is_ea and lib.infer_dtype(values, skipna=False) == "mixed-integer":
         ordered = sort_mixed(values)
-    elif not ext_arr and values.size and isinstance(values[0], tuple):
-        # 1-D arrays with tuples of potentially mixed type (solves GH36562)
+    elif not is_ea and values.size and isinstance(values[0], tuple):
         ordered = sort_tuples(values)
     else:
         try:
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 8881266a3f3ce..fd3f85c1c3e72 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -93,7 +93,7 @@ def test_multiindex_get_loc_list_raises(self):
             idx.get_loc([])
 
 
-def test_combine_first_with_nan_index():
+def test_combine_first_with_nan_multiindex():
     mi1 = pd.MultiIndex.from_arrays(
         [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
     )
@@ -102,7 +102,7 @@ def test_combine_first_with_nan_index():
         [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
     )
     s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
-    df_combined = df.combine_first(pd.DataFrame({"col": s}))
+    res = df.combine_first(pd.DataFrame({"d": s}))
     mi_expected = pd.MultiIndex.from_arrays(
         [
             ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
@@ -110,9 +110,11 @@ def test_combine_first_with_nan_index():
         ],
         names=["a", "b"],
     )
-    assert (df_combined.index == mi_expected).all()
-    exp_col = np.asarray(
-        [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan]
+    expected = pd.DataFrame(
+        {
+            "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
+            "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
+        },
+        index=mi_expected,
     )
-    act_col = df_combined["col"].values
-    assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True)
+    tm.assert_frame_equal(res, expected)

From 2677166fe280a999cb81dc3883074a54dd932315 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Fri, 16 Oct 2020 11:05:21 +1100
Subject: [PATCH 05/14] Make `sort_tuples` last resort sorting strategy to
 allow faster sorters to execute first

---
 pandas/core/algorithms.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 631d3a5e4c9cc..bfb36b676cbed 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1023,10 +1023,11 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
         to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
     else:
         to_raise = (
-            (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]
-        ).any() or (
-            (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
-        ).any()
+            ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
+            or (
+                (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
+            ).any()
+        )
 
     if to_raise:
         raise OverflowError("Overflow in int64 addition")
@@ -2095,18 +2096,24 @@ def cmp_func(index_x, index_y):
 
     sorter = None
 
-    is_ea = is_extension_array_dtype(values)
-    if not is_ea and lib.infer_dtype(values, skipna=False) == "mixed-integer":
+    if (
+        not is_extension_array_dtype(values)
+        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
+    ):
         ordered = sort_mixed(values)
-    elif not is_ea and values.size and isinstance(values[0], tuple):
-        ordered = sort_tuples(values)
     else:
         try:
             sorter = values.argsort()
             ordered = values.take(sorter)
         except TypeError:
             # try this anyway
-            ordered = sort_mixed(values)
+            try:
+                ordered = sort_mixed(values)
+            except TypeError:
+                if values.size and isinstance(values[0], tuple):
+                    ordered = sort_tuples(values)
+                else:
+                    raise
 
     # codes:
 

From 3688238654368efef6aee016c3661d755a7ece6d Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Fri, 16 Oct 2020 11:22:01 +1100
Subject: [PATCH 06/14] manually apply pandas black changes (from CI)

---
 pandas/core/algorithms.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index bfb36b676cbed..c3f0d6037157b 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1023,11 +1023,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
         to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
     else:
         to_raise = (
-            ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
-            or (
-                (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
-            ).any()
-        )
+            (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]
+        ).any() or (
+            (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
+        ).any()
 
     if to_raise:
         raise OverflowError("Overflow in int64 addition")

From 8ae927913cd6b9f5c27f20ce0b4819ec6ac2b9d6 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Thu, 22 Oct 2020 09:52:42 +1100
Subject: [PATCH 07/14] Modified changelog message as per review comments

---
 doc/source/whatsnew/v1.2.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 0540015bfb3d4..e7c6418631aa6 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -412,7 +412,7 @@ MultiIndex
 
 - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
 - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
-- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` with message ``"'<' not supported between instances of 'float' and 'str'"`` (:issue:`36562`)
+- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
 -
 
 I/O

From 7b7d6f8d9d34fdbd1d6cccfbe43bac07dd06e0b3 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Thu, 22 Oct 2020 14:26:36 +1100
Subject: [PATCH 08/14] Removed pd.xyz

---
 pandas/tests/indexing/multiindex/test_multiindex.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index bab0a30815202..f0367817db5a0 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -94,23 +94,23 @@ def test_multiindex_get_loc_list_raises(self):
 
 
 def test_combine_first_with_nan_multiindex():
-    mi1 = pd.MultiIndex.from_arrays(
+    mi1 = MultiIndex.from_arrays(
         [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
     )
-    df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
-    mi2 = pd.MultiIndex.from_arrays(
+    df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
+    mi2 = MultiIndex.from_arrays(
         [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
     )
-    s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
+    s = Series([1, 2, 3, 4, 5, 6], index=mi2)
     res = df.combine_first(pd.DataFrame({"d": s}))
-    mi_expected = pd.MultiIndex.from_arrays(
+    mi_expected = MultiIndex.from_arrays(
         [
             ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
             [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
         ],
         names=["a", "b"],
     )
-    expected = pd.DataFrame(
+    expected = DataFrame(
         {
             "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
             "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],

From 8cbfa015724c85cc084d203ad32f4f33cd6c4892 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Thu, 22 Oct 2020 15:46:50 +1100
Subject: [PATCH 09/14] forgot `pd`

---
 pandas/tests/indexing/multiindex/test_multiindex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index f0367817db5a0..244888836c733 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -102,7 +102,7 @@ def test_combine_first_with_nan_multiindex():
         [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
     )
     s = Series([1, 2, 3, 4, 5, 6], index=mi2)
-    res = df.combine_first(pd.DataFrame({"d": s}))
+    res = df.combine_first(DataFrame({"d": s}))
     mi_expected = MultiIndex.from_arrays(
         [
             ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],

From 92e1e3384a1c1410bd59af90ca853aba1e09bdb9 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Sat, 31 Oct 2020 16:31:31 +1100
Subject: [PATCH 10/14] Changed sorting algorithm by using expanded array *
 Extract column arrays and use pandas' internal functions to obtain index
 which sorts the array of tuples * Add function annotations to document
 expected argument types of `sort_tuples()`

---
 pandas/core/algorithms.py | 48 +++++++++------------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 7e94765127c95..bd6eac9794656 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -4,7 +4,6 @@
 """
 from __future__ import annotations
 
-import functools
 import operator
 from textwrap import dedent
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast
@@ -2071,42 +2070,17 @@ def sort_mixed(values):
         strs = np.sort(values[str_pos])
         return np.concatenate([nums, np.asarray(strs, dtype=object)])
 
-    def sort_tuples(values):
-        # sorts tuples with mixed values. can handle nan vs string comparisons.
-        def cmp_func(index_x, index_y):
-            x = values[index_x]
-            y = values[index_y]
-            # shortcut loop in case both tuples are the same
-            if x == y:
-                return 0
-            # lexicographic sorting
-            for i in range(max(len(x), len(y))):
-                # check if the tuples have different lengths (shorter tuples
-                # first)
-                if i >= len(x):
-                    return -1
-                if i >= len(y):
-                    return +1
-                x_is_na = isna(x[i])
-                y_is_na = isna(y[i])
-                # values are the same -> resolve tie with next element
-                if (x_is_na and y_is_na) or (x[i] == y[i]):
-                    continue
-                # check for nan values (sort nan to the end)
-                if x_is_na and not y_is_na:
-                    return +1
-                if not x_is_na and y_is_na:
-                    return -1
-                # normal greater/less than comparison
-                if x[i] < y[i]:
-                    return -1
-                return +1
-            # both values are the same (should already have been caught)
-            return 0
-
-        ixs = np.arange(len(values))
-        ixs = sorted(ixs, key=functools.cmp_to_key(cmp_func))
-        return values[ixs]
+    def sort_tuples(values: np.ndarray[tuple]):
+        # convert array of tuples (1d) to array or array (2d).
+        # we need to keep the columns separately as they contain different
+        # types and nans (can't use `np.sort` as it may fail when str and nan
+        # are mixed in a column as types cannot be compared).
+        from pandas.core.sorting import lexsort_indexer
+        from pandas.core.internals.construction import to_arrays
+
+        arrays, _ = to_arrays(values, None)
+        indexer = lexsort_indexer(arrays, orders=True)
+        return values[indexer]
 
     sorter = None
 

From 95670f1c4719900a540db77aa7e320ee61a6622a Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Sat, 31 Oct 2020 16:37:49 +1100
Subject: [PATCH 11/14] Add ticket ref to test case

---
 pandas/tests/indexing/multiindex/test_multiindex.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index be0f8e9617fb3..f392cb52cea90 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -86,6 +86,8 @@ def test_nested_tuples_duplicates(self):
 
 
 def test_combine_first_with_nan_multiindex():
+    # GH#36562
+
     mi1 = MultiIndex.from_arrays(
         [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
     )

From d9dcd22a269cbab2313cfc7ffae251a189815a03 Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Sat, 31 Oct 2020 17:51:11 +1100
Subject: [PATCH 12/14] sort import

---
 pandas/core/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6cd9a6f75674e..b67dd7d9b6537 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2075,8 +2075,8 @@ def sort_tuples(values: np.ndarray[tuple]):
         # we need to keep the columns separately as they contain different
         # types and nans (can't use `np.sort` as it may fail when str and nan
         # are mixed in a column as types cannot be compared).
-        from pandas.core.sorting import lexsort_indexer
         from pandas.core.internals.construction import to_arrays
+        from pandas.core.sorting import lexsort_indexer
 
         arrays, _ = to_arrays(values, None)
         indexer = lexsort_indexer(arrays, orders=True)

From 66c30c74fab35f25aa8c6279fc8bf7b4e194cd8a Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Sun, 1 Nov 2020 07:44:07 +1100
Subject: [PATCH 13/14] Address review comments * factor out inner functions *
 relocated test case * simplified try/except

---
 pandas/core/algorithms.py                     | 59 ++++++++++---------
 .../tests/frame/methods/test_combine_first.py | 31 +++++++++-
 .../indexing/multiindex/test_multiindex.py    | 29 ---------
 3 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index b67dd7d9b6537..6f21835ef1def 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2063,45 +2063,25 @@ def safe_sort(
         dtype, _ = infer_dtype_from_array(values)
         values = np.asarray(values, dtype=dtype)
 
-    def sort_mixed(values):
-        # order ints before strings, safe in py3
-        str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-        nums = np.sort(values[~str_pos])
-        strs = np.sort(values[str_pos])
-        return np.concatenate([nums, np.asarray(strs, dtype=object)])
-
-    def sort_tuples(values: np.ndarray[tuple]):
-        # convert array of tuples (1d) to array or array (2d).
-        # we need to keep the columns separately as they contain different
-        # types and nans (can't use `np.sort` as it may fail when str and nan
-        # are mixed in a column as types cannot be compared).
-        from pandas.core.internals.construction import to_arrays
-        from pandas.core.sorting import lexsort_indexer
-
-        arrays, _ = to_arrays(values, None)
-        indexer = lexsort_indexer(arrays, orders=True)
-        return values[indexer]
-
     sorter = None
 
     if (
         not is_extension_array_dtype(values)
         and lib.infer_dtype(values, skipna=False) == "mixed-integer"
     ):
-        ordered = sort_mixed(values)
+        ordered = _sort_mixed(values)
     else:
         try:
             sorter = values.argsort()
             ordered = values.take(sorter)
         except TypeError:
-            # try this anyway
-            try:
-                ordered = sort_mixed(values)
-            except TypeError:
-                if values.size and isinstance(values[0], tuple):
-                    ordered = sort_tuples(values)
-                else:
-                    raise
+            # Previous sorters failed or were not applicable, try `_sort_mixed`
+            # which would work, but which fails for special case of 1d arrays
+            # with tuples.
+            if values.size and isinstance(values[0], tuple):
+                ordered = _sort_tuples(values)
+            else:
+                ordered = _sort_mixed(values)
 
     # codes:
 
@@ -2148,3 +2128,26 @@ def sort_tuples(values: np.ndarray[tuple]):
         np.putmask(new_codes, mask, na_sentinel)
 
     return ordered, ensure_platform_int(new_codes)
+
+
+def _sort_mixed(values):
+    """ order ints before strings in 1d arrays, safe in py3 """
+    str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
+    nums = np.sort(values[~str_pos])
+    strs = np.sort(values[str_pos])
+    return np.concatenate([nums, np.asarray(strs, dtype=object)])
+
+
+def _sort_tuples(values: np.ndarray[tuple]):
+    """
+    Convert array of tuples (1d) to array or array (2d).
+    We need to keep the columns separately as they contain different types and
+    nans (can't use `np.sort` as it may fail when str and nan are mixed in a
+    column as types cannot be compared).
+    """
+    from pandas.core.internals.construction import to_arrays
+    from pandas.core.sorting import lexsort_indexer
+
+    arrays, _ = to_arrays(values, None)
+    indexer = lexsort_indexer(arrays, orders=True)
+    return values[indexer]
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index d1f38d90547fd..7c1fd95af03b8 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -4,7 +4,7 @@
 import pytest
 
 import pandas as pd
-from pandas import DataFrame, Index, Series
+from pandas import DataFrame, Index, MultiIndex, Series
 import pandas._testing as tm
 
 
@@ -353,3 +353,32 @@ def test_combine_first_with_asymmetric_other(self, val):
         exp = DataFrame({"isBool": [True], "isNum": [val]})
 
         tm.assert_frame_equal(res, exp)
+
+
+def test_combine_first_with_nan_multiindex():
+    # gh-36562
+
+    mi1 = MultiIndex.from_arrays(
+        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
+    )
+    df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
+    mi2 = MultiIndex.from_arrays(
+        [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
+    )
+    s = Series([1, 2, 3, 4, 5, 6], index=mi2)
+    res = df.combine_first(DataFrame({"d": s}))
+    mi_expected = MultiIndex.from_arrays(
+        [
+            ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
+            [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
+        ],
+        names=["a", "b"],
+    )
+    expected = DataFrame(
+        {
+            "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
+            "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
+        },
+        index=mi_expected,
+    )
+    tm.assert_frame_equal(res, expected)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index f392cb52cea90..a3b8d66c92024 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -83,32 +83,3 @@ def test_nested_tuples_duplicates(self):
         df3 = df.copy(deep=True)
         df3.loc[[(dti[0], "a")], "c2"] = 1.0
         tm.assert_frame_equal(df3, expected)
-
-
-def test_combine_first_with_nan_multiindex():
-    # GH#36562
-
-    mi1 = MultiIndex.from_arrays(
-        [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
-    )
-    df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
-    mi2 = MultiIndex.from_arrays(
-        [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
-    )
-    s = Series([1, 2, 3, 4, 5, 6], index=mi2)
-    res = df.combine_first(DataFrame({"d": s}))
-    mi_expected = MultiIndex.from_arrays(
-        [
-            ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
-            [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
-        ],
-        names=["a", "b"],
-    )
-    expected = DataFrame(
-        {
-            "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
-            "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
-        },
-        index=mi_expected,
-    )
-    tm.assert_frame_equal(res, expected)

From 9d03dc35fcc631784c795695809d7538ebaad7ff Mon Sep 17 00:00:00 2001
From: ssche <sven.schellenberg@paradynsystems.com>
Date: Wed, 4 Nov 2020 10:53:56 +1100
Subject: [PATCH 14/14] Fixed: committed file was in merge state

---
 .../tests/frame/methods/test_combine_first.py | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index b5289e1e58f48..08c4293323500 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -354,7 +354,18 @@ def test_combine_first_with_asymmetric_other(self, val):
 
         tm.assert_frame_equal(res, exp)
 
-<<<<<<< HEAD
+    def test_combine_first_string_dtype_only_na(self):
+        # GH: 37519
+        df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string")
+        df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string")
+        df.set_index(["a", "b"], inplace=True)
+        df2.set_index(["a", "b"], inplace=True)
+        result = df.combine_first(df2)
+        expected = DataFrame(
+            {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
+        ).set_index(["a", "b"])
+        tm.assert_frame_equal(result, expected)
+
 
 def test_combine_first_with_nan_multiindex():
     # gh-36562
@@ -383,16 +394,3 @@ def test_combine_first_with_nan_multiindex():
         index=mi_expected,
     )
     tm.assert_frame_equal(res, expected)
-=======
-    def test_combine_first_string_dtype_only_na(self):
-        # GH: 37519
-        df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string")
-        df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string")
-        df.set_index(["a", "b"], inplace=True)
-        df2.set_index(["a", "b"], inplace=True)
-        result = df.combine_first(df2)
-        expected = DataFrame(
-            {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
-        ).set_index(["a", "b"])
-        tm.assert_frame_equal(result, expected)
->>>>>>> upstream/master