From 1320ff12ce4f7d58dab43396df7f3d2e9e1c4792 Mon Sep 17 00:00:00 2001 From: ssche Date: Tue, 13 Oct 2020 20:44:17 +1100 Subject: [PATCH 01/14] Fixed #36562 * Use special sorting comparator for tuple arrays which can be created when consolidate_first is called on DataFrames with MultiIndex which contain nan and string values --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/algorithms.py | 48 +++++++++++++++++-- .../indexing/multiindex/test_multiindex.py | 23 +++++++++ pandas/tests/test_sorting.py | 7 +++ 4 files changed, 76 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ddee06aeab779..5e5af52678b37 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -363,7 +363,8 @@ MultiIndex ^^^^^^^^^^ - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) -- +- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` with message ``"'<' not supported between instances of 'float' and 'str'"`` (:issue:`36562`) + I/O ^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d2005d46bbbf1..811db799c46e7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -5,6 +5,7 @@ from __future__ import annotations import operator +import functools from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast from warnings import catch_warnings, simplefilter, warn @@ -2055,13 +2056,52 @@ def sort_mixed(values): strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) + def sort_tuples(values): + # sorts tuples with mixed values. can handle nan vs string comparisons. + def cmp_func(index_x, index_y): + x = values[index_x] + y = values[index_y] + if x == y: + return 0 + len_x = len(x) + len_y = len(y) + for i in range(max(len_x, len_y)): + # check if the tuples have different lengths (shorter tuples + # first) + if i >= len_x: + return -1 + if i >= len_y: + return +1 + x_i_na = isna(x[i]) + y_i_na = isna(y[i]) + # values are the same -> resolve tie with next element + if (x_i_na and y_i_na) or (x[i] == y[i]): + continue + # check for nan values (sort nan to the end which is consistent + # with numpy + if x_i_na and not y_i_na: + return +1 + if not x_i_na and y_i_na: + return -1 + # normal greater/less than comparison + if x[i] < y[i]: + return -1 + return +1 + return 0 + + ixs = np.arange(len(values)) + ixs = sorted(ixs, key=functools.cmp_to_key(cmp_func)) + return values[ixs] + sorter = None - if ( - not is_extension_array_dtype(values) - and lib.infer_dtype(values, skipna=False) == "mixed-integer" - ): + + ext_arr = is_extension_array_dtype(values) + if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer": # unorderable in py3 if mixed str/int ordered = sort_mixed(values) + elif not ext_arr and values.size and isinstance(values[0], tuple): + # 1-D arrays with tuples of potentially mixed type (solves GH36562) + ordered = sort_tuples(values) else: try: sorter = values.argsort() diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 4565d79c632de..1bc984d1dc8c3 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -91,3 +91,26 @@ def test_multiindex_get_loc_list_raises(self): msg = "unhashable type" with pytest.raises(TypeError, match=msg): idx.get_loc([]) + + +def test_combine_first_with_nan_index(): + mi1 = pd.MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], + names=["a", "b"] + ) + df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = pd.MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2) + df_combined = df.combine_first(pd.DataFrame({"col": s})) + mi_expected = pd.MultiIndex.from_arrays([ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6,] + ], names=["a", "b"]) + assert (df_combined.index == mi_expected).all() + exp_col = np.asarray( + [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan] + ) + act_col = df_combined['col'].values + assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index deb7434694d01..3553456d58e03 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel): expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(codes, expected_codes) + + +def test_mixed_str_nan(): + values = np.array(["b", np.nan, "a", "b"], dtype=object) + result = safe_sort(values) + expected = np.array([np.nan, "a", "b", "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) From 1f76d216f8cf4b4a8097bf947a8947dddcf6f7ee Mon Sep 17 00:00:00 2001 From: ssche Date: Tue, 13 Oct 2020 20:49:58 +1100 Subject: [PATCH 02/14] Fixed flake issue --- pandas/tests/indexing/multiindex/test_multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 1bc984d1dc8c3..c1028ebcda95b 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -106,7 +106,7 @@ def test_combine_first_with_nan_index(): df_combined = df.combine_first(pd.DataFrame({"col": s})) mi_expected = pd.MultiIndex.from_arrays([ ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], - [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6,] + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6] ], names=["a", "b"]) assert (df_combined.index == mi_expected).all() exp_col = np.asarray( From 7076841ba4f5b50dddc20daf5e980c8e20f2f273 Mon Sep 17 00:00:00 2001 From: ssche Date: Tue, 13 Oct 2020 21:22:59 +1100 Subject: [PATCH 03/14] fixed linting errors --- pandas/core/algorithms.py | 2 +- .../tests/indexing/multiindex/test_multiindex.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 811db799c46e7..6a2e4e34f92cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,8 +4,8 @@ """ from __future__ import annotations -import operator import functools +import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast from warnings import catch_warnings, simplefilter, warn diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index c1028ebcda95b..8881266a3f3ce 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -95,8 +95,7 @@ def test_multiindex_get_loc_list_raises(self): def test_combine_first_with_nan_index(): mi1 = pd.MultiIndex.from_arrays( - [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], - names=["a", "b"] + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] ) df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) mi2 = pd.MultiIndex.from_arrays( @@ -104,13 +103,16 @@ def test_combine_first_with_nan_index(): ) s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2) df_combined = df.combine_first(pd.DataFrame({"col": s})) - mi_expected = pd.MultiIndex.from_arrays([ - ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], - [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6] - ], names=["a", "b"]) + mi_expected = pd.MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) assert (df_combined.index == mi_expected).all() exp_col = np.asarray( [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan] ) - act_col = df_combined['col'].values + act_col = df_combined["col"].values assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True) From 225675db4433f523699a69042175c55bfd07c99d Mon Sep 17 00:00:00 2001 From: ssche Date: Wed, 14 Oct 2020 11:19:25 +1100 Subject: [PATCH 04/14] Addressed review comments --- pandas/core/algorithms.py | 32 +++++++++---------- .../indexing/multiindex/test_multiindex.py | 16 ++++++---- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6a2e4e34f92cc..631d3a5e4c9cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2061,32 +2061,32 @@ def sort_tuples(values): def cmp_func(index_x, index_y): x = values[index_x] y = values[index_y] + # shortcut loop in case both tuples are the same if x == y: return 0 - len_x = len(x) - len_y = len(y) - for i in range(max(len_x, len_y)): + # lexicographic sorting + for i in range(max(len(x), len(y))): # check if the tuples have different lengths (shorter tuples # first) - if i >= len_x: + if i >= len(x): return -1 - if i >= len_y: + if i >= len(y): return +1 - x_i_na = isna(x[i]) - y_i_na = isna(y[i]) + x_is_na = isna(x[i]) + y_is_na = isna(y[i]) # values are the same -> resolve tie with next element - if (x_i_na and y_i_na) or (x[i] == y[i]): + if (x_is_na and y_is_na) or (x[i] == y[i]): continue - # check for nan values (sort nan to the end which is consistent - # with numpy - if x_i_na and not y_i_na: + # check for nan values (sort nan to the end) + if x_is_na and not y_is_na: return +1 - if not x_i_na and y_i_na: + if not x_is_na and y_is_na: return -1 # normal greater/less than comparison if x[i] < y[i]: return -1 return +1 + # both values are the same (should already have been caught) return 0 ixs = np.arange(len(values)) @@ -2095,12 +2095,10 @@ def cmp_func(index_x, index_y): sorter = None - ext_arr = is_extension_array_dtype(values) - if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer": - # unorderable in py3 if mixed str/int + is_ea = is_extension_array_dtype(values) + if not is_ea and lib.infer_dtype(values, skipna=False) == "mixed-integer": ordered = sort_mixed(values) - elif not ext_arr and values.size and isinstance(values[0], tuple): - # 1-D arrays with tuples of potentially mixed type (solves GH36562) + elif not is_ea and values.size and isinstance(values[0], tuple): ordered = sort_tuples(values) else: try: diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 8881266a3f3ce..fd3f85c1c3e72 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -93,7 +93,7 @@ def test_multiindex_get_loc_list_raises(self): idx.get_loc([]) -def test_combine_first_with_nan_index(): +def test_combine_first_with_nan_multiindex(): mi1 = pd.MultiIndex.from_arrays( [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] ) @@ -102,7 +102,7 @@ def test_combine_first_with_nan_index(): [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] ) s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2) - df_combined = df.combine_first(pd.DataFrame({"col": s})) + res = df.combine_first(pd.DataFrame({"d": s})) mi_expected = pd.MultiIndex.from_arrays( [ ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], @@ -110,9 +110,11 @@ def test_combine_first_with_nan_index(): ], names=["a", "b"], ) - assert (df_combined.index == mi_expected).all() - exp_col = np.asarray( - [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan] + expected = pd.DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, ) - act_col = df_combined["col"].values - assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True) + tm.assert_frame_equal(res, expected) From 2677166fe280a999cb81dc3883074a54dd932315 Mon Sep 17 00:00:00 2001 From: ssche Date: Fri, 16 Oct 2020 11:05:21 +1100 Subject: [PATCH 05/14] Make `sort_tuples` last resort sorting strategy to allow faster sorters to execute first --- pandas/core/algorithms.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 631d3a5e4c9cc..bfb36b676cbed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1023,10 +1023,11 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] - ).any() or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) if to_raise: raise OverflowError("Overflow in int64 addition") @@ -2095,18 +2096,24 @@ def cmp_func(index_x, index_y): sorter = None - is_ea = is_extension_array_dtype(values) - if not is_ea and lib.infer_dtype(values, skipna=False) == "mixed-integer": + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): ordered = sort_mixed(values) - elif not is_ea and values.size and isinstance(values[0], tuple): - ordered = sort_tuples(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway - ordered = sort_mixed(values) + try: + ordered = sort_mixed(values) + except TypeError: + if values.size and isinstance(values[0], tuple): + ordered = sort_tuples(values) + else: + raise # codes: From 3688238654368efef6aee016c3661d755a7ece6d Mon Sep 17 00:00:00 2001 From: ssche Date: Fri, 16 Oct 2020 11:22:01 +1100 Subject: [PATCH 06/14] manually apply pandas black changes (from CI) --- pandas/core/algorithms.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bfb36b676cbed..c3f0d6037157b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1023,11 +1023,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() - or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - ) + (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] + ).any() or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() if to_raise: raise OverflowError("Overflow in int64 addition") From 8ae927913cd6b9f5c27f20ce0b4819ec6ac2b9d6 Mon Sep 17 00:00:00 2001 From: ssche Date: Thu, 22 Oct 2020 09:52:42 +1100 Subject: [PATCH 07/14] Modified changelog message as per review comments --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0540015bfb3d4..e7c6418631aa6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -412,7 +412,7 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) -- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` with message ``"'<' not supported between instances of 'float' and 'str'"`` (:issue:`36562`) +- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - I/O From 7b7d6f8d9d34fdbd1d6cccfbe43bac07dd06e0b3 Mon Sep 17 00:00:00 2001 From: ssche Date: Thu, 22 Oct 2020 14:26:36 +1100 Subject: [PATCH 08/14] Removed pd.xyz --- pandas/tests/indexing/multiindex/test_multiindex.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index bab0a30815202..f0367817db5a0 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -94,23 +94,23 @@ def test_multiindex_get_loc_list_raises(self): def test_combine_first_with_nan_multiindex(): - mi1 = pd.MultiIndex.from_arrays( + mi1 = MultiIndex.from_arrays( [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] ) - df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) - mi2 = pd.MultiIndex.from_arrays( + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] ) - s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) res = df.combine_first(pd.DataFrame({"d": s})) - mi_expected = pd.MultiIndex.from_arrays( + mi_expected = MultiIndex.from_arrays( [ ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], ], names=["a", "b"], ) - expected = pd.DataFrame( + expected = DataFrame( { "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], From 8cbfa015724c85cc084d203ad32f4f33cd6c4892 Mon Sep 17 00:00:00 2001 From: ssche Date: Thu, 22 Oct 2020 15:46:50 +1100 Subject: [PATCH 09/14] forgot `pd` --- pandas/tests/indexing/multiindex/test_multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index f0367817db5a0..244888836c733 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -102,7 +102,7 @@ def test_combine_first_with_nan_multiindex(): [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] ) s = Series([1, 2, 3, 4, 5, 6], index=mi2) - res = df.combine_first(pd.DataFrame({"d": s})) + res = df.combine_first(DataFrame({"d": s})) mi_expected = MultiIndex.from_arrays( [ ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], From 92e1e3384a1c1410bd59af90ca853aba1e09bdb9 Mon Sep 17 00:00:00 2001 From: ssche Date: Sat, 31 Oct 2020 16:31:31 +1100 Subject: [PATCH 10/14] Changed sorting algorithm by using expanded array * Extract column arrays and use pandas' internal functions to obtain index which sorts the array of tuples * Add function annotations to document expected argument types of `sort_tuples()` --- pandas/core/algorithms.py | 48 +++++++++------------------------------ 1 file changed, 11 insertions(+), 37 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7e94765127c95..bd6eac9794656 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,7 +4,6 @@ """ from __future__ import annotations -import functools import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast @@ -2071,42 +2070,17 @@ def sort_mixed(values): strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) - def sort_tuples(values): - # sorts tuples with mixed values. can handle nan vs string comparisons. - def cmp_func(index_x, index_y): - x = values[index_x] - y = values[index_y] - # shortcut loop in case both tuples are the same - if x == y: - return 0 - # lexicographic sorting - for i in range(max(len(x), len(y))): - # check if the tuples have different lengths (shorter tuples - # first) - if i >= len(x): - return -1 - if i >= len(y): - return +1 - x_is_na = isna(x[i]) - y_is_na = isna(y[i]) - # values are the same -> resolve tie with next element - if (x_is_na and y_is_na) or (x[i] == y[i]): - continue - # check for nan values (sort nan to the end) - if x_is_na and not y_is_na: - return +1 - if not x_is_na and y_is_na: - return -1 - # normal greater/less than comparison - if x[i] < y[i]: - return -1 - return +1 - # both values are the same (should already have been caught) - return 0 - - ixs = np.arange(len(values)) - ixs = sorted(ixs, key=functools.cmp_to_key(cmp_func)) - return values[ixs] + def sort_tuples(values: np.ndarray[tuple]): + # convert array of tuples (1d) to array or array (2d). + # we need to keep the columns separately as they contain different + # types and nans (can't use `np.sort` as it may fail when str and nan + # are mixed in a column as types cannot be compared). + from pandas.core.sorting import lexsort_indexer + from pandas.core.internals.construction import to_arrays + + arrays, _ = to_arrays(values, None) + indexer = lexsort_indexer(arrays, orders=True) + return values[indexer] sorter = None From 95670f1c4719900a540db77aa7e320ee61a6622a Mon Sep 17 00:00:00 2001 From: ssche Date: Sat, 31 Oct 2020 16:37:49 +1100 Subject: [PATCH 11/14] Add ticket ref to test case --- pandas/tests/indexing/multiindex/test_multiindex.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index be0f8e9617fb3..f392cb52cea90 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -86,6 +86,8 @@ def test_nested_tuples_duplicates(self): def test_combine_first_with_nan_multiindex(): + # GH#36562 + mi1 = MultiIndex.from_arrays( [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] ) From d9dcd22a269cbab2313cfc7ffae251a189815a03 Mon Sep 17 00:00:00 2001 From: ssche Date: Sat, 31 Oct 2020 17:51:11 +1100 Subject: [PATCH 12/14] sort import --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6cd9a6f75674e..b67dd7d9b6537 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2075,8 +2075,8 @@ def sort_tuples(values: np.ndarray[tuple]): # we need to keep the columns separately as they contain different # types and nans (can't use `np.sort` as it may fail when str and nan # are mixed in a column as types cannot be compared). - from pandas.core.sorting import lexsort_indexer from pandas.core.internals.construction import to_arrays + from pandas.core.sorting import lexsort_indexer arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) From 66c30c74fab35f25aa8c6279fc8bf7b4e194cd8a Mon Sep 17 00:00:00 2001 From: ssche Date: Sun, 1 Nov 2020 07:44:07 +1100 Subject: [PATCH 13/14] Address review comments * factor out inner functions * relocated test case * simplified try/except --- pandas/core/algorithms.py | 59 ++++++++++--------- .../tests/frame/methods/test_combine_first.py | 31 +++++++++- .../indexing/multiindex/test_multiindex.py | 29 --------- 3 files changed, 61 insertions(+), 58 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b67dd7d9b6537..6f21835ef1def 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2063,45 +2063,25 @@ def safe_sort( dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - - def sort_tuples(values: np.ndarray[tuple]): - # convert array of tuples (1d) to array or array (2d). - # we need to keep the columns separately as they contain different - # types and nans (can't use `np.sort` as it may fail when str and nan - # are mixed in a column as types cannot be compared). - from pandas.core.internals.construction import to_arrays - from pandas.core.sorting import lexsort_indexer - - arrays, _ = to_arrays(values, None) - indexer = lexsort_indexer(arrays, orders=True) - return values[indexer] - sorter = None if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): - ordered = sort_mixed(values) + ordered = _sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: - # try this anyway - try: - ordered = sort_mixed(values) - except TypeError: - if values.size and isinstance(values[0], tuple): - ordered = sort_tuples(values) - else: - raise + # Previous sorters failed or were not applicable, try `_sort_mixed` + # which would work, but which fails for special case of 1d arrays + # with tuples. + if values.size and isinstance(values[0], tuple): + ordered = _sort_tuples(values) + else: + ordered = _sort_mixed(values) # codes: @@ -2148,3 +2128,26 @@ def sort_tuples(values: np.ndarray[tuple]): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def _sort_mixed(values): + """ order ints before strings in 1d arrays, safe in py3 """ + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + +def _sort_tuples(values: np.ndarray[tuple]): + """ + Convert array of tuples (1d) to array or array (2d). + We need to keep the columns separately as they contain different types and + nans (can't use `np.sort` as it may fail when str and nan are mixed in a + column as types cannot be compared). + """ + from pandas.core.internals.construction import to_arrays + from pandas.core.sorting import lexsort_indexer + + arrays, _ = to_arrays(values, None) + indexer = lexsort_indexer(arrays, orders=True) + return values[indexer] diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index d1f38d90547fd..7c1fd95af03b8 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm @@ -353,3 +353,32 @@ def test_combine_first_with_asymmetric_other(self, val): exp = DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) + + +def test_combine_first_with_nan_multiindex(): + # gh-36562 + + mi1 = MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] + ) + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) + res = df.combine_first(DataFrame({"d": s})) + mi_expected = MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) + expected = DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, + ) + tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index f392cb52cea90..a3b8d66c92024 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -83,32 +83,3 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) - - -def test_combine_first_with_nan_multiindex(): - # GH#36562 - - mi1 = MultiIndex.from_arrays( - [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] - ) - df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) - mi2 = MultiIndex.from_arrays( - [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] - ) - s = Series([1, 2, 3, 4, 5, 6], index=mi2) - res = df.combine_first(DataFrame({"d": s})) - mi_expected = MultiIndex.from_arrays( - [ - ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], - [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], - ], - names=["a", "b"], - ) - expected = DataFrame( - { - "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], - "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], - }, - index=mi_expected, - ) - tm.assert_frame_equal(res, expected) From 9d03dc35fcc631784c795695809d7538ebaad7ff Mon Sep 17 00:00:00 2001 From: ssche Date: Wed, 4 Nov 2020 10:53:56 +1100 Subject: [PATCH 14/14] Fixed: committed file was in merge state --- .../tests/frame/methods/test_combine_first.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b5289e1e58f48..08c4293323500 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -354,7 +354,18 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) -<<<<<<< HEAD + def test_combine_first_string_dtype_only_na(self): + # GH: 37519 + df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df.set_index(["a", "b"], inplace=True) + df2.set_index(["a", "b"], inplace=True) + result = df.combine_first(df2) + expected = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + ).set_index(["a", "b"]) + tm.assert_frame_equal(result, expected) + def test_combine_first_with_nan_multiindex(): # gh-36562 @@ -383,16 +394,3 @@ def test_combine_first_with_nan_multiindex(): index=mi_expected, ) tm.assert_frame_equal(res, expected) -======= - def test_combine_first_string_dtype_only_na(self): - # GH: 37519 - df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") - df.set_index(["a", "b"], inplace=True) - df2.set_index(["a", "b"], inplace=True) - result = df.combine_first(df2) - expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" - ).set_index(["a", "b"]) - tm.assert_frame_equal(result, expected) ->>>>>>> upstream/master