From 677ff7a9a42b186a22791306e7ce85d093e33a52 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 11 Sep 2022 13:29:10 +0200 Subject: [PATCH 1/6] ENH: Keep dtypes in MultiIndex.union without NAs --- pandas/_libs/lib.pyi | 2 +- pandas/_libs/lib.pyx | 56 ++++++++++++++---------------------- pandas/core/indexes/multi.py | 14 +++++++-- 3 files changed, 35 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 77d3cbe92bef9..079f3ae5546be 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -59,7 +59,7 @@ def is_bool_array(values: np.ndarray, skipna: bool = ...): ... def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... -def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ... +def fast_unique_multiple(left: np.ndarray, right: np.ndarray) -> list: ... def map_infer( arr: np.ndarray, f: Callable[[Any], Any], diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ec7c3d61566dc..5abee4fd0cc3f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,9 +1,7 @@ from collections import abc from decimal import Decimal from enum import Enum -import inspect from typing import Literal -import warnings cimport cython from cpython.datetime cimport ( @@ -31,8 +29,6 @@ from cython cimport ( floating, ) -from pandas.util._exceptions import find_stack_level - import_datetime() import numpy as np @@ -314,51 +310,43 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True): +def fast_unique_multiple(ndarray left, ndarray right): """ - Generate a list of unique values from a list of arrays. + Generate a list indices we have to add to the left to get the union + of both arrays. Parameters ---------- - list : array-like - List of array-like objects. - sort : bool - Whether or not to sort the resulting unique list. + left : np.ndarray + Left array that is used as base. + + right : np.ndarray + right array that is checked for values that are not in left. Returns ------- - list of unique values + list of indices that we have to add to the left array. """ cdef: - ndarray[object] buf - Py_ssize_t k = len(arrays) Py_ssize_t i, j, n - list uniques = [] + list indices = [] dict table = {} object val, stub = 0 - for i in range(k): - buf = arrays[i] - n = len(buf) - for j in range(n): - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) + n = len(left) + for j in range(n): + val = left[j] + if val not in table: + table[val] = stub - if sort is None: - try: - uniques.sort() - except TypeError: - warnings.warn( - "The values in the array are unorderable. " - "Pass `sort=False` to suppress this warning.", - RuntimeWarning, - stacklevel=find_stack_level(inspect.currentframe()), - ) - pass + n = len(right) + for j in range(n): + val = right[j] + if val not in table: + table[val] = stub + indices.append(j) - return uniques + return indices @cython.wraparound(False) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 13c3b9200371f..e1ce4bb454fc3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3644,11 +3644,21 @@ def _union(self, other, sort) -> MultiIndex: # This is only necessary if both sides have nans or one has dups, # fast_unique_multiple is faster result = super()._union(other, sort) + return MultiIndex.from_arrays( + zip(*result), sortorder=None, names=result_names + ) + else: rvals = other._values.astype(object, copy=False) - result = lib.fast_unique_multiple([self._values, rvals], sort=sort) + right_missing = lib.fast_unique_multiple(self._values, rvals) + if right_missing: + result = self.append(other.take(right_missing)) + else: + result = self._get_reconciled_name_object(other) - return MultiIndex.from_arrays(zip(*result), sortorder=None, names=result_names) + if sort is None: + result = algos.safe_sort(result) + return result def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) From 119ed35b7ccc9da891fca76036cc66e25f146493 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 11 Sep 2022 15:00:08 +0200 Subject: [PATCH 2/6] Add tests --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/core/indexes/multi.py | 14 +++++++-- pandas/tests/indexes/multi/test_setops.py | 35 +++++++++++++++++++---- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index ee5085fd9ad89..e67213d2d281b 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -102,6 +102,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`) +- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`50000`) - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e1ce4bb454fc3..9862e5222d534 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3638,10 +3638,9 @@ def _union(self, other, sort) -> MultiIndex: if ( any(-1 in code for code in self.codes) and any(-1 in code for code in other.codes) - or self.has_duplicates or other.has_duplicates ): - # This is only necessary if both sides have nans or one has dups, + # This is only necessary if both sides have nans or other has dups, # fast_unique_multiple is faster result = super()._union(other, sort) return MultiIndex.from_arrays( @@ -3657,7 +3656,16 @@ def _union(self, other, sort) -> MultiIndex: result = self._get_reconciled_name_object(other) if sort is None: - result = algos.safe_sort(result) + try: + result = result.sort_values() + except TypeError: + warnings.warn( + "The values in the array are unorderable. " + "Pass `sort=False` to suppress this warning.", + RuntimeWarning, + stacklevel=find_stack_level(inspect.currentframe()), + ) + pass return result def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index e15809a751b73..3126aa77abb81 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -277,7 +277,9 @@ def test_union_with_regular_index(idx): msg = "The values in the array are unorderable" with tm.assert_produces_warning(RuntimeWarning, match=msg): result2 = idx.union(other) - assert result.equals(result2) + # This is more consistent now, if sorting fails then we don't sort at all + # in the MultiIndex case. + assert not result.equals(result2) def test_intersection(idx, sort): @@ -525,6 +527,26 @@ def test_union_nan_got_duplicated(): tm.assert_index_equal(result, mi2) +@pytest.mark.parametrize("val", [4, 1]) +def test_union_keep_ea_dtype(any_numeric_ea_dtype, val): + # GH#48498 + + arr1 = Series([val, 2], dtype=any_numeric_ea_dtype) + arr2 = Series([2, 1], dtype=any_numeric_ea_dtype) + midx = MultiIndex.from_arrays([arr1, [1, 2]], names=["a", None]) + midx2 = MultiIndex.from_arrays([arr2, [2, 1]]) + result = midx.union(midx2) + if val == 4: + expected = MultiIndex.from_arrays( + [Series([1, 2, 4], dtype=any_numeric_ea_dtype), [1, 2, 1]] + ) + else: + expected = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [1, 2]] + ) + tm.assert_index_equal(result, expected) + + def test_union_duplicates(index, request): # GH#38977 if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): @@ -534,18 +556,19 @@ def test_union_duplicates(index, request): values = index.unique().values.tolist() mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) - result = mi1.union(mi2) + result = mi2.union(mi1) expected = mi2.sort_values() + tm.assert_index_equal(result, expected) + if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all(): # GH#47294 - union uses lib.fast_zip, converting data to Python integers # and loses type information. Result is then unsigned only when values are - # sufficiently large to require unsigned dtype. + # sufficiently large to require unsigned dtype. This happens only if other + # has dups orone of both have missing values expected = expected.set_levels( [expected.levels[0].astype(int), expected.levels[1]] ) - tm.assert_index_equal(result, expected) - - result = mi2.union(mi1) + result = mi1.union(mi2) tm.assert_index_equal(result, expected) From de89d9449ac10fcab16315bfcc85264f2e1a0fc1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 11 Sep 2022 15:01:41 +0200 Subject: [PATCH 3/6] Fix --- pandas/tests/indexes/multi/test_setops.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 3126aa77abb81..31f93c29b29e9 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -261,12 +261,6 @@ def test_union(idx, sort): assert result.equals(idx) -@pytest.mark.xfail( - # This test was commented out from Oct 2011 to Dec 2021, may no longer - # be relevant. - reason="Length of names must match number of levels in MultiIndex", - raises=ValueError, -) def test_union_with_regular_index(idx): other = Index(["A", "B", "C"]) From a40e1d26c9b2cf74cdb78569db6a8f04c4ffde5f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 11 Sep 2022 15:03:54 +0200 Subject: [PATCH 4/6] Add gh refs --- doc/source/whatsnew/v1.6.0.rst | 3 ++- pandas/tests/indexes/multi/test_setops.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index e67213d2d281b..bf6d46e6757b2 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -102,7 +102,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`) -- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`50000`) +- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`) - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) @@ -169,6 +169,7 @@ Missing MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`) +- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48505`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) - diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 31f93c29b29e9..54bdedac55801 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -523,7 +523,7 @@ def test_union_nan_got_duplicated(): @pytest.mark.parametrize("val", [4, 1]) def test_union_keep_ea_dtype(any_numeric_ea_dtype, val): - # GH#48498 + # GH#48505 arr1 = Series([val, 2], dtype=any_numeric_ea_dtype) arr2 = Series([2, 1], dtype=any_numeric_ea_dtype) From 794d257b37b2471c53e5a7d8e5a8147715679d47 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 11 Sep 2022 21:40:22 +0200 Subject: [PATCH 5/6] Fix docstring --- pandas/_libs/lib.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5abee4fd0cc3f..91afd203ce415 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -319,7 +319,6 @@ def fast_unique_multiple(ndarray left, ndarray right): ---------- left : np.ndarray Left array that is used as base. - right : np.ndarray right array that is checked for values that are not in left. From 731583e1b85cf271f79c7dc564fadb53afda1100 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 19:59:42 +0200 Subject: [PATCH 6/6] Refactor --- pandas/_libs/lib.pyx | 10 +++++----- pandas/tests/indexes/multi/test_setops.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 91afd203ce415..f1473392147f9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -310,7 +310,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(ndarray left, ndarray right): +def fast_unique_multiple(ndarray left, ndarray right) -> list: """ Generate a list indices we have to add to the left to get the union of both arrays. @@ -321,28 +321,28 @@ def fast_unique_multiple(ndarray left, ndarray right): Left array that is used as base. right : np.ndarray right array that is checked for values that are not in left. + right can not have duplicates. Returns ------- list of indices that we have to add to the left array. """ cdef: - Py_ssize_t i, j, n + Py_ssize_t j, n list indices = [] - dict table = {} + set table = set() object val, stub = 0 n = len(left) for j in range(n): val = left[j] if val not in table: - table[val] = stub + table.add(val) n = len(right) for j in range(n): val = right[j] if val not in table: - table[val] = stub indices.append(j) return indices diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 54bdedac55801..84e750763b96b 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -558,7 +558,7 @@ def test_union_duplicates(index, request): # GH#47294 - union uses lib.fast_zip, converting data to Python integers # and loses type information. Result is then unsigned only when values are # sufficiently large to require unsigned dtype. This happens only if other - # has dups orone of both have missing values + # has dups or one of both have missing values expected = expected.set_levels( [expected.levels[0].astype(int), expected.levels[1]] )