From 677ff7a9a42b186a22791306e7ce85d093e33a52 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 11 Sep 2022 13:29:10 +0200
Subject: [PATCH 1/6] ENH: Keep dtypes in MultiIndex.union without NAs

---
 pandas/_libs/lib.pyi         |  2 +-
 pandas/_libs/lib.pyx         | 56 ++++++++++++++----------------------
 pandas/core/indexes/multi.py | 14 +++++++--
 3 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 77d3cbe92bef9..079f3ae5546be 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -59,7 +59,7 @@ def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
 def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
 def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
 def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
-def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ...
+def fast_unique_multiple(left: np.ndarray, right: np.ndarray) -> list: ...
 def map_infer(
     arr: np.ndarray,
     f: Callable[[Any], Any],
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index ec7c3d61566dc..5abee4fd0cc3f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1,9 +1,7 @@
 from collections import abc
 from decimal import Decimal
 from enum import Enum
-import inspect
 from typing import Literal
-import warnings
 
 cimport cython
 from cpython.datetime cimport (
@@ -31,8 +29,6 @@ from cython cimport (
     floating,
 )
 
-from pandas.util._exceptions import find_stack_level
-
 import_datetime()
 
 import numpy as np
@@ -314,51 +310,43 @@ def item_from_zerodim(val: object) -> object:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple(list arrays, sort: bool = True):
+def fast_unique_multiple(ndarray left, ndarray right):
     """
-    Generate a list of unique values from a list of arrays.
+    Generate a list indices we have to add to the left to get the union
+    of both arrays.
 
     Parameters
     ----------
-    list : array-like
-        List of array-like objects.
-    sort : bool
-        Whether or not to sort the resulting unique list.
+    left : np.ndarray
+        Left array that is used as base.
+
+    right : np.ndarray
+        right array that is checked for values that are not in left.
 
     Returns
     -------
-    list of unique values
+    list of indices that we have to add to the left array.
     """
     cdef:
-        ndarray[object] buf
-        Py_ssize_t k = len(arrays)
         Py_ssize_t i, j, n
-        list uniques = []
+        list indices = []
         dict table = {}
         object val, stub = 0
 
-    for i in range(k):
-        buf = arrays[i]
-        n = len(buf)
-        for j in range(n):
-            val = buf[j]
-            if val not in table:
-                table[val] = stub
-                uniques.append(val)
+    n = len(left)
+    for j in range(n):
+        val = left[j]
+        if val not in table:
+            table[val] = stub
 
-    if sort is None:
-        try:
-            uniques.sort()
-        except TypeError:
-            warnings.warn(
-                "The values in the array are unorderable. "
-                "Pass `sort=False` to suppress this warning.",
-                RuntimeWarning,
-                stacklevel=find_stack_level(inspect.currentframe()),
-            )
-            pass
+    n = len(right)
+    for j in range(n):
+        val = right[j]
+        if val not in table:
+            table[val] = stub
+            indices.append(j)
 
-    return uniques
+    return indices
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 13c3b9200371f..e1ce4bb454fc3 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -3644,11 +3644,21 @@ def _union(self, other, sort) -> MultiIndex:
             # This is only necessary if both sides have nans or one has dups,
             # fast_unique_multiple is faster
             result = super()._union(other, sort)
+            return MultiIndex.from_arrays(
+                zip(*result), sortorder=None, names=result_names
+            )
+
         else:
             rvals = other._values.astype(object, copy=False)
-            result = lib.fast_unique_multiple([self._values, rvals], sort=sort)
+            right_missing = lib.fast_unique_multiple(self._values, rvals)
+            if right_missing:
+                result = self.append(other.take(right_missing))
+            else:
+                result = self._get_reconciled_name_object(other)
 
-        return MultiIndex.from_arrays(zip(*result), sortorder=None, names=result_names)
+            if sort is None:
+                result = algos.safe_sort(result)
+            return result
 
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         return is_object_dtype(dtype)

From 119ed35b7ccc9da891fca76036cc66e25f146493 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 11 Sep 2022 15:00:08 +0200
Subject: [PATCH 2/6] Add tests

---
 doc/source/whatsnew/v1.6.0.rst            |  1 +
 pandas/core/indexes/multi.py              | 14 +++++++--
 pandas/tests/indexes/multi/test_setops.py | 35 +++++++++++++++++++----
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
index ee5085fd9ad89..e67213d2d281b 100644
--- a/doc/source/whatsnew/v1.6.0.rst
+++ b/doc/source/whatsnew/v1.6.0.rst
@@ -102,6 +102,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
+- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`50000`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
 - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index e1ce4bb454fc3..9862e5222d534 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -3638,10 +3638,9 @@ def _union(self, other, sort) -> MultiIndex:
         if (
             any(-1 in code for code in self.codes)
             and any(-1 in code for code in other.codes)
-            or self.has_duplicates
             or other.has_duplicates
         ):
-            # This is only necessary if both sides have nans or one has dups,
+            # This is only necessary if both sides have nans or other has dups,
             # fast_unique_multiple is faster
             result = super()._union(other, sort)
             return MultiIndex.from_arrays(
@@ -3657,7 +3656,16 @@ def _union(self, other, sort) -> MultiIndex:
                 result = self._get_reconciled_name_object(other)
 
             if sort is None:
-                result = algos.safe_sort(result)
+                try:
+                    result = result.sort_values()
+                except TypeError:
+                    warnings.warn(
+                        "The values in the array are unorderable. "
+                        "Pass `sort=False` to suppress this warning.",
+                        RuntimeWarning,
+                        stacklevel=find_stack_level(inspect.currentframe()),
+                    )
+                    pass
             return result
 
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index e15809a751b73..3126aa77abb81 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -277,7 +277,9 @@ def test_union_with_regular_index(idx):
     msg = "The values in the array are unorderable"
     with tm.assert_produces_warning(RuntimeWarning, match=msg):
         result2 = idx.union(other)
-    assert result.equals(result2)
+    # This is more consistent now, if sorting fails then we don't sort at all
+    # in the MultiIndex case.
+    assert not result.equals(result2)
 
 
 def test_intersection(idx, sort):
@@ -525,6 +527,26 @@ def test_union_nan_got_duplicated():
     tm.assert_index_equal(result, mi2)
 
 
+@pytest.mark.parametrize("val", [4, 1])
+def test_union_keep_ea_dtype(any_numeric_ea_dtype, val):
+    # GH#48498
+
+    arr1 = Series([val, 2], dtype=any_numeric_ea_dtype)
+    arr2 = Series([2, 1], dtype=any_numeric_ea_dtype)
+    midx = MultiIndex.from_arrays([arr1, [1, 2]], names=["a", None])
+    midx2 = MultiIndex.from_arrays([arr2, [2, 1]])
+    result = midx.union(midx2)
+    if val == 4:
+        expected = MultiIndex.from_arrays(
+            [Series([1, 2, 4], dtype=any_numeric_ea_dtype), [1, 2, 1]]
+        )
+    else:
+        expected = MultiIndex.from_arrays(
+            [Series([1, 2], dtype=any_numeric_ea_dtype), [1, 2]]
+        )
+    tm.assert_index_equal(result, expected)
+
+
 def test_union_duplicates(index, request):
     # GH#38977
     if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)):
@@ -534,18 +556,19 @@ def test_union_duplicates(index, request):
     values = index.unique().values.tolist()
     mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
     mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
-    result = mi1.union(mi2)
+    result = mi2.union(mi1)
     expected = mi2.sort_values()
+    tm.assert_index_equal(result, expected)
+
     if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
         # GH#47294 - union uses lib.fast_zip, converting data to Python integers
         # and loses type information. Result is then unsigned only when values are
-        # sufficiently large to require unsigned dtype.
+        # sufficiently large to require unsigned dtype. This happens only if other
+        # has dups orone of both have missing values
         expected = expected.set_levels(
             [expected.levels[0].astype(int), expected.levels[1]]
         )
-    tm.assert_index_equal(result, expected)
-
-    result = mi2.union(mi1)
+    result = mi1.union(mi2)
     tm.assert_index_equal(result, expected)
 
 

From de89d9449ac10fcab16315bfcc85264f2e1a0fc1 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 11 Sep 2022 15:01:41 +0200
Subject: [PATCH 3/6] Fix

---
 pandas/tests/indexes/multi/test_setops.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index 3126aa77abb81..31f93c29b29e9 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -261,12 +261,6 @@ def test_union(idx, sort):
         assert result.equals(idx)
 
 
-@pytest.mark.xfail(
-    # This test was commented out from Oct 2011 to Dec 2021, may no longer
-    #  be relevant.
-    reason="Length of names must match number of levels in MultiIndex",
-    raises=ValueError,
-)
 def test_union_with_regular_index(idx):
     other = Index(["A", "B", "C"])
 

From a40e1d26c9b2cf74cdb78569db6a8f04c4ffde5f Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 11 Sep 2022 15:03:54 +0200
Subject: [PATCH 4/6] Add gh refs

---
 doc/source/whatsnew/v1.6.0.rst            | 3 ++-
 pandas/tests/indexes/multi/test_setops.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
index e67213d2d281b..bf6d46e6757b2 100644
--- a/doc/source/whatsnew/v1.6.0.rst
+++ b/doc/source/whatsnew/v1.6.0.rst
@@ -102,7 +102,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
-- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`50000`)
+- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
 - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
@@ -169,6 +169,7 @@ Missing
 MultiIndex
 ^^^^^^^^^^
 - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`)
+- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48505`)
 - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
 -
 
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index 31f93c29b29e9..54bdedac55801 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -523,7 +523,7 @@ def test_union_nan_got_duplicated():
 
 @pytest.mark.parametrize("val", [4, 1])
 def test_union_keep_ea_dtype(any_numeric_ea_dtype, val):
-    # GH#48498
+    # GH#48505
 
     arr1 = Series([val, 2], dtype=any_numeric_ea_dtype)
     arr2 = Series([2, 1], dtype=any_numeric_ea_dtype)

From 794d257b37b2471c53e5a7d8e5a8147715679d47 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 11 Sep 2022 21:40:22 +0200
Subject: [PATCH 5/6] Fix docstring

---
 pandas/_libs/lib.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5abee4fd0cc3f..91afd203ce415 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -319,7 +319,6 @@ def fast_unique_multiple(ndarray left, ndarray right):
     ----------
     left : np.ndarray
         Left array that is used as base.
-
     right : np.ndarray
         right array that is checked for values that are not in left.
 

From 731583e1b85cf271f79c7dc564fadb53afda1100 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 12 Sep 2022 19:59:42 +0200
Subject: [PATCH 6/6] Refactor

---
 pandas/_libs/lib.pyx                      | 10 +++++-----
 pandas/tests/indexes/multi/test_setops.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 91afd203ce415..f1473392147f9 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -310,7 +310,7 @@ def item_from_zerodim(val: object) -> object:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple(ndarray left, ndarray right):
+def fast_unique_multiple(ndarray left, ndarray right) -> list:
     """
     Generate a list indices we have to add to the left to get the union
     of both arrays.
@@ -321,28 +321,28 @@ def fast_unique_multiple(ndarray left, ndarray right):
         Left array that is used as base.
     right : np.ndarray
         right array that is checked for values that are not in left.
+        right can not have duplicates.
 
     Returns
     -------
     list of indices that we have to add to the left array.
     """
     cdef:
-        Py_ssize_t i, j, n
+        Py_ssize_t j, n
         list indices = []
-        dict table = {}
+        set table = set()
         object val, stub = 0
 
     n = len(left)
     for j in range(n):
         val = left[j]
         if val not in table:
-            table[val] = stub
+            table.add(val)
 
     n = len(right)
     for j in range(n):
         val = right[j]
         if val not in table:
-            table[val] = stub
             indices.append(j)
 
     return indices
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index 54bdedac55801..84e750763b96b 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -558,7 +558,7 @@ def test_union_duplicates(index, request):
         # GH#47294 - union uses lib.fast_zip, converting data to Python integers
         # and loses type information. Result is then unsigned only when values are
         # sufficiently large to require unsigned dtype. This happens only if other
-        # has dups orone of both have missing values
+        # has dups or one of both have missing values
         expected = expected.set_levels(
             [expected.levels[0].astype(int), expected.levels[1]]
         )