From fb6d13ca4cee690fb49a579735c9e9cbe983797f Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 May 2020 18:07:19 +0200
Subject: [PATCH 1/4] ENH: allow storing ExtensionArrays in the Index

---
 pandas/conftest.py                     |  1 +
 pandas/core/indexes/base.py            | 58 ++++++++++++++++++++------
 pandas/core/indexing.py                |  1 -
 pandas/tests/base/test_value_counts.py |  4 ++
 pandas/tests/groupby/test_function.py  |  2 +-
 pandas/tests/indexes/common.py         |  5 ++-
 pandas/tests/indexes/test_base.py      |  4 ++
 pandas/tests/indexing/test_indexing.py |  1 +
 8 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 1e7f1b769c856..d4da7a82952a4 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -404,6 +404,7 @@ def _create_mi_with_dt64tz_level():
     "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
     "multi": _create_multiindex(),
     "repeats": Index([0, 0, 1, 1, 2, 2]),
+    "nullable_int": Index(np.arange(100), dtype="Int64"),
 }
 
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index b8a9827b5effd..2780670e89d4e 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -346,11 +346,13 @@ def __new__(
                 ea_cls = dtype.construct_array_type()
                 data = ea_cls._from_sequence(data, dtype=dtype, copy=False)
             else:
-                data = np.asarray(data, dtype=object)
+                # TODO clean-up with extract_array ?
+                if isinstance(data, Index):
+                    data = data._data
+                elif isinstance(data, ABCSeries):
+                    data = data.array
 
-            # coerce to the object dtype
-            data = data.astype(object)
-            return Index(data, dtype=object, copy=copy, name=name, **kwargs)
+            return cls._simple_new(data, name)
 
         # index-like
         elif isinstance(data, (np.ndarray, Index, ABCSeries)):
@@ -458,7 +460,7 @@ def _simple_new(cls, values, name: Label = None):
 
         Must be careful not to recurse.
         """
-        assert isinstance(values, np.ndarray), type(values)
+        assert isinstance(values, (np.ndarray, ExtensionArray)), type(values)
 
         result = object.__new__(cls)
         result._data = values
@@ -2126,6 +2128,8 @@ def fillna(self, value=None, downcast=None):
         Series.fillna : Fill NaN Values of a Series.
         """
         self._assert_can_do_op(value)
+        if is_extension_array_dtype(self.dtype):
+            return self._shallow_copy(self._values.fillna(value))
         if self.hasnans:
             result = self.putmask(self._isnan, value)
             if downcast is None:
@@ -2525,7 +2529,9 @@ def _union(self, other, sort):
                 # worth making this faster? a very unusual case
                 value_set = set(lvals)
                 result.extend([x for x in rvals if x not in value_set])
-                result = Index(result)._values  # do type inference here
+                result = Index(
+                    result, dtype=self.dtype
+                )._values  # do type inference here
         else:
             # find indexes of things in "other" that are not in "self"
             if self.is_unique:
@@ -3797,7 +3803,7 @@ def values(self) -> np.ndarray:
         Index.array : Reference to the underlying data.
         Index.to_numpy : A NumPy array representing the underlying data.
         """
-        return self._data.view(np.ndarray)
+        return self._data  # .view(np.ndarray)
 
     @cache_readonly
     @doc(IndexOpsMixin.array)
@@ -3839,7 +3845,10 @@ def _get_engine_target(self) -> np.ndarray:
         """
         Get the ndarray that we can pass to the IndexEngine constructor.
         """
-        return self._values
+        if isinstance(self._values, np.ndarray):
+            return self._values
+        else:
+            return np.asarray(self._values)
 
     @doc(IndexOpsMixin.memory_usage)
     def memory_usage(self, deep: bool = False) -> int:
@@ -4232,10 +4241,18 @@ def equals(self, other: Any) -> bool:
             # d-level MultiIndex can equal d-tuple Index
             return other.equals(self)
 
-        if is_extension_array_dtype(other.dtype):
+        if is_extension_array_dtype(other.dtype) and type(other) != Index:
             # All EA-backed Index subclasses override equals
             return other.equals(self)
 
+        if is_extension_array_dtype(self.dtype):
+            if is_object_dtype(other.dtype):
+                try:
+                    other = other.astype(self.dtype)
+                except Exception:
+                    return False
+            return self._values.equals(other._values)
+
         return array_equivalent(self._values, other._values)
 
     def identical(self, other) -> bool:
@@ -4759,6 +4776,15 @@ def map(self, mapper, na_action=None):
 
         attributes = self._get_attributes_dict()
 
+        if is_extension_array_dtype(self.dtype):
+            # try to coerce back to original dtype
+            # TODO this should use a strict version
+            try:
+                # TODO use existing helper method for this
+                new_values = self._values._from_sequence(new_values, dtype=self.dtype)
+            except Exception:
+                pass
+
         # we can return a MultiIndex
         if new_values.size and isinstance(new_values[0], tuple):
             if isinstance(self, MultiIndex):
@@ -5193,7 +5219,10 @@ def delete(self, loc):
         >>> idx.delete([0, 2])
         Index(['b'], dtype='object')
         """
-        return self._shallow_copy(np.delete(self._data, loc))
+        # this is currently overriden by EA-based Index subclasses
+        keep = np.ones(len(self), dtype=bool)
+        keep[loc] = False
+        return self._shallow_copy(self._data[keep])
 
     def insert(self, loc: int, item):
         """
@@ -5212,9 +5241,14 @@ def insert(self, loc: int, item):
         """
         # Note: this method is overridden by all ExtensionIndex subclasses,
         #  so self is never backed by an EA.
-        arr = np.asarray(self)
         item = self._coerce_scalar_to_index(item)._values
-        idx = np.concatenate((arr[:loc], item, arr[loc:]))
+
+        if is_extension_array_dtype(self.dtype):
+            arr = self._values
+            idx = arr._concat_same_type([arr[:loc], item, arr[loc:]])
+        else:
+            arr = np.asarray(self)
+            idx = np.concatenate((arr[:loc], item, arr[loc:]))
         return Index(idx, name=self.name)
 
     def drop(self, labels, errors: str_t = "raise"):
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index b857a59195695..538b5a80de797 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1766,7 +1766,6 @@ def isetter(loc, v):
                 ):
                     self.obj[item_labels[indexer[info_axis]]] = value
                     return
-
                 indexer = maybe_convert_ix(*indexer)
 
             if isinstance(value, (ABCSeries, dict)):
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index de04c30432e6f..1ef98b83642cd 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -34,6 +34,8 @@ def test_value_counts(index_or_series_obj):
     expected.index = expected.index.astype(obj.dtype)
     if isinstance(obj, pd.MultiIndex):
         expected.index = pd.Index(expected.index)
+    if isinstance(obj.dtype, pd.Int64Dtype):
+        expected = expected.astype("Int64")
 
     # TODO: Order of entries with the same count is inconsistent on CI (gh-32449)
     if obj.duplicated().any():
@@ -69,6 +71,8 @@ def test_value_counts_null(null_obj, index_or_series_obj):
     counter = collections.Counter(obj.dropna())
     expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
     expected.index = expected.index.astype(obj.dtype)
+    if isinstance(obj.dtype, pd.Int64Dtype):
+        expected = expected.astype("Int64")
 
     result = obj.value_counts()
     if obj.duplicated().any():
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 68c8b86250e06..07eefcbff9e6b 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -1000,7 +1000,7 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     # https://github.com/pandas-dev/pandas/issues/32219
     output = 0.5 if function == "var" else 1.5
     arr = np.array([output] * 3, dtype=float)
-    idx = pd.Index([1, 2, 3], dtype=object, name="a")
+    idx = pd.Index([1, 2, 3], dtype="Int64", name="a")
     expected = pd.DataFrame({"b": arr}, index=idx)
 
     groups = pd.DataFrame(values, dtype="Int64").groupby("a")
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 0f9509c372bdf..a2158988ed484 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -6,7 +6,7 @@
 
 from pandas._libs import iNaT
 
-from pandas.core.dtypes.common import is_datetime64tz_dtype
+from pandas.core.dtypes.common import is_datetime64tz_dtype, is_extension_array_dtype
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
 import pandas as pd
@@ -278,6 +278,9 @@ def test_ensure_copied_data(self, indices):
         elif isinstance(indices, IntervalIndex):
             # checked in test_interval.py
             pass
+        elif is_extension_array_dtype(indices.dtype):
+            # TODO can we check this generally?
+            pass
         else:
             result = index_type(indices.values, copy=False, **init_kwargs)
             tm.assert_numpy_array_equal(
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 9f235dcdbb295..8141ef03fdb71 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -916,6 +916,10 @@ def test_map_dictlike(self, indices, mapper):
         else:
             expected = Index(np.arange(len(indices), 0, -1))
 
+        if isinstance(indices.dtype, pd.Int64Dtype):
+            # map tries to preserve the nullable dtype
+            expected = expected.astype("Int64")
+
         result = indices.map(mapper(expected, indices))
         tm.assert_index_equal(result, expected)
 
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index 51a7aa9bb586b..3deb009b06b3f 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -81,6 +81,7 @@ def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id):
                 "Index data must be 1-dimensional",
                 "positional indexers are out-of-bounds",
                 "Indexing a MultiIndex with a multidimensional key is not implemented",
+                "values must be a 1D array",
             ]
         )
 

From cd917b16fc6d2a318e3ba6a94eb7197279adefdd Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 May 2020 22:18:04 +0200
Subject: [PATCH 2/4] use extract_array

---
 pandas/core/indexes/base.py                | 9 +++------
 pandas/tests/arrays/integer/test_dtypes.py | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 2780670e89d4e..4c392d227fcc8 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -71,6 +71,7 @@
 from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype
 from pandas.core.base import IndexOpsMixin, PandasObject
 import pandas.core.common as com
+from pandas.core.construction import extract_array
 from pandas.core.indexers import deprecate_ndim_indexing
 from pandas.core.indexes.frozen import FrozenList
 import pandas.core.missing as missing
@@ -345,13 +346,9 @@ def __new__(
                 # coerce to the provided dtype
                 ea_cls = dtype.construct_array_type()
                 data = ea_cls._from_sequence(data, dtype=dtype, copy=False)
-            else:
-                # TODO clean-up with extract_array ?
-                if isinstance(data, Index):
-                    data = data._data
-                elif isinstance(data, ABCSeries):
-                    data = data.array
 
+            # extract array from Series/Index + ensure have don't have PandasArray
+            data = extract_array(data, extract_numpy=True)
             return cls._simple_new(data, name)
 
         # index-like
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index a02501e2dcbf2..1e486b646164b 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -70,7 +70,7 @@ def test_construct_index(all_data, dropna):
         other = all_data
 
     result = pd.Index(integer_array(other, dtype=all_data.dtype))
-    expected = pd.Index(other, dtype=object)
+    expected = pd.Index(other, dtype=all_data.dtype)
 
     tm.assert_index_equal(result, expected)
 

From 099e73a81f1a636a276f156ac5c6bfdab4f7015e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 23 Apr 2021 01:20:20 +0200
Subject: [PATCH 3/4] fixup merge

---
 pandas/core/algorithms.py   | 5 ++++-
 pandas/core/indexes/base.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6f906cf8879ff..0be0a65726c82 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1512,7 +1512,10 @@ def take(
         )
     else:
         # NumPy style
-        result = arr.take(indices, axis=axis)
+        if arr.ndim == 1:
+            result = arr.take(indices)
+        else:
+            result = arr.take(indices, axis=axis)
     return result
 
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 4f784b5445283..7a92f5b6caaf1 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -435,6 +435,7 @@ def __new__(
                     return result.astype(dtype, copy=False)
                 return result
 
+            data = extract_array(data)
             disallow_kwargs(kwargs)
             return Index._simple_new(data, name=name)
 
@@ -2570,7 +2571,7 @@ def fillna(self, value=None, downcast=None):
         Series.fillna : Fill NaN Values of a Series.
         """
         value = self._require_scalar(value)
-        if is_extension_array_dtype(self.dtype):
+        if is_extension_array_dtype(self.dtype) and type(self) is Index:
             return self._shallow_copy(self._values.fillna(value))
 
         if self.hasnans:
@@ -4404,6 +4405,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
         Cast the ndarray returned from one of the libjoin.foo_indexer functions
         back to type(self)._data.
         """
+        if is_extension_array_dtype(self.dtype):
+            # TODO use helper method / strict version
+            return self._values._from_sequence(result, dtype=self.dtype)
         return result
 
     @doc(IndexOpsMixin._memory_usage)

From 7a9699c004e3130f0fa8b1142e7b9842a0fb5dea Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 23 Apr 2021 08:32:00 +0200
Subject: [PATCH 4/4] clean-up

---
 pandas/core/indexes/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 7a92f5b6caaf1..0051cc1ab764b 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5962,11 +5962,10 @@ def insert(self, loc: int, item) -> Index:
         # Use Index constructor to ensure we get tuples cast correctly.
         item = Index([item], dtype=self.dtype)._values
 
+        arr = self._values
         if is_extension_array_dtype(self.dtype):
-            arr = self._values
             idx = arr._concat_same_type([arr[:loc], item, arr[loc:]])
         else:
-            arr = np.asarray(self)
             idx = np.concatenate((arr[:loc], item, arr[loc:]))
 
         return Index(idx, name=self.name)