pandas-dev · jbrockmendel · Nov 15, 2020 · Nov 17, 2020 · Nov 25, 2020 · Jan 15, 2021
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1135,6 +1135,9 @@ def view(self, dtype: Optional[Dtype] = None) -> ArrayLike:
         # - The only case that *must* be implemented is with dtype=None,
         #   giving a view with the same dtype as self.
         if dtype is not None:
+            if dtype is np.ndarray:
+                # passed in Index.values
+                return np.asarray(self)
             raise NotImplementedError(dtype)
         return self[:]
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -345,7 +345,10 @@ def __new__(
                 stacklevel=2,
             )
 
-        from pandas.core.arrays import PandasArray
+        from pandas.core.arrays import (
+            PandasArray,
+            StringArray,
+        )
         from pandas.core.indexes.range import RangeIndex
 
         name = maybe_extract_name(name, data, cls)
@@ -357,7 +360,7 @@ def __new__(
             validate_tz_from_dtype(dtype, tz)
             dtype = tz_to_dtype(tz)
 
-        if isinstance(data, PandasArray):
+        if isinstance(data, PandasArray) and not isinstance(data, StringArray):
             # ensure users don't accidentally put a PandasArray in an index.
             data = data.to_numpy()
         if isinstance(dtype, PandasDtype):
@@ -383,11 +386,13 @@ def __new__(
             if klass is not Index:
                 return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)
 
+            from pandas.core.indexes.extension import ExtensionIndex
+
             ea_cls = dtype.construct_array_type()
             data = ea_cls._from_sequence(data, dtype=dtype, copy=copy)
-            data = np.asarray(data, dtype=object)
             disallow_kwargs(kwargs)
-            return Index._simple_new(data, name=name)
+            data = extract_array(data, extract_numpy=True)
+            return ExtensionIndex._simple_new(data, name=name)
 
         elif is_ea_or_datetimelike_dtype(data_dtype):
             klass = cls._dtype_to_subclass(data_dtype)
@@ -397,9 +402,15 @@ def __new__(
                     return result.astype(dtype, copy=False)
                 return result
 
-            data = np.array(data, dtype=object, copy=copy)
             disallow_kwargs(kwargs)
-            return Index._simple_new(data, name=name)
+            if data_dtype == object:
+                data = np.array(data, dtype=object, copy=copy)
+                return Index._simple_new(data, name=name)
+
+            from pandas.core.indexes.extension import ExtensionIndex
+
+            data = extract_array(data, extract_numpy=True)
+            return ExtensionIndex._simple_new(data, name=name)  # TODO: copy?
 
         # index-like
         elif isinstance(data, (np.ndarray, Index, ABCSeries)):
@@ -568,7 +579,8 @@ def _simple_new(cls: Type[_IndexT], values, name: Hashable = None) -> _IndexT:
 
         Must be careful not to recurse.
         """
-        assert isinstance(values, np.ndarray), type(values)
+        if cls.__name__ != "ExtensionIndex":
+            assert isinstance(values, np.ndarray), type(values)
 
         result = object.__new__(cls)
         result._data = values

diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 
+from pandas._libs import index as libindex
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import (
@@ -257,6 +258,25 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
 
     # ---------------------------------------------------------------------
 
+    @property
+    def _na_value(self):
+        return self.dtype.na_value
+
+    @property  # TODO: cache_readonly?
+    def _engine_type(self):
+        # TODO: can we avoid re-calling if get_engine_target is expensive?
+        dtype = self._get_engine_target().dtype
+        return {
+            np.int8: libindex.Int8Engine,
+            np.int16: libindex.Int16Engine,
+            np.int32: libindex.Int32Engine,
+            np.int64: libindex.Int64Engine,
+            np.object_: libindex.ObjectEngine,
+            # TODO: missing floats, uints
+        }[dtype.type]
+
+    # ---------------------------------------------------------------------
+
     def _get_engine_target(self) -> np.ndarray:
         return np.asarray(self._data)
 

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -1371,15 +1371,27 @@ def test_integer_array_add_list_like(
     left = container + box_1d_array(data)
     right = box_1d_array(data) + container
 
-    if Series == box_pandas_1d_array:
+    if box_pandas_1d_array is Series:
         assert_function = tm.assert_series_equal
         expected = Series(expected_data, dtype="Int64")
-    elif Series == box_1d_array:
+
+    elif box_1d_array is Series:
         assert_function = tm.assert_series_equal
-        expected = Series(expected_data, dtype="object")
-    elif Index in (box_pandas_1d_array, box_1d_array):
+
+        if box_pandas_1d_array is Index:
+            expected = Series(expected_data, dtype="Int64")
+        else:
+            expected = Series(expected_data, dtype="object")
+
+    elif box_pandas_1d_array is Index:
         assert_function = tm.assert_index_equal
-        expected = Int64Index(expected_data)
+        expected = Index(array(expected_data))
+        assert expected.dtype == "Int64"
+
+    elif box_1d_array is Index:
+        assert_function = tm.assert_index_equal
+        expected = Index(expected_data)
+
     else:
         assert_function = tm.assert_numpy_array_equal
         expected = np.array(expected_data, dtype="object")

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -727,7 +727,9 @@ def test_categorical_extension_array_nullable(self, nulls_fixture):
         # GH:
         arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
         result = Categorical(arr)
-        expected = Categorical(Series([pd.NA, pd.NA], dtype="object"))
+        idx = Index(arr)
+        assert idx.dtype == arr.dtype
+        expected = Categorical([np.nan, np.nan], categories=idx[:0])
         tm.assert_categorical_equal(result, expected)
 
     def test_from_sequence_copy(self):

diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -71,8 +71,9 @@ def test_construct_index(all_data, dropna):
     else:
         other = all_data
 
-    result = pd.Index(pd.array(other, dtype=all_data.dtype))
-    expected = pd.Index(other, dtype=object)
+    arr = pd.array(other, dtype=all_data.dtype)
+    result = pd.Index(arr)
+    expected = pd.core.indexes.extension.ExtensionIndex._simple_new(arr)
 
     tm.assert_index_equal(result, expected)
 

diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -312,14 +312,29 @@ def test_groupby_extension_apply(self):
         we'll be able to dispatch unique.
         """
 
-    @pytest.mark.parametrize("as_index", [True, False])
+    @pytest.mark.parametrize(
+        "as_index",
+        [
+            pytest.param(
+                True,
+                marks=pytest.mark.xfail(
+                    reason="Best guess: lack of hashability breaks ExtensionIndex"
+                ),
+            ),
+            False,
+        ],
+    )
     def test_groupby_extension_agg(self, as_index, data_for_grouping):
         super().test_groupby_extension_agg(as_index, data_for_grouping)
 
     @pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
     def test_groupby_agg_extension(self, data_for_grouping):
         super().test_groupby_agg_extension(data_for_grouping)
 
+    @pytest.mark.xfail(reason="Best guess: lack of hashability breaks ExtensionIndex")
+    def test_groupby_extension_no_sort(self, data_for_grouping):
+        super().test_groupby_extension_no_sort(data_for_grouping)
+
 
 class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests):
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -191,4 +191,14 @@ class TestPrinting(base.BasePrintingTests):
 
 
 class TestGroupBy(base.BaseGroupbyTests):
-    pass
+    @pytest.fixture(
+        params=[
+            StringDtype,
+            pytest.param(
+                ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="2.0.0")
+            ),
+        ]
+    )
+    def dtype(self, request):
+        # GH#37869 we need pyarrow 2.0+ for some of these tests
+        return request.param
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -13,10 +13,12 @@
     MultiIndex,
     Series,
     Timestamp,
+    array as pd_array,
     date_range,
     isna,
 )
 import pandas._testing as tm
+from pandas.core.indexes.extension import ExtensionIndex
 import pandas.core.nanops as nanops
 from pandas.util import _test_decorators as td
 
@@ -122,10 +124,6 @@ def test_intercept_builtin_sum():
     tm.assert_series_equal(result2, expected)
 
 
-# @pytest.mark.parametrize("f", [max, min, sum])
-# def test_builtins_apply(f):
-
-
 @pytest.mark.parametrize("f", [max, min, sum])
 @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]])  # Single key  # Multi-key
 def test_builtins_apply(keys, f):
@@ -1106,7 +1104,9 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     # https://github.com/pandas-dev/pandas/issues/32219
     output = 0.5 if function == "var" else 1.5
     arr = np.array([output] * 3, dtype=float)
-    idx = Index([1, 2, 3], dtype=object, name="a")
+    idx = Index(pd_array([1, 2, 3]), name="a")
+    assert isinstance(idx, ExtensionIndex)
+    assert idx.dtype == "Int64"
     expected = DataFrame({"b": arr}, index=idx).astype("Float64")
 
     groups = DataFrame(values, dtype="Int64").groupby("a")
@@ -1126,7 +1126,9 @@ def test_groupby_sum_below_mincount_nullable_integer():
     # https://github.com/pandas-dev/pandas/issues/32861
     df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
     grouped = df.groupby("a")
-    idx = Index([0, 1, 2], dtype=object, name="a")
+    idx = Index(pd_array([0, 1, 2]), name="a")
+    assert isinstance(idx, ExtensionIndex)
+    assert idx.dtype == "Int64"
 
     result = grouped["b"].sum(min_count=2)
     expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")