From fb6d13ca4cee690fb49a579735c9e9cbe983797f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 May 2020 18:07:19 +0200 Subject: [PATCH 1/4] ENH: allow storing ExtensionArrays in the Index --- pandas/conftest.py | 1 + pandas/core/indexes/base.py | 58 ++++++++++++++++++++------ pandas/core/indexing.py | 1 - pandas/tests/base/test_value_counts.py | 4 ++ pandas/tests/groupby/test_function.py | 2 +- pandas/tests/indexes/common.py | 5 ++- pandas/tests/indexes/test_base.py | 4 ++ pandas/tests/indexing/test_indexing.py | 1 + 8 files changed, 61 insertions(+), 15 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 1e7f1b769c856..d4da7a82952a4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -404,6 +404,7 @@ def _create_mi_with_dt64tz_level(): "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), + "nullable_int": Index(np.arange(100), dtype="Int64"), } diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b8a9827b5effd..2780670e89d4e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -346,11 +346,13 @@ def __new__( ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=False) else: - data = np.asarray(data, dtype=object) + # TODO clean-up with extract_array ? + if isinstance(data, Index): + data = data._data + elif isinstance(data, ABCSeries): + data = data.array - # coerce to the object dtype - data = data.astype(object) - return Index(data, dtype=object, copy=copy, name=name, **kwargs) + return cls._simple_new(data, name) # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -458,7 +460,7 @@ def _simple_new(cls, values, name: Label = None): Must be careful not to recurse. """ - assert isinstance(values, np.ndarray), type(values) + assert isinstance(values, (np.ndarray, ExtensionArray)), type(values) result = object.__new__(cls) result._data = values @@ -2126,6 +2128,8 @@ def fillna(self, value=None, downcast=None): Series.fillna : Fill NaN Values of a Series. """ self._assert_can_do_op(value) + if is_extension_array_dtype(self.dtype): + return self._shallow_copy(self._values.fillna(value)) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: @@ -2525,7 +2529,9 @@ def _union(self, other, sort): # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here + result = Index( + result, dtype=self.dtype + )._values # do type inference here else: # find indexes of things in "other" that are not in "self" if self.is_unique: @@ -3797,7 +3803,7 @@ def values(self) -> np.ndarray: Index.array : Reference to the underlying data. Index.to_numpy : A NumPy array representing the underlying data. """ - return self._data.view(np.ndarray) + return self._data # .view(np.ndarray) @cache_readonly @doc(IndexOpsMixin.array) @@ -3839,7 +3845,10 @@ def _get_engine_target(self) -> np.ndarray: """ Get the ndarray that we can pass to the IndexEngine constructor. """ - return self._values + if isinstance(self._values, np.ndarray): + return self._values + else: + return np.asarray(self._values) @doc(IndexOpsMixin.memory_usage) def memory_usage(self, deep: bool = False) -> int: @@ -4232,10 +4241,18 @@ def equals(self, other: Any) -> bool: # d-level MultiIndex can equal d-tuple Index return other.equals(self) - if is_extension_array_dtype(other.dtype): + if is_extension_array_dtype(other.dtype) and type(other) != Index: # All EA-backed Index subclasses override equals return other.equals(self) + if is_extension_array_dtype(self.dtype): + if is_object_dtype(other.dtype): + try: + other = other.astype(self.dtype) + except Exception: + return False + return self._values.equals(other._values) + return array_equivalent(self._values, other._values) def identical(self, other) -> bool: @@ -4759,6 +4776,15 @@ def map(self, mapper, na_action=None): attributes = self._get_attributes_dict() + if is_extension_array_dtype(self.dtype): + # try to coerce back to original dtype + # TODO this should use a strict version + try: + # TODO use existing helper method for this + new_values = self._values._from_sequence(new_values, dtype=self.dtype) + except Exception: + pass + # we can return a MultiIndex if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): @@ -5193,7 +5219,10 @@ def delete(self, loc): >>> idx.delete([0, 2]) Index(['b'], dtype='object') """ - return self._shallow_copy(np.delete(self._data, loc)) + # this is currently overriden by EA-based Index subclasses + keep = np.ones(len(self), dtype=bool) + keep[loc] = False + return self._shallow_copy(self._data[keep]) def insert(self, loc: int, item): """ @@ -5212,9 +5241,14 @@ def insert(self, loc: int, item): """ # Note: this method is overridden by all ExtensionIndex subclasses, # so self is never backed by an EA. - arr = np.asarray(self) item = self._coerce_scalar_to_index(item)._values - idx = np.concatenate((arr[:loc], item, arr[loc:])) + + if is_extension_array_dtype(self.dtype): + arr = self._values + idx = arr._concat_same_type([arr[:loc], item, arr[loc:]]) + else: + arr = np.asarray(self) + idx = np.concatenate((arr[:loc], item, arr[loc:])) return Index(idx, name=self.name) def drop(self, labels, errors: str_t = "raise"): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b857a59195695..538b5a80de797 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1766,7 +1766,6 @@ def isetter(loc, v): ): self.obj[item_labels[indexer[info_axis]]] = value return - indexer = maybe_convert_ix(*indexer) if isinstance(value, (ABCSeries, dict)): diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index de04c30432e6f..1ef98b83642cd 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -34,6 +34,8 @@ def test_value_counts(index_or_series_obj): expected.index = expected.index.astype(obj.dtype) if isinstance(obj, pd.MultiIndex): expected.index = pd.Index(expected.index) + if isinstance(obj.dtype, pd.Int64Dtype): + expected = expected.astype("Int64") # TODO: Order of entries with the same count is inconsistent on CI (gh-32449) if obj.duplicated().any(): @@ -69,6 +71,8 @@ def test_value_counts_null(null_obj, index_or_series_obj): counter = collections.Counter(obj.dropna()) expected = pd.Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) + if isinstance(obj.dtype, pd.Int64Dtype): + expected = expected.astype("Int64") result = obj.value_counts() if obj.duplicated().any(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 68c8b86250e06..07eefcbff9e6b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1000,7 +1000,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) - idx = pd.Index([1, 2, 3], dtype=object, name="a") + idx = pd.Index([1, 2, 3], dtype="Int64", name="a") expected = pd.DataFrame({"b": arr}, index=idx) groups = pd.DataFrame(values, dtype="Int64").groupby("a") diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 0f9509c372bdf..a2158988ed484 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -6,7 +6,7 @@ from pandas._libs import iNaT -from pandas.core.dtypes.common import is_datetime64tz_dtype +from pandas.core.dtypes.common import is_datetime64tz_dtype, is_extension_array_dtype from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -278,6 +278,9 @@ def test_ensure_copied_data(self, indices): elif isinstance(indices, IntervalIndex): # checked in test_interval.py pass + elif is_extension_array_dtype(indices.dtype): + # TODO can we check this generally? + pass else: result = index_type(indices.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal( diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9f235dcdbb295..8141ef03fdb71 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -916,6 +916,10 @@ def test_map_dictlike(self, indices, mapper): else: expected = Index(np.arange(len(indices), 0, -1)) + if isinstance(indices.dtype, pd.Int64Dtype): + # map tries to preserve the nullable dtype + expected = expected.astype("Int64") + result = indices.map(mapper(expected, indices)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 51a7aa9bb586b..3deb009b06b3f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -81,6 +81,7 @@ def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id): "Index data must be 1-dimensional", "positional indexers are out-of-bounds", "Indexing a MultiIndex with a multidimensional key is not implemented", + "values must be a 1D array", ] ) From cd917b16fc6d2a318e3ba6a94eb7197279adefdd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 May 2020 22:18:04 +0200 Subject: [PATCH 2/4] use extract_array --- pandas/core/indexes/base.py | 9 +++------ pandas/tests/arrays/integer/test_dtypes.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2780670e89d4e..4c392d227fcc8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -71,6 +71,7 @@ from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing @@ -345,13 +346,9 @@ def __new__( # coerce to the provided dtype ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=False) - else: - # TODO clean-up with extract_array ? - if isinstance(data, Index): - data = data._data - elif isinstance(data, ABCSeries): - data = data.array + # extract array from Series/Index + ensure have don't have PandasArray + data = extract_array(data, extract_numpy=True) return cls._simple_new(data, name) # index-like diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index a02501e2dcbf2..1e486b646164b 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -70,7 +70,7 @@ def test_construct_index(all_data, dropna): other = all_data result = pd.Index(integer_array(other, dtype=all_data.dtype)) - expected = pd.Index(other, dtype=object) + expected = pd.Index(other, dtype=all_data.dtype) tm.assert_index_equal(result, expected) From 099e73a81f1a636a276f156ac5c6bfdab4f7015e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 01:20:20 +0200 Subject: [PATCH 3/4] fixup merge --- pandas/core/algorithms.py | 5 ++++- pandas/core/indexes/base.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6f906cf8879ff..0be0a65726c82 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1512,7 +1512,10 @@ def take( ) else: # NumPy style - result = arr.take(indices, axis=axis) + if arr.ndim == 1: + result = arr.take(indices) + else: + result = arr.take(indices, axis=axis) return result diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4f784b5445283..7a92f5b6caaf1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -435,6 +435,7 @@ def __new__( return result.astype(dtype, copy=False) return result + data = extract_array(data) disallow_kwargs(kwargs) return Index._simple_new(data, name=name) @@ -2570,7 +2571,7 @@ def fillna(self, value=None, downcast=None): Series.fillna : Fill NaN Values of a Series. """ value = self._require_scalar(value) - if is_extension_array_dtype(self.dtype): + if is_extension_array_dtype(self.dtype) and type(self) is Index: return self._shallow_copy(self._values.fillna(value)) if self.hasnans: @@ -4404,6 +4405,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: Cast the ndarray returned from one of the libjoin.foo_indexer functions back to type(self)._data. """ + if is_extension_array_dtype(self.dtype): + # TODO use helper method / strict version + return self._values._from_sequence(result, dtype=self.dtype) return result @doc(IndexOpsMixin._memory_usage) From 7a9699c004e3130f0fa8b1142e7b9842a0fb5dea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 08:32:00 +0200 Subject: [PATCH 4/4] clean-up --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a92f5b6caaf1..0051cc1ab764b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5962,11 +5962,10 @@ def insert(self, loc: int, item) -> Index: # Use Index constructor to ensure we get tuples cast correctly. item = Index([item], dtype=self.dtype)._values + arr = self._values if is_extension_array_dtype(self.dtype): - arr = self._values idx = arr._concat_same_type([arr[:loc], item, arr[loc:]]) else: - arr = np.asarray(self) idx = np.concatenate((arr[:loc], item, arr[loc:])) return Index(idx, name=self.name)