diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index da75e2c49ae10..bdddc99ca094e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -94,6 +94,25 @@ of the Series or columns of a DataFrame will also have string dtype. We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. + +.. _whatsnew_1000.enhancements.unique: + +Changes to the ``unique``-method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed, +makes the output a tuple where the second component is an ndarray that contains the +mapping from the indices of the values to their location in the return unique values. + +.. ipython:: python + + idx = pd.Index([1, 0, 0, 1]) + uniques, inverse = pd.unique(idx, return_inverse=True) + uniques + inverse + reconstruct = pd.Index(uniques[inverse]) + reconstruct.equals(idx) + .. _whatsnew_1000.enhancements.other: Other enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e5ab0d182aff..8392f9c4cc77d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -306,7 +306,7 @@ def match(to_match, values, na_sentinel=-1): return result -def unique(values): +def unique(values, return_inverse=False): """ Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. @@ -316,6 +316,13 @@ def unique(values): Parameters ---------- values : 1d array-like + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple of two np.ndarray. The second component + contains the mapping between the indices of the elements in the + calling Categorical and their locations in the unique values. + + .. versionadded:: 1.0.0 Returns ------- @@ -384,19 +391,47 @@ def unique(values): >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ + from pandas import Index values = _ensure_arraylike(values) if is_extension_array_dtype(values): # Dispatch to extension dtype's unique. + if return_inverse: + # as long as return_inverse is not part of the EA.unique contract, + # test if this works + try: + # make sure that we're not calling from an Index/Series + # container, as these do not support return_inverse yet + ea_val = getattr(values, "array", values) + result, inverse = ea_val.unique(return_inverse=return_inverse) + + if is_categorical_dtype(values) and isinstance(values, Index): + # pd.unique(CategoricalIndex) returns Index not Categorical + result = Index(result) + return result, inverse + except TypeError: + msg = ( + "The Extension Array class for type {dtype} does not " + "yet support the unique-method with " + '"return_inverse=True".'.format(dtype=type(values)) + ) + raise NotImplementedError(msg) return values.unique() original = values htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + if return_inverse: + uniques, inverse = table.unique(values, return_inverse=True) + else: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, original.dtype, original) + + if return_inverse: + return uniques, inverse return uniques diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bab1127e6e539..a7f9ba80a7ceb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2274,7 +2274,7 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) - def unique(self): + def unique(self, return_inverse=False): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. Unused categories are NOT returned. @@ -2284,9 +2284,21 @@ def unique(self): - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple of two np.ndarray. The second component + contains the mapping between the indices of the elements in the + calling Categorical and their locations in the unique values. + + .. versionadded:: 0.25.0 + Returns ------- - unique values : ``Categorical`` + uniques : ``Categorical`` + inverse : np.ndarray (if `return_inverse=True`) + The inverse from the `uniques` back to the calling ``Categorical``. Examples -------- @@ -2318,7 +2330,10 @@ def unique(self): """ # unlike np.unique, unique1d does not sort - unique_codes = unique1d(self.codes) + if return_inverse: + unique_codes, inverse = unique1d(self.codes, return_inverse=True) + else: + unique_codes = unique1d(self.codes, return_inverse=False) cat = self.copy() # keep nan in codes @@ -2328,7 +2343,11 @@ def unique(self): take_codes = unique_codes[unique_codes != -1] if self.ordered: take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + result = cat.set_categories(cat.categories.take(take_codes)) + + if return_inverse: + return result, inverse + return result def _values_for_factorize(self): codes = self.codes.astype("int64") diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a5706d8baa614..1d76b99a7170d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -31,6 +31,22 @@ from pandas.util.testing import assert_almost_equal +def assert_series_or_index_or_array_or_categorical_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + elif isinstance(left, Index): + tm.assert_index_equal(left, right) + elif isinstance(left, np.ndarray): + tm.assert_numpy_array_equal(left, right) + elif isinstance(left, Categorical): + tm.assert_categorical_equal(left, right) + elif isinstance(left, DatetimeArray): + tm.assert_extension_array_equal(left, right) + else: + # will fail + assert isinstance(left, (Series, Index, np.ndarray, Categorical, DatetimeArray)) + + class TestMatch: def test_ints(self): values = np.array([0, 2, 1]) @@ -357,17 +373,22 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): class TestUnique: - def test_ints(self): - arr = np.random.randint(0, 100, size=50) + def test_unique_all_dtypes(self, any_numpy_dtype): + dtype = any_numpy_dtype + arr = np.random.randint(0, 100, size=50).astype(dtype) result = algos.unique(arr) assert isinstance(result, np.ndarray) - def test_objects(self): - arr = np.random.randint(0, 100, size=50).astype("O") + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() - result = algos.unique(arr) - assert isinstance(result, np.ndarray) + result_uniques, result_inverse = algos.unique(arr, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = result_uniques[result_inverse] + tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False) def test_object_refcount_bug(self): lst = ["A", "B", "C", "D", "E"] @@ -420,24 +441,26 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype - def test_timedelta64_dtype_array_returned(self): + @pytest.mark.parametrize("box", [Index, Series, np.array]) + def test_timedelta64_dtype_array_returned(self, box): # GH 9431 expected = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) - result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + obj = box(td_index) - s = Series(td_index) - result = algos.unique(s) + result = algos.unique(obj) tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype - arr = s.values - result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + + result_uniques, result_inverse = algos.unique(obj, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_uint64_overflow(self): s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) @@ -450,102 +473,87 @@ def test_nan_in_object_array(self): expected = np.array(["a", np.nan, "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - def test_categorical(self): - - # we are expecting to return in the order - # of appearance - expected = Categorical(list("bac"), categories=list("bac")) + result_uniques, result_inverse = pd.unique( + duplicated_items, return_inverse=True + ) + expected_inverse = np.array([0, 1, 2, 2], dtype="int64") + tm.assert_numpy_array_equal(result_inverse, expected_inverse) - # we are expecting to return in the order - # of the categories - expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True) + @pytest.mark.parametrize("ordered", [True, False]) + @pytest.mark.parametrize( + "box", [lambda x: x, Series, Index], ids=["Categorical", "Series", "Index"] + ) + @pytest.mark.parametrize( + "method", + [lambda x, **kwargs: x.unique(**kwargs), pd.unique], + ids=["classmethod", "toplevel"], + ) + def test_categorical(self, method, box, ordered): - # GH 15939 - c = Categorical(list("baabc")) - result = c.unique() - tm.assert_categorical_equal(result, expected) + categories = list("abc") if ordered else list("bac") + expected = Categorical(list("bac"), categories=categories, ordered=ordered) - result = algos.unique(c) - tm.assert_categorical_equal(result, expected) + # Index.unique always returns Index + # pd.unique(Index) stays Index (only) for Categorical + expected = box(expected) if box == Index else expected - c = Categorical(list("baabc"), ordered=True) - result = c.unique() - tm.assert_categorical_equal(result, expected_o) + # GH 15939 + c = box(Categorical(list("baabc"), categories=categories, ordered=ordered)) + result = method(c) - result = algos.unique(c) - tm.assert_categorical_equal(result, expected_o) + assert_series_or_index_or_array_or_categorical_equal(result, expected) - # Series of categorical dtype - s = Series(Categorical(list("baabc")), name="foo") - result = s.unique() - tm.assert_categorical_equal(result, expected) + if method == pd.unique: + # [Series/Index].unique do not yet support return_inverse=True - result = pd.unique(s) - tm.assert_categorical_equal(result, expected) + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + result_uniques, result_inverse = method(c, return_inverse=True) - # CI -> return CI - ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) - expected = CategoricalIndex(expected) - result = ci.unique() - tm.assert_index_equal(result, expected) + assert_series_or_index_or_array_or_categorical_equal( + result_uniques, expected_uniques + ) - result = pd.unique(ci) - tm.assert_index_equal(result, expected) + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, c) - def test_datetime64tz_aware(self): + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize( + "method", + [lambda x, **kwargs: x.unique(**kwargs), pd.unique], + ids=["classmethod", "toplevel"], + ) + def test_datetime64tz_aware(self, method, box): # GH 15939 - result = Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ).unique() - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) - ) - tm.assert_extension_array_equal(result, expected) + ts = Timestamp("20160101", tz="US/Eastern") + obj = box([ts, ts]) - result = Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ).unique() - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) - tm.assert_index_equal(result, expected) - - result = pd.unique( - Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) + if box == Series: + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) ) - ) - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01", tz="US/Eastern")]) - ) - tm.assert_extension_array_equal(result, expected) - - result = pd.unique( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] + else: # Index + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None ) - ) - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) - tm.assert_index_equal(result, expected) + result = method(obj) + assert_series_or_index_or_array_or_categorical_equal(result, expected) + + # TODO: add support for return_inverse to DatetimeArray/DatetimeIndex, + # as well as [[Series/Index].unique + + # # reuse result as expected outcome of return_inverse case + # expected_uniques = result.copy() + # result_uniques, result_inverse = method(obj, return_inverse=True) + # + # assert_series_or_index_or_array_or_categorical_equal( + # result_uniques, expected_uniques) + # + # # reconstruction can only work if inverse is correct + # reconstr = box(result_uniques[result_inverse]) + # assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_order_of_appearance(self): # 9346 @@ -557,31 +565,10 @@ def test_order_of_appearance(self): result = pd.unique(Series([2] + [1] * 5)) tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")])) - expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]") - tm.assert_numpy_array_equal(result, expected) - - result = pd.unique( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) - tm.assert_index_equal(result, expected) - result = pd.unique(list("aabc")) expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Series(Categorical(list("aabc")))) - expected = Categorical(list("abc")) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize( "arg ,expected", [