From d19f073bf7055c7d91ce472d98c785b2a49e9452 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Dec 2018 20:58:20 +0100 Subject: [PATCH] API: add return_inverse to pd.unique --- doc/source/whatsnew/v0.24.0.rst | 18 +++ pandas/core/algorithms.py | 29 ++++- pandas/core/arrays/categorical.py | 28 ++++- pandas/tests/test_algos.py | 189 +++++++++++++++--------------- 4 files changed, 166 insertions(+), 98 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 14bb7e55173704..75dc8b6ca8a8b5 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -320,6 +320,24 @@ Example: See the :ref:`advanced docs on renaming` for more details. +.. _whatsnew_0240.enhancements.unique: + +Changes to the ``unique``-method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed, +makes the output a tuple where the second component is an ndarray that contains the +mapping from the indices of the values to their location in the return unique values. + +.. ipython:: python + + idx = pd.Index([1, 0, 0, 1]) + uniques, inverse = pd.unique(idx, return_inverse=True) + uniques + inverse + reconstruct = pd.Index(uniques[inverse]) + reconstruct.equals(idx) + .. _whatsnew_0240.enhancements.other: Other Enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1a4368ee8ea98a..b662201e45ca67 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1): return result -def unique(values): +def unique(values, return_inverse=False): """ Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. @@ -344,18 +344,41 @@ def unique(values): pandas.Index.unique pandas.Series.unique """ + from pandas import Index values = _ensure_arraylike(values) if is_extension_array_dtype(values): # Dispatch to extension dtype's unique. + if return_inverse: + # as long as return_inverse is not part of the EA.unique contract, + # test if this works + try: + # make sure that we're not calling from an Index/Series + # container, as these do not support return_inverse yet + ea_val = getattr(values, 'array', values) + result, inverse = ea_val.unique(return_inverse=return_inverse) + + if is_categorical_dtype(values) and isinstance(values, Index): + # pd.unique(CategoricalIndex) returns Index not Categorical + result = Index(result) + return result, inverse + except TypeError: + msg = ('The Extension Array class for type {dtype} does not ' + 'yet support the unique-method with ' + '"return_inverse=True".'.format(dtype=type(values))) + raise NotImplementedError(msg) return values.unique() original = values htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + if return_inverse: + uniques, inverse = table.unique(values, return_inverse=True) + else: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -365,6 +388,8 @@ def unique(values): # TODO: it must return DatetimeArray with tz in pandas 2.0 uniques = uniques.astype(object).values + if return_inverse: + return uniques, inverse return uniques diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 938ca53b5fdce3..f96510dc1f5162 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2249,7 +2249,7 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) - def unique(self): + def unique(self, return_inverse=False): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. Unused categories are NOT returned. @@ -2259,9 +2259,22 @@ def unique(self): - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple where the second component is again an + np.ndarray that contains the mapping between the indices of the + elements in the calling Categorical and their locations in the + unique values. See examples for how to reconstruct. + + .. versionadded:: 0.24.0 + Returns ------- - unique values : ``Categorical`` + uniques : ``Categorical`` + inverse : np.ndarray (if `return_inverse=True`) + The inverse from the `uniques` back to the calling ``Categorical``. Examples -------- @@ -2293,7 +2306,10 @@ def unique(self): """ # unlike np.unique, unique1d does not sort - unique_codes = unique1d(self.codes) + if return_inverse: + unique_codes, inverse = unique1d(self.codes, return_inverse=True) + else: + unique_codes = unique1d(self.codes, return_inverse=False) cat = self.copy() # keep nan in codes @@ -2303,7 +2319,11 @@ def unique(self): take_codes = unique_codes[unique_codes != -1] if self.ordered: take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + result = cat.set_categories(cat.categories.take(take_codes)) + + if return_inverse: + return result, inverse + return result def _values_for_factorize(self): codes = self.codes.astype('int64') diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c9d403f6696af1..2afbfdf0db6eb8 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -25,6 +25,20 @@ from pandas.util.testing import assert_almost_equal +def assert_series_or_index_or_array_or_categorical_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + elif isinstance(left, Index): + tm.assert_index_equal(left, right) + elif isinstance(left, np.ndarray): + tm.assert_numpy_array_equal(left, right) + elif isinstance(left, Categorical): + tm.assert_categorical_equal(left, right) + else: + # will fail + assert isinstance(left, (Series, Index, np.ndarray, Categorical)) + + class TestMatch(object): def test_ints(self): @@ -321,17 +335,22 @@ def test_parametrized_factorize_na_value(self, data, na_value): class TestUnique(object): - def test_ints(self): - arr = np.random.randint(0, 100, size=50) + def test_unique_inverse(self, any_numpy_dtype): + dtype = any_numpy_dtype + arr = np.random.randint(0, 100, size=50).astype(dtype) result = algos.unique(arr) assert isinstance(result, np.ndarray) - def test_objects(self): - arr = np.random.randint(0, 100, size=50).astype('O') + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() - result = algos.unique(arr) - assert isinstance(result, np.ndarray) + result_uniques, result_inverse = algos.unique(arr, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = result_uniques[result_inverse] + tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False) def test_object_refcount_bug(self): lst = ['A', 'B', 'C', 'D', 'E'] @@ -376,24 +395,26 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype - def test_timedelta64_dtype_array_returned(self): + @pytest.mark.parametrize('box', [Index, Series, np.array]) + def test_timedelta64_dtype_array_returned(self, box): # GH 9431 expected = np.array([31200, 45678, 10000], dtype='m8[ns]') td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) - result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + obj = box(td_index) - s = Series(td_index) - result = algos.unique(s) + result = algos.unique(obj) tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype - arr = s.values - result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + + result_uniques, result_inverse = algos.unique(obj, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) @@ -406,78 +427,80 @@ def test_nan_in_object_array(self): expected = np.array(['a', np.nan, 'c'], dtype=object) tm.assert_numpy_array_equal(result, expected) - def test_categorical(self): + result_uniques, result_inverse = pd.unique(duplicated_items, + return_inverse=True) + expected_inverse = np.array([0, 1, 2, 2], dtype='int64') + tm.assert_numpy_array_equal(result_inverse, expected_inverse) + + @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize('box', [lambda x: x, Series, Index], + ids=['Categorical', 'Series', 'Index']) + @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), + pd.unique], + ids=['classmethod', 'toplevel']) + def test_categorical(self, method, box, ordered): - # we are expecting to return in the order - # of appearance - expected = Categorical(list('bac'), categories=list('bac')) + categories = list('abc') if ordered else list('bac') + expected = Categorical(list('bac'), categories=categories, + ordered=ordered) - # we are expecting to return in the order - # of the categories - expected_o = Categorical( - list('bac'), categories=list('abc'), ordered=True) + # Index.unique always returns Index + # pd.unique(Index) stays Index (only) for Categorical + expected = box(expected) if box == Index else expected # GH 15939 - c = Categorical(list('baabc')) - result = c.unique() - tm.assert_categorical_equal(result, expected) + c = box(Categorical(list('baabc'), categories=categories, + ordered=ordered)) + result = method(c) - result = algos.unique(c) - tm.assert_categorical_equal(result, expected) + assert_series_or_index_or_array_or_categorical_equal(result, expected) - c = Categorical(list('baabc'), ordered=True) - result = c.unique() - tm.assert_categorical_equal(result, expected_o) + if method == pd.unique: + # [Series/Index].unique do not yet support return_inverse=True - result = algos.unique(c) - tm.assert_categorical_equal(result, expected_o) + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + result_uniques, result_inverse = method(c, return_inverse=True) - # Series of categorical dtype - s = Series(Categorical(list('baabc')), name='foo') - result = s.unique() - tm.assert_categorical_equal(result, expected) + assert_series_or_index_or_array_or_categorical_equal( + result_uniques, expected_uniques) - result = pd.unique(s) - tm.assert_categorical_equal(result, expected) + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, c) - # CI -> return CI - ci = CategoricalIndex(Categorical(list('baabc'), - categories=list('bac'))) - expected = CategoricalIndex(expected) - result = ci.unique() - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), + pd.unique], + ids=['classmethod', 'toplevel']) + def test_datetime64tz_aware(self, method, box): + # GH 15939 - result = pd.unique(ci) - tm.assert_index_equal(result, expected) + ts = Timestamp('20160101', tz='US/Eastern') + obj = box([ts, ts]) - def test_datetime64tz_aware(self): - # GH 15939 + if box == Series: + expected = np.array([Timestamp('2016-01-01 00:00:00-0500', + tz='US/Eastern')], dtype=object) + else: # Index + expected = Index([ts]) - result = Series( - Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])).unique() - expected = np.array([Timestamp('2016-01-01 00:00:00-0500', - tz='US/Eastern')], dtype=object) - tm.assert_numpy_array_equal(result, expected) + result = method(obj) + assert_series_or_index_or_array_or_categorical_equal(result, expected) - result = Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]).unique() - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) - tm.assert_index_equal(result, expected) - - result = pd.unique( - Series(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]))) - expected = np.array([Timestamp('2016-01-01 00:00:00-0500', - tz='US/Eastern')], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if method == pd.unique: + # [Series/Index].unique do not yet support return_inverse=True + + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + result_uniques, result_inverse = method(obj, return_inverse=True) - result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) - tm.assert_index_equal(result, expected) + assert_series_or_index_or_array_or_categorical_equal( + result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_order_of_appearance(self): # 9346 @@ -491,28 +514,10 @@ def test_order_of_appearance(self): tm.assert_numpy_array_equal(result, np.array([2, 1], dtype='int64')) - result = pd.unique(Series([Timestamp('20160101'), - Timestamp('20160101')])) - expected = np.array(['2016-01-01T00:00:00.000000000'], - dtype='datetime64[ns]') - tm.assert_numpy_array_equal(result, expected) - - result = pd.unique(Index( - [Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) - tm.assert_index_equal(result, expected) - result = pd.unique(list('aabc')) expected = np.array(['a', 'b', 'c'], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Series(Categorical(list('aabc')))) - expected = Categorical(list('abc')) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize("arg ,expected", [ (('1', '1', '2'), np.array(['1', '2'], dtype=object)), (('foo',), np.array(['foo'], dtype=object))