diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6dc730cae37f7c..a9f35fb4827926 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -24,6 +24,7 @@ Other Enhancements - Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) - :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) +- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) - :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 121bf6a66dee5e..058a94e00cb8f8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3757,8 +3757,32 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) - @Appender(base._shared_docs['unique'] % _index_doc_kwargs) - def unique(self): + _index_shared_docs['index_unique'] = ( + """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. + + Parameters + ---------- + level : int or str, optional, default None + Only return values from specified level (for MultiIndex) + + .. versionadded:: 0.22.0 + + Returns + ------- + Index without duplicates + + See Also + -------- + unique + Series.unique + """) + + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) result = super(Index, self).unique() return self._shallow_copy(result) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3812ed96b6c36b..778907abab681a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -378,8 +378,10 @@ def is_monotonic_increasing(self): def is_monotonic_decreasing(self): return Index(self.codes).is_monotonic_decreasing - @Appender(base._shared_docs['unique'] % _index_doc_kwargs) - def unique(self): + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level not in {0, self.name, None}: + raise ValueError("Level {} not found".format(level)) result = base.IndexOpsMixin.unique(self) # CategoricalIndex._shallow_copy uses keeps original categories # and ordered if not otherwise specified diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 50e1a9d2fc68bd..ec506d7a671181 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -908,7 +908,7 @@ def _try_mi(k): raise InvalidIndexError(key) - def _get_level_values(self, level): + def _get_level_values(self, level, unique=False): """ Return vector of label values for requested level, equal to the length of the index @@ -918,17 +918,21 @@ def _get_level_values(self, level): Parameters ---------- level : int level + unique : bool, default False + if True, drop duplicated values Returns ------- values : ndarray """ - unique = self.levels[level] + values = self.levels[level] labels = self.labels[level] - filled = algos.take_1d(unique._values, labels, - fill_value=unique._na_value) - values = unique._shallow_copy(filled) + if unique: + labels = algos.unique(labels) + filled = algos.take_1d(values._values, labels, + fill_value=values._na_value) + values = values._shallow_copy(filled) return values def get_level_values(self, level): @@ -967,6 +971,15 @@ def get_level_values(self, level): values = self._get_level_values(level) return values + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + + if level is None: + return super(MultiIndex, self).unique() + else: + level = self._get_level_number(level) + return self._get_level_values(level=level, unique=True) + def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None): if len(self) == 0: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 456e5a9bd6439d..81360bc0c13f9c 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -329,6 +329,27 @@ def test_duplicates(self, indices): assert not idx.is_unique assert idx.has_duplicates + def test_unique(self, indices): + # don't test a MultiIndex here (as its tested separated) + # don't test a CategoricalIndex because categories change (GH 18291) + if isinstance(indices, (MultiIndex, CategoricalIndex)): + return + + # GH 17896 + expected = indices.drop_duplicates() + for level in 0, indices.name, None: + result = indices.unique(level=level) + tm.assert_index_equal(result, expected) + + for level in 3, 'wrong': + pytest.raises((IndexError, KeyError), indices.unique, level=level) + + def test_unique_na(self): + idx = pd.Index([2, np.nan, 2, 1], name='my_index') + expected = pd.Index([2, np.nan, 1], name='my_index') + result = idx.unique() + tm.assert_index_equal(result, expected) + def test_get_unique_index(self, indices): # MultiIndex tested separately if not len(indices) or isinstance(indices, MultiIndex): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index dbd18de16cebde..9d81cfef04e87f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -963,19 +963,21 @@ def test_get_level_values(self): exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp) - def test_get_level_values_na(self): + @pytest.mark.xfail(reason='GH 17924 (returns Int64Index with float data)') + def test_get_level_values_int_with_na(self): arrays = [['a', 'b', 'b'], [1, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) - values = index.get_level_values(1) - expected = np.array([1, np.nan, 2]) - tm.assert_numpy_array_equal(values.values.astype(float), expected) + result = index.get_level_values(1) + expected = Index([1, np.nan, 2]) + tm.assert_index_equal(result, expected) arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) - values = index.get_level_values(1) - expected = np.array([np.nan, np.nan, 2]) - tm.assert_numpy_array_equal(values.values.astype(float), expected) + result = index.get_level_values(1) + expected = Index([np.nan, np.nan, 2]) + tm.assert_index_equal(result, expected) + def test_get_level_values_na(self): arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) @@ -990,7 +992,7 @@ def test_get_level_values_na(self): index = pd.MultiIndex.from_arrays(arrays) values = index.get_level_values(1) expected = pd.DatetimeIndex([0, 1, pd.NaT]) - tm.assert_numpy_array_equal(values.values, expected.values) + tm.assert_index_equal(values, expected) arrays = [[], []] index = pd.MultiIndex.from_arrays(arrays) @@ -2277,6 +2279,20 @@ def test_unique(self): exp = pd.MultiIndex.from_arrays([['a'], ['a']]) tm.assert_index_equal(res, exp) + @pytest.mark.parametrize('level', [0, 'first', 1, 'second']) + def test_unique_level(self, level): + # GH #17896 - with level= argument + result = self.index.unique(level=level) + expected = self.index.get_level_values(level).unique() + tm.assert_index_equal(result, expected) + + # With already unique level + mi = pd.MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], + names=['first', 'second']) + result = mi.unique(level=level) + expected = mi.get_level_values(level) + tm.assert_index_equal(result, expected) + def test_unique_datetimelike(self): idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', '2015-01-01', 'NaT', 'NaT'])