From 9852c6f955b7144cd234aea05518926e40bb9e88 Mon Sep 17 00:00:00 2001 From: Ryan Hendrickson Date: Thu, 1 Jun 2017 13:35:11 -0400 Subject: [PATCH] BUG: reimplement MultiIndex.remove_unused_levels * Add a large random test case for remove_unused_levels that failed the previous implementation * Fix #16556, a performance issue with the previous implementation * Always return at least a view instead of the original index --- doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/core/indexes/multi.py | 34 +++++++++++++----------------- pandas/tests/indexes/test_multi.py | 29 ++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 379249b6e55d67..fae80812797a7c 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -32,6 +32,7 @@ Performance Improvements - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - Improved performance of groupby with categorical groupers (:issue:`16413`) +- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) .. _whatsnew_0202.bug_fixes: @@ -61,6 +62,7 @@ Indexing - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`) +- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 981a6a696a6185..f30da5b05f8ae0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1290,8 +1290,8 @@ def remove_unused_levels(self): new_levels = [] new_labels = [] - changed = np.ones(self.nlevels, dtype=bool) - for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): + changed = False + for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) @@ -1299,33 +1299,29 @@ def remove_unused_levels(self): if len(uniques) == len(lev): new_levels.append(lev) new_labels.append(lab) - changed[i] = False continue - # set difference, then reverse sort - diff = Index(np.arange(len(lev))).difference(uniques) - unused = diff.sort_values(ascending=False) + changed = True + + # labels get mapped from uniques to 0:len(uniques) + label_mapping = np.zeros(len(lev)) + label_mapping[uniques] = np.arange(len(uniques)) + lab = label_mapping[lab] # new levels are simple lev = lev.take(uniques) - # new labels, we remove the unsued - # by decrementing the labels for that value - # prob a better way - for u in unused: - - lab = np.where(lab > u, lab - 1, lab) - new_levels.append(lev) new_labels.append(lab) - # nothing changed - if not changed.any(): - return self + result = self._shallow_copy() - return MultiIndex(new_levels, new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_labels(new_labels, validate=False) + + return result @property def nlevels(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 388a49d25cb820..242a9d63eac63f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2515,7 +2515,34 @@ def test_reconstruct_remove_unused(self): # idempotent result2 = result.remove_unused_levels() tm.assert_index_equal(result2, expected) - assert result2 is result + assert result2.is_(result) + + @pytest.mark.parametrize('first_type,second_type', [ + ('int64', 'int64'), + ('datetime64[D]', 'str')]) + def test_remove_unused_levels_large(self, first_type, second_type): + # GH16556 + + # because tests should be deterministic (and this test in particular + # checks that levels are removed, which is not the case for every + # random input): + rng = np.random.RandomState(4) # seed is arbitrary value that works + + size = 1 << 16 + df = DataFrame(dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size))) + df = df.groupby(['first', 'second']).sum() + df = df[df.third < 0.1] + + result = df.index.remove_unused_levels() + assert len(result.levels[0]) < len(df.index.levels[0]) + assert len(result.levels[1]) < len(df.index.levels[1]) + assert result.equals(df.index) + + expected = df.reset_index().set_index(['first', 'second']).index + tm.assert_index_equal(result, expected) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)]