Skip to content

Commit

Permalink
BUG: reimplement MultiIndex.remove_unused_levels
Browse files Browse the repository at this point in the history
* Add a large random test case for remove_unused_levels that failed the
previous implementation

* Fix #16556, a performance issue with the previous implementation

* Always return at least a view instead of the original index
  • Loading branch information
rhendric committed Jun 1, 2017
1 parent ee8346d commit 8a9fe43
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 20 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Performance Improvements
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
- Improved performance of groupby with categorical groupers (:issue:`16413`)
- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)

.. _whatsnew_0202.bug_fixes:

Expand Down Expand Up @@ -62,6 +63,7 @@ Indexing
^^^^^^^^

- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`)
- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)


I/O
Expand Down
34 changes: 15 additions & 19 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,42 +1290,38 @@ def remove_unused_levels(self):
new_levels = []
new_labels = []

changed = np.ones(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
changed = False
for lev, lab in zip(self.levels, self.labels):

uniques = algos.unique(lab)

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = False
continue

# set difference, then reverse sort
diff = Index(np.arange(len(lev))).difference(uniques)
unused = diff.sort_values(ascending=False)
changed = True

# labels get mapped from uniques to 0:len(uniques)
label_mapping = np.zeros(len(lev))
label_mapping[uniques] = np.arange(len(uniques))
lab = label_mapping[lab]

# new levels are simple
lev = lev.take(uniques)

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self
result = self._shallow_copy()

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)
if changed:
result._reset_identity()
result._set_levels(new_levels, validate=False)
result._set_labels(new_labels, validate=False)

return result

@property
def nlevels(self):
Expand Down
29 changes: 28 additions & 1 deletion pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2489,7 +2489,34 @@ def test_reconstruct_remove_unused(self):
# idempotent
result2 = result.remove_unused_levels()
tm.assert_index_equal(result2, expected)
assert result2 is result
assert result2.is_(result)

def test_remove_unused_levels_large(self):
# GH16556

def check(first_type=None, second_type=None):
size = 1 << 16
first = np.random.randint(0, 1 << 13, size)
if first_type is not None:
first = first.astype(first_type)
second = np.random.randint(0, 1 << 10, size)
if second_type is not None:
second = second.astype(second_type)
third = np.random.rand(size)
df = DataFrame(dict(first=first, second=second, third=third))
df = df.groupby(['first', 'second']).sum()
df = df[df.third < 0.1]

result = df.index.remove_unused_levels()
assert len(result.levels[0]) < len(df.index.levels[0])
assert len(result.levels[1]) < len(df.index.levels[1])
assert result.equals(df.index)

expected = df.reset_index().set_index(['first', 'second']).index
assert result.equals(expected)

check()
check('datetime64[D]', 'str')

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]
Expand Down

0 comments on commit 8a9fe43

Please sign in to comment.