Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF/BUG: reimplement MultiIndex.remove_unused_levels #16565

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@ def setup(self):
[np.arange(100), list('A'), list('A')],
names=['one', 'two', 'three'])

rng = np.random.RandomState(4)
size = 1 << 16
self.mi_unused_levels = pd.MultiIndex.from_arrays([
rng.randint(0, 1 << 13, size),
rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1]

def time_series_xs_mi_ix(self):
self.s.ix[999]

Expand Down Expand Up @@ -248,6 +254,9 @@ def time_multiindex_small_get_loc_warm(self):
def time_is_monotonic(self):
self.miint.is_monotonic

def time_remove_unused_levels(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep!

self.mi_unused_levels.remove_unused_levels()


class IntervalIndexing(object):
goal_time = 0.2
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Performance Improvements
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
- Improved performance of groupby with categorical groupers (:issue:`16413`)
- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)

.. _whatsnew_0202.bug_fixes:

Expand Down Expand Up @@ -61,6 +62,7 @@ Indexing

- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`)
- Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`)
- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)

I/O
^^^
Expand Down
34 changes: 15 additions & 19 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,42 +1290,38 @@ def remove_unused_levels(self):
new_levels = []
new_labels = []

changed = np.ones(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
changed = False
for lev, lab in zip(self.levels, self.labels):

uniques = algos.unique(lab)

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = False
continue

# set difference, then reverse sort
diff = Index(np.arange(len(lev))).difference(uniques)
unused = diff.sort_values(ascending=False)
changed = True

# labels get mapped from uniques to 0:len(uniques)
label_mapping = np.zeros(len(lev))
label_mapping[uniques] = np.arange(len(uniques))
lab = label_mapping[lab]

# new levels are simple
lev = lev.take(uniques)

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self
result = self._shallow_copy()

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)
if changed:
result._reset_identity()
result._set_levels(new_levels, validate=False)
result._set_labels(new_labels, validate=False)

return result

@property
def nlevels(self):
Expand Down
29 changes: 28 additions & 1 deletion pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2515,7 +2515,34 @@ def test_reconstruct_remove_unused(self):
# idempotent
result2 = result.remove_unused_levels()
tm.assert_index_equal(result2, expected)
assert result2 is result
assert result2.is_(result)

@pytest.mark.parametrize('first_type,second_type', [
('int64', 'int64'),
('datetime64[D]', 'str')])
def test_remove_unused_levels_large(self, first_type, second_type):
# GH16556

# because tests should be deterministic (and this test in particular
# checks that levels are removed, which is not the case for every
# random input):
rng = np.random.RandomState(4) # seed is arbitrary value that works

size = 1 << 16
df = DataFrame(dict(
first=rng.randint(0, 1 << 13, size).astype(first_type),
second=rng.randint(0, 1 << 10, size).astype(second_type),
third=rng.rand(size)))
df = df.groupby(['first', 'second']).sum()
df = df[df.third < 0.1]

result = df.index.remove_unused_levels()
assert len(result.levels[0]) < len(df.index.levels[0])
assert len(result.levels[1]) < len(df.index.levels[1])
assert result.equals(df.index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also tests with different things for the levels, e.g. add another example with dates & strings


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also construct that this IS equal to .reset_index(..).set_index(...).index. (for each example)

expected = df.reset_index().set_index(['first', 'second']).index
tm.assert_index_equal(result, expected)

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]
Expand Down