Skip to content

Commit

Permalink
PERF: improved performance of small multiindexes (pandas-dev#16324)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback authored May 11, 2017
1 parent 1c0b632 commit 94ef7b6
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 22 deletions.
20 changes: 16 additions & 4 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,15 @@ def setup(self):
np.arange(1000)], names=['one', 'two'])

import string
self.mistring = MultiIndex.from_product(
[np.arange(1000),
np.arange(20), list(string.ascii_letters)],

self.mi_large = MultiIndex.from_product(
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
names=['one', 'two', 'three'])
self.mi_med = MultiIndex.from_product(
[np.arange(1000), np.arange(10), list('A')],
names=['one', 'two', 'three'])
self.mi_small = MultiIndex.from_product(
[np.arange(100), list('A'), list('A')],
names=['one', 'two', 'three'])

def time_series_xs_mi_ix(self):
Expand All @@ -218,8 +224,14 @@ def time_multiindex_get_indexer(self):
(0, 16), (0, 17), (0, 18),
(0, 19)], dtype=object))

def time_multiindex_large_get_loc(self):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_med_get_loc(self):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_string_get_loc(self):
self.mistring.get_loc((999, 19, 'Z'))
self.mi_small.get_loc((99, 'A', 'A'))

def time_is_monotonic(self):
self.miint.is_monotonic
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance regression fix when indexing with a list-like (:issue:`16285`)

- Performance regression fix for small MultiIndexes (:issuse:`16319`)

.. _whatsnew_0202.bug_fixes:

Expand Down
33 changes: 32 additions & 1 deletion pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz):
return tz is UTC or isinstance(tz, _du_utc)


cdef class MultiIndexEngine(IndexEngine):
cdef class MultiIndexObjectEngine(ObjectEngine):
"""
provide the same interface as the MultiIndexEngine
but use the IndexEngine for computation
This provides good performance with samller MI's
"""
def get_indexer(self, values):
# convert a MI to an ndarray
if hasattr(values, 'values'):
values = values.values
return super(MultiIndexObjectEngine, self).get_indexer(values)

cpdef get_loc(self, object val):

# convert a MI to an ndarray
if hasattr(val, 'values'):
val = val.values
return super(MultiIndexObjectEngine, self).get_loc(val)


cdef class MultiIndexHashEngine(ObjectEngine):
"""
Use a hashing based MultiIndex impl
but use the IndexEngine for computation
This provides good performance with larger MI's
"""

def _call_monotonic(self, object mi):
# defer these back to the mi iteself
Expand Down Expand Up @@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine):
except TypeError:
raise KeyError(val)

def get_indexer(self, values):
self._ensure_mapping_populated()
return self.mapping.lookup(values)

cdef _make_hash_table(self, n):
return _hash.MultiIndexHashTable(n)

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@ def is_dtype(cls, dtype):
"""
if hasattr(dtype, 'dtype'):
dtype = dtype.dtype
if isinstance(dtype, cls):
return True
elif isinstance(dtype, np.dtype):
if isinstance(dtype, np.dtype):
return False
elif dtype is None:
return False
elif isinstance(dtype, cls):
return True
try:
return cls.construct_from_string(dtype) is not None
except:
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class MultiIndex(Index):
_levels = FrozenList()
_labels = FrozenList()
_comparables = ['names']
_engine_type = libindex.MultiIndexEngine
rename = Index.set_names

def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
Expand Down Expand Up @@ -629,7 +628,16 @@ def _get_level_number(self, level):

@cache_readonly
def _engine(self):
return self._engine_type(lambda: self, len(self))

# choose our engine based on our size
# the hashing based MultiIndex for larger
# sizes, and the MultiIndexOjbect for smaller
# xref: https://github.com/pandas-dev/pandas/pull/16324
l = len(self)
if l > 10000:
return libindex.MultiIndexHashEngine(lambda: self, l)

return libindex.MultiIndexObjectEngine(lambda: self.values, l)

@property
def values(self):
Expand Down
18 changes: 7 additions & 11 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@

import numpy as np
from pandas._libs import hashing
from pandas._libs.lib import is_bool_array
from pandas.core.dtypes.generic import (
ABCMultiIndex,
ABCIndexClass,
ABCSeries,
ABCDataFrame)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype,
is_list_like)
is_categorical_dtype, is_list_like)

# 16 byte long hashing key
_default_hash_key = '0123456789123456'
Expand Down Expand Up @@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
-------
ndarray of hashed values array
"""

is_tuple = False
if isinstance(vals, tuple):
vals = [vals]
Expand Down Expand Up @@ -231,29 +227,29 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):

if not hasattr(vals, 'dtype'):
raise TypeError("must pass a ndarray-like")
dtype = vals.dtype

if hash_key is None:
hash_key = _default_hash_key

# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke.
if is_categorical_dtype(vals.dtype):
if is_categorical_dtype(dtype):
return _hash_categorical(vals, encoding, hash_key)

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
elif np.issubdtype(dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if is_bool_array(vals):
elif isinstance(dtype, np.bool):
vals = vals.astype('u8')
elif (is_datetime64_dtype(vals) or
is_timedelta64_dtype(vals)):
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view('i8').astype('u8', copy=False)
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:
# With repeated values, its MUCH faster to categorize object dtypes,
Expand Down

0 comments on commit 94ef7b6

Please sign in to comment.