Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: speed up CategoricalIndex.get_loc #23235

Merged
merged 2 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
import numpy as np

from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
ObjectEngine)
from pandas._libs import index as libindex


def _get_numeric_engines():
engine_names = [
('Int64Engine', np.int64), ('Int32Engine', np.int32),
('Int16Engine', np.int16), ('Int8Engine', np.int8),
('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
('Float64Engine', np.float64), ('Float32Engine', np.float32),
]
return [(getattr(libindex, engine_name), dtype)
for engine_name, dtype in engine_names
if hasattr(libindex, engine_name)]


class NumericEngineIndexing(object):

params = [[Int64Engine, UInt64Engine, Float64Engine],
[np.int64, np.uint64, np.float64],
params = [_get_numeric_engines(),
['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
]
param_names = ['engine', 'dtype', 'index_type']
param_names = ['engine_and_dtype', 'index_type']

def setup(self, engine, dtype, index_type):
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype
N = 10**5
values = list([1] * N + [2] * N + [3] * N)
arr = {
Expand All @@ -26,7 +38,7 @@ def setup(self, engine, dtype, index_type):
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc(2)

def time_get_loc(self, engine, dtype, index_type):
def time_get_loc(self, engine_and_dtype, index_type):
self.data.get_loc(2)


Expand All @@ -44,7 +56,7 @@ def setup(self, index_type):
'non_monotonic': np.array(list('abc') * N, dtype=object),
}[index_type]

self.data = ObjectEngine(lambda: arr, len(arr))
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc('b')

Expand Down
8 changes: 5 additions & 3 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -935,9 +935,11 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
both when indexing by label (using .loc) and position(.iloc).
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
- Slicing Series and Dataframes with an monotonically increasing :class:`CategoricalIndex`
is now very fast and has speed comparable to slicing with an ``Int64Index``.
The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``)
shows similar speed improvements as above (:issue:`21659`)
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`)
Expand Down
24 changes: 22 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ from libc.math cimport fabs, sqrt
import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
NPY_INT64, NPY_UINT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all of these used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NPY_UINT32, NPY_UINT16 and uint23_t and uint16_t are not used. This needs a discussion, see below.

NPY_FLOAT32, NPY_FLOAT64,
NPY_OBJECT,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
Expand Down Expand Up @@ -359,9 +360,13 @@ ctypedef fused algos_t:
float64_t
float32_t
object
int32_t
int64_t
int32_t
int16_t
int8_t
uint64_t
uint32_t
uint16_t
uint8_t


Expand Down Expand Up @@ -459,7 +464,12 @@ pad_float32 = pad["float32_t"]
pad_object = pad["object"]
pad_int64 = pad["int64_t"]
pad_int32 = pad["int32_t"]
pad_int16 = pad["int16_t"]
pad_int8 = pad["int8_t"]
pad_uint64 = pad["uint64_t"]
pad_uint32 = pad["uint32_t"]
pad_uint16 = pad["uint16_t"]
pad_uint8 = pad["uint8_t"]
pad_bool = pad["uint8_t"]


Expand Down Expand Up @@ -653,7 +663,12 @@ backfill_float32 = backfill["float32_t"]
backfill_object = backfill["object"]
backfill_int64 = backfill["int64_t"]
backfill_int32 = backfill["int32_t"]
backfill_int16 = backfill["int16_t"]
backfill_int8 = backfill["int8_t"]
backfill_uint64 = backfill["uint64_t"]
backfill_uint32 = backfill["uint32_t"]
backfill_uint16 = backfill["uint16_t"]
backfill_uint8 = backfill["uint8_t"]
backfill_bool = backfill["uint8_t"]


Expand Down Expand Up @@ -866,7 +881,12 @@ is_monotonic_float32 = is_monotonic["float32_t"]
is_monotonic_object = is_monotonic["object"]
is_monotonic_int64 = is_monotonic["int64_t"]
is_monotonic_int32 = is_monotonic["int32_t"]
is_monotonic_int16 = is_monotonic["int16_t"]
is_monotonic_int8 = is_monotonic["int8_t"]
is_monotonic_uint64 = is_monotonic["uint64_t"]
is_monotonic_uint32 = is_monotonic["uint32_t"]
is_monotonic_uint16 = is_monotonic["uint16_t"]
is_monotonic_uint8 = is_monotonic["uint8_t"]
is_monotonic_bool = is_monotonic["uint8_t"]


Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint8', 'UINT8', 'uint8'),
('uint16', 'UINT16', 'uint16'),
('uint32', 'UINT32', 'uint32'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import cython

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray, float64_t, int32_t,
int64_t, uint8_t, uint64_t, intp_t,
from numpy cimport (ndarray, intp_t,
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
float64_t, float32_t,
int64_t, int32_t, int16_t, int8_t,
uint64_t, uint32_t, uint16_t, uint8_t,
# Note: NPY_DATETIME, NPY_TIMEDELTA are only available
# for cimport in cython>=0.27.3
NPY_DATETIME, NPY_TIMEDELTA)
Expand Down
35 changes: 22 additions & 13 deletions pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

{{py:

# name, dtype, ctype
dtypes = [('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t'),
('Object', 'object', 'object')]
# name, dtype, ctype, hashtable_name, hashtable_dtype
dtypes = [('Float64', 'float64', 'float64_t', 'Float64', 'float64'),
('Float32', 'float32', 'float32_t', 'Float64', 'float64'),
('Int64', 'int64', 'int64_t', 'Int64', 'int64'),
('Int32', 'int32', 'int32_t', 'Int64', 'int64'),
('Int16', 'int16', 'int16_t', 'Int64', 'int64'),
('Int8', 'int8', 'int8_t', 'Int64', 'int64'),
('UInt64', 'uint64', 'uint64_t', 'UInt64', 'uint64'),
('UInt32', 'uint32', 'uint32_t', 'UInt64', 'uint64'),
('UInt16', 'uint16', 'uint16_t', 'UInt64', 'uint64'),
('UInt8', 'uint8', 'uint8_t', 'UInt64', 'uint64'),
('Object', 'object', 'object', 'PyObject', 'object'),
]
}}

{{for name, dtype, ctype in dtypes}}
{{for name, dtype, ctype, hashtable_name, hashtable_dtype in dtypes}}


cdef class {{name}}Engine(IndexEngine):
Expand All @@ -34,13 +42,9 @@ cdef class {{name}}Engine(IndexEngine):
other, limit=limit)

cdef _make_hash_table(self, n):
{{if name == 'Object'}}
return _hash.PyObjectHashTable(n)
{{else}}
return _hash.{{name}}HashTable(n)
{{endif}}
return _hash.{{hashtable_name}}HashTable(n)

{{if name != 'Float64' and name != 'Object'}}
{{if name not in {'Float64', 'Float32', 'Object'} }}
cdef _check_type(self, object val):
hash(val)
if util.is_bool_object(val):
Expand All @@ -50,6 +54,11 @@ cdef class {{name}}Engine(IndexEngine):
{{endif}}

{{if name != 'Object'}}
cpdef _call_map_locations(self, values):
# self.mapping is of type {{hashtable_name}}HashTable,
# so convert dtype of values
self.mapping.map_locations(algos.ensure_{{hashtable_dtype}}(values))

cdef _get_index_values(self):
return algos.ensure_{{dtype}}(self.vgetter())

Expand All @@ -60,7 +69,7 @@ cdef class {{name}}Engine(IndexEngine):
ndarray[{{ctype}}] values
int count = 0

{{if name != 'Float64'}}
{{if name not in {'Float64', 'Float32'} }}
if not util.is_integer_object(val):
raise KeyError(val)
{{endif}}
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,17 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
"""

_typ = 'categoricalindex'
_engine_type = libindex.Int64Engine

@property
def _engine_type(self):
# self.codes can have dtype int8, int16, int32 or int64, so we need
# to return the corresponding engine type (libindex.Int8Engine, etc.).
return {np.int8: libindex.Int8Engine,
np.int16: libindex.Int16Engine,
np.int32: libindex.Int32Engine,
np.int64: libindex.Int64Engine,
}[self.codes.dtype.type]

_attributes = ['name']

def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
Expand Down Expand Up @@ -382,7 +392,7 @@ def argsort(self, *args, **kwargs):
def _engine(self):

# we are going to look things up with the codes themselves
return self._engine_type(lambda: self.codes.astype('i8'), len(self))
return self._engine_type(lambda: self.codes, len(self))

# introspection
@cache_readonly
Expand Down Expand Up @@ -450,6 +460,7 @@ def get_loc(self, key, method=None):
array([False, True, False, True], dtype=bool)
"""
code = self.categories.get_loc(key)
code = self.codes.dtype.type(code)
try:
return self._engine.get_loc(code)
except KeyError:
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-

import pytest
import numpy as np

import pandas.util.testing as tm
from pandas.core.indexes.api import Index, CategoricalIndex
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas._libs import index as libindex
from .common import Base

from pandas.compat import range, PY3

import numpy as np

from pandas import Categorical, IntervalIndex, compat
from pandas.util.testing import assert_almost_equal
import pandas.core.config as cf
Expand Down Expand Up @@ -1117,3 +1117,23 @@ def test_take_invalid_kwargs(self):
msg = "the 'mode' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, idx.take,
indices, mode='clip')

@pytest.mark.parametrize('dtype, engine_type', [
(np.int8, libindex.Int8Engine),
(np.int16, libindex.Int16Engine),
(np.int32, libindex.Int32Engine),
(np.int64, libindex.Int64Engine),
])
def test_engine_type(self, dtype, engine_type):
if dtype != np.int64:
# num. of uniques required to push CategoricalIndex.codes to a
# dtype (128 categories required for .codes dtype to be int16 etc.)
num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
ci = pd.CategoricalIndex(range(num_uniques))
else:
# having 2**32 - 2**31 categories would be very memory-intensive,
# so we cheat a bit with the dtype
ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
ci.values._codes = ci.values._codes.astype('int64')
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)
20 changes: 20 additions & 0 deletions pandas/tests/indexing/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
import pytest

from pandas._libs import index as libindex


@pytest.fixture(params=[
(libindex.Int64Engine, np.int64),
(libindex.Int32Engine, np.int32),
(libindex.Int16Engine, np.int16),
(libindex.Int8Engine, np.int8),
(libindex.UInt64Engine, np.uint64),
(libindex.UInt32Engine, np.uint32),
(libindex.UInt16Engine, np.uint16),
(libindex.UInt8Engine, np.uint8),
(libindex.Float64Engine, np.float64),
(libindex.Float32Engine, np.float32),
], ids=lambda x: x[0].__name__)
def numeric_indexing_engine_type_and_dtype(request):
return request.param
Loading