Skip to content

Commit

Permalink
PERF: don't call RangeIndex._data unnecessarily (#26565)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored Jun 1, 2019
1 parent 3db9dc3 commit 437efa6
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 2 deletions.
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ def time_min(self):
def time_min_trivial(self):
self.idx_inc.min()

def time_get_loc_inc(self):
self.idx_inc.get_loc(900000)

def time_get_loc_dec(self):
self.idx_dec.get_loc(100000)


class IndexAppend:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ Performance Improvements
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`)
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
- Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
Expand Down
32 changes: 30 additions & 2 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.indexes.numeric import Int64Index

from pandas.io.formats.printing import pprint_thing


class RangeIndex(Int64Index):
"""
Expand Down Expand Up @@ -64,6 +66,8 @@ class RangeIndex(Int64Index):
_typ = 'rangeindex'
_engine_type = libindex.Int64Engine

# check whether self._data has benn called
_cached_data = None # type: np.ndarray
# --------------------------------------------------------------------
# Constructors

Expand Down Expand Up @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None,
for k, v in kwargs.items():
setattr(result, k, v)

result._range = range(result._start, result._stop, result._step)

result._reset_identity()
return result

Expand All @@ -180,9 +186,19 @@ def _constructor(self):
""" return the class to use for construction """
return Int64Index

@cache_readonly
@property
def _data(self):
return np.arange(self._start, self._stop, self._step, dtype=np.int64)
"""
An int array that for performance reasons is created only when needed.
The constructed array is saved in ``_cached_data``. This allows us to
check if the array has been created without accessing ``_data`` and
triggering the construction.
"""
if self._cached_data is None:
self._cached_data = np.arange(self._start, self._stop, self._step,
dtype=np.int64)
return self._cached_data

@cache_readonly
def _int64index(self):
Expand Down Expand Up @@ -215,6 +231,9 @@ def _format_data(self, name=None):
# we are formatting thru the attributes
return None

def _format_with_header(self, header, na_rep='NaN', **kwargs):
return header + list(map(pprint_thing, self._range))

# --------------------------------------------------------------------
@property
def start(self):
Expand Down Expand Up @@ -296,6 +315,15 @@ def is_monotonic_decreasing(self):
def has_duplicates(self):
return False

@Appender(_index_shared_docs['get_loc'])
def get_loc(self, key, method=None, tolerance=None):
if is_integer(key) and method is None and tolerance is None:
try:
return self._range.index(key)
except ValueError:
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)

def tolist(self):
return list(range(self._start, self._stop, self._step))

Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/indexes/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,42 @@ def test_view(self):
def test_dtype(self):
assert self.index.dtype == np.int64

def test_cached_data(self):
# GH 26565
# Calling RangeIndex._data caches an int64 array of the same length at
# self._cached_data. This tests whether _cached_data has been set.
idx = RangeIndex(0, 100, 10)

assert idx._cached_data is None

repr(idx)
assert idx._cached_data is None

str(idx)
assert idx._cached_data is None

idx.get_loc(20)
assert idx._cached_data is None

df = pd.DataFrame({'a': range(10)}, index=idx)

df.loc[50]
assert idx._cached_data is None

with pytest.raises(KeyError):
df.loc[51]
assert idx._cached_data is None

df.loc[10:50]
assert idx._cached_data is None

df.iloc[5:10]
assert idx._cached_data is None

# actually calling data._data
assert isinstance(idx._data, np.ndarray)
assert isinstance(idx._cached_data, np.ndarray)

def test_is_monotonic(self):
assert self.index.is_monotonic is True
assert self.index.is_monotonic_increasing is True
Expand Down

0 comments on commit 437efa6

Please sign in to comment.