Skip to content

Commit

Permalink
ENH: tolerance now takes list-like argument for reindex and get_index…
Browse files Browse the repository at this point in the history
  • Loading branch information
buntwo authored and Krzysztof Chomski committed Oct 16, 2017
1 parent c0aacb7 commit fcc91de
Show file tree
Hide file tree
Showing 17 changed files with 222 additions and 56 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ Other Enhancements
- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`)
- Improved the import time of pandas by about 2.25x. (:issue:`16764`)
- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
- :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`)

.. _whatsnew_0210.api_breaking:

Expand Down
17 changes: 16 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2470,9 +2470,10 @@ def reindex_like(self, other, method=None, copy=True, limit=None,
Maximum number of consecutive labels to fill for inexact matches.
tolerance : optional
Maximum distance between labels of the other object and this
object for inexact matches.
object for inexact matches. Can be list-like.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Notes
-----
Expand Down Expand Up @@ -2860,7 +2861,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Examples
--------
Expand Down Expand Up @@ -3120,7 +3128,14 @@ def _reindex_multi(self, axes, copy, fill_value):
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Examples
--------
Expand Down
23 changes: 21 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2484,7 +2484,14 @@ def _get_unique_index(self, dropna=False):
the index at the matching location most satisfy the equation
``abs(index[loc] - key) <= tolerance``.
Tolerance may be a scalar
value, which applies the same tolerance to all values, or
list-like, which applies variable tolerance per element. List-like
includes list, tuple, array, Series, and must be the same size as
the index and its dtype must exactly match the index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Returns
-------
Expand Down Expand Up @@ -2627,7 +2634,14 @@ def _get_level_values(self, level):
matches. The values of the index at the matching locations most
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
Tolerance may be a scalar value, which applies the same tolerance
to all values, or list-like, which applies variable tolerance per
element. List-like includes list, tuple, array, Series, and must be
the same size as the index and its dtype must exactly match the
index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Examples
--------
Expand All @@ -2647,7 +2661,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
method = missing.clean_reindex_fill_method(method)
target = _ensure_index(target)
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, target)

# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
Expand Down Expand Up @@ -2683,10 +2697,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
'backfill or nearest reindexing')

indexer = self._engine.get_indexer(target._values)

return _ensure_platform_int(indexer)

def _convert_tolerance(self, tolerance):
def _convert_tolerance(self, tolerance, target):
# override this method on subclasses
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return tolerance

def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas import compat
from pandas.compat.numpy import function as nv
from pandas.core.tools.timedeltas import to_timedelta

import numpy as np
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -431,13 +432,12 @@ def asobject(self):
from pandas.core.index import Index
return Index(self._box_values(self.asi8), name=self.name, dtype=object)

def _convert_tolerance(self, tolerance):
try:
return Timedelta(tolerance).to_timedelta64()
except ValueError:
raise ValueError('tolerance argument for %s must be convertible '
'to Timedelta: %r'
% (type(self).__name__, tolerance))
def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(to_timedelta(tolerance, box=False))
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return tolerance

def _maybe_mask_results(self, result, fill_value=None, convert=None):
"""
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,7 @@ def get_loc(self, key, method=None, tolerance=None):
if tolerance is not None:
# try converting tolerance now, so errors don't get swallowed by
# the try/except clauses below
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, np.asarray(key))

if isinstance(key, datetime):
# needed to localize naive datetimes
Expand All @@ -1447,7 +1447,12 @@ def get_loc(self, key, method=None, tolerance=None):
try:
stamp = Timestamp(key, tz=self.tz)
return Index.get_loc(self, stamp, method, tolerance)
except (KeyError, ValueError):
except KeyError:
raise KeyError(key)
except ValueError as e:
# list-like tolerance size must match target index size
if 'list-like' in str(e):
raise e
raise KeyError(key)

def _maybe_cast_slice_bound(self, label, side, kind):
Expand Down
21 changes: 15 additions & 6 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,21 @@ def _convert_for_op(self, value):

return value

def _convert_tolerance(self, tolerance):
try:
return float(tolerance)
except ValueError:
raise ValueError('tolerance argument for %s must be numeric: %r' %
(type(self).__name__, tolerance))
def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
if not np.issubdtype(tolerance.dtype, np.number):
if tolerance.ndim > 0:
raise ValueError(('tolerance argument for %s must contain '
'numeric elements if it is list type') %
(type(self).__name__,))
else:
raise ValueError(('tolerance argument for %s must be numeric '
'if it is a scalar: %r') %
(type(self).__name__, tolerance))
return tolerance

@classmethod
def _assert_safe_casting(cls, data, subarr):
Expand Down
24 changes: 17 additions & 7 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,12 +641,17 @@ def to_timestamp(self, freq=None, how='start'):
return DatetimeIndex(new_data, freq='infer', name=self.name)

def _maybe_convert_timedelta(self, other):
if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)):
if isinstance(
other, (timedelta, np.timedelta64, offsets.Tick, np.ndarray)):
offset = frequencies.to_offset(self.freq.rule_code)
if isinstance(offset, offsets.Tick):
nanos = tslib._delta_to_nanoseconds(other)
if isinstance(other, np.ndarray):
nanos = np.vectorize(tslib._delta_to_nanoseconds)(other)
else:
nanos = tslib._delta_to_nanoseconds(other)
offset_nanos = tslib._delta_to_nanoseconds(offset)
if nanos % offset_nanos == 0:
check = np.all(nanos % offset_nanos == 0)
if check:
return nanos // offset_nanos
elif isinstance(other, offsets.DateOffset):
freqstr = other.rule_code
Expand Down Expand Up @@ -782,7 +787,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = target.asi8

if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, target)
return Index.get_indexer(self._int64index, target, method,
limit, tolerance)

Expand Down Expand Up @@ -825,7 +830,8 @@ def get_loc(self, key, method=None, tolerance=None):
try:
ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance,
np.asarray(key))
return self._int64index.get_loc(ordinal, method, tolerance)

except KeyError:
Expand Down Expand Up @@ -908,8 +914,12 @@ def _get_string_slice(self, key):
return slice(self.searchsorted(t1.ordinal, side='left'),
self.searchsorted(t2.ordinal, side='right'))

def _convert_tolerance(self, tolerance):
tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance)
def _convert_tolerance(self, tolerance, target):
tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance,
target)
if target.size != tolerance.size and tolerance.size > 1:
raise ValueError('list-like tolerance size must match '
'target index size')
return self._maybe_convert_timedelta(tolerance)

def insert(self, loc, item):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def get_loc(self, key, method=None, tolerance=None):
if tolerance is not None:
# try converting tolerance now, so errors don't get swallowed by
# the try/except clauses below
tolerance = self._convert_tolerance(tolerance)
tolerance = self._convert_tolerance(tolerance, np.asarray(key))

if _is_convertible_to_td(key):
key = Timedelta(key)
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/tools/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'):
elif isinstance(arg, ABCIndexClass):
return _convert_listlike(arg, unit=unit, box=box,
errors=errors, name=arg.name)
elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 0:
# extract array scalar and process below
arg = arg.item()
elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:
return _convert_listlike(arg, unit=unit, box=box, errors=errors)
elif getattr(arg, 'ndim', 1) > 1:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1935,9 +1935,13 @@ def test_reindex_methods(self):

actual = df.reindex_like(df, method=method, tolerance=0)
assert_frame_equal(df, actual)
actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
assert_frame_equal(df, actual)

actual = df.reindex(target, method=method, tolerance=1)
assert_frame_equal(expected, actual)
actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
assert_frame_equal(expected, actual)

e2 = expected[::-1]
actual = df.reindex(target[::-1], method=method)
Expand All @@ -1958,6 +1962,11 @@ def test_reindex_methods(self):
actual = df.reindex(target, method='nearest', tolerance=0.2)
assert_frame_equal(expected, actual)

expected = pd.DataFrame({'x': [0, np.nan, 1, np.nan]}, index=target)
actual = df.reindex(target, method='nearest',
tolerance=[0.5, 0.01, 0.4, 0.1])
assert_frame_equal(expected, actual)

def test_reindex_frame_add_nat(self):
rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng})
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,17 @@ def test_get_loc(self):
tolerance=np.timedelta64(1, 'D')) == 1
assert idx.get_loc('2000-01-01T12', method='nearest',
tolerance=timedelta(1)) == 1
with tm.assert_raises_regex(ValueError, 'must be convertible'):
with tm.assert_raises_regex(ValueError,
'unit abbreviation w/o a number'):
idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
with pytest.raises(KeyError):
idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')
with pytest.raises(
ValueError,
match='tolerance size must match target index size'):
idx.get_loc('2000-01-01', method='nearest',
tolerance=[pd.Timedelta('1day').to_timedelta64(),
pd.Timedelta('1day').to_timedelta64()])

assert idx.get_loc('2000', method='nearest') == slice(0, 3)
assert idx.get_loc('2000-01', method='nearest') == slice(0, 3)
Expand Down Expand Up @@ -93,6 +100,19 @@ def test_get_indexer(self):
idx.get_indexer(target, 'nearest',
tolerance=pd.Timedelta('1 hour')),
np.array([0, -1, 1], dtype=np.intp))
tol_raw = [pd.Timedelta('1 hour'),
pd.Timedelta('1 hour'),
pd.Timedelta('1 hour').to_timedelta64(), ]
tm.assert_numpy_array_equal(
idx.get_indexer(target, 'nearest',
tolerance=[np.timedelta64(x) for x in tol_raw]),
np.array([0, -1, 1], dtype=np.intp))
tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
pd.Timedelta('1 hour').to_timedelta64(),
'foo', ]
with pytest.raises(
ValueError, match='abbreviation w/o a number'):
idx.get_indexer(target, 'nearest', tolerance=tol_bad)
with pytest.raises(ValueError):
idx.get_indexer(idx[[0]], method='nearest', tolerance='foo')

Expand Down
24 changes: 23 additions & 1 deletion pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas import (PeriodIndex, period_range, notna, DatetimeIndex, NaT,
Index, Period, Int64Index, Series, DataFrame, date_range,
offsets, compat)
from pandas.core.indexes.period import IncompatibleFrequency

from ..datetimelike import DatetimeLike

Expand Down Expand Up @@ -83,14 +84,21 @@ def test_get_loc(self):
tolerance=np.timedelta64(1, 'D')) == 1
assert idx.get_loc('2000-01-02T12', method='nearest',
tolerance=timedelta(1)) == 1
with tm.assert_raises_regex(ValueError, 'must be convertible'):
with tm.assert_raises_regex(ValueError,
'unit abbreviation w/o a number'):
idx.get_loc('2000-01-10', method='nearest', tolerance='foo')

msg = 'Input has different freq from PeriodIndex\\(freq=D\\)'
with tm.assert_raises_regex(ValueError, msg):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
with pytest.raises(KeyError):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
with pytest.raises(
ValueError,
match='list-like tolerance size must match target index size'):
idx.get_loc('2000-01-10', method='nearest',
tolerance=[pd.Timedelta('1 day').to_timedelta64(),
pd.Timedelta('1 day').to_timedelta64()])

def test_where(self):
i = self.create_index()
Expand Down Expand Up @@ -158,6 +166,20 @@ def test_get_indexer(self):
tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
tolerance='1 day'),
np.array([0, 1, 1], dtype=np.intp))
tol_raw = [pd.Timedelta('1 hour'),
pd.Timedelta('1 hour'),
np.timedelta64(1, 'D'), ]
tm.assert_numpy_array_equal(
idx.get_indexer(target, 'nearest',
tolerance=[np.timedelta64(x) for x in tol_raw]),
np.array([0, -1, 1], dtype=np.intp))
tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
pd.Timedelta('1 hour').to_timedelta64(),
np.timedelta64(1, 'M'), ]
with pytest.raises(
IncompatibleFrequency,
match='Input has different freq from'):
idx.get_indexer(target, 'nearest', tolerance=tol_bad)

def test_repeat(self):
# GH10183
Expand Down
Loading

0 comments on commit fcc91de

Please sign in to comment.