ENH: tolerance now takes list-like argument for reindex and get_index…

…er. (pandas-dev#17367)
reef-technologies · Oct 16, 2017 · fcc91de · fcc91de
1 parent c0aacb7
commit fcc91de
Show file tree

Hide file tree

Showing 17 changed files with 222 additions and 56 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -234,6 +234,7 @@ Other Enhancements
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`)
 - Improved the import time of pandas by about 2.25x.  (:issue:`16764`)
 - :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
+- :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for ``tolerance``. (:issue:`17367`)
 
 .. _whatsnew_0210.api_breaking:
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2470,9 +2470,10 @@ def reindex_like(self, other, method=None, copy=True, limit=None,
             Maximum number of consecutive labels to fill for inexact matches.
         tolerance : optional
             Maximum distance between labels of the other object and this
-            object for inexact matches.
+            object for inexact matches. Can be list-like.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Notes
         -----
@@ -2860,7 +2861,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
 
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------
@@ -3120,7 +3128,14 @@ def _reindex_multi(self, axes, copy, fill_value):
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
 
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2484,7 +2484,14 @@ def _get_unique_index(self, dropna=False):
             the index at the matching location most satisfy the equation
             ``abs(index[loc] - key) <= tolerance``.
 
+            Tolerance may be a scalar
+            value, which applies the same tolerance to all values, or
+            list-like, which applies variable tolerance per element. List-like
+            includes list, tuple, array, Series, and must be the same size as
+            the index and its dtype must exactly match the index's type.
+
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Returns
         -------
@@ -2627,7 +2634,14 @@ def _get_level_values(self, level):
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
 
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------
@@ -2647,7 +2661,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         method = missing.clean_reindex_fill_method(method)
         target = _ensure_index(target)
         if tolerance is not None:
-            tolerance = self._convert_tolerance(tolerance)
+            tolerance = self._convert_tolerance(tolerance, target)
 
         # Treat boolean labels passed to a numeric index as not found. Without
         # this fix False and True would be treated as 0 and 1 respectively.
@@ -2683,10 +2697,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                                  'backfill or nearest reindexing')
 
             indexer = self._engine.get_indexer(target._values)
+
         return _ensure_platform_int(indexer)
 
-    def _convert_tolerance(self, tolerance):
+    def _convert_tolerance(self, tolerance, target):
         # override this method on subclasses
+        tolerance = np.asarray(tolerance)
+        if target.size != tolerance.size and tolerance.size > 1:
+            raise ValueError('list-like tolerance size must match '
+                             'target index size')
         return tolerance
 
     def _get_fill_indexer(self, target, method, limit=None, tolerance=None):

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -7,6 +7,7 @@
 
 from pandas import compat
 from pandas.compat.numpy import function as nv
+from pandas.core.tools.timedeltas import to_timedelta
 
 import numpy as np
 from pandas.core.dtypes.common import (
@@ -431,13 +432,12 @@ def asobject(self):
         from pandas.core.index import Index
         return Index(self._box_values(self.asi8), name=self.name, dtype=object)
 
-    def _convert_tolerance(self, tolerance):
-        try:
-            return Timedelta(tolerance).to_timedelta64()
-        except ValueError:
-            raise ValueError('tolerance argument for %s must be convertible '
-                             'to Timedelta: %r'
-                             % (type(self).__name__, tolerance))
+    def _convert_tolerance(self, tolerance, target):
+        tolerance = np.asarray(to_timedelta(tolerance, box=False))
+        if target.size != tolerance.size and tolerance.size > 1:
+            raise ValueError('list-like tolerance size must match '
+                             'target index size')
+        return tolerance
 
     def _maybe_mask_results(self, result, fill_value=None, convert=None):
         """

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -1423,7 +1423,7 @@ def get_loc(self, key, method=None, tolerance=None):
         if tolerance is not None:
             # try converting tolerance now, so errors don't get swallowed by
             # the try/except clauses below
-            tolerance = self._convert_tolerance(tolerance)
+            tolerance = self._convert_tolerance(tolerance, np.asarray(key))
 
         if isinstance(key, datetime):
             # needed to localize naive datetimes
@@ -1447,7 +1447,12 @@ def get_loc(self, key, method=None, tolerance=None):
             try:
                 stamp = Timestamp(key, tz=self.tz)
                 return Index.get_loc(self, stamp, method, tolerance)
-            except (KeyError, ValueError):
+            except KeyError:
+                raise KeyError(key)
+            except ValueError as e:
+                # list-like tolerance size must match target index size
+                if 'list-like' in str(e):
+                    raise e
                 raise KeyError(key)
 
     def _maybe_cast_slice_bound(self, label, side, kind):

diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -71,12 +71,21 @@ def _convert_for_op(self, value):
 
         return value
 
-    def _convert_tolerance(self, tolerance):
-        try:
-            return float(tolerance)
-        except ValueError:
-            raise ValueError('tolerance argument for %s must be numeric: %r' %
-                             (type(self).__name__, tolerance))
+    def _convert_tolerance(self, tolerance, target):
+        tolerance = np.asarray(tolerance)
+        if target.size != tolerance.size and tolerance.size > 1:
+            raise ValueError('list-like tolerance size must match '
+                             'target index size')
+        if not np.issubdtype(tolerance.dtype, np.number):
+            if tolerance.ndim > 0:
+                raise ValueError(('tolerance argument for %s must contain '
+                                  'numeric elements if it is list type') %
+                                 (type(self).__name__,))
+            else:
+                raise ValueError(('tolerance argument for %s must be numeric '
+                                  'if it is a scalar: %r') %
+                                 (type(self).__name__, tolerance))
+        return tolerance
 
     @classmethod
     def _assert_safe_casting(cls, data, subarr):

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -641,12 +641,17 @@ def to_timestamp(self, freq=None, how='start'):
         return DatetimeIndex(new_data, freq='infer', name=self.name)
 
     def _maybe_convert_timedelta(self, other):
-        if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)):
+        if isinstance(
+                other, (timedelta, np.timedelta64, offsets.Tick, np.ndarray)):
             offset = frequencies.to_offset(self.freq.rule_code)
             if isinstance(offset, offsets.Tick):
-                nanos = tslib._delta_to_nanoseconds(other)
+                if isinstance(other, np.ndarray):
+                    nanos = np.vectorize(tslib._delta_to_nanoseconds)(other)
+                else:
+                    nanos = tslib._delta_to_nanoseconds(other)
                 offset_nanos = tslib._delta_to_nanoseconds(offset)
-                if nanos % offset_nanos == 0:
+                check = np.all(nanos % offset_nanos == 0)
+                if check:
                     return nanos // offset_nanos
         elif isinstance(other, offsets.DateOffset):
             freqstr = other.rule_code
@@ -782,7 +787,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             target = target.asi8
 
         if tolerance is not None:
-            tolerance = self._convert_tolerance(tolerance)
+            tolerance = self._convert_tolerance(tolerance, target)
         return Index.get_indexer(self._int64index, target, method,
                                  limit, tolerance)
 
@@ -825,7 +830,8 @@ def get_loc(self, key, method=None, tolerance=None):
             try:
                 ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal
                 if tolerance is not None:
-                    tolerance = self._convert_tolerance(tolerance)
+                    tolerance = self._convert_tolerance(tolerance,
+                                                        np.asarray(key))
                 return self._int64index.get_loc(ordinal, method, tolerance)
 
             except KeyError:
@@ -908,8 +914,12 @@ def _get_string_slice(self, key):
         return slice(self.searchsorted(t1.ordinal, side='left'),
                      self.searchsorted(t2.ordinal, side='right'))
 
-    def _convert_tolerance(self, tolerance):
-        tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance)
+    def _convert_tolerance(self, tolerance, target):
+        tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance,
+                                                             target)
+        if target.size != tolerance.size and tolerance.size > 1:
+            raise ValueError('list-like tolerance size must match '
+                             'target index size')
         return self._maybe_convert_timedelta(tolerance)
 
     def insert(self, loc, item):

diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -699,7 +699,7 @@ def get_loc(self, key, method=None, tolerance=None):
         if tolerance is not None:
             # try converting tolerance now, so errors don't get swallowed by
             # the try/except clauses below
-            tolerance = self._convert_tolerance(tolerance)
+            tolerance = self._convert_tolerance(tolerance, np.asarray(key))
 
         if _is_convertible_to_td(key):
             key = Timedelta(key)

diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py
@@ -83,6 +83,9 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'):
     elif isinstance(arg, ABCIndexClass):
         return _convert_listlike(arg, unit=unit, box=box,
                                  errors=errors, name=arg.name)
+    elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 0:
+        # extract array scalar and process below
+        arg = arg.item()
     elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:
         return _convert_listlike(arg, unit=unit, box=box, errors=errors)
     elif getattr(arg, 'ndim', 1) > 1:

diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py
@@ -1935,9 +1935,13 @@ def test_reindex_methods(self):
 
             actual = df.reindex_like(df, method=method, tolerance=0)
             assert_frame_equal(df, actual)
+            actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0])
+            assert_frame_equal(df, actual)
 
             actual = df.reindex(target, method=method, tolerance=1)
             assert_frame_equal(expected, actual)
+            actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
+            assert_frame_equal(expected, actual)
 
             e2 = expected[::-1]
             actual = df.reindex(target[::-1], method=method)
@@ -1958,6 +1962,11 @@ def test_reindex_methods(self):
         actual = df.reindex(target, method='nearest', tolerance=0.2)
         assert_frame_equal(expected, actual)
 
+        expected = pd.DataFrame({'x': [0, np.nan, 1, np.nan]}, index=target)
+        actual = df.reindex(target, method='nearest',
+                            tolerance=[0.5, 0.01, 0.4, 0.1])
+        assert_frame_equal(expected, actual)
+
     def test_reindex_frame_add_nat(self):
         rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
         df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng})

diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py
@@ -41,10 +41,17 @@ def test_get_loc(self):
                            tolerance=np.timedelta64(1, 'D')) == 1
         assert idx.get_loc('2000-01-01T12', method='nearest',
                            tolerance=timedelta(1)) == 1
-        with tm.assert_raises_regex(ValueError, 'must be convertible'):
+        with tm.assert_raises_regex(ValueError,
+                                    'unit abbreviation w/o a number'):
             idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
         with pytest.raises(KeyError):
             idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')
+        with pytest.raises(
+                ValueError,
+                match='tolerance size must match target index size'):
+            idx.get_loc('2000-01-01', method='nearest',
+                        tolerance=[pd.Timedelta('1day').to_timedelta64(),
+                                   pd.Timedelta('1day').to_timedelta64()])
 
         assert idx.get_loc('2000', method='nearest') == slice(0, 3)
         assert idx.get_loc('2000-01', method='nearest') == slice(0, 3)
@@ -93,6 +100,19 @@ def test_get_indexer(self):
             idx.get_indexer(target, 'nearest',
                             tolerance=pd.Timedelta('1 hour')),
             np.array([0, -1, 1], dtype=np.intp))
+        tol_raw = [pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour').to_timedelta64(), ]
+        tm.assert_numpy_array_equal(
+            idx.get_indexer(target, 'nearest',
+                            tolerance=[np.timedelta64(x) for x in tol_raw]),
+            np.array([0, -1, 1], dtype=np.intp))
+        tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+                   pd.Timedelta('1 hour').to_timedelta64(),
+                   'foo', ]
+        with pytest.raises(
+                ValueError, match='abbreviation w/o a number'):
+            idx.get_indexer(target, 'nearest', tolerance=tol_bad)
         with pytest.raises(ValueError):
             idx.get_indexer(idx[[0]], method='nearest', tolerance='foo')
 

diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py
@@ -9,6 +9,7 @@
 from pandas import (PeriodIndex, period_range, notna, DatetimeIndex, NaT,
                     Index, Period, Int64Index, Series, DataFrame, date_range,
                     offsets, compat)
+from pandas.core.indexes.period import IncompatibleFrequency
 
 from ..datetimelike import DatetimeLike
 
@@ -83,14 +84,21 @@ def test_get_loc(self):
                            tolerance=np.timedelta64(1, 'D')) == 1
         assert idx.get_loc('2000-01-02T12', method='nearest',
                            tolerance=timedelta(1)) == 1
-        with tm.assert_raises_regex(ValueError, 'must be convertible'):
+        with tm.assert_raises_regex(ValueError,
+                                    'unit abbreviation w/o a number'):
             idx.get_loc('2000-01-10', method='nearest', tolerance='foo')
 
         msg = 'Input has different freq from PeriodIndex\\(freq=D\\)'
         with tm.assert_raises_regex(ValueError, msg):
             idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
         with pytest.raises(KeyError):
             idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
+        with pytest.raises(
+                ValueError,
+                match='list-like tolerance size must match target index size'):
+            idx.get_loc('2000-01-10', method='nearest',
+                        tolerance=[pd.Timedelta('1 day').to_timedelta64(),
+                                   pd.Timedelta('1 day').to_timedelta64()])
 
     def test_where(self):
         i = self.create_index()
@@ -158,6 +166,20 @@ def test_get_indexer(self):
         tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
                                                     tolerance='1 day'),
                                     np.array([0, 1, 1], dtype=np.intp))
+        tol_raw = [pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour'),
+                   np.timedelta64(1, 'D'), ]
+        tm.assert_numpy_array_equal(
+            idx.get_indexer(target, 'nearest',
+                            tolerance=[np.timedelta64(x) for x in tol_raw]),
+            np.array([0, -1, 1], dtype=np.intp))
+        tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+                   pd.Timedelta('1 hour').to_timedelta64(),
+                   np.timedelta64(1, 'M'), ]
+        with pytest.raises(
+                IncompatibleFrequency,
+                match='Input has different freq from'):
+            idx.get_indexer(target, 'nearest', tolerance=tol_bad)
 
     def test_repeat(self):
         # GH10183