From 8b50d8c854196e3025c9a881cafeedc5f509aaef Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 15 Aug 2016 18:22:40 -0400 Subject: [PATCH] BUG: Don't error in pd.to_timedelta when errors=ignore Title is self-explanatory. Closes #13613. Author: gfyoung Closes #13832 from gfyoung/to-timedelta-error-bug and squashes the following commits: dc39205 [gfyoung] BUG: Don't error in pd.to_timedelta when errors=ignore --- asv_bench/benchmarks/timedelta.py | 17 +++- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/src/inference.pyx | 2 +- pandas/tseries/tests/test_timedeltas.py | 23 +++++ pandas/tseries/timedeltas.py | 89 ++++++++++++------ pandas/tslib.pxd | 2 +- pandas/tslib.pyx | 120 +++++++++--------------- 7 files changed, 146 insertions(+), 108 deletions(-) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 2f252a4d3e1dc..9719fd87dfb2e 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -31,4 +31,19 @@ def setup(self): self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) \ No newline at end of file + to_timedelta(self.arr) + + +class timedelta_convert_bad_parse(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 1000, size=10000) + self.arr = ['{0} days'.format(i) for i in self.arr] + self.arr[-1] = 'apple' + + def time_timedelta_convert_coerce(self): + to_timedelta(self.arr, errors='coerce') + + def time_timedelta_convert_ignore(self): + to_timedelta(self.arr, errors='ignore') diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index d436d4dd3703b..e3cdefd36b0be 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -858,6 +858,7 @@ Bug Fixes - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) +- Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (issue:`9161`, :issue:`13544`) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 039e0df4193b3..62555dc7f178c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -780,7 +780,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, 'ns', False) + itimedeltas[i] = convert_to_timedelta64(val, 'ns') seen_timedelta = 1 else: seen_object = 1 diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 0bdf8590ec487..159d2b4f52f2a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -845,6 +845,11 @@ def testit(unit, transform): def test_to_timedelta_invalid(self): + # bad value for errors parameter + msg = "errors must be one of" + tm.assertRaisesRegexp(ValueError, msg, to_timedelta, + ['foo'], errors='never') + # these will error self.assertRaises(ValueError, lambda: to_timedelta([1, 2], unit='foo')) self.assertRaises(ValueError, lambda: to_timedelta(1, unit='foo')) @@ -862,6 +867,24 @@ def test_to_timedelta_invalid(self): to_timedelta(['1 day', 'bar', '1 min'], errors='coerce')) + # gh-13613: these should not error because errors='ignore' + invalid_data = 'apple' + self.assertEqual(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = ['apple', '1 days'] + tm.assert_numpy_array_equal( + np.array(invalid_data, dtype=object), + to_timedelta(invalid_data, errors='ignore')) + + invalid_data = pd.Index(['apple', '1 days']) + tm.assert_index_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = Series(['apple', '1 days']) + tm.assert_series_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + def test_to_timedelta_via_apply(self): # GH 5458 expected = Series([np.timedelta64(1, 's')]) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 7f28ec86ec40d..2ca3fcea8005b 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -3,7 +3,9 @@ """ import numpy as np +import pandas as pd import pandas.tslib as tslib + from pandas.types.common import (_ensure_object, is_integer_dtype, is_timedelta64_dtype, @@ -64,37 +66,22 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise', coerce=None): """ unit = _validate_timedelta_unit(unit) - def _convert_listlike(arg, box, unit, name=None): - - if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): - arg = np.array(list(arg), dtype='O') - - # these are shortcutable - if is_timedelta64_dtype(arg): - value = arg.astype('timedelta64[ns]') - elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{0}]'.format( - unit)).astype('timedelta64[ns]', copy=False) - else: - value = tslib.array_to_timedelta64(_ensure_object(arg), - unit=unit, errors=errors) - value = value.astype('timedelta64[ns]', copy=False) - - if box: - from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit='ns', name=name) - return value + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") if arg is None: return arg elif isinstance(arg, ABCSeries): from pandas import Series - values = _convert_listlike(arg._values, box=False, unit=unit) - return Series(values, index=arg.index, name=arg.name, dtype='m8[ns]') + values = _convert_listlike(arg._values, unit=unit, + box=False, errors=errors) + return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, box=box, unit=unit, name=arg.name) + return _convert_listlike(arg, unit=unit, box=box, + errors=errors, name=arg.name) elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: - return _convert_listlike(arg, box=box, unit=unit) + return _convert_listlike(arg, unit=unit, box=box, errors=errors) elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, timedelta, list, tuple, ' '1-d array, or Series') @@ -142,13 +129,55 @@ def _validate_timedelta_unit(arg): def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): - """ - convert strings to timedelta; coerce to Timedelta (if box), else - np.timedelta64 - """ + """Convert string 'r' to a timedelta object.""" + + try: + result = tslib.convert_to_timedelta64(r, unit) + except ValueError: + if errors == 'raise': + raise + elif errors == 'ignore': + return r + + # coerce + result = pd.NaT - result = tslib.convert_to_timedelta(r, unit, errors) if box: result = tslib.Timedelta(result) - return result + + +def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): + """Convert a list of objects to a timedelta index object.""" + + if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): + arg = np.array(list(arg), dtype='O') + + # these are shortcut-able + if is_timedelta64_dtype(arg): + value = arg.astype('timedelta64[ns]') + elif is_integer_dtype(arg): + value = arg.astype('timedelta64[{0}]'.format( + unit)).astype('timedelta64[ns]', copy=False) + else: + try: + value = tslib.array_to_timedelta64(_ensure_object(arg), + unit=unit, errors=errors) + value = value.astype('timedelta64[ns]', copy=False) + except ValueError: + if errors == 'ignore': + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise + + if box: + from pandas import TimedeltaIndex + value = TimedeltaIndex(value, unit='ns', name=name) + return value diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd index d6c5810e1d713..aa8cbcb2cedc7 100644 --- a/pandas/tslib.pxd +++ b/pandas/tslib.pxd @@ -1,7 +1,7 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) -cdef convert_to_timedelta64(object, object, object) +cpdef convert_to_timedelta64(object, object) cpdef object maybe_get_tz(object) cdef bint _is_utc(object) cdef bint _is_tzlocal(object) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 3c07cfd2446ed..53c77b2d8f9d7 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2619,7 +2619,7 @@ class Timedelta(_Timedelta): try: nano = kwargs.pop('nanoseconds',0) - value = convert_to_timedelta64(timedelta(**kwargs),'ns',False) + nano + value = convert_to_timedelta64(timedelta(**kwargs),'ns') + nano except TypeError as e: raise ValueError("cannot construct a Timedelta from the passed arguments, allowed keywords are " "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") @@ -2627,9 +2627,9 @@ class Timedelta(_Timedelta): if isinstance(value, Timedelta): value = value.value elif util.is_string_object(value): - value = np.timedelta64(parse_timedelta_string(value, False)) + value = np.timedelta64(parse_timedelta_string(value)) elif isinstance(value, timedelta): - value = convert_to_timedelta64(value,'ns',False) + value = convert_to_timedelta64(value,'ns') elif isinstance(value, np.timedelta64): if unit is not None: value = value.astype('timedelta64[{0}]'.format(unit)) @@ -2638,7 +2638,7 @@ class Timedelta(_Timedelta): value = np.timedelta64(_delta_to_nanoseconds(value.delta),'ns') elif is_integer_object(value) or util.is_float_object(value): # unit=None is de-facto 'ns' - value = convert_to_timedelta64(value,unit,False) + value = convert_to_timedelta64(value,unit) elif _checknull_with_nat(value): return NaT else: @@ -3001,37 +3001,41 @@ cdef PyTypeObject* td_type = Timedelta cdef inline bint is_timedelta(object o): return Py_TYPE(o) == td_type # isinstance(o, Timedelta) -def array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): - """ convert an ndarray to an array of ints that are timedeltas - force conversion if errors = 'coerce', - else will raise if cannot convert """ +cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): + """ + Convert an ndarray to an array of timedeltas. If errors == 'coerce', + coerce non-convertible objects to NaT. Otherwise, raise. + """ + cdef: Py_ssize_t i, n ndarray[int64_t] iresult - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' - assert is_raise or is_ignore or is_coerce + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") n = values.shape[0] result = np.empty(n, dtype='m8[ns]') iresult = result.view('i8') - # usually we have all strings - # if so then we hit the fast path + # Usually, we have all strings. If so, we hit the fast path. + # If this path fails, we try conversion a different way, and + # this is where all of the error handling will take place. try: for i in range(n): - result[i] = parse_timedelta_string(values[i], is_coerce) + result[i] = parse_timedelta_string(values[i]) except: for i in range(n): - result[i] = convert_to_timedelta64(values[i], unit, is_coerce) - return iresult - - -def convert_to_timedelta(object ts, object unit='ns', errors='raise'): - cdef bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + try: + result[i] = convert_to_timedelta64(values[i], unit) + except ValueError: + if errors == 'coerce': + result[i] = NPY_NAT + else: + raise - assert is_raise or is_ignore or is_coerce - return convert_to_timedelta64(ts, unit, is_coerce) + return iresult cdef dict timedelta_abbrevs = { 'D' : 'd', 'd' : 'd', @@ -3099,15 +3103,10 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) -cdef inline parse_timedelta_string(object ts, coerce=False): +cdef inline parse_timedelta_string(object ts): """ - Parse an regular format timedelta string - - Return an int64_t or raise a ValueError on an invalid parse - - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Parse a regular format timedelta string. Return an int64_t (in ns) + or raise a ValueError on an invalid parse. """ cdef: @@ -3163,13 +3162,7 @@ cdef inline parse_timedelta_string(object ts, coerce=False): number.append(c) else: - - try: - r = timedelta_from_spec(number, frac, unit) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) unit, number, frac = [], [c], [] result += timedelta_as_neg(r, neg) @@ -3196,9 +3189,9 @@ cdef inline parse_timedelta_string(object ts, coerce=False): result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - if coerce: - return NPY_NAT - raise ValueError("expecting hh:mm:ss format, received: {0}".format(ts)) + raise ValueError("expecting hh:mm:ss format, " + "received: {0}".format(ts)) + unit, number = [], [] # after the decimal point @@ -3228,21 +3221,15 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we had a dot, but we have a fractional # value since we have an unit if have_dot and len(unit): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) # we have a dot as part of a regular format # e.g. hh:mm:ss.fffffff elif have_dot: - if (len(number) or len(frac)) and not len(unit) and current_unit is None: - if coerce: - return NPY_NAT + if ((len(number) or len(frac)) and not len(unit) + and current_unit is None): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: @@ -3266,38 +3253,24 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we have a last abbreviation elif len(unit): - if len(number): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) else: - if coerce: - return NPY_NAT raise ValueError("unit abbreviation w/o a number") # treat as nanoseconds # but only if we don't have anything else else: - if have_value: raise ValueError("have leftover units") if len(number): - try: - r = timedelta_from_spec(number, frac, 'ns') - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, 'ns') + result += timedelta_as_neg(r, neg) return result -cdef inline convert_to_timedelta64(object ts, object unit, object coerce): +cpdef convert_to_timedelta64(object ts, object unit): """ Convert an incoming object to a timedelta64 if possible @@ -3308,9 +3281,7 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): - np.int64 (with unit providing a possible modifier) - None/NaT - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Return an ns based int64 # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case @@ -3346,16 +3317,15 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): ts = cast_from_unit(ts, unit) ts = np.timedelta64(ts) elif util.is_string_object(ts): - ts = np.timedelta64(parse_timedelta_string(ts, coerce)) + ts = np.timedelta64(parse_timedelta_string(ts)) elif hasattr(ts,'delta'): ts = np.timedelta64(_delta_to_nanoseconds(ts),'ns') if isinstance(ts, timedelta): ts = np.timedelta64(ts) elif not isinstance(ts, np.timedelta64): - if coerce: - return np.timedelta64(NPY_NAT) - raise ValueError("Invalid type for timedelta scalar: %s" % type(ts)) + raise ValueError("Invalid type for timedelta " + "scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'):