diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6bb1ddfe2324d..5532771b38a0e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -654,6 +654,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) - Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) - Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index e985a519c3046..0c2753dbc6f28 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -22,6 +22,7 @@ cimport ccalendar from conversion import tz_localize_to_utc, normalize_i8_timestamps from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) +import enum from fields import get_start_end_field, get_date_name_field from nattype import NaT from nattype cimport NPY_NAT @@ -57,50 +58,114 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base -def round_ns(values, rounder, freq): +@enum.unique +class RoundTo(enum.Enum): """ - Applies rounding function at given frequency + enumeration defining the available rounding modes + + Attributes + ---------- + MINUS_INFTY + round towards -∞, or floor [2]_ + PLUS_INFTY + round towards +∞, or ceil [3]_ + NEAREST_HALF_EVEN + round to nearest, tie-break half to even [6]_ + NEAREST_HALF_MINUS_INFTY + round to nearest, tie-break half to -∞ [5]_ + NEAREST_HALF_PLUS_INFTY + round to nearest, tie-break half to +∞ [4]_ + + + References + ---------- + .. [1] "Rounding - Wikipedia" + https://en.wikipedia.org/wiki/Rounding + .. [2] "Rounding down" + https://en.wikipedia.org/wiki/Rounding#Rounding_down + .. [3] "Rounding up" + https://en.wikipedia.org/wiki/Rounding#Rounding_up + .. [4] "Round half up" + https://en.wikipedia.org/wiki/Rounding#Round_half_up + .. [5] "Round half down" + https://en.wikipedia.org/wiki/Rounding#Round_half_down + .. [6] "Round half to even" + https://en.wikipedia.org/wiki/Rounding#Round_half_to_even + """ + MINUS_INFTY = 0 + PLUS_INFTY = 1 + NEAREST_HALF_EVEN = 2 + NEAREST_HALF_PLUS_INFTY = 3 + NEAREST_HALF_MINUS_INFTY = 4 + + +cdef inline _npdivmod(x1, x2): + """implement divmod for numpy < 1.13""" + return np.floor_divide(x1, x2), np.remainder(x1, x2) + + +try: + from numpy import divmod as npdivmod +except ImportError: + npdivmod = _npdivmod + + +cdef inline _floor_int64(values, unit): + return values - np.remainder(values, unit) + +cdef inline _ceil_int64(values, unit): + return values + np.remainder(-values, unit) + +cdef inline _rounddown_int64(values, unit): + return _ceil_int64(values - unit//2, unit) + +cdef inline _roundup_int64(values, unit): + return _floor_int64(values + unit//2, unit) + + +def round_nsint64(values, mode, freq): + """ + Applies rounding mode at given frequency Parameters ---------- values : :obj:`ndarray` - rounder : function, eg. 'ceil', 'floor', 'round' + mode : instance of `RoundTo` enumeration freq : str, obj Returns ------- :obj:`ndarray` """ + + if not isinstance(mode, RoundTo): + raise ValueError('mode should be a RoundTo member') + unit = to_offset(freq).nanos - # GH21262 If the Timestamp is multiple of the freq str - # don't apply any rounding - mask = values % unit == 0 - if mask.all(): - return values - r = values.copy() - - if unit < 1000: - # for nano rounding, work with the last 6 digits separately - # due to float precision - buff = 1000000 - r[~mask] = (buff * (values[~mask] // buff) + - unit * (rounder((values[~mask] % buff) * - (1 / float(unit)))).astype('i8')) - else: - if unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - # GH19206 - # to deal with round-off when unit is large - if unit >= 1e9: - divisor = 10 ** int(np.log10(unit / 1e7)) - else: - divisor = 10 - r[~mask] = (unit * rounder((values[~mask] * - (divisor / float(unit))) / divisor) - .astype('i8')) - return r + if mode is RoundTo.MINUS_INFTY: + return _floor_int64(values, unit) + elif mode is RoundTo.PLUS_INFTY: + return _ceil_int64(values, unit) + elif mode is RoundTo.NEAREST_HALF_MINUS_INFTY: + return _rounddown_int64(values, unit) + elif mode is RoundTo.NEAREST_HALF_PLUS_INFTY: + return _roundup_int64(values, unit) + elif mode is RoundTo.NEAREST_HALF_EVEN: + # for odd unit there is no need of a tie break + if unit % 2: + return _rounddown_int64(values, unit) + quotient, remainder = npdivmod(values, unit) + mask = np.logical_or( + remainder > (unit // 2), + np.logical_and(remainder == (unit // 2), quotient % 2) + ) + quotient[mask] += 1 + return quotient * unit + + # if/elif above should catch all rounding modes defined in enum 'RoundTo': + # if flow of control arrives here, it is a bug + assert False, "round_nsint64 called with an unrecognized rounding mode" # This is PITA. Because we inherit from datetime, which has very specific @@ -656,7 +721,7 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) - def _round(self, freq, rounder, ambiguous='raise'): + def _round(self, freq, mode, ambiguous='raise'): if self.tz is not None: value = self.tz_localize(None).value else: @@ -665,7 +730,7 @@ class Timestamp(_Timestamp): value = np.array([value], dtype=np.int64) # Will only ever contain 1 element for timestamp - r = round_ns(value, rounder, freq)[0] + r = round_nsint64(value, mode, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz, ambiguous=ambiguous) @@ -694,7 +759,7 @@ class Timestamp(_Timestamp): ------ ValueError if the freq cannot be converted """ - return self._round(freq, np.round, ambiguous) + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous) def floor(self, freq, ambiguous='raise'): """ @@ -715,7 +780,7 @@ class Timestamp(_Timestamp): ------ ValueError if the freq cannot be converted """ - return self._round(freq, np.floor, ambiguous) + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous) def ceil(self, freq, ambiguous='raise'): """ @@ -736,7 +801,7 @@ class Timestamp(_Timestamp): ------ ValueError if the freq cannot be converted """ - return self._round(freq, np.ceil, ambiguous) + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous) @property def tz(self): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 578167a7db500..f7f4f187f6202 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -11,7 +11,7 @@ import numpy as np from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs.timestamps import round_ns +from pandas._libs.tslibs.timestamps import round_nsint64, RoundTo from pandas.core.dtypes.common import ( ensure_int64, @@ -180,10 +180,10 @@ class TimelikeOps(object): """ ) - def _round(self, freq, rounder, ambiguous): + def _round(self, freq, mode, ambiguous): # round the local times values = _ensure_datetimelike_to_i8(self) - result = round_ns(values, rounder, freq) + result = round_nsint64(values, mode, freq) result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() @@ -197,15 +197,15 @@ def _round(self, freq, rounder, ambiguous): @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, ambiguous='raise'): - return self._round(freq, np.round, ambiguous) + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous) @Appender((_round_doc + _floor_example).format(op="floor")) def floor(self, freq, ambiguous='raise'): - return self._round(freq, np.floor, ambiguous) + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous) @Appender((_round_doc + _ceil_example).format(op="ceil")) def ceil(self, freq, ambiguous='raise'): - return self._round(freq, np.ceil, ambiguous) + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous) class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 6f6f4eb8d24e3..d054121c6dfab 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import date_range, Timestamp, DatetimeIndex +from pandas.tseries.frequencies import to_offset class TestDatetimeIndexOps(object): @@ -124,7 +125,7 @@ def test_round(self, tz_naive_fixture): expected = DatetimeIndex(['2016-10-17 12:00:00.001501030']) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(False): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') @@ -169,6 +170,46 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = DatetimeIndex(list(expected)) assert expected.equals(result) + @pytest.mark.parametrize('start, index_freq, periods', [ + ('2018-01-01', '12H', 25), + ('2018-01-01 0:0:0.124999', '1ns', 1000), + ]) + @pytest.mark.parametrize('round_freq', [ + '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', + '250ns', '500ns', '750ns', + '1us', '19us', '250us', '500us', '750us', + '1s', '2s', '3s', + '12H', '1D', + ]) + def test_round_int64(self, start, index_freq, periods, round_freq): + dt = DatetimeIndex(start=start, freq=index_freq, periods=periods) + unit = to_offset(round_freq).nanos + + # test floor + result = dt.floor(round_freq) + diff = dt.asi8 - result.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), "floor not a {} multiple".format(round_freq) + assert (0 <= diff).all() and (diff < unit).all(), "floor error" + + # test ceil + result = dt.ceil(round_freq) + diff = result.asi8 - dt.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), "ceil not a {} multiple".format(round_freq) + assert (0 <= diff).all() and (diff < unit).all(), "ceil error" + + # test round + result = dt.round(round_freq) + diff = abs(result.asi8 - dt.asi8) + mod = result.asi8 % unit + assert (mod == 0).all(), "round not a {} multiple".format(round_freq) + assert (diff <= unit // 2).all(), "round error" + if unit % 2 == 0: + assert ( + result.asi8[diff == unit // 2] % 2 == 0 + ).all(), "round half to even error" + # ---------------------------------------------------------------- # DatetimeIndex.normalize diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index f83aa31edf95a..b6c783dc07aec 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -13,6 +13,7 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas import Timestamp, NaT +from pandas.tseries.frequencies import to_offset class TestTimestampUnaryOps(object): @@ -70,7 +71,7 @@ def test_round_subsecond(self): assert result == expected def test_round_nonstandard_freq(self): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(False): Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') def test_round_invalid_arg(self): @@ -154,6 +155,46 @@ def test_round_dst_border(self, method): with pytest.raises(pytz.AmbiguousTimeError): getattr(ts, method)('H', ambiguous='raise') + @pytest.mark.parametrize('timestamp', [ + '2018-01-01 0:0:0.124999360', + '2018-01-01 0:0:0.125000367', + '2018-01-01 0:0:0.125500', + '2018-01-01 0:0:0.126500', + '2018-01-01 12:00:00', + '2019-01-01 12:00:00', + ]) + @pytest.mark.parametrize('freq', [ + '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', + '250ns', '500ns', '750ns', + '1us', '19us', '250us', '500us', '750us', + '1s', '2s', '3s', + '1D', + ]) + def test_round_int64(self, timestamp, freq): + """check that all rounding modes are accurate to int64 precision + see GH#22591 + """ + dt = Timestamp(timestamp) + unit = to_offset(freq).nanos + + # test floor + result = dt.floor(freq) + assert result.value % unit == 0, "floor not a {} multiple".format(freq) + assert 0 <= dt.value - result.value < unit, "floor error" + + # test ceil + result = dt.ceil(freq) + assert result.value % unit == 0, "ceil not a {} multiple".format(freq) + assert 0 <= result.value - dt.value < unit, "ceil error" + + # test round + result = dt.round(freq) + assert result.value % unit == 0, "round not a {} multiple".format(freq) + assert abs(result.value - dt.value) <= unit // 2, "round error" + if unit % 2 == 0 and abs(result.value - dt.value) == unit // 2: + # round half to even + assert result.value // unit % 2 == 0, "round half to even error" + # -------------------------------------------------------------- # Timestamp.replace