From 040f06f731a09fc6e0663cada6697f6602b36f1d Mon Sep 17 00:00:00 2001 From: ArtinSarraf Date: Fri, 14 Dec 2018 09:57:28 -0500 Subject: [PATCH] BUG/ENH - base argument no longer ignored in period resample (#23941) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/resample.py | 97 +++++++++++++++++++- pandas/tests/resample/test_datetime_index.py | 27 +++++- pandas/tests/resample/test_period_index.py | 53 +++++++++++ 4 files changed, 172 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0d3123bc92f74..610463ec3422b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -353,6 +353,7 @@ Other Enhancements - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`) - :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6d80d747f21b3..7b842d141e839 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1389,9 +1389,10 @@ def _get_time_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_range_edges(ax.min(), ax.max(), self.freq, - closed=self.closed, - base=self.base) + first, last = _get_timestamp_range_edges(ax.min(), ax.max(), + self.freq, + closed=self.closed, + base=self.base) tz = ax.tz # GH #12037 # use first/last directly instead of call replace() on them @@ -1540,20 +1541,39 @@ def _get_period_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels + freq_mult = self.freq.n + start = ax.min().asfreq(self.freq, how=self.convention) end = ax.max().asfreq(self.freq, how='end') + bin_shift = 0 + + # GH 23882 + if self.base: + # get base adjusted bin edge labels + p_start, end = _get_period_range_edges(start, + end, + self.freq, + closed=self.closed, + base=self.base) + + # Get offset for bin edge (not label edge) adjustment + start_offset = (pd.Period(start, self.freq) + - pd.Period(p_start, self.freq)) + bin_shift = start_offset.n % freq_mult + start = p_start labels = binner = PeriodIndex(start=start, end=end, freq=self.freq, name=ax.name) i8 = memb.asi8 - freq_mult = self.freq.n # when upsampling to subperiods, we need to generate enough bins expected_bins_count = len(binner) * freq_mult i8_extend = expected_bins_count - (i8[-1] - i8[0]) rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult + # adjust bin edge indexes to account for base + rng -= bin_shift bins = memb.searchsorted(rng, side='left') if nat_count > 0: @@ -1582,7 +1602,35 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_range_edges(first, last, offset, closed='left', base=0): +def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): + """ + Adjust the `first` Timestamp to the preceeding Timestamp that resides on + the provided offset. Adjust the `last` Timestamp to the following + Timestamp that resides on the provided offset. Input Timestamps that + already reside on the offset will be adjusted depeding on the type of + offset and the `closed` parameter. + + Parameters + ---------- + first : pd.Timestamp + The beginning Timestamp of the range to be adjusted. + last : pd.Timestamp + The ending Timestamp of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Timestamps will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Timestamps. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Timestamp objects. + """ + if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type " + "Timestamp") + if isinstance(offset, Tick): is_day = isinstance(offset, Day) day_nanos = delta_to_nanoseconds(timedelta(1)) @@ -1606,6 +1654,45 @@ def _get_range_edges(first, last, offset, closed='left', base=0): return first, last +def _get_period_range_edges(first, last, offset, closed='left', base=0): + """ + Adjust the provided `first` and `last` Periods to the respective Period of + the given offset that encompasses them. + + Parameters + ---------- + first : pd.Period + The beginning Period of the range to be adjusted. + last : pd.Period + The ending Period of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Periods will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Periods. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Period objects. + """ + if not all(isinstance(obj, pd.Period) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type Period") + + # GH 23882 + first = first.to_timestamp() + last = last.to_timestamp() + adjust_first = not offset.onOffset(first) + adjust_last = offset.onOffset(last) + + first, last = _get_timestamp_range_edges(first, last, offset, + closed=closed, base=base) + + first = (first + adjust_first * offset).to_period(offset) + last = (last - adjust_last * offset).to_period(offset) + return first, last + + def _adjust_dates_anchored(first, last, offset, closed='right', base=0): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 71f94f9398758..00c86a919a288 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -15,7 +15,8 @@ from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import timedelta_range -from pandas.core.resample import DatetimeIndex, TimeGrouper +from pandas.core.resample import ( + DatetimeIndex, TimeGrouper, _get_timestamp_range_edges) import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -1481,3 +1482,27 @@ def test_resample_equivalent_offsets(self, n1, freq1, n2, freq2, k): result1 = s.resample(str(n1_) + freq1).mean() result2 = s.resample(str(n2_) + freq2).mean() assert_series_equal(result1, result2) + + @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ + ('19910905', '19920406', 'D', '19910905', '19920407'), + ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'), + ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', + '19920406 07:00'), + ('19910906', '19920406', 'M', '19910831', '19920430'), + ('19910831', '19920430', 'M', '19910831', '19920531'), + ('1991-08', '1992-04', 'M', '19910831', '19920531'), + ]) + def test_get_timestamp_range_edges(self, first, last, offset, + exp_first, exp_last): + first = pd.Period(first) + first = first.to_timestamp(first.freq) + last = pd.Period(last) + last = last.to_timestamp(last.freq) + + exp_first = pd.Timestamp(exp_first, freq=offset) + exp_last = pd.Timestamp(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_timestamp_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 7cb3185ccbbaf..3e3a89de5086c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -13,6 +13,7 @@ from pandas import DataFrame, Series, Timestamp from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range +from pandas.core.resample import _get_period_range_edges import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -701,3 +702,55 @@ def test_resample_with_only_nat(self): expected = DataFrame([], index=expected_index) result = frame.resample('1s').mean() assert_frame_equal(result, expected) + + @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [ + ('19910905', '19910909 03:00', 'H', '24H', 10), + ('19910905', '19910909 12:00', 'H', '24H', 10), + ('19910905', '19910909 23:00', 'H', '24H', 10), + ('19910905 10:00', '19910909', 'H', '24H', 10), + ('19910905 10:00', '19910909 10:00', 'H', '24H', 10), + ('19910905', '19910909 10:00', 'H', '24H', 10), + ('19910905 12:00', '19910909', 'H', '24H', 10), + ('19910905 12:00', '19910909 03:00', 'H', '24H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '24H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '24H', 34), + ('19910905 12:00', '19910909 12:00', 'H', '17H', 10), + ('19910905 12:00', '19910909 12:00', 'H', '17H', 3), + ('19910905 12:00', '19910909 1:00', 'H', 'M', 3), + ('19910905', '19910913 06:00', '2H', '24H', 10), + ('19910905', '19910905 01:39', 'Min', '5Min', 3), + ('19910905', '19910905 03:18', '2Min', '5Min', 3), + ]) + def test_resample_with_non_zero_base(self, start, end, start_freq, + end_freq, base): + # GH 23882 + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) + result = s.resample(end_freq, base=base).mean() + result = result.to_timestamp(end_freq) + # to_timestamp casts 24H -> D + result = result.asfreq(end_freq) if end_freq == '24H' else result + expected = s.to_timestamp().resample(end_freq, base=base).mean() + assert_series_equal(result, expected) + + @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ + ('19910905', '19920406', 'D', '19910905', '19920406'), + ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'), + ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', + '19920406 06:00'), + ('19910906', '19920406', 'M', '1991-09', '1992-04'), + ('19910831', '19920430', 'M', '1991-08', '1992-04'), + ('1991-08', '1992-04', 'M', '1991-08', '1992-04'), + ]) + def test_get_period_range_edges(self, first, last, offset, + exp_first, exp_last): + first = pd.Period(first) + last = pd.Period(last) + + exp_first = pd.Period(exp_first, freq=offset) + exp_last = pd.Period(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_period_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected