Skip to content

Commit

Permalink
BUG/ENH - base argument no longer ignored in period resample (#23941)
Browse files Browse the repository at this point in the history
  • Loading branch information
ms7463 authored and jreback committed Dec 14, 2018
1 parent a1cf3f6 commit 040f06f
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ Other Enhancements
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
- :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`)
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
Expand Down
97 changes: 92 additions & 5 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1389,9 +1389,10 @@ def _get_time_bins(self, ax):
data=[], freq=self.freq, name=ax.name)
return binner, [], labels

first, last = _get_range_edges(ax.min(), ax.max(), self.freq,
closed=self.closed,
base=self.base)
first, last = _get_timestamp_range_edges(ax.min(), ax.max(),
self.freq,
closed=self.closed,
base=self.base)
tz = ax.tz
# GH #12037
# use first/last directly instead of call replace() on them
Expand Down Expand Up @@ -1540,20 +1541,39 @@ def _get_period_bins(self, ax):
data=[], freq=self.freq, name=ax.name)
return binner, [], labels

freq_mult = self.freq.n

start = ax.min().asfreq(self.freq, how=self.convention)
end = ax.max().asfreq(self.freq, how='end')
bin_shift = 0

# GH 23882
if self.base:
# get base adjusted bin edge labels
p_start, end = _get_period_range_edges(start,
end,
self.freq,
closed=self.closed,
base=self.base)

# Get offset for bin edge (not label edge) adjustment
start_offset = (pd.Period(start, self.freq)
- pd.Period(p_start, self.freq))
bin_shift = start_offset.n % freq_mult
start = p_start

labels = binner = PeriodIndex(start=start, end=end,
freq=self.freq, name=ax.name)

i8 = memb.asi8
freq_mult = self.freq.n

# when upsampling to subperiods, we need to generate enough bins
expected_bins_count = len(binner) * freq_mult
i8_extend = expected_bins_count - (i8[-1] - i8[0])
rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
rng += freq_mult
# adjust bin edge indexes to account for base
rng -= bin_shift
bins = memb.searchsorted(rng, side='left')

if nat_count > 0:
Expand Down Expand Up @@ -1582,7 +1602,35 @@ def _take_new_index(obj, indexer, new_index, axis=0):
raise ValueError("'obj' should be either a Series or a DataFrame")


def _get_range_edges(first, last, offset, closed='left', base=0):
def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
"""
Adjust the `first` Timestamp to the preceeding Timestamp that resides on
the provided offset. Adjust the `last` Timestamp to the following
Timestamp that resides on the provided offset. Input Timestamps that
already reside on the offset will be adjusted depeding on the type of
offset and the `closed` parameter.
Parameters
----------
first : pd.Timestamp
The beginning Timestamp of the range to be adjusted.
last : pd.Timestamp
The ending Timestamp of the range to be adjusted.
offset : pd.DateOffset
The dateoffset to which the Timestamps will be adjusted.
closed : {'right', 'left'}, default None
Which side of bin interval is closed.
base : int, default 0
The "origin" of the adjusted Timestamps.
Returns
-------
A tuple of length 2, containing the adjusted pd.Timestamp objects.
"""
if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]):
raise TypeError("'first' and 'last' must be instances of type "
"Timestamp")

if isinstance(offset, Tick):
is_day = isinstance(offset, Day)
day_nanos = delta_to_nanoseconds(timedelta(1))
Expand All @@ -1606,6 +1654,45 @@ def _get_range_edges(first, last, offset, closed='left', base=0):
return first, last


def _get_period_range_edges(first, last, offset, closed='left', base=0):
"""
Adjust the provided `first` and `last` Periods to the respective Period of
the given offset that encompasses them.
Parameters
----------
first : pd.Period
The beginning Period of the range to be adjusted.
last : pd.Period
The ending Period of the range to be adjusted.
offset : pd.DateOffset
The dateoffset to which the Periods will be adjusted.
closed : {'right', 'left'}, default None
Which side of bin interval is closed.
base : int, default 0
The "origin" of the adjusted Periods.
Returns
-------
A tuple of length 2, containing the adjusted pd.Period objects.
"""
if not all(isinstance(obj, pd.Period) for obj in [first, last]):
raise TypeError("'first' and 'last' must be instances of type Period")

# GH 23882
first = first.to_timestamp()
last = last.to_timestamp()
adjust_first = not offset.onOffset(first)
adjust_last = offset.onOffset(last)

first, last = _get_timestamp_range_edges(first, last, offset,
closed=closed, base=base)

first = (first + adjust_first * offset).to_period(offset)
last = (last - adjust_last * offset).to_period(offset)
return first, last


def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
# First and last offsets should be calculated from the start day to fix an
# error cause by resampling across multiple days when a one day period is
Expand Down
27 changes: 26 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from pandas.core.indexes.datetimes import date_range
from pandas.core.indexes.period import Period, period_range
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.resample import DatetimeIndex, TimeGrouper
from pandas.core.resample import (
DatetimeIndex, TimeGrouper, _get_timestamp_range_edges)
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, assert_series_equal)
Expand Down Expand Up @@ -1481,3 +1482,27 @@ def test_resample_equivalent_offsets(self, n1, freq1, n2, freq2, k):
result1 = s.resample(str(n1_) + freq1).mean()
result2 = s.resample(str(n2_) + freq2).mean()
assert_series_equal(result1, result2)

@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [
('19910905', '19920406', 'D', '19910905', '19920407'),
('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'),
('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
'19920406 07:00'),
('19910906', '19920406', 'M', '19910831', '19920430'),
('19910831', '19920430', 'M', '19910831', '19920531'),
('1991-08', '1992-04', 'M', '19910831', '19920531'),
])
def test_get_timestamp_range_edges(self, first, last, offset,
exp_first, exp_last):
first = pd.Period(first)
first = first.to_timestamp(first.freq)
last = pd.Period(last)
last = last.to_timestamp(last.freq)

exp_first = pd.Timestamp(exp_first, freq=offset)
exp_last = pd.Timestamp(exp_last, freq=offset)

offset = pd.tseries.frequencies.to_offset(offset)
result = _get_timestamp_range_edges(first, last, offset)
expected = (exp_first, exp_last)
assert result == expected
53 changes: 53 additions & 0 deletions pandas/tests/resample/test_period_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas import DataFrame, Series, Timestamp
from pandas.core.indexes.datetimes import date_range
from pandas.core.indexes.period import Period, PeriodIndex, period_range
from pandas.core.resample import _get_period_range_edges
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, assert_series_equal)
Expand Down Expand Up @@ -701,3 +702,55 @@ def test_resample_with_only_nat(self):
expected = DataFrame([], index=expected_index)
result = frame.resample('1s').mean()
assert_frame_equal(result, expected)

@pytest.mark.parametrize('start,end,start_freq,end_freq,base', [
('19910905', '19910909 03:00', 'H', '24H', 10),
('19910905', '19910909 12:00', 'H', '24H', 10),
('19910905', '19910909 23:00', 'H', '24H', 10),
('19910905 10:00', '19910909', 'H', '24H', 10),
('19910905 10:00', '19910909 10:00', 'H', '24H', 10),
('19910905', '19910909 10:00', 'H', '24H', 10),
('19910905 12:00', '19910909', 'H', '24H', 10),
('19910905 12:00', '19910909 03:00', 'H', '24H', 10),
('19910905 12:00', '19910909 12:00', 'H', '24H', 10),
('19910905 12:00', '19910909 12:00', 'H', '24H', 34),
('19910905 12:00', '19910909 12:00', 'H', '17H', 10),
('19910905 12:00', '19910909 12:00', 'H', '17H', 3),
('19910905 12:00', '19910909 1:00', 'H', 'M', 3),
('19910905', '19910913 06:00', '2H', '24H', 10),
('19910905', '19910905 01:39', 'Min', '5Min', 3),
('19910905', '19910905 03:18', '2Min', '5Min', 3),
])
def test_resample_with_non_zero_base(self, start, end, start_freq,
end_freq, base):
# GH 23882
s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq))
s = s + np.arange(len(s))
result = s.resample(end_freq, base=base).mean()
result = result.to_timestamp(end_freq)
# to_timestamp casts 24H -> D
result = result.asfreq(end_freq) if end_freq == '24H' else result
expected = s.to_timestamp().resample(end_freq, base=base).mean()
assert_series_equal(result, expected)

@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [
('19910905', '19920406', 'D', '19910905', '19920406'),
('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'),
('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
'19920406 06:00'),
('19910906', '19920406', 'M', '1991-09', '1992-04'),
('19910831', '19920430', 'M', '1991-08', '1992-04'),
('1991-08', '1992-04', 'M', '1991-08', '1992-04'),
])
def test_get_period_range_edges(self, first, last, offset,
exp_first, exp_last):
first = pd.Period(first)
last = pd.Period(last)

exp_first = pd.Period(exp_first, freq=offset)
exp_last = pd.Period(exp_last, freq=offset)

offset = pd.tseries.frequencies.to_offset(offset)
result = _get_period_range_edges(first, last, offset)
expected = (exp_first, exp_last)
assert result == expected

0 comments on commit 040f06f

Please sign in to comment.