-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BUG] Fix interpolation for datetimelike dtypes #21915
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
|
||
from pandas.core.base import PandasObject | ||
|
||
import pandas.core.dtypes.common as ct | ||
from pandas.core.dtypes.dtypes import ( | ||
ExtensionDtype, DatetimeTZDtype, | ||
PandasExtensionDtype, | ||
|
@@ -1158,20 +1159,19 @@ def check_int_bool(self, inplace): | |
try: | ||
m = missing.clean_interp_method(method, **kwargs) | ||
except: | ||
m = None | ||
raise ValueError("invalid method '{0}' to interpolate." | ||
.format(method)) | ||
|
||
if m is not None: | ||
r = check_int_bool(self, inplace) | ||
if r is not None: | ||
return r | ||
return self._interpolate(method=m, index=index, values=values, | ||
axis=axis, limit=limit, | ||
limit_direction=limit_direction, | ||
limit_area=limit_area, | ||
fill_value=fill_value, inplace=inplace, | ||
downcast=downcast, mgr=mgr, **kwargs) | ||
r = check_int_bool(self, inplace) | ||
if r is not None: | ||
return r | ||
return self._interpolate(method=m, index=index, values=values, | ||
axis=axis, limit=limit, | ||
limit_direction=limit_direction, | ||
limit_area=limit_area, | ||
fill_value=fill_value, inplace=inplace, | ||
downcast=downcast, mgr=mgr, **kwargs) | ||
|
||
raise ValueError("invalid method '{0}' to interpolate.".format(method)) | ||
|
||
def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, | ||
limit=None, fill_value=None, coerce=False, | ||
|
@@ -1199,20 +1199,35 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, | |
blocks = [self.make_block_same_class(values, ndim=self.ndim)] | ||
return self._maybe_downcast(blocks, downcast) | ||
|
||
# TODO: ignoring `values`? | ||
def _interpolate(self, method=None, index=None, values=None, | ||
fill_value=None, axis=0, limit=None, | ||
limit_direction='forward', limit_area=None, | ||
inplace=False, downcast=None, mgr=None, **kwargs): | ||
""" interpolate using scipy wrappers """ | ||
|
||
inplace = validate_bool_kwarg(inplace, 'inplace') | ||
data = self.values if inplace else self.values.copy() | ||
|
||
# only deal with floats | ||
if not self.is_float: | ||
if ct.needs_i8_conversion(self.dtype): | ||
if ct.is_period_dtype(self.dtype): | ||
raise NotImplementedError("PeriodDtype columns/Series don't " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let’s move code to subclasses as appropriate |
||
"exist yet, but will soon. " | ||
"When they do, test them!") | ||
mask = isna(self.values) | ||
values = self.values | ||
|
||
# DatetimeTZBlock.values is DatetimeIndex, need to cast/shape | ||
values = getattr(values, 'values', values).reshape(self.shape) | ||
data = values.astype(np.float64) | ||
data[mask.reshape(self.shape)] = np.nan | ||
elif not self.is_float: | ||
if not self.is_integer: | ||
return self | ||
data = data.astype(np.float64) | ||
data = self.values.astype(np.float64) | ||
else: | ||
# avoid making a copy if possible | ||
data = self.values if inplace else self.values.copy() | ||
|
||
if fill_value is None: | ||
fill_value = self.fill_value | ||
|
@@ -1224,7 +1239,6 @@ def _interpolate(self, method=None, index=None, values=None, | |
# process 1-d slices in the axis direction | ||
|
||
def func(x): | ||
|
||
# process a 1-d slice, returning it | ||
# should the axis argument be handled below in apply_along_axis? | ||
# i.e. not an arg to missing.interpolate_1d | ||
|
@@ -1236,6 +1250,20 @@ def func(x): | |
|
||
# interp each column independently | ||
interp_values = np.apply_along_axis(func, axis, data) | ||
if ct.needs_i8_conversion(self.dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
# convert remaining NaNs back to NaT and cast back to own dtype | ||
mask = isna(interp_values) | ||
interp_values[mask] = fill_value # TODO: or self.fill_value? | ||
|
||
# Note: we need to get to a numpy dtype (M8[ns] or m8[ns]) and | ||
# not a pandas tz-aware dtype (for now) | ||
dtype = self.dtype.base | ||
assert isinstance(dtype, np.dtype) | ||
interp_values = interp_values.astype(dtype) | ||
if is_datetimetz(self): | ||
# squeeze() since we expanded dimension above | ||
held = self._holder(interp_values.squeeze(), tz='UTC') | ||
interp_values = held.tz_convert(self.dtype.tz) | ||
|
||
blocks = [self.make_block_same_class(interp_values)] | ||
return self._maybe_downcast(blocks, downcast) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -360,7 +360,10 @@ def test_fillna_categorical_nan(self): | |
cat = Categorical([np.nan, 2, np.nan]) | ||
val = Categorical([np.nan, np.nan, np.nan]) | ||
df = DataFrame({"cats": cat, "vals": val}) | ||
res = df.fillna(df.median()) | ||
with tm.assert_produces_warning(RuntimeWarning): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated, but nice to catch. |
||
# RuntimeWarning: All-NaN slice encountered | ||
res = df.fillna(df.median()) | ||
|
||
v_exp = [np.nan, np.nan, np.nan] | ||
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, | ||
dtype='category') | ||
|
@@ -855,3 +858,71 @@ def test_interp_ignore_all_good(self): | |
# all good | ||
result = df[['B', 'D']].interpolate(downcast=None) | ||
assert_frame_equal(result, df[['B', 'D']]) | ||
|
||
@pytest.mark.parametrize('use_idx', [True, False]) | ||
@pytest.mark.parametrize('tz', [None, 'US/Central']) | ||
def test_interpolate_dt64_values(self, tz, use_idx): | ||
# GH#21915 | ||
dti = pd.date_range('2016-01-01', periods=10, tz=tz) | ||
index = dti if use_idx else None | ||
|
||
# Copy to avoid corrupting dti, see GH#21907 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u make this more informative ; this is very crytptix |
||
ser = pd.Series(dti, index=index).copy() | ||
ser[::3] = pd.NaT | ||
|
||
expected = pd.Series(dti, index=index) | ||
expected.iloc[0] = pd.NaT | ||
expected.iloc[-1] = expected.iloc[-2] | ||
|
||
df = ser.to_frame() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you converting to frames? can u just start with frames? |
||
expected = expected.to_frame() | ||
|
||
result = df.interpolate(method='linear') | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('use_idx', [True, False]) | ||
def test_interpolate_td64_values(self, use_idx): | ||
# GH#21915 | ||
tdi = pd.timedelta_range('1D', periods=10) | ||
index = tdi if use_idx else None | ||
|
||
ser = pd.Series(tdi, index=index) | ||
ser[::3] = pd.NaT | ||
|
||
expected = pd.Series(tdi, index=index) | ||
expected.iloc[0] = pd.NaT | ||
expected.iloc[-1] = expected.iloc[-2] | ||
|
||
df = ser.to_frame() | ||
expected = expected.to_frame() | ||
|
||
result = df.interpolate(method='linear') | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('use_idx', [True, False]) | ||
def test_interpolate_datetimelike_and_object(self, use_idx): | ||
# GH#21915 | ||
# Check that dt64/td64 with more than one column doesn't get | ||
# screwed up by .transpose() with an object column present. | ||
dti_tz = pd.date_range('2016-01-01', periods=10, tz='US/Central') | ||
dti_naive = pd.date_range('2016-01-01', periods=10, tz=None) | ||
tdi = pd.timedelta_range('1D', periods=10) | ||
objcol = list('ABCDEFGHIJ') | ||
|
||
index = tdi if use_idx else None | ||
|
||
df = pd.DataFrame({'aware': dti_tz, | ||
'naive': dti_naive, | ||
'tdi': tdi, | ||
'obj': objcol}, | ||
columns=['naive', 'aware', 'tdi', 'obj'], | ||
index=index) | ||
|
||
expected = df.copy() | ||
expected.iloc[0, :-1] = pd.NaT | ||
expected.iloc[-1, :-1] = df.iloc[-2, :-1] | ||
|
||
df.iloc[::3, :-1] = pd.NaT | ||
|
||
result = df.interpolate(method='linear') | ||
tm.assert_frame_equal(result, expected) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1317,3 +1317,38 @@ def test_series_interpolate_intraday(self): | |
result = ts.reindex(new_index).interpolate(method='time') | ||
|
||
tm.assert_numpy_array_equal(result.values, exp.values) | ||
|
||
# TODO: De-duplicate with similar tests in test.frame.test_missing? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is much more complicated we have generic tests that already do a lot of this you can also move these tests there There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll take a look for a better place. This is also the answer to the previous question about why not starting with DataFrame: bc the test originally tested Series and DataFrame in the same test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Both for this PR and the upcoming slew of Datetime/Timedelta/PeriodArray tests, there are a bunch of for FooArray/FooIndex/Series[foo]/DataFrame[foo-column] that are highly duplicative and I think things are falling through the cracks. Think we can find a place to put them so they can be parametrized? |
||
@pytest.mark.parametrize('use_idx', [True, False]) | ||
@pytest.mark.parametrize('tz', [None, 'US/Central']) | ||
def test_interpolate_dt64_values(self, tz, use_idx): | ||
# GH#21915 | ||
dti = pd.date_range('2016-01-01', periods=10, tz=tz) | ||
index = dti if use_idx else None | ||
|
||
# Copy to avoid corrupting dti, see GH#21907 | ||
ser = pd.Series(dti, index=index).copy() | ||
ser[::3] = pd.NaT | ||
|
||
expected = pd.Series(dti, index=index) | ||
expected.iloc[0] = pd.NaT | ||
expected.iloc[-1] = expected.iloc[-2] | ||
|
||
result = ser.interpolate(method='linear') | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('use_idx', [True, False]) | ||
def test_interpolate_td64_values(self, use_idx): | ||
# GH#21915 | ||
tdi = pd.timedelta_range('1D', periods=10) | ||
index = tdi if use_idx else None | ||
|
||
ser = pd.Series(tdi, index=index) | ||
ser[::3] = pd.NaT | ||
|
||
expected = pd.Series(tdi, index=index) | ||
expected.iloc[0] = pd.NaT | ||
expected.iloc[-1] = expected.iloc[-2] | ||
|
||
result = ser.interpolate(method='linear') | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i prefer to import these rather than use ct
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Darn, I was hoping that would catch on (I don't like the giant namespaces). Will change.