Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Fix interpolation for datetimelike dtypes #21915

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ Datetimelike

- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`)
- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`,:issue:`21365`)
- Fixed bug in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` where null values were not filled for dtypes of ``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64[ns]`` (:issue:`21915`)

Timedelta
^^^^^^^^^
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6097,8 +6097,11 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
raise ValueError("Only `method=linear` interpolation is supported "
"on MultiIndexes.")

if _maybe_transposed_self._data.get_dtype_counts().get(
'object') == len(_maybe_transposed_self.T):
dtype_counts = _maybe_transposed_self._data.get_dtype_counts()
if ('object' in dtype_counts and
dtype_counts.get('object') == len(_maybe_transposed_self.T)):
# Try to short-circuit tranposing to avoid superfluous dimension
# errors GH#13287, GH#17539, GH#19197
raise TypeError("Cannot interpolate with all NaNs.")

# create/use the index
Expand Down
60 changes: 44 additions & 16 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from pandas.core.base import PandasObject

import pandas.core.dtypes.common as ct
from pandas.core.dtypes.dtypes import (
ExtensionDtype, DatetimeTZDtype,
PandasExtensionDtype,
Expand Down Expand Up @@ -1158,20 +1159,19 @@ def check_int_bool(self, inplace):
try:
m = missing.clean_interp_method(method, **kwargs)
except:
m = None
raise ValueError("invalid method '{0}' to interpolate."
.format(method))

if m is not None:
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate(method=m, index=index, values=values,
axis=axis, limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value, inplace=inplace,
downcast=downcast, mgr=mgr, **kwargs)
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate(method=m, index=index, values=values,
axis=axis, limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value, inplace=inplace,
downcast=downcast, mgr=mgr, **kwargs)

raise ValueError("invalid method '{0}' to interpolate.".format(method))

def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
limit=None, fill_value=None, coerce=False,
Expand Down Expand Up @@ -1199,20 +1199,35 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
return self._maybe_downcast(blocks, downcast)

# TODO: ignoring `values`?
def _interpolate(self, method=None, index=None, values=None,
fill_value=None, axis=0, limit=None,
limit_direction='forward', limit_area=None,
inplace=False, downcast=None, mgr=None, **kwargs):
""" interpolate using scipy wrappers """

inplace = validate_bool_kwarg(inplace, 'inplace')
data = self.values if inplace else self.values.copy()

# only deal with floats
if not self.is_float:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i prefer to import these rather than use ct

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Darn, I was hoping that would catch on (I don't like the giant namespaces). Will change.

if ct.needs_i8_conversion(self.dtype):
if ct.is_period_dtype(self.dtype):
raise NotImplementedError("PeriodDtype columns/Series don't "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let’s move code to subclasses as appropriate
this gets messy really fast otherwise

"exist yet, but will soon. "
"When they do, test them!")
mask = isna(self.values)
values = self.values

# DatetimeTZBlock.values is DatetimeIndex, need to cast/shape
values = getattr(values, 'values', values).reshape(self.shape)
data = values.astype(np.float64)
data[mask.reshape(self.shape)] = np.nan
elif not self.is_float:
if not self.is_integer:
return self
data = data.astype(np.float64)
data = self.values.astype(np.float64)
else:
# avoid making a copy if possible
data = self.values if inplace else self.values.copy()

if fill_value is None:
fill_value = self.fill_value
Expand All @@ -1224,7 +1239,6 @@ def _interpolate(self, method=None, index=None, values=None,
# process 1-d slices in the axis direction

def func(x):

# process a 1-d slice, returning it
# should the axis argument be handled below in apply_along_axis?
# i.e. not an arg to missing.interpolate_1d
Expand All @@ -1236,6 +1250,20 @@ def func(x):

# interp each column independently
interp_values = np.apply_along_axis(func, axis, data)
if ct.needs_i8_conversion(self.dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

# convert remaining NaNs back to NaT and cast back to own dtype
mask = isna(interp_values)
interp_values[mask] = fill_value # TODO: or self.fill_value?

# Note: we need to get to a numpy dtype (M8[ns] or m8[ns]) and
# not a pandas tz-aware dtype (for now)
dtype = self.dtype.base
assert isinstance(dtype, np.dtype)
interp_values = interp_values.astype(dtype)
if is_datetimetz(self):
# squeeze() since we expanded dimension above
held = self._holder(interp_values.squeeze(), tz='UTC')
interp_values = held.tz_convert(self.dtype.tz)

blocks = [self.make_block_same_class(interp_values)]
return self._maybe_downcast(blocks, downcast)
Expand Down
73 changes: 72 additions & 1 deletion pandas/tests/frame/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,10 @@ def test_fillna_categorical_nan(self):
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
with tm.assert_produces_warning(RuntimeWarning):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated, but nice to catch.

# RuntimeWarning: All-NaN slice encountered
res = df.fillna(df.median())

v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
dtype='category')
Expand Down Expand Up @@ -855,3 +858,71 @@ def test_interp_ignore_all_good(self):
# all good
result = df[['B', 'D']].interpolate(downcast=None)
assert_frame_equal(result, df[['B', 'D']])

@pytest.mark.parametrize('use_idx', [True, False])
@pytest.mark.parametrize('tz', [None, 'US/Central'])
def test_interpolate_dt64_values(self, tz, use_idx):
# GH#21915
dti = pd.date_range('2016-01-01', periods=10, tz=tz)
index = dti if use_idx else None

# Copy to avoid corrupting dti, see GH#21907
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u make this more informative ; this is very crytptix

ser = pd.Series(dti, index=index).copy()
ser[::3] = pd.NaT

expected = pd.Series(dti, index=index)
expected.iloc[0] = pd.NaT
expected.iloc[-1] = expected.iloc[-2]

df = ser.to_frame()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you converting to frames? can u just start with frames?

expected = expected.to_frame()

result = df.interpolate(method='linear')
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('use_idx', [True, False])
def test_interpolate_td64_values(self, use_idx):
# GH#21915
tdi = pd.timedelta_range('1D', periods=10)
index = tdi if use_idx else None

ser = pd.Series(tdi, index=index)
ser[::3] = pd.NaT

expected = pd.Series(tdi, index=index)
expected.iloc[0] = pd.NaT
expected.iloc[-1] = expected.iloc[-2]

df = ser.to_frame()
expected = expected.to_frame()

result = df.interpolate(method='linear')
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('use_idx', [True, False])
def test_interpolate_datetimelike_and_object(self, use_idx):
# GH#21915
# Check that dt64/td64 with more than one column doesn't get
# screwed up by .transpose() with an object column present.
dti_tz = pd.date_range('2016-01-01', periods=10, tz='US/Central')
dti_naive = pd.date_range('2016-01-01', periods=10, tz=None)
tdi = pd.timedelta_range('1D', periods=10)
objcol = list('ABCDEFGHIJ')

index = tdi if use_idx else None

df = pd.DataFrame({'aware': dti_tz,
'naive': dti_naive,
'tdi': tdi,
'obj': objcol},
columns=['naive', 'aware', 'tdi', 'obj'],
index=index)

expected = df.copy()
expected.iloc[0, :-1] = pd.NaT
expected.iloc[-1, :-1] = df.iloc[-2, :-1]

df.iloc[::3, :-1] = pd.NaT

result = df.interpolate(method='linear')
tm.assert_frame_equal(result, expected)
35 changes: 35 additions & 0 deletions pandas/tests/series/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1317,3 +1317,38 @@ def test_series_interpolate_intraday(self):
result = ts.reindex(new_index).interpolate(method='time')

tm.assert_numpy_array_equal(result.values, exp.values)

# TODO: De-duplicate with similar tests in test.frame.test_missing?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is much more complicated we have generic tests that already do a lot of this

you can also move these tests there

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll take a look for a better place.

This is also the answer to the previous question about why not starting with DataFrame: bc the test originally tested Series and DataFrame in the same test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is much more complicated we have generic tests that already do a lot of this

Both for this PR and the upcoming slew of Datetime/Timedelta/PeriodArray tests, there are a bunch of for FooArray/FooIndex/Series[foo]/DataFrame[foo-column] that are highly duplicative and I think things are falling through the cracks. Think we can find a place to put them so they can be parametrized?

@pytest.mark.parametrize('use_idx', [True, False])
@pytest.mark.parametrize('tz', [None, 'US/Central'])
def test_interpolate_dt64_values(self, tz, use_idx):
# GH#21915
dti = pd.date_range('2016-01-01', periods=10, tz=tz)
index = dti if use_idx else None

# Copy to avoid corrupting dti, see GH#21907
ser = pd.Series(dti, index=index).copy()
ser[::3] = pd.NaT

expected = pd.Series(dti, index=index)
expected.iloc[0] = pd.NaT
expected.iloc[-1] = expected.iloc[-2]

result = ser.interpolate(method='linear')
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('use_idx', [True, False])
def test_interpolate_td64_values(self, use_idx):
# GH#21915
tdi = pd.timedelta_range('1D', periods=10)
index = tdi if use_idx else None

ser = pd.Series(tdi, index=index)
ser[::3] = pd.NaT

expected = pd.Series(tdi, index=index)
expected.iloc[0] = pd.NaT
expected.iloc[-1] = expected.iloc[-2]

result = ser.interpolate(method='linear')
tm.assert_series_equal(result, expected)