Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

interpolate returns new column if no values are interpolated #16158

Merged
merged 4 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 21 additions & 40 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import RangeIndex, ensure_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column

if TYPE_CHECKING:
from cudf.core.column.column import ColumnBase
from cudf.core.index import BaseIndex


def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
"""Encode the input values as integer labels
Expand Down Expand Up @@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
return labels, cats.values if return_cupy_array else ensure_index(cats)


def _linear_interpolation(column, index=None):
"""
Interpolate over a float column. Implicitly assumes that values are
evenly spaced with respect to the x-axis, for example the data
[1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
between the two valid values, yielding [1.0, 2.0, 3.0]
"""

index = RangeIndex(start=0, stop=len(column), step=1)
return _index_or_values_interpolation(column, index=index)


def _index_or_values_interpolation(column, index=None):
def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
"""
Interpolate over a float column. assumes a linear interpolation
strategy using the index of the data to denote spacing of the x
values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
would result in [1.0, 3.0, 4.0]
would result in [1.0, 3.0, 4.0].
"""
# figure out where the nans are
mask = cp.isnan(column)
mask = column.isnull()

# trivial cases, all nan or no nans
num_nan = mask.sum()
if num_nan == 0 or num_nan == len(column):
return column
if not mask.any() or mask.all():
return column.copy()

to_interp = IndexedFrame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(
BooleanMask(~mask, len(to_interp))
)

known_x = known_x_and_y.index.to_cupy()
known_y = known_x_and_y._data.columns[0].values
valid_locs = ~mask
if isinstance(index, RangeIndex):
# Each point is evenly spaced, index values don't matter
known_x = cp.flatnonzero(valid_locs.values)
else:
known_x = index._column.apply_boolean_mask(valid_locs).values # type: ignore[attr-defined]
known_y = column.apply_boolean_mask(valid_locs).values

result = cp.interp(index.to_cupy(), known_x, known_y)

# find the first nan
first_nan_idx = (mask == 0).argmax().item()
first_nan_idx = valid_locs.values.argmax().item()
result[:first_nan_idx] = np.nan
return result


def get_column_interpolator(method):
interpolator = {
"linear": _linear_interpolation,
"index": _index_or_values_interpolation,
"values": _index_or_values_interpolation,
}.get(method, None)
if not interpolator:
raise ValueError(f"Interpolation method `{method}` not found")
return interpolator
return as_column(result)
14 changes: 11 additions & 3 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

import cudf
import cudf._lib as libcudf
import cudf.core
import cudf.core.algorithms
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
Expand Down Expand Up @@ -1987,6 +1989,8 @@ def interpolate(
"Use obj.ffill() or obj.bfill() instead.",
FutureWarning,
)
elif method not in {"linear", "values", "index"}:
raise ValueError(f"Interpolation method `{method}` not found")

data = self

Expand All @@ -2000,7 +2004,10 @@ def interpolate(
)
)

interpolator = cudf.core.algorithms.get_column_interpolator(method)
if method == "linear":
interp_index = RangeIndex(self._num_rows)
else:
interp_index = data.index
columns = []
for col in data._columns:
if isinstance(col, cudf.core.column.StringColumn):
Expand All @@ -2012,8 +2019,9 @@ def interpolate(
if col.nullable:
col = col.astype("float64").fillna(np.nan)

# Interpolation methods may or may not need the index
columns.append(interpolator(col, index=data.index))
columns.append(
cudf.core.algorithms._interpolation(col, index=interp_index)
)

result = self._from_data_like_self(
self._data._from_columns_like_self(columns)
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._base_index import _return_get_indexer_result
from cudf.core.algorithms import factorize
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.index import (
Expand Down Expand Up @@ -1373,9 +1374,6 @@ def from_arrays(
(2, 'blue')],
names=['number', 'color'])
"""
# Imported here due to circular import
from cudf.core.algorithms import factorize

error_msg = "Input must be a list / sequence of array-likes."
if not is_list_like(arrays):
raise TypeError(error_msg)
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs):
lfunc_args_and_kwargs=([], kwargs),
rfunc_args_and_kwargs=([], kwargs),
)


def test_interpolate_noop_new_column():
ser = cudf.Series([1.0, 2.0, 3.0])
result = ser.interpolate()
assert ser._column is not result._column
Loading