diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index e8b82ff60c2..6c69fbd2637 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,17 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING import cupy as cp import numpy as np from cudf.core.column import as_column -from cudf.core.copy_types import BooleanMask from cudf.core.index import RangeIndex, ensure_index -from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar from cudf.options import get_option from cudf.utils.dtypes import can_convert_to_column +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + from cudf.core.index import BaseIndex + def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): """Encode the input values as integer labels @@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): return labels, cats.values if return_cupy_array else ensure_index(cats) -def _linear_interpolation(column, index=None): - """ - Interpolate over a float column. Implicitly assumes that values are - evenly spaced with respect to the x-axis, for example the data - [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way - between the two valid values, yielding [1.0, 2.0, 3.0] - """ - - index = RangeIndex(start=0, stop=len(column), step=1) - return _index_or_values_interpolation(column, index=index) - - -def _index_or_values_interpolation(column, index=None): +def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase: """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] - would result in [1.0, 3.0, 4.0] + would result in [1.0, 3.0, 4.0]. """ # figure out where the nans are - mask = cp.isnan(column) + mask = column.isnull() # trivial cases, all nan or no nans - num_nan = mask.sum() - if num_nan == 0 or num_nan == len(column): - return column + if not mask.any() or mask.all(): + return column.copy() - to_interp = IndexedFrame(data={None: column}, index=index) - known_x_and_y = to_interp._apply_boolean_mask( - BooleanMask(~mask, len(to_interp)) - ) - - known_x = known_x_and_y.index.to_cupy() - known_y = known_x_and_y._data.columns[0].values + valid_locs = ~mask + if isinstance(index, RangeIndex): + # Each point is evenly spaced, index values don't matter + known_x = cp.flatnonzero(valid_locs.values) + else: + known_x = index._column.apply_boolean_mask(valid_locs).values # type: ignore[attr-defined] + known_y = column.apply_boolean_mask(valid_locs).values result = cp.interp(index.to_cupy(), known_x, known_y) # find the first nan - first_nan_idx = (mask == 0).argmax().item() + first_nan_idx = valid_locs.values.argmax().item() result[:first_nan_idx] = np.nan - return result - - -def get_column_interpolator(method): - interpolator = { - "linear": _linear_interpolation, - "index": _index_or_values_interpolation, - "values": _index_or_values_interpolation, - }.get(method, None) - if not interpolator: - raise ValueError(f"Interpolation method `{method}` not found") - return interpolator + return as_column(result) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ff10051c52d..63fa96d0db0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -26,6 +26,8 @@ import cudf import cudf._lib as libcudf +import cudf.core +import cudf.core.algorithms from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -1987,6 +1989,8 @@ def interpolate( "Use obj.ffill() or obj.bfill() instead.", FutureWarning, ) + elif method not in {"linear", "values", "index"}: + raise ValueError(f"Interpolation method `{method}` not found") data = self @@ -2000,7 +2004,10 @@ def interpolate( ) ) - interpolator = cudf.core.algorithms.get_column_interpolator(method) + if method == "linear": + interp_index = RangeIndex(self._num_rows) + else: + interp_index = data.index columns = [] for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): @@ -2012,8 +2019,9 @@ def interpolate( if col.nullable: col = col.astype("float64").fillna(np.nan) - # Interpolation methods may or may not need the index - columns.append(interpolator(col, index=data.index)) + columns.append( + cudf.core.algorithms._interpolation(col, index=interp_index) + ) result = self._from_data_like_self( self._data._from_columns_like_self(columns) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9cbe863142b..dbbd1eab6c8 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -23,6 +23,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core.algorithms import factorize from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import ( @@ -1373,9 +1374,6 @@ def from_arrays( (2, 'blue')], names=['number', 'color']) """ - # Imported here due to circular import - from cudf.core.algorithms import factorize - error_msg = "Input must be a list / sequence of array-likes." if not is_list_like(arrays): raise TypeError(error_msg) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 4a0dc331e1a..a4f0b9fc97e 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs): lfunc_args_and_kwargs=([], kwargs), rfunc_args_and_kwargs=([], kwargs), ) + + +def test_interpolate_noop_new_column(): + ser = cudf.Series([1.0, 2.0, 3.0]) + result = ser.interpolate() + assert ser._column is not result._column