Ensure column.fillna signatures are consistent (#14724)

Aligns the definitions of `Columns.fillna` among all subclasses. `dtype` looks to only needed in certain instances to cast the fill value so can do that separately. A `fill_nan` can be avoided with its single usage in a `can_cast` routine by checking for nan first Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: #14724
rapidsai · Jan 10, 2024 · 6a23775 · 6a23775
1 parent 8df33ee
commit 6a23775
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 53 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase:
     def fillna(
         self,
         fill_value: Any = None,
-        method: Any = None,
-        dtype: Optional[Dtype] = None,
-    ) -> CategoricalColumn:
+        method: Optional[str] = None,
+    ) -> Self:
         """
         Fill null values with *fill_value*
         """
@@ -1276,7 +1275,7 @@ def fillna(
                     self.codes.dtype
                 )
 
-        return super().fillna(value=fill_value, method=method)
+        return super().fillna(fill_value, method=method)
 
     def indices_of(
         self, value: ScalarLike

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -710,16 +710,15 @@ def _check_scatter_key_length(
 
     def fillna(
         self,
-        value: Any = None,
+        fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
     ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
         return libcudf.replace.replace_nulls(
-            input_col=self, replacement=value, method=method, dtype=dtype
+            input_col=self, replacement=fill_value, method=method
         )._with_type_metadata(self.dtype)
 
     def isnull(self) -> ColumnBase:
@@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
             # https://github.com/rapidsai/cudf/issues/14515 by
             # providing a mode in which cudf::contains does not mask
             # the result.
-            result = result.fillna(rhs.null_count > 0, dtype=bool)
+            result = result.fillna(cudf.Scalar(rhs.null_count > 0))
         return result
 
     def as_mask(self) -> Buffer:

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -598,12 +599,12 @@ def fillna(
         self,
         fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
-    ) -> DatetimeColumn:
+    ) -> Self:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
                 return self.copy(deep=True)
             if is_scalar(fill_value):
+                # TODO: Add cast checking like TimedeltaColumn.fillna
                 if not isinstance(fill_value, cudf.Scalar):
                     fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
             else:

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import warnings
 from decimal import Decimal
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
 
     def fillna(
         self,
-        value: Any = None,
+        fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
-    ):
+    ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
-        if isinstance(value, (int, Decimal)):
-            value = cudf.Scalar(value, dtype=self.dtype)
+        if isinstance(fill_value, (int, Decimal)):
+            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
         elif (
-            isinstance(value, DecimalBaseColumn)
-            or isinstance(value, cudf.core.column.NumericalColumn)
-            and is_integer_dtype(value.dtype)
+            isinstance(fill_value, DecimalBaseColumn)
+            or isinstance(fill_value, cudf.core.column.NumericalColumn)
+            and is_integer_dtype(fill_value.dtype)
         ):
-            value = value.astype(self.dtype)
+            fill_value = fill_value.astype(self.dtype)
         else:
             raise TypeError(
                 "Decimal columns only support using fillna with decimal and "
                 "integer values"
             )
 
-        return super().fillna(value=value, method=method)
+        return super().fillna(fill_value, method=method)
 
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -16,6 +16,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
-    def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
+    def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
         if self.dtype.kind != "f" or self.nan_count == 0:
             return self
@@ -533,13 +534,11 @@ def fillna(
         self,
         fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
-        fill_nan: bool = True,
-    ) -> NumericalColumn:
+    ) -> Self:
         """
         Fill null values with *fill_value*
         """
-        col = self.nans_to_nulls() if fill_nan else self
+        col = self.nans_to_nulls()
 
         if col.null_count == 0:
             return col
@@ -574,8 +573,8 @@ def fillna(
                     if not (new_fill_value == fill_value).all():
                         raise TypeError(
                             f"Cannot safely cast non-equivalent "
-                            f"{col.dtype.type.__name__} to "
-                            f"{cudf.dtype(dtype).type.__name__}"
+                            f"{fill_value.dtype.type.__name__} to "
+                            f"{col.dtype.type.__name__}"
                         )
                     fill_value = new_fill_value
             else:
@@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         # want to cast float to int:
         elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}:
+            if self.nan_count > 0:
+                return False
             iinfo = np.iinfo(to_dtype)
             min_, max_ = iinfo.min, iinfo.max
 
             # best we can do is hope to catch it here and avoid compare
             if (self.min() >= min_) and (self.max() <= max_):
-                filled = self.fillna(0, fill_nan=False)
+                filled = self.fillna(0)
                 return (cudf.Series(filled) % 1 == 0).all()
             else:
                 return False

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -21,6 +21,7 @@
 import pandas as pd
 import pyarrow as pa
 from numba import cuda
+from typing_extensions import Self
 
 import cudf
 import cudf.api.types
@@ -5824,17 +5825,16 @@ def fillna(
         self,
         fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
-    ) -> StringColumn:
+    ) -> Self:
         if fill_value is not None:
             if not is_scalar(fill_value):
                 fill_value = column.as_column(fill_value, dtype=self.dtype)
             elif cudf._lib.scalar._is_null_host_scalar(fill_value):
                 # Trying to fill <NA> with <NA> value? Return copy.
                 return self.copy(deep=True)
-            return super().fillna(value=fill_value, dtype="object")
-        else:
-            return super().fillna(method=method)
+            else:
+                fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
+        return super().fillna(fill_value, method=method)
 
     def normalize_binop_value(
         self, other

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -281,24 +282,21 @@ def fillna(
         self,
         fill_value: Any = None,
         method: Optional[str] = None,
-        dtype: Optional[Dtype] = None,
-    ) -> TimeDeltaColumn:
+    ) -> Self:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
                 return self.copy(deep=True)
-            col: ColumnBase = self
             if is_scalar(fill_value):
-                if isinstance(fill_value, np.timedelta64):
-                    dtype = determine_out_dtype(self.dtype, fill_value.dtype)
-                    fill_value = fill_value.astype(dtype)
-                    col = col.astype(dtype)
-                if not isinstance(fill_value, cudf.Scalar):
-                    fill_value = cudf.Scalar(fill_value, dtype=dtype)
+                fill_value = cudf.Scalar(fill_value)
+                dtype = determine_out_dtype(self.dtype, fill_value.dtype)
+                fill_value = fill_value.astype(dtype)
+                if self.dtype != dtype:
+                    return cast(
+                        Self, self.astype(dtype).fillna(fill_value, method)
+                    )
             else:
                 fill_value = column.as_column(fill_value, nan_as_null=False)
-            return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value))
-        else:
-            return super().fillna(method=method)
+        return super().fillna(fill_value, method)
 
     def as_numerical_column(
         self, dtype: Dtype, **kwargs

diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import datetime
 import operator
@@ -1024,7 +1024,6 @@ def local_assert(expected, actual):
     [
         np.timedelta64(4, "s"),
         np.timedelta64(456, "D"),
-        np.timedelta64(46, "h"),
         np.timedelta64("nat"),
         np.timedelta64(1, "s"),
         np.timedelta64(1, "ms"),