Drop na_sentinel from factorize (#12924)

This PR drops support for `na_sentinel` in factorize APIs, to match with pandas-2.0
rapidsai · Mar 13, 2023 · 4a87cbd · 4a87cbd
1 parent e115ba5
commit 4a87cbd
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 87 deletions.
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -11,9 +11,7 @@
 from cudf.core.series import Series
 
 
-def factorize(
-    values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None
-):
+def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     """Encode the input values as integer labels
 
     Parameters
@@ -22,14 +20,6 @@ def factorize(
         The data to be factorized.
     sort : bool, default True
         Sort uniques and shuffle codes to maintain the relationship.
-    na_sentinel : number, default -1
-        Value to indicate missing category.
-
-        .. deprecated:: 23.04
-
-           The na_sentinel argument is deprecated and will be removed in
-           a future version of cudf. Specify use_na_sentinel as
-           either True or False.
     use_na_sentinel : bool, default True
         If True, the sentinel -1 will be used for NA values.
         If False, NA values will be encoded as non-negative
@@ -83,51 +73,19 @@ def factorize(
     >>> uniques
     Float64Index([<NA>, 1.0, 2.0], dtype='float64')
     """
-    # TODO: Drop `na_sentinel` in the next release immediately after
-    # pandas 2.0 upgrade.
-    if na_sentinel is not None and use_na_sentinel is not None:
-        raise ValueError(
-            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
-            f"got `na_sentinel={na_sentinel}` and "
-            f"`use_na_sentinel={use_na_sentinel}`"
-        )
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
     values = Series(values)
 
-    if na_sentinel is None:
-        na_sentinel = (
-            -1
-            if use_na_sentinel is None or use_na_sentinel
-            else Scalar(None, dtype=values.dtype)
-        )
-    else:
-        if na_sentinel is None:
-            msg = (
-                "Specifying `na_sentinel=None` is deprecated, specify "
-                "`use_na_sentinel=False` instead."
-            )
-        elif na_sentinel == -1:
-            msg = (
-                "Specifying `na_sentinel=-1` is deprecated, specify "
-                "`use_na_sentinel=True` instead."
-            )
-        else:
-            msg = (
-                "Specifying the specific value to use for `na_sentinel` is "
-                "deprecated and will be removed in a future version of cudf. "
-                "Specify `use_na_sentinel=True` to use the sentinel value -1, "
-                "and `use_na_sentinel=False` to encode NA values.",
-            )
-        warnings.warn(msg, FutureWarning)
-
     if size_hint:
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
-    if use_na_sentinel is None or use_na_sentinel:
+    if use_na_sentinel:
+        na_sentinel = Scalar(-1)
         cats = values._column.dropna()
     else:
+        na_sentinel = Scalar(None, dtype=values.dtype)
         cats = values._column
 
     cats = cats.unique().astype(values.dtype)
@@ -136,7 +94,7 @@ def factorize(
         cats, _ = cats.sort_by_values()
 
     labels = values._column._label_encoding(
-        cats=cats, na_sentinel=Scalar(na_sentinel)
+        cats=cats, na_sentinel=na_sentinel
     ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -671,13 +671,7 @@ def _compute_levels_and_codes(self):
 
         codes = {}
         for name, col in self._data.items():
-            with warnings.catch_warnings():
-                # TODO: Remove this filter when
-                # `na_sentinel` is removed from `factorize`.
-                # This is a filter to not let the warnings from
-                # `factorize` show up in other parts of public APIs.
-                warnings.simplefilter("ignore")
-                code, cats = cudf.Series._from_data({None: col}).factorize()
+            code, cats = cudf.Series._from_data({None: col}).factorize()
             codes[name] = code.astype(np.int64)
             levels.append(cudf.Series(cats, name=None))
 

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -249,21 +249,13 @@ def __cuda_array_interface__(self):
         return self._column.__cuda_array_interface__
 
     @_cudf_nvtx_annotate
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort=False, use_na_sentinel=True):
         """Encode the input values as integer labels.
 
         Parameters
         ----------
         sort : bool, default True
             Sort uniques and shuffle codes to maintain the relationship.
-        na_sentinel : number, default -1
-            Value to indicate missing category.
-
-            .. deprecated:: 23.04
-
-               The na_sentinel argument is deprecated and will be removed in
-               a future version of cudf. Specify use_na_sentinel as
-               either True or False.
         use_na_sentinel : bool, default True
             If True, the sentinel -1 will be used for NA values.
             If False, NA values will be encoded as non-negative
@@ -290,7 +282,6 @@ def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
         return cudf.core.algorithms.factorize(
             self,
             sort=sort,
-            na_sentinel=na_sentinel,
             use_na_sentinel=use_na_sentinel,
         )
 

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -459,29 +459,6 @@ def test_series_describe_other_types(ps):
         assert_eq(expected.astype("str"), actual)
 
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        [1, 2, 3, 2, 1],
-        [1, 2, None, 3, 1, 1],
-        [],
-        ["a", "b", "c", None, "z", "a"],
-    ],
-)
-@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
-def test_series_factorize(data, na_sentinel):
-    gsr = cudf.Series(data)
-    psr = gsr.to_pandas()
-
-    with pytest.warns(FutureWarning):
-        expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
-    with pytest.warns(FutureWarning):
-        actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
-
-    assert_eq(expected_labels, actual_labels.get())
-    assert_eq(expected_cats.values, actual_cats.to_pandas().values)
-
-
 @pytest.mark.parametrize(
     "data",
     [