Skip to content

Commit

Permalink
Drop na_sentinel from factorize (#12924)
Browse files Browse the repository at this point in the history
This PR drops support for `na_sentinel` in factorize APIs, to match with pandas-2.0
  • Loading branch information
galipremsagar authored Mar 13, 2023
1 parent e115ba5 commit 4a87cbd
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 87 deletions.
52 changes: 5 additions & 47 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
from cudf.core.series import Series


def factorize(
values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None
):
def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
"""Encode the input values as integer labels
Parameters
Expand All @@ -22,14 +20,6 @@ def factorize(
The data to be factorized.
sort : bool, default True
Sort uniques and shuffle codes to maintain the relationship.
na_sentinel : number, default -1
Value to indicate missing category.
.. deprecated:: 23.04
The na_sentinel argument is deprecated and will be removed in
a future version of cudf. Specify use_na_sentinel as
either True or False.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NA values.
If False, NA values will be encoded as non-negative
Expand Down Expand Up @@ -83,51 +73,19 @@ def factorize(
>>> uniques
Float64Index([<NA>, 1.0, 2.0], dtype='float64')
"""
# TODO: Drop `na_sentinel` in the next release immediately after
# pandas 2.0 upgrade.
if na_sentinel is not None and use_na_sentinel is not None:
raise ValueError(
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
f"got `na_sentinel={na_sentinel}` and "
f"`use_na_sentinel={use_na_sentinel}`"
)

return_cupy_array = isinstance(values, cp.ndarray)

values = Series(values)

if na_sentinel is None:
na_sentinel = (
-1
if use_na_sentinel is None or use_na_sentinel
else Scalar(None, dtype=values.dtype)
)
else:
if na_sentinel is None:
msg = (
"Specifying `na_sentinel=None` is deprecated, specify "
"`use_na_sentinel=False` instead."
)
elif na_sentinel == -1:
msg = (
"Specifying `na_sentinel=-1` is deprecated, specify "
"`use_na_sentinel=True` instead."
)
else:
msg = (
"Specifying the specific value to use for `na_sentinel` is "
"deprecated and will be removed in a future version of cudf. "
"Specify `use_na_sentinel=True` to use the sentinel value -1, "
"and `use_na_sentinel=False` to encode NA values.",
)
warnings.warn(msg, FutureWarning)

if size_hint:
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel is None or use_na_sentinel:
if use_na_sentinel:
na_sentinel = Scalar(-1)
cats = values._column.dropna()
else:
na_sentinel = Scalar(None, dtype=values.dtype)
cats = values._column

cats = cats.unique().astype(values.dtype)
Expand All @@ -136,7 +94,7 @@ def factorize(
cats, _ = cats.sort_by_values()

labels = values._column._label_encoding(
cats=cats, na_sentinel=Scalar(na_sentinel)
cats=cats, na_sentinel=na_sentinel
).values

return labels, cats.values if return_cupy_array else Index(cats)
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,13 +671,7 @@ def _compute_levels_and_codes(self):

codes = {}
for name, col in self._data.items():
with warnings.catch_warnings():
# TODO: Remove this filter when
# `na_sentinel` is removed from `factorize`.
# This is a filter to not let the warnings from
# `factorize` show up in other parts of public APIs.
warnings.simplefilter("ignore")
code, cats = cudf.Series._from_data({None: col}).factorize()
code, cats = cudf.Series._from_data({None: col}).factorize()
codes[name] = code.astype(np.int64)
levels.append(cudf.Series(cats, name=None))

Expand Down
11 changes: 1 addition & 10 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,21 +249,13 @@ def __cuda_array_interface__(self):
return self._column.__cuda_array_interface__

@_cudf_nvtx_annotate
def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
def factorize(self, sort=False, use_na_sentinel=True):
"""Encode the input values as integer labels.
Parameters
----------
sort : bool, default True
Sort uniques and shuffle codes to maintain the relationship.
na_sentinel : number, default -1
Value to indicate missing category.
.. deprecated:: 23.04
The na_sentinel argument is deprecated and will be removed in
a future version of cudf. Specify use_na_sentinel as
either True or False.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NA values.
If False, NA values will be encoded as non-negative
Expand All @@ -290,7 +282,6 @@ def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
return cudf.core.algorithms.factorize(
self,
sort=sort,
na_sentinel=na_sentinel,
use_na_sentinel=use_na_sentinel,
)

Expand Down
23 changes: 0 additions & 23 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,29 +459,6 @@ def test_series_describe_other_types(ps):
assert_eq(expected.astype("str"), actual)


@pytest.mark.parametrize(
"data",
[
[1, 2, 3, 2, 1],
[1, 2, None, 3, 1, 1],
[],
["a", "b", "c", None, "z", "a"],
],
)
@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
def test_series_factorize(data, na_sentinel):
gsr = cudf.Series(data)
psr = gsr.to_pandas()

with pytest.warns(FutureWarning):
expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
with pytest.warns(FutureWarning):
actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)

assert_eq(expected_labels, actual_labels.get())
assert_eq(expected_cats.values, actual_cats.to_pandas().values)


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit 4a87cbd

Please sign in to comment.