Skip to content

Commit

Permalink
Update value_counts with new behavior (#12835)
Browse files Browse the repository at this point in the history
This PR updates value_counts behavior to match pandas-2.x, the result name will be count (or proportion if normalize=True is passed), and the index will be named after the original object name. This PR also fixes two dtype APIs that are breaking changes on pandas side.
  • Loading branch information
galipremsagar authored Feb 23, 2023
1 parent 430d91e commit 14f54ac
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 14 deletions.
5 changes: 2 additions & 3 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

"""Define common type operations."""

Expand Down Expand Up @@ -244,7 +244,6 @@ def _union_categoricals(
is_datetime64_dtype = pd_types.is_datetime64_dtype
is_datetime64_ns_dtype = pd_types.is_datetime64_ns_dtype
is_datetime64tz_dtype = pd_types.is_datetime64tz_dtype
is_extension_type = pd_types.is_extension_type
is_extension_array_dtype = pd_types.is_extension_array_dtype
is_float_dtype = _wrap_pandas_is_dtype_api(pd_types.is_float_dtype)
is_int64_dtype = pd_types.is_int64_dtype
Expand All @@ -263,7 +262,7 @@ def _union_categoricals(
is_named_tuple = pd_types.is_named_tuple
is_iterator = pd_types.is_iterator
is_bool = pd_types.is_bool
is_categorical = pd_types.is_categorical
is_categorical = pd_types.is_categorical_dtype
is_complex = pd_types.is_complex
is_float = pd_types.is_float
is_hashable = pd_types.is_hashable
Expand Down
9 changes: 8 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7211,12 +7211,18 @@ def value_counts(
>>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6],
... 'num_wings': [2, 0, 0, 0]},
... index=['falcon', 'dog', 'cat', 'ant'])
>>> df
num_legs num_wings
falcon 2 2
dog 4 0
cat 4 0
ant 6 0
>>> df.value_counts()
num_legs num_wings
4 0 2
2 2 1
6 0 1
dtype: int64
Name: count, dtype: int64
"""
if subset:
diff = set(subset) - set(self._data)
Expand All @@ -7238,6 +7244,7 @@ def value_counts(
# Pandas always returns MultiIndex even if only one column.
if not isinstance(result.index, MultiIndex):
result.index = MultiIndex._from_data(result._index._data)
result.name = "proportion" if normalize else "count"
return result


Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2946,15 +2946,15 @@ def value_counts(
3.0 3
2.0 2
1.0 1
dtype: int32
Name: count, dtype: int32
The order of the counts can be changed by passing ``ascending=True``:
>>> sr.value_counts(ascending=True)
1.0 1
2.0 2
3.0 3
dtype: int32
Name: count, dtype: int32
With ``normalize`` set to True, returns the relative frequency
by dividing all values by the sum of values.
Expand All @@ -2963,7 +2963,7 @@ def value_counts(
3.0 0.500000
2.0 0.333333
1.0 0.166667
dtype: float32
Name: proportion, dtype: float32
To include ``NA`` value counts, pass ``dropna=False``:
Expand All @@ -2983,24 +2983,24 @@ def value_counts(
2.0 2
<NA> 2
1.0 1
dtype: int32
Name: count, dtype: int32
>>> s = cudf.Series([3, 1, 2, 3, 4, np.nan])
>>> s.value_counts(bins=3)
(2.0, 3.0] 2
(0.996, 2.0] 2
(3.0, 4.0] 1
dtype: int32
Name: count, dtype: int32
"""
if bins is not None:
series_bins = cudf.cut(self, bins, include_lowest=True)

result_name = "proportion" if normalize else "count"
if dropna and self.null_count == len(self):
return Series(
[],
dtype=np.int32,
name=self.name,
index=cudf.Index([], dtype=self.dtype),
name=result_name,
index=cudf.Index([], dtype=self.dtype, name=self.name),
)

if bins is not None:
Expand All @@ -3009,7 +3009,7 @@ def value_counts(
else:
res = self.groupby(self, dropna=dropna).count(dropna=dropna)

res.index.name = None
res.index.name = self.name

if sort:
res = res.sort_values(ascending=ascending)
Expand All @@ -3024,7 +3024,7 @@ def value_counts(
res.index._column, res.index.categories.dtype
)
res.index = int_index

res.name = result_name
return res

@_cudf_nvtx_annotate
Expand Down

0 comments on commit 14f54ac

Please sign in to comment.