Update value_counts with new behavior (#12835)

This PR updates value_counts behavior to match pandas-2.x, the result name will be count (or proportion if normalize=True is passed), and the index will be named after the original object name. This PR also fixes two dtype APIs that are breaking changes on pandas side.
rapidsai · Feb 23, 2023 · 14f54ac · 14f54ac
1 parent 430d91e
commit 14f54ac
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 14 deletions.
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 """Define common type operations."""
 
@@ -244,7 +244,6 @@ def _union_categoricals(
 is_datetime64_dtype = pd_types.is_datetime64_dtype
 is_datetime64_ns_dtype = pd_types.is_datetime64_ns_dtype
 is_datetime64tz_dtype = pd_types.is_datetime64tz_dtype
-is_extension_type = pd_types.is_extension_type
 is_extension_array_dtype = pd_types.is_extension_array_dtype
 is_float_dtype = _wrap_pandas_is_dtype_api(pd_types.is_float_dtype)
 is_int64_dtype = pd_types.is_int64_dtype
@@ -263,7 +262,7 @@ def _union_categoricals(
 is_named_tuple = pd_types.is_named_tuple
 is_iterator = pd_types.is_iterator
 is_bool = pd_types.is_bool
-is_categorical = pd_types.is_categorical
+is_categorical = pd_types.is_categorical_dtype
 is_complex = pd_types.is_complex
 is_float = pd_types.is_float
 is_hashable = pd_types.is_hashable

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -7211,12 +7211,18 @@ def value_counts(
         >>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6],
         ...                    'num_wings': [2, 0, 0, 0]},
         ...                    index=['falcon', 'dog', 'cat', 'ant'])
+        >>> df
+                num_legs  num_wings
+        falcon         2          2
+        dog            4          0
+        cat            4          0
+        ant            6          0
         >>> df.value_counts()
         num_legs  num_wings
         4         0            2
         2         2            1
         6         0            1
-        dtype: int64
+        Name: count, dtype: int64
         """
         if subset:
             diff = set(subset) - set(self._data)
@@ -7238,6 +7244,7 @@ def value_counts(
         # Pandas always returns MultiIndex even if only one column.
         if not isinstance(result.index, MultiIndex):
             result.index = MultiIndex._from_data(result._index._data)
+        result.name = "proportion" if normalize else "count"
         return result
 
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -2946,15 +2946,15 @@ def value_counts(
         3.0    3
         2.0    2
         1.0    1
-        dtype: int32
+        Name: count, dtype: int32
 
         The order of the counts can be changed by passing ``ascending=True``:
 
         >>> sr.value_counts(ascending=True)
         1.0    1
         2.0    2
         3.0    3
-        dtype: int32
+        Name: count, dtype: int32
 
         With ``normalize`` set to True, returns the relative frequency
         by dividing all values by the sum of values.
@@ -2963,7 +2963,7 @@ def value_counts(
         3.0    0.500000
         2.0    0.333333
         1.0    0.166667
-        dtype: float32
+        Name: proportion, dtype: float32
 
         To include ``NA`` value counts, pass ``dropna=False``:
 
@@ -2983,24 +2983,24 @@ def value_counts(
         2.0     2
         <NA>    2
         1.0     1
-        dtype: int32
+        Name: count, dtype: int32
 
         >>> s = cudf.Series([3, 1, 2, 3, 4, np.nan])
         >>> s.value_counts(bins=3)
         (2.0, 3.0]      2
         (0.996, 2.0]    2
         (3.0, 4.0]      1
-        dtype: int32
+        Name: count, dtype: int32
         """
         if bins is not None:
             series_bins = cudf.cut(self, bins, include_lowest=True)
-
+        result_name = "proportion" if normalize else "count"
         if dropna and self.null_count == len(self):
             return Series(
                 [],
                 dtype=np.int32,
-                name=self.name,
-                index=cudf.Index([], dtype=self.dtype),
+                name=result_name,
+                index=cudf.Index([], dtype=self.dtype, name=self.name),
             )
 
         if bins is not None:
@@ -3009,7 +3009,7 @@ def value_counts(
         else:
             res = self.groupby(self, dropna=dropna).count(dropna=dropna)
 
-        res.index.name = None
+        res.index.name = self.name
 
         if sort:
             res = res.sort_values(ascending=ascending)
@@ -3024,7 +3024,7 @@ def value_counts(
                 res.index._column, res.index.categories.dtype
             )
             res.index = int_index
-
+        res.name = result_name
         return res
 
     @_cudf_nvtx_annotate