[BUG] don't mangle null-objects in value_counts (#42743)

pandas-dev · Nov 12, 2021 · 2d3644c · 2d3644c
1 parent 6fa73d2
commit 2d3644c
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 18 deletions.
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype):
         self.s.value_counts()
 
 
+class ValueCountsObjectDropNAFalse:
+
+    params = [10 ** 3, 10 ** 4, 10 ** 5]
+    param_names = ["N"]
+
+    def setup(self, N):
+        self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")
+
+    def time_value_counts(self, N):
+        self.s.value_counts(dropna=False)
+
+
 class Mode:
 
     params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
@@ -164,6 +176,18 @@ def time_mode(self, N, dtype):
         self.s.mode()
 
 
+class ModeObjectDropNAFalse:
+
+    params = [10 ** 3, 10 ** 4, 10 ** 5]
+    param_names = ["N"]
+
+    def setup(self, N):
+        self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")
+
+    def time_mode(self, N):
+        self.s.mode(dropna=False)
+
+
 class Dir:
     def setup(self):
         self.s = Series(index=tm.makeStringIndex(10000))

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is
 
 *New behavior*:
 
+.. ipython:: python
+
+    res
+
+.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan:
+
+Null-values are no longer coerced to NaN-value in value_counts and mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`).
+
+.. ipython:: python
+
+    s = pd.Series([True, None, pd.NaT, None, pd.NaT, None])
+    res = s.value_counts(dropna=False)
+
+Previously, all null-values were replaced by a NaN-value.
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [3]: res
+    Out[3]:
+    NaN     5
+    True    1
+    dtype: int64
+
+Now null-values are no longer mangled.
+
+*New behavior*:
+
 .. ipython:: python
 
     res

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128',
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
+cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
 {{else}}
 cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
@@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 
         # Don't use Py_ssize_t, since table.n_buckets is unsigned
         khiter_t k
-        bint is_null
 
         {{c_type}} val
 
@@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 
     for i in range(n):
         val = values[i]
-        is_null = checknull(val)
-        if not is_null or not dropna:
-            # all nas become the same representative:
-            if is_null:
-                val = navalue
+        if not dropna or not checknull(val):
             k = kh_get_{{ttype}}(table, <PyObject*>val)
             if k != table.n_buckets:
                 table.vals[k] += 1

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series):
     if dropna is True:
         expected = Series([1], index=[True])
     else:
-        expected = Series([2, 1], index=[pd.NA, True])
+        expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
     tm.assert_series_equal(res, expected)
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -786,12 +786,12 @@ def test_no_reference_cycle(self):
         del df
         assert wr() is None
 
-    def test_label_indexing_on_nan(self):
+    def test_label_indexing_on_nan(self, nulls_fixture):
         # GH 32431
-        df = Series([1, "{1,2}", 1, None])
+        df = Series([1, "{1,2}", 1, nulls_fixture])
         vc = df.value_counts(dropna=False)
-        result1 = vc.loc[np.nan]
-        result2 = vc[np.nan]
+        result1 = vc.loc[nulls_fixture]
+        result2 = vc[nulls_fixture]
 
         expected = 1
         assert result1 == expected

diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable):
 
 
 def test_modes_with_nans():
-    # GH39007
-    values = np.array([True, pd.NA, np.nan], dtype=np.object_)
-    # pd.Na and np.nan will have the same representative: np.nan
-    # thus we have 2 nans and 1 True
+    # GH42688, nans aren't mangled
+    nulls = [pd.NA, np.nan, pd.NaT, None]
+    values = np.array([True] + nulls * 2, dtype=np.object_)
     modes = ht.mode(values, False)
-    assert modes.size == 1
-    assert np.isnan(modes[0])
+    assert modes.size == len(nulls)
 
 
 def test_unique_label_indices_intp(writable):