Skip to content

Commit

Permalink
[BUG] don't mangle null-objects in value_counts (#42743)
Browse files Browse the repository at this point in the history
  • Loading branch information
realead authored Nov 12, 2021
1 parent 6fa73d2 commit 2d3644c
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 18 deletions.
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype):
self.s.value_counts()


class ValueCountsObjectDropNAFalse:

params = [10 ** 3, 10 ** 4, 10 ** 5]
param_names = ["N"]

def setup(self, N):
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")

def time_value_counts(self, N):
self.s.value_counts(dropna=False)


class Mode:

params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
Expand All @@ -164,6 +176,18 @@ def time_mode(self, N, dtype):
self.s.mode()


class ModeObjectDropNAFalse:

params = [10 ** 3, 10 ** 4, 10 ** 5]
param_names = ["N"]

def setup(self, N):
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")

def time_mode(self, N):
self.s.mode(dropna=False)


class Dir:
def setup(self):
self.s = Series(index=tm.makeStringIndex(10000))
Expand Down
32 changes: 32 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is

*New behavior*:

.. ipython:: python
res
.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan:

Null-values are no longer coerced to NaN-value in value_counts and mode
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`).

.. ipython:: python
s = pd.Series([True, None, pd.NaT, None, pd.NaT, None])
res = s.value_counts(dropna=False)
Previously, all null-values were replaced by a NaN-value.

*Previous behavior*:

.. code-block:: ipython
In [3]: res
Out[3]:
NaN 5
True 1
dtype: int64
Now null-values are no longer mangled.

*New behavior*:

.. ipython:: python
res
Expand Down
9 changes: 2 additions & 7 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128',
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
{{else}}
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{endif}}
Expand All @@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):

# Don't use Py_ssize_t, since table.n_buckets is unsigned
khiter_t k
bint is_null

{{c_type}} val

Expand All @@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):

for i in range(n):
val = values[i]
is_null = checknull(val)
if not is_null or not dropna:
# all nas become the same representative:
if is_null:
val = navalue
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, <PyObject*>val)
if k != table.n_buckets:
table.vals[k] += 1
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series):
if dropna is True:
expected = Series([1], index=[True])
else:
expected = Series([2, 1], index=[pd.NA, True])
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
tm.assert_series_equal(res, expected)
8 changes: 4 additions & 4 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,12 +786,12 @@ def test_no_reference_cycle(self):
del df
assert wr() is None

def test_label_indexing_on_nan(self):
def test_label_indexing_on_nan(self, nulls_fixture):
# GH 32431
df = Series([1, "{1,2}", 1, None])
df = Series([1, "{1,2}", 1, nulls_fixture])
vc = df.value_counts(dropna=False)
result1 = vc.loc[np.nan]
result2 = vc[np.nan]
result1 = vc.loc[nulls_fixture]
result2 = vc[nulls_fixture]

expected = 1
assert result1 == expected
Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable):


def test_modes_with_nans():
# GH39007
values = np.array([True, pd.NA, np.nan], dtype=np.object_)
# pd.Na and np.nan will have the same representative: np.nan
# thus we have 2 nans and 1 True
# GH42688, nans aren't mangled
nulls = [pd.NA, np.nan, pd.NaT, None]
values = np.array([True] + nulls * 2, dtype=np.object_)
modes = ht.mode(values, False)
assert modes.size == 1
assert np.isnan(modes[0])
assert modes.size == len(nulls)


def test_unique_label_indices_intp(writable):
Expand Down

0 comments on commit 2d3644c

Please sign in to comment.