Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] don't mangle null-objects in value_counts #42743

Merged
merged 11 commits into from
Nov 12, 2021
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype):
self.s.value_counts()


class ValueCountsObjectDropNAFalse:

params = [10 ** 3, 10 ** 4, 10 ** 5]
param_names = ["N"]

def setup(self, N):
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")

def time_value_counts(self, N):
self.s.value_counts(dropna=False)


class Mode:

params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
Expand All @@ -164,6 +176,18 @@ def time_mode(self, N, dtype):
self.s.mode()


class ModeObjectDropNAFalse:

params = [10 ** 3, 10 ** 4, 10 ** 5]
param_names = ["N"]

def setup(self, N):
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")
jreback marked this conversation as resolved.
Show resolved Hide resolved

def time_mode(self, N):
self.s.mode(dropna=False)


class Dir:
def setup(self):
self.s = Series(index=tm.makeStringIndex(10000))
Expand Down
32 changes: 32 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is

*New behavior*:

.. ipython:: python

res

.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan:

Null-values are no longer coerced to NaN-value in value_counts and mode
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`).

.. ipython:: python

s = pd.Series([True, None, pd.NaT, None, pd.NaT, None])
res = s.value_counts(dropna=False)

Previously, all null-values were replaced by a NaN-value.

*Previous behavior*:

.. code-block:: ipython

In [3]: res
Out[3]:
NaN 5
True 1
dtype: int64

Now null-values are no longer mangled.

*New behavior*:

.. ipython:: python

res
Expand Down
9 changes: 2 additions & 7 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128',
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
{{else}}
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{endif}}
Expand All @@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):

# Don't use Py_ssize_t, since table.n_buckets is unsigned
khiter_t k
bint is_null

{{c_type}} val

Expand All @@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):

for i in range(n):
val = values[i]
is_null = checknull(val)
if not is_null or not dropna:
# all nas become the same representative:
if is_null:
val = navalue
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, <PyObject*>val)
if k != table.n_buckets:
table.vals[k] += 1
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series):
if dropna is True:
expected = Series([1], index=[True])
else:
expected = Series([2, 1], index=[pd.NA, True])
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
tm.assert_series_equal(res, expected)
8 changes: 4 additions & 4 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,12 +786,12 @@ def test_no_reference_cycle(self):
del df
assert wr() is None

def test_label_indexing_on_nan(self):
def test_label_indexing_on_nan(self, nulls_fixture):
# GH 32431
df = Series([1, "{1,2}", 1, None])
df = Series([1, "{1,2}", 1, nulls_fixture])
vc = df.value_counts(dropna=False)
result1 = vc.loc[np.nan]
result2 = vc[np.nan]
result1 = vc.loc[nulls_fixture]
result2 = vc[nulls_fixture]

expected = 1
assert result1 == expected
Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable):


def test_modes_with_nans():
# GH39007
values = np.array([True, pd.NA, np.nan], dtype=np.object_)
# pd.Na and np.nan will have the same representative: np.nan
# thus we have 2 nans and 1 True
# GH42688, nans aren't mangled
nulls = [pd.NA, np.nan, pd.NaT, None]
values = np.array([True] + nulls * 2, dtype=np.object_)
modes = ht.mode(values, False)
assert modes.size == 1
assert np.isnan(modes[0])
assert modes.size == len(nulls)


def test_unique_label_indices_intp(writable):
Expand Down