Skip to content

Commit

Permalink
BUG: don't mangle NaN-float-values and pd.NaT (GH 22295) (pandas-dev…
Browse files Browse the repository at this point in the history
  • Loading branch information
realead authored and victor committed Sep 30, 2018
1 parent ce3e7fd commit 9c88223
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 53 deletions.
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -721,13 +721,16 @@ Indexing
- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`)
- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`)
- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`)
- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`)

Missing
^^^^^^^

- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
- :func:`Series.isin` now treats all nans as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`)


MultiIndex
^^^^^^^^^^
Expand Down
52 changes: 7 additions & 45 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,6 @@ cdef class {{name}}HashTable(HashTable):
int ret = 0
{{dtype}}_t val
khiter_t k
bint seen_na = 0
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud

Expand All @@ -479,30 +478,13 @@ cdef class {{name}}HashTable(HashTable):
with nogil:
for i in range(n):
val = values[i]
{{if float_group}}
if val == val:
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
{{endif}}
return uniques.to_array()

{{endfor}}
Expand Down Expand Up @@ -747,9 +729,6 @@ cdef class StringHashTable(HashTable):
return np.asarray(labels)


na_sentinel = object


cdef class PyObjectHashTable(HashTable):

def __init__(self, size_hint=1):
Expand All @@ -767,8 +746,7 @@ cdef class PyObjectHashTable(HashTable):
def __contains__(self, object key):
cdef khiter_t k
hash(key)
if key != key or key is None:
key = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets

Expand All @@ -780,8 +758,7 @@ cdef class PyObjectHashTable(HashTable):

cpdef get_item(self, object val):
cdef khiter_t k
if val != val or val is None:
val = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
Expand All @@ -795,8 +772,7 @@ cdef class PyObjectHashTable(HashTable):
char* buf

hash(key)
if key != key or key is None:
key = na_sentinel

k = kh_put_pymap(self.table, <PyObject*>key, &ret)
# self.table.keys[k] = key
if kh_exist_pymap(self.table, k):
Expand All @@ -814,8 +790,6 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i
Expand All @@ -831,8 +805,6 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
Expand All @@ -849,24 +821,14 @@ cdef class PyObjectHashTable(HashTable):
object val
khiter_t k
ObjectVector uniques = ObjectVector()
bint seen_na = 0

for i in range(n):
val = values[i]
hash(val)

# `val is None` below is exception to prevent mangling of None and
# other NA values; note however that other NA values (ex: pd.NaT
# and np.nan) will still get mangled, so many not be a permanent
# solution; see GH 20866
if not checknull(val) or val is None:
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
elif not seen_na:
seen_na = 1
uniques.append(nan)
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)

return uniques.to_array()

Expand Down
12 changes: 12 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,18 @@ def nulls_fixture(request):
nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture


@pytest.fixture(params=[None, np.nan, pd.NaT])
def unique_nulls_fixture(request):
"""
Fixture for each null type in pandas, each null type exactly once
"""
return request.param


# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture


TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
'dateutil/Asia/Singapore']

Expand Down
5 changes: 0 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3109,7 +3109,6 @@ def get_loc(self, key, method=None, tolerance=None):
return self._engine.get_loc(key)
except KeyError:
return self._engine.get_loc(self._maybe_cast_indexer(key))

indexer = self.get_indexer([key], method=method, tolerance=tolerance)
if indexer.ndim > 1 or indexer.size > 1:
raise TypeError('get_loc requires scalar valued input')
Expand Down Expand Up @@ -4475,10 +4474,6 @@ def insert(self, loc, item):
-------
new_index : Index
"""
if is_scalar(item) and isna(item):
# GH 18295
item = self._na_value

_self = np.asarray(self)
item = self._coerce_scalar_to_index(item)._ndarray_values
idx = np.concatenate((_self[:loc], item, _self[loc:]))
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
is_bool,
is_bool_dtype,
is_scalar)
from pandas.core.dtypes.missing import isna

from pandas import compat
from pandas.core import algorithms
Expand Down Expand Up @@ -114,6 +115,13 @@ def is_all_dates(self):
"""
return False

@Appender(Index.insert.__doc__)
def insert(self, loc, item):
# treat NA values as nans:
if is_scalar(item) and isna(item):
item = self._na_value
return super(NumericIndex, self).insert(loc, item)


_num_index_shared_docs['class_descr'] = """
Immutable ndarray implementing an ordered, sliceable set. The basic object
Expand Down
20 changes: 18 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,9 @@ def test_insert(self):
tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))

def test_insert_missing(self, nulls_fixture):
# GH 18295 (test missing)
expected = Index(['a', np.nan, 'b', 'c'])
# GH 22295
# test there is no mangling of NA values
expected = Index(['a', nulls_fixture, 'b', 'c'])
result = Index(list('abc')).insert(1, nulls_fixture)
tm.assert_index_equal(result, expected)

Expand Down Expand Up @@ -1364,6 +1365,21 @@ def test_get_indexer_numeric_index_boolean_target(self):
expected = np.array([-1, -1, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

def test_get_indexer_with_NA_values(self, unique_nulls_fixture,
unique_nulls_fixture2):
# GH 22332
# check pairwise, that no pair of na values
# is mangled
if unique_nulls_fixture is unique_nulls_fixture2:
return # skip it, values are not unique
arr = np.array([unique_nulls_fixture,
unique_nulls_fixture2], dtype=np.object)
index = pd.Index(arr, dtype=np.object)
result = index.get_indexer([unique_nulls_fixture,
unique_nulls_fixture2, 'Unknown'])
expected = np.array([0, 1, -1], dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
def test_get_loc(self, method):
index = pd.Index([0, 1, 2])
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,36 @@ def test_different_nans(self):
expected = np.array([np.nan])
tm.assert_numpy_array_equal(result, expected)

def test_first_nan_kept(self):
# GH 22295
# create different nans from bit-patterns:
bits_for_nan1 = 0xfff8000000000001
bits_for_nan2 = 0x7ff8000000000001
NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
assert NAN1 != NAN1
assert NAN2 != NAN2
for el_type in [np.float64, np.object]:
a = np.array([NAN1, NAN2], dtype=el_type)
result = pd.unique(a)
assert result.size == 1
# use bit patterns to identify which nan was kept:
result_nan_bits = struct.unpack("=Q",
struct.pack("d", result[0]))[0]
assert result_nan_bits == bits_for_nan1

def test_do_not_mangle_na_values(self, unique_nulls_fixture,
unique_nulls_fixture2):
# GH 22295
if unique_nulls_fixture is unique_nulls_fixture2:
return # skip it, values not unique
a = np.array([unique_nulls_fixture,
unique_nulls_fixture2], dtype=np.object)
result = pd.unique(a)
assert result.size == 2
assert a[0] is unique_nulls_fixture
assert a[1] is unique_nulls_fixture2


class TestIsin(object):

Expand Down

0 comments on commit 9c88223

Please sign in to comment.