BUG: don't mangle NaN-float-values and pd.NaT (GH 22295) (pandas-dev…

…#22296)
Sup3rGeo · Sep 30, 2018 · 9c88223 · 9c88223
1 parent ce3e7fd
commit 9c88223
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 53 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -721,13 +721,16 @@ Indexing
 - Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`)
 - ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`)
 - Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`)
+- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`)
 
 Missing
 ^^^^^^^
 
 - Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
 - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
-- :func:`Series.isin` now treats all nans as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
+- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
+- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`)
+
 
 MultiIndex
 ^^^^^^^^^^

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -470,7 +470,6 @@ cdef class {{name}}HashTable(HashTable):
            int ret = 0
            {{dtype}}_t val
            khiter_t k
-           bint seen_na = 0
            {{name}}Vector uniques = {{name}}Vector()
            {{name}}VectorData *ud
 
@@ -479,30 +478,13 @@ cdef class {{name}}HashTable(HashTable):
         with nogil:
             for i in range(n):
                 val = values[i]
-                {{if float_group}}
-                if val == val:
-                    k = kh_get_{{dtype}}(self.table, val)
-                    if k == self.table.n_buckets:
-                        kh_put_{{dtype}}(self.table, val, &ret)
-                        if needs_resize(ud):
-                            with gil:
-                                uniques.resize()
-                        append_data_{{dtype}}(ud, val)
-                elif not seen_na:
-                    seen_na = 1
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, NAN)
-                {{else}}
                 k = kh_get_{{dtype}}(self.table, val)
                 if k == self.table.n_buckets:
                     kh_put_{{dtype}}(self.table, val, &ret)
                     if needs_resize(ud):
                         with gil:
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                {{endif}}
         return uniques.to_array()
 
 {{endfor}}
@@ -747,9 +729,6 @@ cdef class StringHashTable(HashTable):
         return np.asarray(labels)
 
 
-na_sentinel = object
-
-
 cdef class PyObjectHashTable(HashTable):
 
     def __init__(self, size_hint=1):
@@ -767,8 +746,7 @@ cdef class PyObjectHashTable(HashTable):
     def __contains__(self, object key):
         cdef khiter_t k
         hash(key)
-        if key != key or key is None:
-            key = na_sentinel
+
         k = kh_get_pymap(self.table, <PyObject*>key)
         return k != self.table.n_buckets
 
@@ -780,8 +758,7 @@ cdef class PyObjectHashTable(HashTable):
 
     cpdef get_item(self, object val):
         cdef khiter_t k
-        if val != val or val is None:
-            val = na_sentinel
+
         k = kh_get_pymap(self.table, <PyObject*>val)
         if k != self.table.n_buckets:
             return self.table.vals[k]
@@ -795,8 +772,7 @@ cdef class PyObjectHashTable(HashTable):
             char* buf
 
         hash(key)
-        if key != key or key is None:
-            key = na_sentinel
+
         k = kh_put_pymap(self.table, <PyObject*>key, &ret)
         # self.table.keys[k] = key
         if kh_exist_pymap(self.table, k):
@@ -814,8 +790,6 @@ cdef class PyObjectHashTable(HashTable):
         for i in range(n):
             val = values[i]
             hash(val)
-            if val != val or val is None:
-                val = na_sentinel
 
             k = kh_put_pymap(self.table, <PyObject*>val, &ret)
             self.table.vals[k] = i
@@ -831,8 +805,6 @@ cdef class PyObjectHashTable(HashTable):
         for i in range(n):
             val = values[i]
             hash(val)
-            if val != val or val is None:
-                val = na_sentinel
 
             k = kh_get_pymap(self.table, <PyObject*>val)
             if k != self.table.n_buckets:
@@ -849,24 +821,14 @@ cdef class PyObjectHashTable(HashTable):
             object val
             khiter_t k
             ObjectVector uniques = ObjectVector()
-            bint seen_na = 0
 
         for i in range(n):
             val = values[i]
             hash(val)
-
-            # `val is None` below is exception to prevent mangling of None and
-            # other NA values; note however that other NA values (ex: pd.NaT
-            # and np.nan) will still get mangled, so many not be a permanent
-            # solution; see GH 20866
-            if not checknull(val) or val is None:
-                k = kh_get_pymap(self.table, <PyObject*>val)
-                if k == self.table.n_buckets:
-                    kh_put_pymap(self.table, <PyObject*>val, &ret)
-                    uniques.append(val)
-            elif not seen_na:
-                seen_na = 1
-                uniques.append(nan)
+            k = kh_get_pymap(self.table, <PyObject*>val)
+            if k == self.table.n_buckets:
+                kh_put_pymap(self.table, <PyObject*>val, &ret)
+                uniques.append(val)
 
         return uniques.to_array()
 

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -286,6 +286,18 @@ def nulls_fixture(request):
 nulls_fixture2 = nulls_fixture  # Generate cartesian product of nulls_fixture
 
 
+@pytest.fixture(params=[None, np.nan, pd.NaT])
+def unique_nulls_fixture(request):
+    """
+    Fixture for each null type in pandas, each null type exactly once
+    """
+    return request.param
+
+
+# Generate cartesian product of unique_nulls_fixture:
+unique_nulls_fixture2 = unique_nulls_fixture
+
+
 TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
              'dateutil/Asia/Singapore']
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3109,7 +3109,6 @@ def get_loc(self, key, method=None, tolerance=None):
                 return self._engine.get_loc(key)
             except KeyError:
                 return self._engine.get_loc(self._maybe_cast_indexer(key))
-
         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
         if indexer.ndim > 1 or indexer.size > 1:
             raise TypeError('get_loc requires scalar valued input')
@@ -4475,10 +4474,6 @@ def insert(self, loc, item):
         -------
         new_index : Index
         """
-        if is_scalar(item) and isna(item):
-            # GH 18295
-            item = self._na_value
-
         _self = np.asarray(self)
         item = self._coerce_scalar_to_index(item)._ndarray_values
         idx = np.concatenate((_self[:loc], item, _self[loc:]))

diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -9,6 +9,7 @@
     is_bool,
     is_bool_dtype,
     is_scalar)
+from pandas.core.dtypes.missing import isna
 
 from pandas import compat
 from pandas.core import algorithms
@@ -114,6 +115,13 @@ def is_all_dates(self):
         """
         return False
 
+    @Appender(Index.insert.__doc__)
+    def insert(self, loc, item):
+        # treat NA values as nans:
+        if is_scalar(item) and isna(item):
+            item = self._na_value
+        return super(NumericIndex, self).insert(loc, item)
+
 
 _num_index_shared_docs['class_descr'] = """
     Immutable ndarray implementing an ordered, sliceable set. The basic object

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -560,8 +560,9 @@ def test_insert(self):
         tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))
 
     def test_insert_missing(self, nulls_fixture):
-        # GH 18295 (test missing)
-        expected = Index(['a', np.nan, 'b', 'c'])
+        # GH 22295
+        # test there is no mangling of NA values
+        expected = Index(['a', nulls_fixture, 'b', 'c'])
         result = Index(list('abc')).insert(1, nulls_fixture)
         tm.assert_index_equal(result, expected)
 
@@ -1364,6 +1365,21 @@ def test_get_indexer_numeric_index_boolean_target(self):
         expected = np.array([-1, -1, -1], dtype=np.intp)
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_get_indexer_with_NA_values(self, unique_nulls_fixture,
+                                        unique_nulls_fixture2):
+        # GH 22332
+        # check pairwise, that no pair of na values
+        # is mangled
+        if unique_nulls_fixture is unique_nulls_fixture2:
+            return  # skip it, values are not unique
+        arr = np.array([unique_nulls_fixture,
+                        unique_nulls_fixture2], dtype=np.object)
+        index = pd.Index(arr, dtype=np.object)
+        result = index.get_indexer([unique_nulls_fixture,
+                                    unique_nulls_fixture2, 'Unknown'])
+        expected = np.array([0, 1, -1], dtype=np.int64)
+        tm.assert_numpy_array_equal(result, expected)
+
     @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
     def test_get_loc(self, method):
         index = pd.Index([0, 1, 2])

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -520,6 +520,36 @@ def test_different_nans(self):
         expected = np.array([np.nan])
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_first_nan_kept(self):
+        # GH 22295
+        # create different nans from bit-patterns:
+        bits_for_nan1 = 0xfff8000000000001
+        bits_for_nan2 = 0x7ff8000000000001
+        NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        for el_type in [np.float64, np.object]:
+            a = np.array([NAN1, NAN2], dtype=el_type)
+            result = pd.unique(a)
+            assert result.size == 1
+            # use bit patterns to identify which nan was kept:
+            result_nan_bits = struct.unpack("=Q",
+                                            struct.pack("d", result[0]))[0]
+            assert result_nan_bits == bits_for_nan1
+
+    def test_do_not_mangle_na_values(self, unique_nulls_fixture,
+                                     unique_nulls_fixture2):
+        # GH 22295
+        if unique_nulls_fixture is unique_nulls_fixture2:
+            return  # skip it, values not unique
+        a = np.array([unique_nulls_fixture,
+                      unique_nulls_fixture2], dtype=np.object)
+        result = pd.unique(a)
+        assert result.size == 2
+        assert a[0] is unique_nulls_fixture
+        assert a[1] is unique_nulls_fixture2
+
 
 class TestIsin(object):