From 8986439c269a1e0bf08fc3d00986af6b7480beb1 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 16 Nov 2017 18:41:59 -0700 Subject: [PATCH] Generalize NA Compat --- doc/source/whatsnew/v0.21.1.txt | 1 - doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/base.py | 4 ++ pandas/core/indexes/category.py | 4 +- pandas/core/indexes/datetimes.py | 5 +- pandas/core/indexes/interval.py | 8 +-- pandas/core/indexes/timedeltas.py | 6 +- .../tests/indexes/datetimes/test_indexing.py | 7 ++ pandas/tests/indexes/period/test_period.py | 8 +++ pandas/tests/indexes/test_base.py | 6 ++ pandas/tests/indexes/test_category.py | 6 ++ pandas/tests/indexes/test_interval.py | 66 +++++++++++-------- pandas/tests/indexes/test_numeric.py | 14 ++++ pandas/tests/indexes/test_range.py | 6 ++ .../tests/indexes/timedeltas/test_indexing.py | 6 ++ 15 files changed, 109 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 8362a49f4c14b6..0ab536f2898c75 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -62,7 +62,6 @@ Bug Fixes - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) -- Bug in ``IntervalIndex.insert`` when attempting to insert ``NaN`` (:issue:`18295`) Conversion ^^^^^^^^^^ diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2c0ccd377492a9..d47ef3abfdf944 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -47,7 +47,7 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) - +- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 57454e6fce118c..37bf8f980384ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3728,6 +3728,10 @@ def insert(self, loc, item): ------- new_index : Index """ + if lib.checknull(item): + # GH 18295 + item = self._na_value + _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._values idx = np.concatenate((_self[:loc], item, _self[loc:])) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3812ed96b6c36b..bd4cfcb6097199 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,5 @@ import numpy as np -from pandas._libs import index as libindex +from pandas._libs import index as libindex, lib from pandas import compat from pandas.compat.numpy import function as nv @@ -688,7 +688,7 @@ def insert(self, loc, item): """ code = self.categories.get_indexer([item]) - if (code == -1): + if (code == -1) and not lib.checknull(item): raise TypeError("cannot insert an item into a CategoricalIndex " "that is not already an existing category") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ba96979435f813..1df9d766f3a44b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1751,6 +1751,9 @@ def insert(self, loc, item): ------- new_index : Index """ + if lib.checknull(item): + # GH 18295 + item = self._na_value freq = None @@ -1767,6 +1770,7 @@ def insert(self, loc, item): elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = _to_m8(item, tz=self.tz) + try: new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) @@ -1774,7 +1778,6 @@ def insert(self, loc, item): new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) - except (AttributeError, TypeError): # fall back to object index diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c4db8f5eca0c8e..0ad2d5d6cd438e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -22,7 +22,7 @@ Index, _ensure_index, default_pprint, _index_shared_docs) -from pandas._libs import Timestamp, Timedelta +from pandas._libs import lib, Timestamp, Timedelta from pandas._libs.interval import ( Interval, IntervalMixin, IntervalTree, intervals_to_interval_bounds) @@ -985,12 +985,8 @@ def insert(self, loc, item): 'side as the index') left_insert = item.left right_insert = item.right - elif is_scalar(item) and isna(item): + elif lib.checknull(item): # GH 18295 - if item is not self.left._na_value: - raise TypeError('cannot insert with incompatible NA value: ' - 'got {item}, expected {na}' - .format(item=item, na=self.left._na_value)) left_insert = right_insert = item else: raise ValueError('can only insert Interval objects and NA into ' diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c592aa9608d970..aa15798050f6de 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -849,16 +849,18 @@ def insert(self, loc, item): ------- new_index : Index """ - # try to convert if possible if _is_convertible_to_td(item): try: item = Timedelta(item) except Exception: pass + elif lib.checknull(item): + # GH 18295 + item = self._na_value freq = None - if isinstance(item, Timedelta) or item is NaT: + if isinstance(item, Timedelta) or (item is self._na_value): # check freq can be preserved on edge cases if self.freq is not None: diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4ce9441d87970e..b3ce22962d5d49 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -145,6 +145,13 @@ def test_insert(self): assert result.tz == expected.tz assert result.freq is None + # GH 18295 (test missing) + expected = DatetimeIndex( + ['20170101', pd.NaT, '20170102', '20170103', '20170104']) + for na in (np.nan, pd.NaT, None): + result = date_range('20170101', periods=4).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e5ee078d3558dc..bd2ece271eec48 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -706,3 +706,11 @@ def test_join_self(self, how): index = period_range('1/1/2000', periods=10) joined = index.join(index, how=how) assert index is joined + + def test_insert(self): + # GH 18295 (test missing) + expected = PeriodIndex( + ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q') + for na in (np.nan, pd.NaT, None): + result = period_range('2017Q1', periods=4, freq='Q').insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 307cda7f2d1cb7..d9eea1868346e6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -442,6 +442,12 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) + # GH 18295 (test missing) + expected = Index(['a', np.nan, 'b', 'c']) + for na in (np.nan, pd.NaT, None): + result = Index(list('abc')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = Index(['a', 'b', 'c', 'd'], name='idx') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5e40e06d574135..638fe2c1818114 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -353,6 +353,12 @@ def test_insert(self): # invalid pytest.raises(TypeError, lambda: ci.insert(0, 'd')) + # GH 18295 (test missing) + expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) + for na in (np.nan, pd.NaT, None): + result = CategoricalIndex(list('aabcb')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index d7447d82632a99..4bde1f63edeb4b 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -246,38 +246,50 @@ def test_delete(self): actual = self.index.delete(0) assert expected.equals(actual) - def test_insert(self): - expected = IntervalIndex.from_breaks(range(4)) - actual = self.index.insert(2, Interval(2, 3)) - assert expected.equals(actual) - - pytest.raises(ValueError, self.index.insert, 0, 1) - pytest.raises(ValueError, self.index.insert, 0, - Interval(2, 3, closed='left')) - @pytest.mark.parametrize('data', [ - interval_range(0, periods=10), - interval_range(1.7, periods=8, freq=2.5), - interval_range(Timestamp('20170101'), periods=12), - interval_range(Timedelta('1 day'), periods=6), + interval_range(0, periods=10, closed='neither'), + interval_range(1.7, periods=8, freq=2.5, closed='both'), + interval_range(Timestamp('20170101'), periods=12, closed='left'), + interval_range(Timedelta('1 day'), periods=6, closed='right'), IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]), IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])]) - def test_insert_na(self, data): - # GH 18295 - valid_na, invalid_na = np.nan, pd.NaT - if data.left._na_value is pd.NaT: - valid_na, invalid_na = invalid_na, valid_na - - # valid insertion - expected = IntervalIndex([data[0], np.nan]).append(data[1:]) - result = data.insert(1, valid_na) + def test_insert(self, data): + item = data[0] + idx_item = IntervalIndex([item], closed=data.closed) + + # start + expected = idx_item.append(data) + result = data.insert(0, item) tm.assert_index_equal(result, expected) - # invalid insertion - msg = ('cannot insert with incompatible NA value: got {invalid}, ' - 'expected {valid}').format(invalid=invalid_na, valid=valid_na) - with tm.assert_raises_regex(TypeError, msg): - data.insert(1, invalid_na) + # end + expected = data.append(idx_item) + result = data.insert(len(data), item) + tm.assert_index_equal(result, expected) + + # mid + expected = data[:3].append(idx_item).append(data[3:]) + result = data.insert(3, item) + tm.assert_index_equal(result, expected) + + # invalid type + msg = 'can only insert Interval objects and NA into an IntervalIndex' + with tm.assert_raises_regex(ValueError, msg): + data.insert(1, 'foo') + + # invalid closed + msg = 'inserted item must be closed on the same side as the index' + for closed in {'left', 'right', 'both', 'neither'} - {item.closed}: + with tm.assert_raises_regex(ValueError, msg): + bad_item = Interval(item.left, item.right, closed=closed) + data.insert(1, bad_item) + + # GH 18295 (test missing) + na_idx = IntervalIndex([np.nan], closed=data.closed) + for na in (np.nan, pd.NaT, None): + expected = data[:1].append(na_idx).append(data[1:]) + result = data.insert(1, na) + tm.assert_index_equal(result, expected) def test_take(self): actual = self.index.take([0, 1]) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a96c677852339d..7b82591174b65f 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -175,6 +175,13 @@ def test_modulo(self): expected = Index(index.values % 2) tm.assert_index_equal(index % 2, expected) + def test_insert(self): + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = self.create_index().insert(1, na) + tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric): _holder = Float64Index @@ -1206,3 +1213,10 @@ def test_join_outer(self): tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) + + def test_insert(self): + # GH 18295 (test missing) + expected = UInt64Index([0, 0, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = self.create_index().insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7d88b547746f64..71ea90649c6920 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -295,6 +295,12 @@ def test_insert(self): # test 0th element tm.assert_index_equal(idx[0:4], result.insert(0, idx[0])) + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = RangeIndex(5).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = RangeIndex(5, name='Foo') diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index cb88bac6386f71..e64c4e6ac54a53 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -57,6 +57,12 @@ def test_insert(self): assert result.name == expected.name assert result.freq == expected.freq + # GH 18295 (test missing) + expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + for na in (np.nan, pd.NaT, None): + result = timedelta_range('1day', '3day').insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx')