Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix IntervalIndex.insert to allow inserting NaN #18300

Merged
merged 6 commits into from
Nov 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Other API Changes
- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`)
- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`)
- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`)
- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`)


.. _whatsnew_0220.deprecations:
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3734,6 +3734,10 @@ def insert(self, loc, item):
-------
new_index : Index
"""
if is_scalar(item) and isna(item):
# GH 18295
item = self._na_value

_self = np.asarray(self)
item = self._coerce_scalar_to_index(item)._values
idx = np.concatenate((_self[:loc], item, _self[loc:]))
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
is_scalar)
from pandas.core.common import (_asarray_tuplesafe,
_values_from_object)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core.dtypes.missing import array_equivalent, isna
from pandas.core.algorithms import take_1d


Expand Down Expand Up @@ -690,7 +690,7 @@ def insert(self, loc, item):

"""
code = self.categories.get_indexer([item])
if (code == -1):
if (code == -1) and not (is_scalar(item) and isna(item)):
raise TypeError("cannot insert an item into a CategoricalIndex "
"that is not already an existing category")

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1757,6 +1757,9 @@ def insert(self, loc, item):
-------
new_index : Index
"""
if is_scalar(item) and isna(item):
# GH 18295
item = self._na_value

freq = None

Expand All @@ -1773,14 +1776,14 @@ def insert(self, loc, item):
elif (loc == len(self)) and item - self.freq == self[-1]:
freq = self.freq
item = _to_m8(item, tz=self.tz)

try:
new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)],
self[loc:].asi8))
if self.tz is not None:
new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz)
return DatetimeIndex(new_dates, name=self.name, freq=freq,
tz=self.tz)

except (AttributeError, TypeError):

# fall back to object index
Expand Down
23 changes: 15 additions & 8 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1001,14 +1001,21 @@ def delete(self, loc):
return self._shallow_copy(new_left, new_right)

def insert(self, loc, item):
if not isinstance(item, Interval):
raise ValueError('can only insert Interval objects into an '
'IntervalIndex')
if not item.closed == self.closed:
raise ValueError('inserted item must be closed on the same side '
'as the index')
new_left = self.left.insert(loc, item.left)
new_right = self.right.insert(loc, item.right)
if isinstance(item, Interval):
if item.closed != self.closed:
raise ValueError('inserted item must be closed on the same '
'side as the index')
left_insert = item.left
right_insert = item.right
elif is_scalar(item) and isna(item):
# GH 18295
left_insert = right_insert = item
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes u need to use a nan compat with left iow this could be a NaT

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we have a left.na_value iirc (might be spelled differently)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or maybe the underlying already handles this in the insert

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated. Procedure I'm following is "check if any type of NA value is passed -> raise if the wrong type of NA is passed". I suppose I could just bypass this and only check if the right type of NA is passed, if that would be preferred.

else:
raise ValueError('can only insert Interval objects and NA into '
'an IntervalIndex')

new_left = self.left.insert(loc, left_insert)
new_right = self.right.insert(loc, right_insert)
return self._shallow_copy(new_left, new_right)

def _as_like_interval_index(self, other, error_msg):
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,16 +852,18 @@ def insert(self, loc, item):
-------
new_index : Index
"""

# try to convert if possible
if _is_convertible_to_td(item):
try:
item = Timedelta(item)
except Exception:
pass
elif is_scalar(item) and isna(item):
# GH 18295
item = self._na_value

freq = None
if isinstance(item, Timedelta) or item is NaT:
if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)):

# check freq can be preserved on edge cases
if self.freq is not None:
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexes/datetimes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ def test_insert(self):
assert result.tz == expected.tz
assert result.freq is None

# GH 18295 (test missing)
expected = DatetimeIndex(
['20170101', pd.NaT, '20170102', '20170103', '20170104'])
for na in (np.nan, pd.NaT, None):
result = date_range('20170101', periods=4).insert(1, na)
tm.assert_index_equal(result, expected)

def test_delete(self):
idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx')

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/indexes/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,3 +697,11 @@ def test_join_self(self, how):
index = period_range('1/1/2000', periods=10)
joined = index.join(index, how=how)
assert index is joined

def test_insert(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in another PR, can come back and move test_insert to datetimelike.py where this can be tested generically (instead of having code inside each datetimelike index type)

# GH 18295 (test missing)
expected = PeriodIndex(
['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q')
for na in (np.nan, pd.NaT, None):
result = period_range('2017Q1', periods=4, freq='Q').insert(1, na)
tm.assert_index_equal(result, expected)
6 changes: 6 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ def test_insert(self):
null_index = Index([])
tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))

# GH 18295 (test missing)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can also be done generically

expected = Index(['a', np.nan, 'b', 'c'])
for na in (np.nan, pd.NaT, None):
result = Index(list('abc')).insert(1, na)
tm.assert_index_equal(result, expected)

def test_delete(self):
idx = Index(['a', 'b', 'c', 'd'], name='idx')

Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,12 @@ def test_insert(self):
# invalid
pytest.raises(TypeError, lambda: ci.insert(0, 'd'))

# GH 18295 (test missing)
expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b'])
for na in (np.nan, pd.NaT, None):
result = CategoricalIndex(list('aabcb')).insert(1, na)
tm.assert_index_equal(result, expected)

def test_delete(self):

ci = self.create_index()
Expand Down
50 changes: 43 additions & 7 deletions pandas/tests/indexes/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,14 +366,50 @@ def test_delete(self, closed):
result = self.create_index(closed=closed).delete(0)
tm.assert_index_equal(result, expected)

def test_insert(self):
expected = IntervalIndex.from_breaks(range(4))
actual = self.index.insert(2, Interval(2, 3))
assert expected.equals(actual)
@pytest.mark.parametrize('data', [
interval_range(0, periods=10, closed='neither'),
interval_range(1.7, periods=8, freq=2.5, closed='both'),
interval_range(Timestamp('20170101'), periods=12, closed='left'),
interval_range(Timedelta('1 day'), periods=6, closed='right'),
IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]),
IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])])
def test_insert(self, data):
item = data[0]
idx_item = IntervalIndex([item])

# start
expected = idx_item.append(data)
result = data.insert(0, item)
tm.assert_index_equal(result, expected)

# end
expected = data.append(idx_item)
result = data.insert(len(data), item)
tm.assert_index_equal(result, expected)

# mid
expected = data[:3].append(idx_item).append(data[3:])
result = data.insert(3, item)
tm.assert_index_equal(result, expected)

# invalid type
msg = 'can only insert Interval objects and NA into an IntervalIndex'
with tm.assert_raises_regex(ValueError, msg):
data.insert(1, 'foo')

pytest.raises(ValueError, self.index.insert, 0, 1)
pytest.raises(ValueError, self.index.insert, 0,
Interval(2, 3, closed='left'))
# invalid closed
msg = 'inserted item must be closed on the same side as the index'
for closed in {'left', 'right', 'both', 'neither'} - {item.closed}:
with tm.assert_raises_regex(ValueError, msg):
bad_item = Interval(item.left, item.right, closed=closed)
data.insert(1, bad_item)

# GH 18295 (test missing)
na_idx = IntervalIndex([np.nan], closed=data.closed)
for na in (np.nan, pd.NaT, None):
expected = data[:1].append(na_idx).append(data[1:])
result = data.insert(1, na)
tm.assert_index_equal(result, expected)

def test_take(self, closed):
index = self.create_index(closed=closed)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexes/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ def test_where(self, klass):
result = i.where(klass(cond))
tm.assert_index_equal(result, expected)

def test_insert(self):
# GH 18295 (test missing)
expected = Float64Index([0, np.nan, 1, 2, 3, 4])
for na in (np.nan, pd.NaT, None):
result = self.create_index().insert(1, na)
tm.assert_index_equal(result, expected)


class TestFloat64Index(Numeric):
_holder = Float64Index
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,12 @@ def test_insert(self):
# test 0th element
tm.assert_index_equal(idx[0:4], result.insert(0, idx[0]))

# GH 18295 (test missing)
expected = Float64Index([0, np.nan, 1, 2, 3, 4])
for na in (np.nan, pd.NaT, None):
result = RangeIndex(5).insert(1, na)
tm.assert_index_equal(result, expected)

def test_delete(self):

idx = RangeIndex(5, name='Foo')
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/timedeltas/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ def test_insert(self):
assert result.name == expected.name
assert result.freq == expected.freq

# GH 18295 (test missing)
expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side issue, we have lots of duplication on these test_insert/delete and such in the hierarchy and most can simply be tested via fixture / in superclass (but will require some refactoring in the tests). I think we have an issue about this.

for na in (np.nan, pd.NaT, None):
result = timedelta_range('1day', '3day').insert(1, na)
tm.assert_index_equal(result, expected)

def test_delete(self):
idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx')

Expand Down