From de4b384e5b4c1231825cba4f080fa756a2d601d0 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 24 Nov 2017 09:55:13 -0700 Subject: [PATCH] BUG: Fix IntervalIndex constructor inconsistencies (#18424) --- doc/source/whatsnew/v0.22.0.txt | 2 + pandas/_libs/interval.pyx | 4 +- pandas/core/indexes/interval.py | 57 +++++++---- pandas/tests/indexes/test_interval.py | 126 ++++++++++++++++++------- pandas/tests/indexing/test_interval.py | 2 +- 5 files changed, 134 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 90032a692fd15..92d9123d2cf4c 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -74,6 +74,7 @@ Other API Changes - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) +- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) .. _whatsnew_0220.deprecations: @@ -137,6 +138,7 @@ Indexing - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) +- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - I/O diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index c09642511207a..39b26c61172ed 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -211,8 +211,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals): int64_t n = len(intervals) ndarray left, right - left = np.empty(n, dtype=object) - right = np.empty(n, dtype=object) + left = np.empty(n, dtype=intervals.dtype) + right = np.empty(n, dtype=intervals.dtype) for i in range(len(intervals)): interval = intervals[i] diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c9bb8748abe7b..cca7a06a2d44b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.missing import notna, isna from pandas.core.dtypes.generic import ABCPeriodIndex from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, @@ -31,7 +32,9 @@ from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex from pandas.compat.numpy import function as nv -from pandas.core import common as com +from pandas.core.common import ( + _all_not_none, _any_none, _asarray_tuplesafe, _count_not_none, + is_bool_indexer, _maybe_box_datetimelike, _not_none) from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset @@ -176,7 +179,7 @@ class IntervalIndex(IntervalMixin, Index): _mask = None - def __new__(cls, data, closed='right', + def __new__(cls, data, closed=None, name=None, copy=False, dtype=None, fastpath=False, verify_integrity=True): @@ -197,8 +200,17 @@ def __new__(cls, data, closed='right', if is_scalar(data): cls._scalar_data_error(data) - data = IntervalIndex.from_intervals(data, name=name) - left, right, closed = data.left, data.right, data.closed + data = maybe_convert_platform(data) + left, right, infer_closed = intervals_to_interval_bounds(data) + + if _all_not_none(closed, infer_closed) and closed != infer_closed: + # GH 18421 + msg = ("conflicting values for closed: constructor got " + "'{closed}', inferred from data '{infer_closed}'" + .format(closed=closed, infer_closed=infer_closed)) + raise ValueError(msg) + + closed = closed or infer_closed return cls._simple_new(left, right, closed, name, copy=copy, verify_integrity=verify_integrity) @@ -376,7 +388,8 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - breaks = np.asarray(breaks) + breaks = maybe_convert_platform(breaks) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, name=name, copy=copy) @@ -416,8 +429,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - left = np.asarray(left) - right = np.asarray(right) + left = maybe_convert_platform(left) + right = maybe_convert_platform(right) + return cls._simple_new(left, right, closed, name=name, copy=copy, verify_integrity=True) @@ -460,8 +474,12 @@ def from_intervals(cls, data, name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) + if isinstance(data, IntervalIndex): + left, right, closed = data.left, data.right, data.closed + name = name or data.name + else: + data = maybe_convert_platform(data) + left, right, closed = intervals_to_interval_bounds(data) return cls.from_arrays(left, right, closed, name=name, copy=False) @classmethod @@ -497,8 +515,11 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): IntervalIndex.from_intervals : Construct an IntervalIndex from an array of Interval objects """ - left = [] - right = [] + if len(data): + left, right = [], [] + else: + left = right = data + for d in data: if isna(d): @@ -517,7 +538,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): return cls.from_arrays(left, right, closed, name=name, copy=False) def to_tuples(self): - return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + return Index(_asarray_tuplesafe(zip(self.left, self.right))) @cache_readonly def _multiindex(self): @@ -838,7 +859,7 @@ def get_loc(self, key, method=None): return self._engine.get_loc(key) def get_value(self, series, key): - if com.is_bool_indexer(key): + if is_bool_indexer(key): loc = key elif is_list_like(key): loc = self.get_indexer(key) @@ -1166,7 +1187,7 @@ def _is_type_compatible(a, b): return ((is_number(a) and is_number(b)) or (is_ts_compat(a) and is_ts_compat(b)) or (is_td_compat(a) and is_td_compat(b)) or - com._any_none(a, b)) + _any_none(a, b)) def interval_range(start=None, end=None, periods=None, freq=None, @@ -1244,13 +1265,13 @@ def interval_range(start=None, end=None, periods=None, freq=None, -------- IntervalIndex : an Index of intervals that are all closed on the same side. """ - if com._count_not_none(start, end, periods) != 2: + if _count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' 'exactly two must be specified') - start = com._maybe_box_datetimelike(start) - end = com._maybe_box_datetimelike(end) - endpoint = next(com._not_none(start, end)) + start = _maybe_box_datetimelike(start) + end = _maybe_box_datetimelike(end) + endpoint = next(_not_none(start, end)) if not _is_valid_endpoint(start): msg = 'start must be numeric or datetime-like, got {start}' diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 399d88309072e..b98359ea0ec4d 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,7 +6,7 @@ from pandas import (Interval, IntervalIndex, Index, isna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) -from pandas.compat import zip +from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -38,7 +38,7 @@ def create_index_with_nan(self, closed='right'): @pytest.mark.parametrize('name', [None, 'foo']) def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) - ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] + ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) @@ -57,7 +57,7 @@ def test_constructors(self, closed, name): tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( - zip(left, right), closed=closed, name=name) + lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) @@ -68,6 +68,9 @@ def test_constructors(self, closed, name): tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) + result = IntervalIndex.from_intervals(expected) + tm.assert_index_equal(result, expected) + result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) @@ -86,63 +89,118 @@ def test_constructors(self, closed, name): breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) - def test_constructors_other(self): - - # all-nan - result = IntervalIndex.from_intervals([np.nan]) - expected = np.array([np.nan], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) - - # empty - result = IntervalIndex.from_intervals([]) - expected = np.array([], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) + @pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50]) + def test_constructors_nan(self, closed, data): + # GH 18421 + expected_values = np.array(data, dtype=object) + expected_idx = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_idx.closed == closed + tm.assert_numpy_array_equal(expected_idx.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks([np.nan] + data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + @pytest.mark.parametrize('data', [ + [], + np.array([], dtype='int64'), + np.array([], dtype='float64'), + np.array([], dtype=object)]) + def test_constructors_empty(self, data, closed): + # GH 18421 + expected_dtype = data.dtype if isinstance(data, np.ndarray) else object + expected_values = np.array([], dtype=object) + expected_index = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_index.empty + assert expected_index.closed == closed + assert expected_index.dtype.subtype == expected_dtype + tm.assert_numpy_array_equal(expected_index.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) def test_constructors_errors(self): # scalar - msg = ('IntervalIndex(...) must be called with a collection of ' + msg = ('IntervalIndex\(...\) must be called with a collection of ' 'some kind, 5 was passed') - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex(5) # not an interval - msg = "type with value 0 is not an interval" - with pytest.raises(TypeError, message=msg): + msg = ("type <(class|type) 'numpy.int64'> with value 0 " + "is not an interval") + with tm.assert_raises_regex(TypeError, msg): IntervalIndex([0, 1]) - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals([0, 1]) # invalid closed msg = "invalid options for 'closed': invalid" - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') - # mismatched closed + # mismatched closed within intervals msg = 'intervals must all be closed on the same side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) - with pytest.raises(ValueError, message=msg): - IntervalIndex.from_arrays([0, 10], [3, 5]) - - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) + # mismatched closed inferred from intervals vs constructor. + msg = 'conflicting values for closed' + with tm.assert_raises_regex(ValueError, msg): + iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] + IntervalIndex(iv, closed='neither') + # no point in nesting periods in an IntervalIndex msg = 'Period dtypes are not supported, use a PeriodIndex instead' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) # decreasing breaks/arrays msg = 'left side of interval must be <= right side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(range(10, -1, -1)) - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) def test_constructors_datetimelike(self, closed): @@ -865,7 +923,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is True - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) @@ -873,7 +931,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) @@ -881,7 +939,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', overwise True (GH16560) @@ -1054,10 +1112,6 @@ def test_constructor_coverage(self): end=end.to_pydatetime()) tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.tz_localize('UTC'), - end=end.tz_localize('UTC')) - tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py index 3792293f48b99..e29dc627a5d94 100644 --- a/pandas/tests/indexing/test_interval.py +++ b/pandas/tests/indexing/test_interval.py @@ -54,7 +54,7 @@ def test_getitem_with_scalar(self): def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] if direction == 'decreasing': - tpls = reversed(tpls) + tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) s = Series(list('abc'), idx)