diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a34b9feb2b2fad..e397365dc8d50c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -944,6 +944,7 @@ Indexing ^^^^^^^^ - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) +- Bug in sparse array indexing in which indices were not being validated (:issue:`15863`) - Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) diff --git a/pandas/sparse/sparse.pyx b/pandas/sparse/sparse.pyx index 00d317c42b18d4..45015b6719ce73 100644 --- a/pandas/sparse/sparse.pyx +++ b/pandas/sparse/sparse.pyx @@ -34,8 +34,9 @@ cdef inline int int_min(int a, int b): return a if a <= b else b cdef class SparseIndex: """ - Abstract superclass for sparse index types + Abstract superclass for sparse index types. """ + def __init__(self): raise NotImplementedError @@ -48,8 +49,9 @@ cdef class IntIndex(SparseIndex): ---------- length : integer indices : array-like - Contains integers corresponding to + Contains integers corresponding to the indices. """ + cdef readonly: Py_ssize_t length, npoints ndarray indices @@ -59,9 +61,11 @@ cdef class IntIndex(SparseIndex): self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) + self.check_integrity() + def __reduce__(self): args = (self.length, self.indices) - return (IntIndex, args) + return IntIndex, args def __repr__(self): output = 'IntIndex\n' @@ -70,10 +74,40 @@ cdef class IntIndex(SparseIndex): def check_integrity(self): """ - Only need be strictly ascending and nothing less than 0 or greater than - total length + Checks the following: + + - Indices are strictly ascending + - Number of indices is at most self.length + - Indices are at least 0 and at most the total length less one + + A ValueError is raised if any of these conditions is violated. """ - pass + + cdef: + int32_t index, prev = -1 + + if self.npoints > self.length: + msg = ("Too many indices. Expected " + "{exp} but found {act}").format( + exp=self.length, act=self.npoints) + raise ValueError(msg) + + # Indices are vacuously ordered and non-negative + # if the sequence of indices is empty. + if self.npoints == 0: + return + + if min(self.indices) < 0: + raise ValueError("No index can be less than zero") + + if max(self.indices) >= self.length: + raise ValueError("All indices must be less than the length") + + for index in self.indices: + if prev != -1 and index <= prev: + raise ValueError("Indices must be strictly increasing") + + prev = index def equals(self, other): if not isinstance(other, IntIndex): @@ -320,7 +354,7 @@ cdef class BlockIndex(SparseIndex): def __reduce__(self): args = (self.length, self.blocs, self.blengths) - return (BlockIndex, args) + return BlockIndex, args def __repr__(self): output = 'BlockIndex\n' diff --git a/pandas/tests/sparse/test_libsparse.py b/pandas/tests/sparse/test_libsparse.py index b6ab99dc66cda3..696d2cf47f4c0d 100644 --- a/pandas/tests/sparse/test_libsparse.py +++ b/pandas/tests/sparse/test_libsparse.py @@ -474,6 +474,44 @@ def test_to_block_index(self): class TestIntIndex(tm.TestCase): + def test_check_integrity(self): + + # Too many indices than specified in self.length + msg = "Too many indices" + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=1, indices=[1, 2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # All indices must be less than the length. + msg = "All indices must be less than the length" + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, 2, 5]) + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, 2, 6]) + + # Indices must be strictly ascending. + msg = "Indices must be strictly increasing" + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, 3, 2]) + + with tm.assertRaisesRegexp(ValueError, msg): + IntIndex(length=5, indices=[1, 3, 3]) + def test_int_internal(self): idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') self.assertIsInstance(idx, IntIndex)