Skip to content

Commit

Permalink
PERF: Faster SparseArray.__get_item__ for boolean masks (pandas-dev#2…
Browse files Browse the repository at this point in the history
  • Loading branch information
bdrum committed Dec 18, 2021
1 parent 097322f commit 7973c8b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 9 deletions.
27 changes: 18 additions & 9 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
)
from pandas.core.dtypes.common import (
is_array_like,
is_bool,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
Expand Down Expand Up @@ -181,9 +182,8 @@ def _sparse_array_op(
ltype = SparseDtype(subtype, left.fill_value)
rtype = SparseDtype(subtype, right.fill_value)

# TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
left = left.astype(ltype)
right = right.astype(rtype)
left = left.astype(ltype, copy=False)
right = right.astype(rtype, copy=False)
dtype = ltype.subtype
else:
dtype = ltype
Expand Down Expand Up @@ -701,7 +701,11 @@ def isna(self):
# If null fill value, we want SparseDtype[bool, true]
# to preserve the same memory usage.
dtype = SparseDtype(bool, self._null_fill_value)
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
if self._null_fill_value:
return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
mask = np.full(len(self), False, dtype=np.bool8)
mask[self.sp_index.indices] = isna(self.sp_values)
return type(self)(mask, fill_value=False, dtype=dtype)

def fillna(
self: SparseArrayT,
Expand Down Expand Up @@ -945,13 +949,18 @@ def __getitem__(
)

else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
# key's fill_value for True / False, and then do an intersection
# on the indices of the sp_values.
if isinstance(key, SparseArray):
if is_bool_dtype(key):
key = key.to_dense()
if is_bool(key.fill_value):
msk = np.full(
shape=len(self),
fill_value=key.fill_value,
dtype=np.bool8,
)
msk[key.sp_index.indices] = not key.fill_value
return self.take(np.arange(len(self), dtype=np.int32)[msk])
else:
key = key.to_dense()
else:
key = np.asarray(key)

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
assert arr.dtype == dtype
assert exp.dtype == dtype

# GH 23122
def test_get_item_bool_sparse_array(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool8, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

spar_bool = SparseArray(~spar_bool.to_dense(), dtype=np.bool8, fill_value=False)
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(self.arr[spar_bool], exp)

def test_get_item(self):

assert np.isnan(self.arr[1])
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ def test_isna(self, data_missing):
expected = pd.Series([], dtype=expected_dtype)
self.assert_series_equal(result, expected)

# test isna for arr without na
data_missing = data_missing.fillna(0)
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
self.assert_equal(pd.isna(data_missing), expected)

def test_fillna_limit_pad(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_pad(data_missing)
Expand Down

0 comments on commit 7973c8b

Please sign in to comment.