diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fcb14a7c29a..34f002086da 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,8 +39,8 @@ New functions/methods `NEP18 `_ compliant numpy-like library (important: read notes about NUMPY_EXPERIMENTAL_ARRAY_FUNCTION in the above link). Added explicit test coverage for - `sparse `_. (:issue:`3117`, :issue:`3202`) - By `Nezar Abdennur `_ + `sparse `_. (:issue:`3117`, :issue:`3202`). + This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ and `Guido Imperiale `_. - The xarray package is now discoverable by mypy (although typing hints coverage is not diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d5cd5eb9e8f..f6570149484 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -10,7 +10,7 @@ from . import duck_array_ops, nputils, utils from .npcompat import DTypeLike -from .pycompat import dask_array_type, integer_types +from .pycompat import dask_array_type, integer_types, sparse_array_type from .utils import is_dict_like, maybe_cast_to_coords_dtype @@ -1076,19 +1076,30 @@ def _logical_any(args): return functools.reduce(operator.or_, args) -def _masked_result_drop_slice(key, chunks_hint=None): +def _masked_result_drop_slice(key, data=None): + key = (k for k in key if not isinstance(k, slice)) - if chunks_hint is not None: - key = [ - _dask_array_with_chunks_hint(k, chunks_hint) - if isinstance(k, np.ndarray) - else k - for k in key - ] - return _logical_any(k == -1 for k in key) + chunks_hint = getattr(data, "chunks", None) + + new_keys = [] + for k in key: + if isinstance(k, np.ndarray): + if isinstance(data, dask_array_type): + new_keys.append(_dask_array_with_chunks_hint(k, chunks_hint)) + elif isinstance(data, sparse_array_type): + import sparse + + new_keys.append(sparse.COO.from_numpy(k)) + else: + new_keys.append(k) + else: + new_keys.append(k) + + mask = _logical_any(k == -1 for k in new_keys) + return mask -def create_mask(indexer, shape, chunks_hint=None): +def create_mask(indexer, shape, data=None): """Create a mask for indexing with a fill-value. Parameters @@ -1098,25 +1109,24 @@ def create_mask(indexer, shape, chunks_hint=None): the result that should be masked. shape : tuple Shape of the array being indexed. - chunks_hint : tuple, optional - Optional tuple indicating desired chunks for the result. If provided, - used as a hint for chunks on the resulting dask. Must have a hint for - each dimension on the result array. + data : optional + Data for which mask is being created. If data is a dask arrays, its chunks + are used as a hint for chunks on the resulting mask. If data is a sparse + array, the returned mask is also a sparse array. Returns ------- - mask : bool, np.ndarray or dask.array.Array with dtype=bool - Dask array if chunks_hint is provided, otherwise a NumPy array. Has the - same shape as the indexing result. + mask : bool, np.ndarray, SparseArray or dask.array.Array with dtype=bool + Same type as data. Has the same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): key = _outer_to_vectorized_indexer(indexer, shape).tuple assert not any(isinstance(k, slice) for k in key) - mask = _masked_result_drop_slice(key, chunks_hint) + mask = _masked_result_drop_slice(key, data) elif isinstance(indexer, VectorizedIndexer): key = indexer.tuple - base_mask = _masked_result_drop_slice(key, chunks_hint) + base_mask = _masked_result_drop_slice(key, data) slice_shape = tuple( np.arange(*k.indices(size)).size for k, size in zip(key, shape) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bc8da10dd0c..c64dd8af6c6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -710,8 +710,7 @@ def _getitem_with_mask(self, key, fill_value=dtypes.NA): actual_indexer = indexer data = as_indexable(self._data)[actual_indexer] - chunks_hint = getattr(data, "chunks", None) - mask = indexing.create_mask(indexer, self.shape, chunks_hint) + mask = indexing.create_mask(indexer, self.shape, data) data = duck_array_ops.where(mask, fill_value, data) else: # array cannot be indexed along dimensions of size 0, so just diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index f37f8d98ca8..82ee9b63f9d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -708,7 +708,9 @@ def test_create_mask_dask(): indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]))) expected = np.array(2 * [[False, True, False]]) - actual = indexing.create_mask(indexer, (5, 5, 5), chunks_hint=((1, 1), (2, 1))) + actual = indexing.create_mask( + indexer, (5, 5, 5), da.empty((2, 3), chunks=((1, 1), (2, 1))) + ) assert actual.chunks == ((1, 1), (2, 1)) np.testing.assert_array_equal(expected, actual) @@ -716,12 +718,14 @@ def test_create_mask_dask(): (np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])) ) expected = np.array([[False, True, True]] * 2).T - actual = indexing.create_mask(indexer, (5, 2), chunks_hint=((3,), (2,))) + actual = indexing.create_mask( + indexer, (5, 2), da.empty((3, 2), chunks=((3,), (2,))) + ) assert isinstance(actual, da.Array) np.testing.assert_array_equal(expected, actual) with pytest.raises(ValueError): - indexing.create_mask(indexer, (5, 2), chunks_hint=()) + indexing.create_mask(indexer, (5, 2), da.empty((5,), chunks=(1,))) def test_create_mask_error(): diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 36decf49713..80f80a93a1c 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -9,6 +9,7 @@ import xarray.ufuncs as xu from xarray import DataArray, Variable from xarray.core.npcompat import IS_NEP18_ACTIVE +from xarray.core.pycompat import sparse_array_type from . import assert_equal, assert_identical @@ -23,6 +24,12 @@ sparse = pytest.importorskip("sparse") +def assert_sparse_equal(a, b): + assert isinstance(a, sparse_array_type) + assert isinstance(b, sparse_array_type) + np.testing.assert_equal(a.todense(), b.todense()) + + def make_ndarray(shape): return np.arange(np.prod(shape)).reshape(shape) @@ -105,21 +112,9 @@ def test_variable_property(prop): (do("to_base_variable"), True), (do("transpose"), True), (do("unstack", dimensions={"x": {"x1": 5, "x2": 2}}), True), - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("identical", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), + (do("identical", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -161,11 +156,7 @@ def test_variable_property(prop): True, marks=xfail(reason="Missing implementation for np.nancumsum"), ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("fillna", 0), True), param( do("item", (1, 1)), False, @@ -188,11 +179,7 @@ def test_variable_property(prop): True, # noqa marks=xfail(reason="Missing implementation for np.pad"), ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("prod"), False), param( do("quantile", q=0.5), True, @@ -219,20 +206,12 @@ def test_variable_property(prop): param( do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") ), - param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("sum"), False), param( do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") ), param(do("to_dict"), False, marks=xfail(reason="Coercion to dense")), - param( - do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), - True, - marks=xfail(reason="Coercion of dense to sparse when using sparse mask"), - ), # noqa + (do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), True), ], ids=repr, ) @@ -282,16 +261,18 @@ def setUp(self): self.var = xr.Variable(("x", "y"), self.data) def test_unary_op(self): - sparse.utils.assert_eq(-self.var.data, -self.data) - sparse.utils.assert_eq(abs(self.var).data, abs(self.data)) - sparse.utils.assert_eq(self.var.round().data, self.data.round()) + assert_sparse_equal(-self.var.data, -self.data) + assert_sparse_equal(abs(self.var).data, abs(self.data)) + assert_sparse_equal(self.var.round().data, self.data.round()) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_univariate_ufunc(self): - sparse.utils.assert_eq(np.sin(self.data), xu.sin(self.var).data) + assert_sparse_equal(np.sin(self.data), xu.sin(self.var).data) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_bivariate_ufunc(self): - sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) - sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(0, self.var).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(0, self.var).data) def test_repr(self): expected = dedent( @@ -304,9 +285,8 @@ def test_repr(self): def test_pickle(self): v1 = self.var v2 = pickle.loads(pickle.dumps(v1)) - sparse.utils.assert_eq(v1.data, v2.data) + assert_sparse_equal(v1.data, v2.data) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_missing_values(self): a = np.array([0, 1, np.nan, 3]) s = sparse.COO.from_numpy(a) @@ -384,16 +364,8 @@ def test_dataarray_property(prop): # TODO # set_index # swap_dims - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -414,11 +386,7 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.flip"), ), - param( - do("combine_first", make_xrarray({"x": 10, "y": 5})), - True, - marks=xfail(reason="mixed sparse-dense operation"), - ), + (do("combine_first", make_xrarray({"x": 10, "y": 5})), True), param( do("conjugate"), False, @@ -445,16 +413,8 @@ def test_dataarray_property(prop): marks=xfail(reason="Missing implementation for np.einsum"), ), param(do("dropna", "x"), False, marks=xfail(reason="Coercion to dense")), - param( - do("ffill", "x"), - False, - marks=xfail(reason="Coercion to dense via bottleneck.push"), - ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + param(do("ffill", "x"), False, marks=xfail(reason="Coercion to dense")), + (do("fillna", 0), True), param( do("interp", coords={"x": np.arange(10) + 0.5}), True, @@ -489,17 +449,9 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.nanmedian"), ), - param(do("notnull"), True), - param( - do("pipe", np.sum, axis=1), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("notnull"), True), + (do("pipe", np.sum, axis=1), True), + (do("prod"), False), param( do("quantile", q=0.5), False, @@ -526,11 +478,7 @@ def test_dataarray_property(prop): True, marks=xfail(reason="Indexing COO with more than one iterable index"), ), # noqa - param( - do("roll", x=2), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("roll", x=2, roll_coords=True), True), param( do("sel", x=[0, 1, 2], y=[2, 3]), True, @@ -539,11 +487,7 @@ def test_dataarray_property(prop): param( do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") ), - param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("sum"), False), param( do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") ), @@ -606,7 +550,6 @@ def setUp(self): self.ds_ar, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_to_dataset_roundtrip(self): x = self.sp_xr assert_equal(x, x.to_dataset("x").to_array("x")) @@ -657,7 +600,6 @@ def test_align_2d(self): assert np.all(B1.coords["x"] == B2.coords["x"]) assert np.all(B1.coords["y"] == B2.coords["y"]) - @pytest.mark.xfail(reason="fill value leads to sparse-dense operation") def test_align_outer(self): a1 = xr.DataArray( sparse.COO.from_numpy(np.arange(4)), @@ -672,22 +614,21 @@ def test_align_outer(self): a2, b2 = xr.align(a1, b1, join="outer") assert isinstance(a2.data, sparse.SparseArray) assert isinstance(b2.data, sparse.SparseArray) - assert np.all(a2.coords["x"].data == ["a", "b", "c", "d"]) - assert np.all(b2.coords["x"].data == ["a", "b", "c", "d"]) + assert np.all(a2.coords["x"].data == ["a", "b", "c", "d", "e"]) + assert np.all(b2.coords["x"].data == ["a", "b", "c", "d", "e"]) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_concat(self): ds1 = xr.Dataset(data_vars={"d": self.sp_xr}) ds2 = xr.Dataset(data_vars={"d": self.sp_xr}) ds3 = xr.Dataset(data_vars={"d": self.sp_xr}) out = xr.concat([ds1, ds2, ds3], dim="x") - sparse.utils.assert_eq( + assert_sparse_equal( out["d"].data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=0), ) out = xr.concat([self.sp_xr, self.sp_xr, self.sp_xr], dim="y") - sparse.utils.assert_eq( + assert_sparse_equal( out.data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=1) ) @@ -706,6 +647,7 @@ def test_stack(self): roundtripped = stacked.unstack() assert arr.identical(roundtripped) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_ufuncs(self): x = self.sp_xr assert_equal(np.sin(x), xu.sin(x)) @@ -830,8 +772,8 @@ def test_groupby_first(self): def test_groupby_bins(self): x1 = self.ds_xr x2 = self.sp_xr - m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum() - m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum() + m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) + m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) assert isinstance(m2.data, sparse.SparseArray) assert np.allclose(m1.data, m2.data.todense())