pydata · dcherian · Dec 3, 2021 · Jul 5, 2021 · Jul 5, 2021 · Jul 5, 2021
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -65,6 +65,7 @@
         "bottleneck": ["", null],
         "dask": [""],
         "distributed": [""],
+        "sparse": [""]
     },
 
 

diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -22,6 +22,13 @@ def requires_dask():
         raise NotImplementedError()
 
 
+def requires_sparse():
+    try:
+        import sparse  # noqa: F401
+    except ImportError:
+        raise NotImplementedError()
+
+
 def randn(shape, frac_nan=None, chunks=None, seed=0):
     rng = np.random.RandomState(seed)
     if chunks is None:

diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -1,8 +1,9 @@
 import numpy as np
+import pandas as pd
 
 import xarray as xr
 
-from . import requires_dask
+from . import requires_dask, requires_sparse
 
 
 class Unstacking:
@@ -27,3 +28,37 @@ def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
         self.da_full = self.da_full.chunk({"flat_dim": 25})
+
+
+class UnstackingSparse(Unstacking):
+    def setup(self, *args, **kwargs):
+        requires_sparse()
+
+        import sparse
+
+        data = sparse.random((500, 1000), random_state=0, fill_value=0)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+
+        mindex = pd.MultiIndex.from_arrays([np.arange(500), np.arange(500)])
+        self.da_eye_2d = xr.DataArray(np.ones((500,)), dims="z", coords={"z": mindex})
+        self.da_eye_3d = xr.DataArray(
+            np.ones((500, 50)),
+            dims=("z", "foo"),
+            coords={"z": mindex, "foo": np.arange(50)},
+        )
+
+    def time_unstack_to_sparse_2d(self):
+        self.da_eye_2d.unstack(sparse=True)
+
+    def time_unstack_to_sparse_3d(self):
+        self.da_eye_3d.unstack(sparse=True)
+
+    def peakmem_unstack_to_sparse_2d(self):
+        self.da_eye_2d.unstack(sparse=True)
+
+    def peakmem_unstack_to_sparse_3d(self):
+        self.da_eye_3d.unstack(sparse=True)
+
+    def time_unstack_pandas_slow(self):
+        pass
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -68,6 +68,13 @@ Deprecations
   passed alongside ``combine='by_coords'``.
   By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
+Performance
+~~~~~~~~~~~
+
+- Significantly faster unstacking to a ``sparse`` array. :pull:`5577`
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4022,7 +4022,9 @@ def ensure_stackable(val):
 
         return data_array
 
-    def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
+    def _unstack_once(
+        self, dim: Hashable, fill_value, sparse: bool = False
+    ) -> "Dataset":
         index = self.get_index(dim)
         index = remove_unused_levels_categories(index)
 
@@ -4038,7 +4040,7 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
                         fill_value_ = fill_value
 
                     variables[name] = var._unstack_once(
-                        index=index, dim=dim, fill_value=fill_value_
+                        index=index, dim=dim, fill_value=fill_value_, sparse=sparse
                     )
                 else:
                     variables[name] = var
@@ -4160,30 +4162,27 @@ def unstack(
                 # function requires.
                 # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
                 any(is_duck_dask_array(v.data) for v in self.variables.values())
-                # Sparse doesn't currently support (though we could special-case
-                # it)
-                # https://github.com/pydata/sparse/issues/422
+                # sparse.COO doesn't currently support assignment
                 or any(
                     isinstance(v.data, sparse_array_type)
                     for v in self.variables.values()
                 )
-                or sparse
                 # Until https://github.com/pydata/xarray/pull/4751 is resolved,
                 # we check explicitly whether it's a numpy array. Once that is
                 # resolved, explicitly exclude pint arrays.
-                # # pint doesn't implement `np.full_like` in a way that's
-                # # currently compatible.
-                # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
-                # # or any(
-                # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
-                # # )
+                # pint doesn't implement `np.full_like` in a way that's
+                # currently compatible.
+                # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
+                # or any(
+                #     isinstance(v.data, pint_array_type) for v in self.variables.values()
+                # )
                 or any(
                     not isinstance(v.data, np.ndarray) for v in self.variables.values()
                 )
             ):
                 result = result._unstack_full_reindex(dim, fill_value, sparse)
             else:
-                result = result._unstack_once(dim, fill_value)
+                result = result._unstack_once(dim, fill_value, sparse)
         return result
 
     def update(self, other: "CoercibleMapping") -> "Dataset":

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1602,6 +1602,7 @@ def _unstack_once(
         index: pd.MultiIndex,
         dim: Hashable,
         fill_value=dtypes.NA,
+        sparse: bool = False,
     ) -> "Variable":
         """
         Unstacks this variable given an index to unstack and the name of the
@@ -1629,19 +1630,39 @@ def _unstack_once(
         else:
             dtype = self.dtype
 
-        data = np.full_like(
-            self.data,
-            fill_value=fill_value,
-            shape=new_shape,
-            dtype=dtype,
-        )
+        if sparse:
+            # unstacking a dense multitindexed array to a sparse array
+            from sparse import COO
+
+            codes = zip(*index.codes)
+            if reordered.ndim == 1:
+                indexes = codes
+            else:
+                sizes = itertools.product(*[range(s) for s in reordered.shape[:-1]])
+                tuple_indexes = itertools.product(sizes, codes)
+                indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes)  # type: ignore
+
+            data = COO(
+                coords=np.array(list(indexes)).T,
+                data=self.data.astype(dtype).ravel(),
+                fill_value=fill_value,
+                shape=new_shape,
+                has_duplicates=False,
+                sorted=index.is_monotonic_increasing,
+            )
+
+        else:
+            data = np.full_like(
+                self.data,
+                fill_value=fill_value,
+                shape=new_shape,
+                dtype=dtype,
+            )
 
-        # Indexer is a list of lists of locations. Each list is the locations
-        # on the new dimension. This is robust to the data being sparse; in that
-        # case the destinations will be NaN / zero.
-        # sparse doesn't support item assigment,
-        # https://github.com/pydata/sparse/issues/114
-        data[(..., *indexer)] = reordered
+            # Indexer is a list of lists of locations. Each list is the locations
+            # on the new dimension. This is robust to the data being sparse; in that
+            # case the destinations will be NaN / zero.
+            data[(..., *indexer)] = reordered
 
         return self._replace(dims=new_dims, data=data)
 

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -28,7 +28,7 @@
 from xarray.core import dtypes, indexing, utils
 from xarray.core.common import duck_array_ops, full_like
 from xarray.core.indexes import Index
-from xarray.core.pycompat import integer_types
+from xarray.core.pycompat import integer_types, sparse_array_type
 from xarray.core.utils import is_scalar
 
 from . import (
@@ -3085,14 +3085,42 @@ def test_unstack_sparse(self):
         # test fill_value
         actual = ds.unstack("index", sparse=True)
         expected = ds.unstack("index")
+        assert isinstance(actual["var"].data, sparse_array_type)
         assert actual["var"].variable._to_dense().equals(expected["var"].variable)
         assert actual["var"].data.density < 1.0
 
         actual = ds["var"].unstack("index", sparse=True)
         expected = ds["var"].unstack("index")
+        assert isinstance(actual.data, sparse_array_type)
         assert actual.variable._to_dense().equals(expected.variable)
         assert actual.data.density < 1.0
 
+        mindex = pd.MultiIndex.from_arrays(
+            [np.arange(3), np.arange(3)], names=["a", "b"]
+        )
+        ds_eye = Dataset(
+            {"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))},
+            coords={"z": mindex, "foo": np.arange(4), "bar": np.arange(5)},
+        )
+        actual = ds_eye.unstack(sparse=True, fill_value=0)
+        assert isinstance(actual["var"].data, sparse_array_type)
+        expected = xr.Dataset(
+            {
+                "var": (
+                    ("foo", "bar", "a", "b"),
+                    np.broadcast_to(np.eye(3, 3), (4, 5, 3, 3)),
+                )
+            },
+            coords={
+                "foo": np.arange(4),
+                "bar": np.arange(5),
+                "a": np.arange(3),
+                "b": np.arange(3),
+            },
+        )
+        actual["var"].data = actual["var"].data.todense()
+        assert_equal(expected, actual)
+
     def test_stack_unstack_fast(self):
         ds = Dataset(
             {