Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster unstacking to sparse #5577

Merged
merged 17 commits into from
Dec 3, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
"bottleneck": ["", null],
"dask": [""],
"distributed": [""],
"sparse": [""]
},


Expand Down
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def requires_dask():
raise NotImplementedError()


def requires_sparse():
try:
import sparse # noqa: F401
except ImportError:
raise NotImplementedError()


def randn(shape, frac_nan=None, chunks=None, seed=0):
rng = np.random.RandomState(seed)
if chunks is None:
Expand Down
37 changes: 36 additions & 1 deletion asv_bench/benchmarks/unstacking.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import numpy as np
import pandas as pd

import xarray as xr

from . import requires_dask
from . import requires_dask, requires_sparse


class Unstacking:
Expand All @@ -27,3 +28,37 @@ def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.da_full = self.da_full.chunk({"flat_dim": 25})


class UnstackingSparse(Unstacking):
def setup(self, *args, **kwargs):
requires_sparse()

import sparse

data = sparse.random((500, 1000), random_state=0, fill_value=0)
self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
self.da_missing = self.da_full[:-1]

mindex = pd.MultiIndex.from_arrays([np.arange(500), np.arange(500)])
self.da_eye_2d = xr.DataArray(np.ones((500,)), dims="z", coords={"z": mindex})
self.da_eye_3d = xr.DataArray(
np.ones((500, 50)),
dims=("z", "foo"),
coords={"z": mindex, "foo": np.arange(50)},
)

def time_unstack_to_sparse_2d(self):
self.da_eye_2d.unstack(sparse=True)

def time_unstack_to_sparse_3d(self):
self.da_eye_3d.unstack(sparse=True)

def peakmem_unstack_to_sparse_2d(self):
self.da_eye_2d.unstack(sparse=True)

def peakmem_unstack_to_sparse_3d(self):
self.da_eye_3d.unstack(sparse=True)

def time_unstack_pandas_slow(self):
pass
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ Deprecations
passed alongside ``combine='by_coords'``.
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Performance
~~~~~~~~~~~

- Significantly faster unstacking to a ``sparse`` array. :pull:`5577`
By `Deepak Cherian <https://github.com/dcherian>`_.


dcherian marked this conversation as resolved.
Show resolved Hide resolved
Bug fixes
~~~~~~~~~

Expand Down
25 changes: 12 additions & 13 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4022,7 +4022,9 @@ def ensure_stackable(val):

return data_array

def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
def _unstack_once(
self, dim: Hashable, fill_value, sparse: bool = False
) -> "Dataset":
index = self.get_index(dim)
index = remove_unused_levels_categories(index)

Expand All @@ -4038,7 +4040,7 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
fill_value_ = fill_value

variables[name] = var._unstack_once(
index=index, dim=dim, fill_value=fill_value_
index=index, dim=dim, fill_value=fill_value_, sparse=sparse
)
else:
variables[name] = var
Expand Down Expand Up @@ -4160,30 +4162,27 @@ def unstack(
# function requires.
# https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
any(is_duck_dask_array(v.data) for v in self.variables.values())
# Sparse doesn't currently support (though we could special-case
# it)
# https://github.com/pydata/sparse/issues/422
# sparse.COO doesn't currently support assignment
or any(
isinstance(v.data, sparse_array_type)
for v in self.variables.values()
)
or sparse
# Until https://github.com/pydata/xarray/pull/4751 is resolved,
# we check explicitly whether it's a numpy array. Once that is
# resolved, explicitly exclude pint arrays.
# # pint doesn't implement `np.full_like` in a way that's
# # currently compatible.
# # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
# # or any(
# # isinstance(v.data, pint_array_type) for v in self.variables.values()
# # )
# pint doesn't implement `np.full_like` in a way that's
# currently compatible.
# https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
# or any(
# isinstance(v.data, pint_array_type) for v in self.variables.values()
# )
or any(
not isinstance(v.data, np.ndarray) for v in self.variables.values()
)
):
result = result._unstack_full_reindex(dim, fill_value, sparse)
else:
result = result._unstack_once(dim, fill_value)
result = result._unstack_once(dim, fill_value, sparse)
return result

def update(self, other: "CoercibleMapping") -> "Dataset":
Expand Down
45 changes: 33 additions & 12 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,7 @@ def _unstack_once(
index: pd.MultiIndex,
dim: Hashable,
fill_value=dtypes.NA,
sparse: bool = False,
) -> "Variable":
"""
Unstacks this variable given an index to unstack and the name of the
Expand Down Expand Up @@ -1629,19 +1630,39 @@ def _unstack_once(
else:
dtype = self.dtype

data = np.full_like(
self.data,
fill_value=fill_value,
shape=new_shape,
dtype=dtype,
)
if sparse:
# unstacking a dense multitindexed array to a sparse array
from sparse import COO

codes = zip(*index.codes)
if reordered.ndim == 1:
indexes = codes
else:
sizes = itertools.product(*[range(s) for s in reordered.shape[:-1]])
tuple_indexes = itertools.product(sizes, codes)
indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes) # type: ignore

data = COO(
coords=np.array(list(indexes)).T,
dcherian marked this conversation as resolved.
Show resolved Hide resolved
data=self.data.astype(dtype).ravel(),
fill_value=fill_value,
shape=new_shape,
has_duplicates=False,
sorted=index.is_monotonic_increasing,
)

else:
data = np.full_like(
self.data,
fill_value=fill_value,
shape=new_shape,
dtype=dtype,
)

# Indexer is a list of lists of locations. Each list is the locations
# on the new dimension. This is robust to the data being sparse; in that
# case the destinations will be NaN / zero.
# sparse doesn't support item assigment,
# https://github.com/pydata/sparse/issues/114
data[(..., *indexer)] = reordered
# Indexer is a list of lists of locations. Each list is the locations
# on the new dimension. This is robust to the data being sparse; in that
# case the destinations will be NaN / zero.
data[(..., *indexer)] = reordered

return self._replace(dims=new_dims, data=data)

Expand Down
30 changes: 29 additions & 1 deletion xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from xarray.core import dtypes, indexing, utils
from xarray.core.common import duck_array_ops, full_like
from xarray.core.indexes import Index
from xarray.core.pycompat import integer_types
from xarray.core.pycompat import integer_types, sparse_array_type
from xarray.core.utils import is_scalar

from . import (
Expand Down Expand Up @@ -3085,14 +3085,42 @@ def test_unstack_sparse(self):
# test fill_value
actual = ds.unstack("index", sparse=True)
expected = ds.unstack("index")
assert isinstance(actual["var"].data, sparse_array_type)
assert actual["var"].variable._to_dense().equals(expected["var"].variable)
assert actual["var"].data.density < 1.0

actual = ds["var"].unstack("index", sparse=True)
expected = ds["var"].unstack("index")
assert isinstance(actual.data, sparse_array_type)
assert actual.variable._to_dense().equals(expected.variable)
assert actual.data.density < 1.0

mindex = pd.MultiIndex.from_arrays(
[np.arange(3), np.arange(3)], names=["a", "b"]
)
ds_eye = Dataset(
{"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))},
coords={"z": mindex, "foo": np.arange(4), "bar": np.arange(5)},
)
actual = ds_eye.unstack(sparse=True, fill_value=0)
assert isinstance(actual["var"].data, sparse_array_type)
expected = xr.Dataset(
{
"var": (
("foo", "bar", "a", "b"),
np.broadcast_to(np.eye(3, 3), (4, 5, 3, 3)),
)
},
coords={
"foo": np.arange(4),
"bar": np.arange(5),
"a": np.arange(3),
"b": np.arange(3),
},
)
actual["var"].data = actual["var"].data.todense()
assert_equal(expected, actual)

def test_stack_unstack_fast(self):
ds = Dataset(
{
Expand Down