chunk sparse arrays (pydata#3202)

* chunk sparse arrays * Deprecated API * Don't wrap plain numpy arrays with ImplicitToExplicitIndexingAdapter * typo * What's New * Version bump and annotations What's New polish
shoyer · Aug 12, 2019 · c782637 · c782637
1 parent 14f1a97
commit c782637
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 43 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -13,31 +13,42 @@ What's New
     import xarray as xr
     np.random.seed(123456)
 
-.. _whats-new.0.12.4:
+.. _whats-new.0.13.0:
 
-v0.12.4 (unreleased)
--------------------
+v0.13.0 (unreleased)
+--------------------
 
 This release increases the minimum required Python version from 3.5.0 to 3.5.3
 (:issue:`3089`). By `Guido Imperiale <https://github.com/crusaderky>`_.
 
 New functions/methods
 ~~~~~~~~~~~~~~~~~~~~~
 
-- Added :py:meth:`DataArray.broadcast_like` and :py:meth:`Dataset.broadcast_like`.
-  By `Deepak Cherian <https://github.com/dcherian>`_ and `David Mertz 
-  <http://github.com/DavidMertz>`_.
+- xarray can now wrap around any
+  `NEP18 <https://www.numpy.org/neps/nep-0018-array-function-protocol.html>`_ compliant
+  numpy-like library (important: read notes about NUMPY_EXPERIMENTAL_ARRAY_FUNCTION in
+  the above link). Added explicit test coverage for
+  `sparse <https://github.com/pydata/sparse>`_. (:issue:`3117`, :issue:`3202`)
+  By `Nezar Abdennur <https://github.com/nvictus>`_
+  and `Guido Imperiale <https://github.com/crusaderky>`_.
 
-- The xarray package is now discoverably by mypy (although typing hints
-  coverage is not complete yet). mypy type checking is now enforced by CI.
-  Libraries that depend on xarray and use mypy can now remove from their setup.cfg the lines::
+- The xarray package is now discoverable by mypy (although typing hints coverage is not
+  complete yet). mypy type checking is now enforced by CI. Libraries that depend on
+  xarray and use mypy can now remove from their setup.cfg the lines::
 
     [mypy-xarray]
     ignore_missing_imports = True
 
-   By `Guido Imperiale <https://github.com/crusaderky>`_
+  (:issue:`2877`, :issue:`3088`, :issue:`3090`, :issue:`3112`, :issue:`3117`,
+  :issue:`3207`)
+  By `Guido Imperiale <https://github.com/crusaderky>`_
+  and `Maximilian Roos <https://github.com/max-sixty>`_.
+
+- Added :py:meth:`DataArray.broadcast_like` and :py:meth:`Dataset.broadcast_like`.
+  By `Deepak Cherian <https://github.com/dcherian>`_ and `David Mertz 
+  <http://github.com/DavidMertz>`_.
 
-- Dataset plotting API for visualizing dependences between two `DataArray`s!
+- Dataset plotting API for visualizing dependencies between two `DataArray`s!
   Currently only :py:meth:`Dataset.plot.scatter` is implemented.
   By `Yohai Bar Sinai <https://github.com/yohai>`_ and `Deepak Cherian <https://github.com/dcherian>`_
 

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -926,23 +926,29 @@ def chunk(self, chunks=None, name=None, lock=False):
         if isinstance(data, da.Array):
             data = data.rechunk(chunks)
         else:
+            if isinstance(data, indexing.ExplicitlyIndexed):
+                # Unambiguously handle array storage backends (like NetCDF4 and h5py)
+                # that can't handle general array indexing. For example, in netCDF4 you
+                # can do "outer" indexing along two dimensions independent, which works
+                # differently from how NumPy handles it.
+                # da.from_array works by using lazy indexing with a tuple of slices.
+                # Using OuterIndexer is a pragmatic choice: dask does not yet handle
+                # different indexing types in an explicit way:
+                # https://github.com/dask/dask/issues/2883
+                data = indexing.ImplicitToExplicitIndexingAdapter(
+                    data, indexing.OuterIndexer
+                )
+                if LooseVersion(dask.__version__) < "2.0.0":
+                    kwargs = {}
+                else:
+                    # All of our lazily loaded backend array classes should use NumPy
+                    # array operations.
+                    kwargs = {"meta": np.ndarray}
+            else:
+                kwargs = {}
+
             if utils.is_dict_like(chunks):
                 chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape))
-            # da.from_array works by using lazily indexing with a tuple of
-            # slices. Using OuterIndexer is a pragmatic choice: dask does not
-            # yet handle different indexing types in an explicit way:
-            # https://github.com/dask/dask/issues/2883
-            data = indexing.ImplicitToExplicitIndexingAdapter(
-                data, indexing.OuterIndexer
-            )
-
-            # For now, assume that all arrays that we wrap with dask (including
-            # our lazily loaded backend array classes) should use NumPy array
-            # operations.
-            if LooseVersion(dask.__version__) > "1.2.2":
-                kwargs = dict(meta=np.ndarray)
-            else:
-                kwargs = dict()
 
             data = da.from_array(data, chunks, name=name, lock=lock, **kwargs)
 

diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py
@@ -1,25 +1,14 @@
-from collections import OrderedDict
-from contextlib import suppress
-from distutils.version import LooseVersion
 from textwrap import dedent
 import pickle
 import numpy as np
 import pandas as pd
 
-from xarray import DataArray, Dataset, Variable
-from xarray.tests import mock
+from xarray import DataArray, Variable
 from xarray.core.npcompat import IS_NEP18_ACTIVE
 import xarray as xr
 import xarray.ufuncs as xu
 
-from . import (
-    assert_allclose,
-    assert_array_equal,
-    assert_equal,
-    assert_frame_equal,
-    assert_identical,
-    raises_regex,
-)
+from . import assert_equal, assert_identical
 
 import pytest
 
@@ -148,7 +137,6 @@ def test_variable_property(prop):
             True,
             marks=xfail(reason="'COO' object has no attribute 'argsort'"),
         ),
-        param(do("chunk", chunks=(5, 5)), True, marks=xfail),
         param(
             do(
                 "concat",
@@ -422,9 +410,6 @@ def test_dataarray_property(prop):
             False,
             marks=xfail(reason="Missing implementation for np.flip"),
         ),
-        param(
-            do("chunk", chunks=(5, 5)), False, marks=xfail(reason="Coercion to dense")
-        ),
         param(
             do("combine_first", make_xrarray({"x": 10, "y": 5})),
             True,
@@ -879,3 +864,17 @@ def test_sparse_coords(self):
             dims=["x"],
             coords={"x": COO.from_numpy([1, 2, 3, 4])},
         )
+
+
+def test_chunk():
+    s = sparse.COO.from_numpy(np.array([0, 0, 1, 2]))
+    a = DataArray(s)
+    ac = a.chunk(2)
+    assert ac.chunks == ((2, 2),)
+    assert isinstance(ac.data._meta, sparse.COO)
+    assert_identical(ac, a)
+
+    ds = a.to_dataset(name="a")
+    dsc = ds.chunk(2)
+    assert dsc.chunks == {"dim_0": (2, 2)}
+    assert_identical(dsc, ds)