From ee7a8da058be3c21903f376208b3d036dbc1ed4c Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Fri, 29 Oct 2021 14:12:21 -0400 Subject: [PATCH] Add .chunksizes property (#5900) * added chunksizes property * fix typing via Hashable->Any * add chunksizes to API doc * whatsnew * grammar * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * removed the word consistent * test .chunksizes Co-authored-by: Deepak Cherian --- doc/api.rst | 2 ++ doc/whats-new.rst | 4 +++ xarray/core/common.py | 17 +++++++++++++ xarray/core/dataarray.py | 32 +++++++++++++++++++++--- xarray/core/dataset.py | 51 ++++++++++++++++++++++++++++----------- xarray/core/variable.py | 37 +++++++++++++++++++++++++--- xarray/tests/test_dask.py | 37 ++++++++++++++++++++++++++++ 7 files changed, 159 insertions(+), 21 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 83015cb3993..183e9d1425e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -65,6 +65,7 @@ Attributes Dataset.indexes Dataset.get_index Dataset.chunks + Dataset.chunksizes Dataset.nbytes Dictionary interface @@ -271,6 +272,7 @@ Attributes DataArray.encoding DataArray.indexes DataArray.get_index + DataArray.chunksizes **ndarray attributes**: :py:attr:`~DataArray.ndim` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8dfecdd2aa8..cd0d4461aa2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,10 @@ New Features `Nathan Lis `_. - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. +- Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes` + property, which will always return a mapping from dimension names to chunking pattern along that dimension, + regardless of whether the object is a Dataset, DataArray, or Variable. (:issue:`5846`, :pull:`5900`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/common.py b/xarray/core/common.py index 2c5d7900ef8..b5dc3bf0e20 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None): return full_like(other, 1, dtype) +def get_chunksizes( + variables: Iterable[Variable], +) -> Mapping[Any, Tuple[int, ...]]: + + chunks: Dict[Any, Tuple[int, ...]] = {} + for v in variables: + if hasattr(v.data, "chunks"): + for dim, c in v.chunksizes.items(): + if dim in chunks and c != chunks[dim]: + raise ValueError( + f"Object has inconsistent chunks along dimension {dim}. " + "This can be fixed by calling unify_chunks()." + ) + chunks[dim] = c + return Frozen(chunks) + + def is_np_datetime_like(dtype: DTypeLike) -> bool: """Check if a dtype is a subclass of the numpy datetime types""" return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ed8b393628d..c6f534796d1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -43,7 +43,7 @@ reindex_like_indexers, ) from .arithmetic import DataArrayArithmetic -from .common import AbstractArray, DataWithCoords +from .common import AbstractArray, DataWithCoords, get_chunksizes from .computation import unify_chunks from .coordinates import ( DataArrayCoordinates, @@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray": @property def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: - """Block dimensions for this array's data or None if it's not a dask - array. + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + DataArray.chunk + DataArray.chunksizes + xarray.unify_chunks """ return self.variable.chunks + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataarray's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + DataArray.chunk + DataArray.chunks + xarray.unify_chunks + """ + all_variables = [self.variable] + [c.variable for c in self.coords.values()] + return get_chunksizes(all_variables) + def chunk( self, chunks: Union[ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0e6ae905aa8..e882495dce5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -52,7 +52,7 @@ ) from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .arithmetic import DatasetArithmetic -from .common import DataWithCoords, _contains_datetime_like_objects +from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes from .computation import unify_chunks from .coordinates import ( DatasetCoordinates, @@ -2095,20 +2095,37 @@ def info(self, buf=None) -> None: @property def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: - """Block dimensions for this dataset's data or None if it's not a dask - array. """ - chunks: Dict[Hashable, Tuple[int, ...]] = {} - for v in self.variables.values(): - if v.chunks is not None: - for dim, c in zip(v.dims, v.chunks): - if dim in chunks and c != chunks[dim]: - raise ValueError( - f"Object has inconsistent chunks along dimension {dim}. " - "This can be fixed by calling unify_chunks()." - ) - chunks[dim] = c - return Frozen(chunks) + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunksizes, but maintained for backwards compatibility. + + See Also + -------- + Dataset.chunk + Dataset.chunksizes + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) + + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunks. + + See Also + -------- + Dataset.chunk + Dataset.chunks + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) def chunk( self, @@ -2147,6 +2164,12 @@ def chunk( Returns ------- chunked : xarray.Dataset + + See Also + -------- + Dataset.chunks + Dataset.chunksizes + xarray.unify_chunks """ if chunks is None: warnings.warn( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 191bb4059f5..a96adb31e64 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -45,6 +45,7 @@ sparse_array_type, ) from .utils import ( + Frozen, NdimSizeLenMixin, OrderedSet, _default, @@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None): __hash__ = None # type: ignore[assignment] @property - def chunks(self): - """Block dimensions for this array's data or None if it's not a dask - array. + def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + Variable.chunk + Variable.chunksizes + xarray.unify_chunks """ return getattr(self._data, "chunks", None) + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this variable's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + Variable.chunk + Variable.chunks + xarray.unify_chunks + """ + if hasattr(self._data, "chunks"): + return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)}) + else: + return {} + _array_counter = itertools.count() def chunk(self, chunks={}, name=None, lock=False): - """Coerce this array's data into a dask arrays with the given chunks. + """Coerce this array's data into a dask array with the given chunks. If this variable is a non-dask array, it will be converted to dask array. If it's a dask array, it will be rechunked to the given chunk diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index de69c972fc6..9eda6ccdf19 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -104,6 +104,11 @@ def test_chunk(self): assert rechunked.chunks == expected self.assertLazyAndIdentical(self.eager_var, rechunked) + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + def test_indexing(self): u = self.eager_var v = self.lazy_var @@ -330,6 +335,38 @@ def setUp(self): self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) + def test_chunk(self): + for chunks, expected in [ + ({}, ((2, 2), (2, 2, 2))), + (3, ((3, 1), (3, 3))), + ({"x": 3, "y": 3}, ((3, 1), (3, 3))), + ({"x": 3}, ((3, 1), (2, 2, 2))), + ({"x": (3, 1)}, ((3, 1), (2, 2, 2))), + ]: + # Test DataArray + rechunked = self.lazy_array.chunk(chunks) + assert rechunked.chunks == expected + self.assertLazyAndIdentical(self.eager_array, rechunked) + + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + + # Test Dataset + lazy_dataset = self.lazy_array.to_dataset() + eager_dataset = self.eager_array.to_dataset() + expected_chunksizes = { + dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected) + } + rechunked = lazy_dataset.chunk(chunks) + + # Dataset.chunks has a different return type to DataArray.chunks - see issue #5843 + assert rechunked.chunks == expected_chunksizes + self.assertLazyAndIdentical(eager_dataset, rechunked) + + assert rechunked.chunksizes == expected_chunksizes + def test_rechunk(self): chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2}) assert chunked.chunks == ((2,) * 2, (2,) * 3)