From 3f3111ed924aa78c16ebbc568dff5fd936860c7a Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 26 Oct 2021 11:48:57 -0400 Subject: [PATCH 1/9] added chunksizes property --- xarray/core/common.py | 17 ++++++++++++++ xarray/core/dataarray.py | 32 ++++++++++++++++++++++--- xarray/core/dataset.py | 51 +++++++++++++++++++++++++++++----------- xarray/core/variable.py | 37 +++++++++++++++++++++++++---- 4 files changed, 116 insertions(+), 21 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 2c5d7900ef8..39875a5a7d8 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None): return full_like(other, 1, dtype) +def get_chunksizes( + variables: Iterable[Variable], +) -> Mapping[Hashable, Tuple[int, ...]]: + + chunks: Dict[Hashable, Tuple[int, ...]] = {} + for v in variables: + if hasattr(v.data, "chunks"): + for dim, c in v.chunksizes.items(): + if dim in chunks and c != chunks[dim]: + raise ValueError( + f"Object has inconsistent chunks along dimension {dim}. " + "This can be fixed by calling unify_chunks()." + ) + chunks[dim] = c + return Frozen(chunks) + + def is_np_datetime_like(dtype: DTypeLike) -> bool: """Check if a dtype is a subclass of the numpy datetime types""" return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ed8b393628d..01fd43e5946 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -43,7 +43,7 @@ reindex_like_indexers, ) from .arithmetic import DataArrayArithmetic -from .common import AbstractArray, DataWithCoords +from .common import AbstractArray, DataWithCoords, get_chunksizes from .computation import unify_chunks from .coordinates import ( DataArrayCoordinates, @@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray": @property def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: - """Block dimensions for this array's data or None if it's not a dask - array. + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + DataArray.chunk + DataArray.chunksizes + xarray.unify_chunks """ return self.variable.chunks + @property + def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataarray's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + DataArray.chunk + DataArray.chunks + xarray.unify_chunks + """ + all_variables = [self.variable] + [c.variable for c in self.coords.values()] + return get_chunksizes(all_variables) + def chunk( self, chunks: Union[ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4b1b1de222d..a38996776d3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -52,7 +52,7 @@ ) from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .arithmetic import DatasetArithmetic -from .common import DataWithCoords, _contains_datetime_like_objects +from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes from .computation import unify_chunks from .coordinates import ( DatasetCoordinates, @@ -2090,20 +2090,37 @@ def info(self, buf=None) -> None: @property def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: - """Block dimensions for this dataset's data or None if it's not a dask - array. """ - chunks: Dict[Hashable, Tuple[int, ...]] = {} - for v in self.variables.values(): - if v.chunks is not None: - for dim, c in zip(v.dims, v.chunks): - if dim in chunks and c != chunks[dim]: - raise ValueError( - f"Object has inconsistent chunks along dimension {dim}. " - "This can be fixed by calling unify_chunks()." - ) - chunks[dim] = c - return Frozen(chunks) + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunksizes, but maintained for backwards compatibility. + + See Also + -------- + Dataset.chunk + Dataset.chunksizes + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) + + @property + def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunks. + + See Also + -------- + Dataset.chunk + Dataset.chunks + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) def chunk( self, @@ -2142,6 +2159,12 @@ def chunk( Returns ------- chunked : xarray.Dataset + + See Also + -------- + Dataset.chunks + Dataset.chunksizes + xarray.unify_chunks """ if chunks is None: warnings.warn( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 191bb4059f5..2b5f466c56c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -45,6 +45,7 @@ sparse_array_type, ) from .utils import ( + Frozen, NdimSizeLenMixin, OrderedSet, _default, @@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None): __hash__ = None # type: ignore[assignment] @property - def chunks(self): - """Block dimensions for this array's data or None if it's not a dask - array. + def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + Variable.chunk + Variable.chunksizes + xarray.unify_chunks """ return getattr(self._data, "chunks", None) + @property + def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this variable's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + Variable.chunk + Variable.chunks + xarray.unify_chunks + """ + if hasattr(self._data, "chunks"): + return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)}) + else: + return {} + _array_counter = itertools.count() def chunk(self, chunks={}, name=None, lock=False): - """Coerce this array's data into a dask arrays with the given chunks. + """Coerce this array's data into dask array with the given chunks. If this variable is a non-dask array, it will be converted to dask array. If it's a dask array, it will be rechunked to the given chunk From 05a7d452c3a6dc70f979bc991c250ac5e0a3aaff Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 12:30:33 -0400 Subject: [PATCH 2/9] fix typing via Hashable->Any --- xarray/core/common.py | 4 ++-- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/variable.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 39875a5a7d8..b5dc3bf0e20 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1815,9 +1815,9 @@ def ones_like(other, dtype: DTypeLike = None): def get_chunksizes( variables: Iterable[Variable], -) -> Mapping[Hashable, Tuple[int, ...]]: +) -> Mapping[Any, Tuple[int, ...]]: - chunks: Dict[Hashable, Tuple[int, ...]] = {} + chunks: Dict[Any, Tuple[int, ...]] = {} for v in variables: if hasattr(v.data, "chunks"): for dim, c in v.chunksizes.items(): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 01fd43e5946..c6f534796d1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1071,7 +1071,7 @@ def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: return self.variable.chunks @property - def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: """ Mapping from dimension names to block lengths for this dataarray's data, or None if the underlying data is not a dask array. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a38996776d3..a756924c6f3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2106,7 +2106,7 @@ def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: return get_chunksizes(self.variables.values()) @property - def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: """ Mapping from dimension names to block lengths for this dataset's data, or None if the underlying data is not a dask array. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2b5f466c56c..97163ae4f1d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1011,7 +1011,7 @@ def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: return getattr(self._data, "chunks", None) @property - def chunksizes(self) -> Mapping[Hashable, Tuple[int, ...]]: + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: """ Mapping from dimension names to block lengths for this variable's data, or None if the underlying data is not a dask array. From c3ea7ec1516651b746ea2c07e5ebabb4a44ca1eb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 12:35:22 -0400 Subject: [PATCH 3/9] add chunksizes to API doc --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 4686751c536..e3e1a02fcf4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -65,6 +65,7 @@ Attributes Dataset.indexes Dataset.get_index Dataset.chunks + Dataset.chunksizes Dataset.nbytes Dictionary interface @@ -271,6 +272,7 @@ Attributes DataArray.encoding DataArray.indexes DataArray.get_index + DataArray.chunksizes **ndarray attributes**: :py:attr:`~DataArray.ndim` From 71d5b17d079dc9d7c17aa9225eaccc241f3c6fe1 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 12:42:35 -0400 Subject: [PATCH 4/9] whatsnew --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 87c85d45454..52fbbbc7626 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,6 +36,10 @@ New Features `Nathan Lis `_. - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. +- Added a new :py:meth:`Dataset.chunksizes`, :py:meth:`DataArray.chunksizes`, and :py:meth:`Variable.chunksizes` + property, which will always return a mapping from dimension names to chunking pattern along that dimension, guaranteed + to be consistent between `Dataset`, `DataArray`, and `Variable` objects. (:issue:`5846`, :pull:`5900`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ From aac21f07e3c1e3af7df364c2d7ae25ec1f11b23f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 12:43:51 -0400 Subject: [PATCH 5/9] grammar --- xarray/core/variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 97163ae4f1d..a96adb31e64 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1034,7 +1034,7 @@ def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: _array_counter = itertools.count() def chunk(self, chunks={}, name=None, lock=False): - """Coerce this array's data into dask array with the given chunks. + """Coerce this array's data into a dask array with the given chunks. If this variable is a non-dask array, it will be converted to dask array. If it's a dask array, it will be rechunked to the given chunk From 9b2889f113e94bcdf3fcd3fd78751e9b5a2de95f Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Thu, 28 Oct 2021 14:42:40 -0400 Subject: [PATCH 6/9] Update doc/whats-new.rst Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 52fbbbc7626..2eb19585908 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,7 +36,7 @@ New Features `Nathan Lis `_. - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. -- Added a new :py:meth:`Dataset.chunksizes`, :py:meth:`DataArray.chunksizes`, and :py:meth:`Variable.chunksizes` +- Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes` property, which will always return a mapping from dimension names to chunking pattern along that dimension, guaranteed to be consistent between `Dataset`, `DataArray`, and `Variable` objects. (:issue:`5846`, :pull:`5900`) By `Tom Nicholas `_. From 32a4b2cf9f9aa13d9cf72ac6d50f693322e92890 Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Thu, 28 Oct 2021 14:42:47 -0400 Subject: [PATCH 7/9] Update doc/whats-new.rst Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2eb19585908..df5be17d33e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,7 +38,7 @@ New Features By `Maxime Liquet `_. - Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes` property, which will always return a mapping from dimension names to chunking pattern along that dimension, guaranteed - to be consistent between `Dataset`, `DataArray`, and `Variable` objects. (:issue:`5846`, :pull:`5900`) + to be consistent between Dataset, DataArray, and Variable objects. (:issue:`5846`, :pull:`5900`) By `Tom Nicholas `_. Breaking changes From f892ef7d1ed54b5973e9923e45a89fdb4699654d Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 14:44:46 -0400 Subject: [PATCH 8/9] removed the word consistent --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index df5be17d33e..e88744b7df1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,8 +37,8 @@ New Features - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. - Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes` - property, which will always return a mapping from dimension names to chunking pattern along that dimension, guaranteed - to be consistent between Dataset, DataArray, and Variable objects. (:issue:`5846`, :pull:`5900`) + property, which will always return a mapping from dimension names to chunking pattern along that dimension, + regardless of whether the object is a Dataset, DataArray, or Variable. (:issue:`5846`, :pull:`5900`) By `Tom Nicholas `_. Breaking changes From 419025a9e24608d7deb24c9832d0467627c41945 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 28 Oct 2021 15:11:06 -0400 Subject: [PATCH 9/9] test .chunksizes --- xarray/tests/test_dask.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index d5d460056aa..fa49d253e26 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -104,6 +104,11 @@ def test_chunk(self): assert rechunked.chunks == expected self.assertLazyAndIdentical(self.eager_var, rechunked) + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + def test_indexing(self): u = self.eager_var v = self.lazy_var @@ -330,6 +335,38 @@ def setUp(self): self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) + def test_chunk(self): + for chunks, expected in [ + ({}, ((2, 2), (2, 2, 2))), + (3, ((3, 1), (3, 3))), + ({"x": 3, "y": 3}, ((3, 1), (3, 3))), + ({"x": 3}, ((3, 1), (2, 2, 2))), + ({"x": (3, 1)}, ((3, 1), (2, 2, 2))), + ]: + # Test DataArray + rechunked = self.lazy_array.chunk(chunks) + assert rechunked.chunks == expected + self.assertLazyAndIdentical(self.eager_array, rechunked) + + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + + # Test Dataset + lazy_dataset = self.lazy_array.to_dataset() + eager_dataset = self.eager_array.to_dataset() + expected_chunksizes = { + dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected) + } + rechunked = lazy_dataset.chunk(chunks) + + # Dataset.chunks has a different return type to DataArray.chunks - see issue #5843 + assert rechunked.chunks == expected_chunksizes + self.assertLazyAndIdentical(eager_dataset, rechunked) + + assert rechunked.chunksizes == expected_chunksizes + def test_rechunk(self): chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2}) assert chunked.chunks == ((2,) * 2, (2,) * 3)