Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add .chunksizes property #5900

Merged
merged 9 commits into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Attributes
Dataset.indexes
Dataset.get_index
Dataset.chunks
Dataset.chunksizes
Dataset.nbytes

Dictionary interface
Expand Down Expand Up @@ -271,6 +272,7 @@ Attributes
DataArray.encoding
DataArray.indexes
DataArray.get_index
DataArray.chunksizes

**ndarray attributes**:
:py:attr:`~DataArray.ndim`
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ New Features
`Nathan Lis <https://github.com/wxman22>`_.
- Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`).
By `Maxime Liquet <https://github.com/maximlt>`_.
- Added a new :py:meth:`Dataset.chunksizes`, :py:meth:`DataArray.chunksizes`, and :py:meth:`Variable.chunksizes`
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
property, which will always return a mapping from dimension names to chunking pattern along that dimension, guaranteed
to be consistent between `Dataset`, `DataArray`, and `Variable` objects. (:issue:`5846`, :pull:`5900`)
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
17 changes: 17 additions & 0 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None):
return full_like(other, 1, dtype)


def get_chunksizes(
variables: Iterable[Variable],
) -> Mapping[Any, Tuple[int, ...]]:

chunks: Dict[Any, Tuple[int, ...]] = {}
for v in variables:
if hasattr(v.data, "chunks"):
for dim, c in v.chunksizes.items():
if dim in chunks and c != chunks[dim]:
raise ValueError(
f"Object has inconsistent chunks along dimension {dim}. "
"This can be fixed by calling unify_chunks()."
)
chunks[dim] = c
return Frozen(chunks)


def is_np_datetime_like(dtype: DTypeLike) -> bool:
"""Check if a dtype is a subclass of the numpy datetime types"""
return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)
Expand Down
32 changes: 29 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
reindex_like_indexers,
)
from .arithmetic import DataArrayArithmetic
from .common import AbstractArray, DataWithCoords
from .common import AbstractArray, DataWithCoords, get_chunksizes
from .computation import unify_chunks
from .coordinates import (
DataArrayCoordinates,
Expand Down Expand Up @@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray":

@property
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
"""Block dimensions for this array's data or None if it's not a dask
array.
"""
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
the underlying data is not a dask array.

See Also
--------
DataArray.chunk
DataArray.chunksizes
xarray.unify_chunks
"""
return self.variable.chunks

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this dataarray's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().

Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes
instead of a tuple of chunk shapes.

See Also
--------
DataArray.chunk
DataArray.chunks
xarray.unify_chunks
"""
all_variables = [self.variable] + [c.variable for c in self.coords.values()]
return get_chunksizes(all_variables)

def chunk(
self,
chunks: Union[
Expand Down
51 changes: 37 additions & 14 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
)
from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align
from .arithmetic import DatasetArithmetic
from .common import DataWithCoords, _contains_datetime_like_objects
from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes
from .computation import unify_chunks
from .coordinates import (
DatasetCoordinates,
Expand Down Expand Up @@ -2090,20 +2090,37 @@ def info(self, buf=None) -> None:

@property
def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]:
"""Block dimensions for this dataset's data or None if it's not a dask
array.
"""
chunks: Dict[Hashable, Tuple[int, ...]] = {}
for v in self.variables.values():
if v.chunks is not None:
for dim, c in zip(v.dims, v.chunks):
if dim in chunks and c != chunks[dim]:
raise ValueError(
f"Object has inconsistent chunks along dimension {dim}. "
"This can be fixed by calling unify_chunks()."
)
chunks[dim] = c
return Frozen(chunks)
Mapping from dimension names to block lengths for this dataset's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().

Same as Dataset.chunksizes, but maintained for backwards compatibility.

See Also
--------
Dataset.chunk
Dataset.chunksizes
xarray.unify_chunks
"""
return get_chunksizes(self.variables.values())

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this dataset's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().

Same as Dataset.chunks.

See Also
--------
Dataset.chunk
Dataset.chunks
xarray.unify_chunks
"""
return get_chunksizes(self.variables.values())

def chunk(
self,
Expand Down Expand Up @@ -2142,6 +2159,12 @@ def chunk(
Returns
-------
chunked : xarray.Dataset

See Also
--------
Dataset.chunks
Dataset.chunksizes
xarray.unify_chunks
"""
if chunks is None:
warnings.warn(
Expand Down
37 changes: 33 additions & 4 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
sparse_array_type,
)
from .utils import (
Frozen,
NdimSizeLenMixin,
OrderedSet,
_default,
Expand Down Expand Up @@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None):
__hash__ = None # type: ignore[assignment]

@property
def chunks(self):
"""Block dimensions for this array's data or None if it's not a dask
array.
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
"""
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
the underlying data is not a dask array.

See Also
--------
Variable.chunk
Variable.chunksizes
xarray.unify_chunks
"""
return getattr(self._data, "chunks", None)

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this variable's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().

Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes
instead of a tuple of chunk shapes.

See Also
--------
Variable.chunk
Variable.chunks
xarray.unify_chunks
"""
if hasattr(self._data, "chunks"):
return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)})
else:
return {}

_array_counter = itertools.count()

def chunk(self, chunks={}, name=None, lock=False):
"""Coerce this array's data into a dask arrays with the given chunks.
"""Coerce this array's data into a dask array with the given chunks.

If this variable is a non-dask array, it will be converted to dask
array. If it's a dask array, it will be rechunked to the given chunk
Expand Down