Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset.chunk() and DataArray.chunk() now set encoding attribute #8069

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@ def _maybe_chunk(
)

if overwrite_encoded_chunks and var.chunks is not None:
var.encoding["chunks"] = tuple(x[0] for x in var.chunks)
# Use chunk size 1 for dimensions of length 0
var.encoding["chunks"] = tuple(x[0] if x[0] != 0 else 1 for x in var.chunks)
return var
else:
return var
Expand Down Expand Up @@ -2612,7 +2613,7 @@ def chunk(
already as dask array.
chunked_array_type: str, optional
Which chunked array type to coerce this datasets' arrays to.
Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system.
Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system.
Experimental API that should not be relied upon.
from_array_kwargs: dict, optional
Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create
Expand Down Expand Up @@ -2665,6 +2666,7 @@ def chunk(
token,
lock,
name_prefix,
overwrite_encoded_chunks=True,
inline_array=inline_array,
chunked_array_type=chunkmanager,
from_array_kwargs=from_array_kwargs.copy(),
Expand Down
5 changes: 5 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,14 +877,17 @@ def test_chunk(self) -> None:

blocked = unblocked.chunk()
assert blocked.chunks == ((3,), (4,))
assert blocked.encoding.get("chunks", None) == (3, 4)
first_dask_name = blocked.data.name

blocked = unblocked.chunk(chunks=((2, 1), (2, 2)))
assert blocked.chunks == ((2, 1), (2, 2))
assert blocked.encoding.get("chunks", None) == (2, 2)
assert blocked.data.name != first_dask_name

blocked = unblocked.chunk(chunks=(3, 3))
assert blocked.chunks == ((3,), (3, 1))
assert blocked.encoding.get("chunks", None) == (3, 3)
assert blocked.data.name != first_dask_name

# name doesn't change when rechunking by same amount
Expand Down Expand Up @@ -3514,6 +3517,8 @@ def test_to_and_from_dict(

if has_dask:
da = array.chunk()
if encoding:
expected["encoding"]["chunks"] = da.shape
else:
da = array

Expand Down
13 changes: 11 additions & 2 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,17 +1142,26 @@ def test_chunk(self) -> None:

# test kwargs form of chunks
assert data.chunk(expected_chunks).chunks == expected_chunks
# Verify the encoding attributes have been set
for da_reblocked in reblocked.values():
assert da_reblocked.encoding.get("chunks", None) == da_reblocked.shape

def get_dask_names(ds):
return {k: v.data.name for k, v in ds.items()}

orig_dask_names = get_dask_names(reblocked)

reblocked = data.chunk({"time": 5, "dim1": 5, "dim2": 5, "dim3": 5})
desired_chunks = {"time": 6, "dim1": 5, "dim2": 4, "dim3": 3}
reblocked = data.chunk(desired_chunks)
# time is not a dim in any of the data_vars, so it
# doesn't get chunked
expected_chunks = {"dim1": (5, 3), "dim2": (5, 4), "dim3": (5, 5)}
expected_chunks = {"dim1": (5, 3), "dim2": (4, 4, 1), "dim3": (3, 3, 3, 1)}
assert reblocked.chunks == expected_chunks
# Verify the encoding attributes have been set
for da_reblocked in reblocked.values():
assert da_reblocked.encoding.get("chunks", None) == tuple(
desired_chunks[str(d)] for d in da_reblocked.dims
)

# make sure dask names change when rechunking by different amounts
# regression test for GH3350
Expand Down
Loading