From 85c49d3d6995a78f2cb337bf017307d5050d19d8 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 18 Sep 2024 08:19:59 -0400 Subject: [PATCH 01/13] fix safe chunks validation --- xarray/backends/zarr.py | 111 ++++++++++++++++++++-------------- xarray/tests/test_backends.py | 68 ++++++++++++++++++--- 2 files changed, 128 insertions(+), 51 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 52d2175621f..52de392e85d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +163,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks) + return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -189,20 +189,36 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: - for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks, strict=True): - for dchunk in dchunks[:-1]: + base_error = ( + f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + f"Writing this array in parallel with dask could lead to corrupted data." + f"Consider either rechunking using `chunk()`, deleting " + f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." + ) + + for zchunk, dchunks, interval in zip(enc_chunks_tuple, var_chunks, region, strict=True): + if not safe_chunks or len(dchunks) <= 1: + # It is not necessary to perform any additional validation if the + # safe_chunks is False, or there are less than two dchunks + continue + + start = 0 + if interval.start: + # If the start of the interval is not None or 0, it means that the data + # is being appended or updated, and in both cases it is mandatory that + # the residue of the division between the first dchunk and the zchunk + # being equal to the border size + border_size = zchunk - interval.start % zchunk + if dchunks[0] % zchunk != border_size: + raise ValueError(base_error) + # Avoid validating the first chunk inside the loop + start = 1 + + for dchunk in dchunks[start:-1]: if dchunk % zchunk: - base_error = ( - f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " - f"Writing this array in parallel with dask could lead to corrupted data." - ) - if safe_chunks: - raise ValueError( - base_error - + " Consider either rechunking using `chunk()`, deleting " - "or modifying `encoding['chunks']`, or specify `safe_chunks=False`." - ) + raise ValueError(base_error) + return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -243,7 +259,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, raise_on_invalid=False, name=None, safe_chunks=True + variable, region, raise_on_invalid=False, name=None, safe_chunks=True ): """ Extract zarr encoding dictionary from xarray Variable @@ -251,6 +267,7 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable + region: tuple[slice] raise_on_invalid : bool, optional Returns @@ -285,7 +302,7 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks + encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks, region ) encoding["chunks"] = chunks return encoding @@ -762,16 +779,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} - # We need to do this for both new and existing variables to ensure we're not - # writing to a partial chunk, even though we don't use the `encoding` value - # when writing to an existing variable. See - # https://github.com/pydata/xarray/issues/8371 for details. - encoding = extract_zarr_variable_encoding( - v, - raise_on_invalid=vn in check_encoding_set, - name=vn, - safe_chunks=self._safe_chunks, - ) + zarr_array = None + write_region = self._write_region if self._write_region is not None else {} + write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} if name in existing_keys: # existing variable @@ -801,7 +811,36 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) else: zarr_array = self.zarr_group[name] - else: + + if self._append_dim is not None and self._append_dim in dims: + # resize existing variable + append_axis = dims.index(self._append_dim) + assert write_region[self._append_dim] == slice(None) + write_region[self._append_dim] = slice( + zarr_array.shape[append_axis], None + ) + + new_shape = list(zarr_array.shape) + new_shape[append_axis] += v.shape[append_axis] + zarr_array.resize(new_shape) + + region = tuple(write_region[dim] for dim in dims) + + # We need to do this for both new and existing variables to ensure we're not + # writing to a partial chunk, even though we don't use the `encoding` value + # when writing to an existing variable. See + # https://github.com/pydata/xarray/issues/8371 for details. + # Note: Ideally there should be two functions, one for validating the chunks and + # another one for extracting the encoding. + encoding = extract_zarr_variable_encoding( + v, + region=region, + raise_on_invalid=vn in check_encoding_set, + name=vn, + safe_chunks=self._safe_chunks, + ) + + if name not in existing_keys: # new variable encoded_attrs = {} # the magic for storing the hidden dimension data @@ -833,22 +872,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) zarr_array = _put_attrs(zarr_array, encoded_attrs) - write_region = self._write_region if self._write_region is not None else {} - write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} - - if self._append_dim is not None and self._append_dim in dims: - # resize existing variable - append_axis = dims.index(self._append_dim) - assert write_region[self._append_dim] == slice(None) - write_region[self._append_dim] = slice( - zarr_array.shape[append_axis], None - ) - - new_shape = list(zarr_array.shape) - new_shape[append_axis] += v.shape[append_axis] - zarr_array.resize(new_shape) - - region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) def close(self) -> None: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 13258fcf6ea..a78b583598b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5496,24 +5496,26 @@ def test_encode_zarr_attr_value() -> None: @requires_zarr def test_extract_zarr_variable_encoding() -> None: + # The region is not useful in these cases, but I still think that it must be mandatory + # because the validation of the chunks is in the same function var = xr.Variable("x", [1, 2]) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) assert "chunks" in actual assert actual["chunks"] is None var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) assert actual["chunks"] == (1,) # does not raise on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var) + actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) # raises on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) with pytest.raises(ValueError, match=r"unexpected encoding parameters"): actual = backends.zarr.extract_zarr_variable_encoding( - var, raise_on_invalid=True + var, raise_on_invalid=True, region=tuple() ) @@ -6096,6 +6098,58 @@ def test_zarr_region_chunk_partial_offset(tmp_path): store, safe_chunks=False, region="auto" ) - # This write is unsafe, and should raise an error, but does not. - # with pytest.raises(ValueError): - # da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + with pytest.raises(ValueError): + da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk(tmp_path): + # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 + store = tmp_path / "foo.zarr" + data = np.ones((20,)) + da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is smaller than the border size then raise an error + da.isel(x=slice(7, 11)).chunk(x=(2, 2)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size then it is valid + da.isel(x=slice(7, 11)).chunk(x=(3, 1)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 11))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size + N * zchunk then it is valid + da.isel(x=slice(7, 17)).chunk(x=(8, 2)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 17))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is valid but the other are not then raise an error + da.isel(x=slice(7, 14)).chunk(x=(3, 3, 1)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk have a size bigger than the border size but not enough + # to complete the size of the next chunk then an error must be raised + da.isel(x=slice(7, 14)).chunk(x=(4, 3)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # Append with a single chunk it's totally valid, + # and it does not matter the size of the chunk + da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr( + store, append_dim="x", safe_chunks=True + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) From 0160d48ee35153f26e96515f887affca61a89348 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:25:29 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 11 +++++++++-- xarray/tests/test_backends.py | 4 +--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 52de392e85d..c4099f1f5fe 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -197,7 +197,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." ) - for zchunk, dchunks, interval in zip(enc_chunks_tuple, var_chunks, region, strict=True): + for zchunk, dchunks, interval in zip( + enc_chunks_tuple, var_chunks, region, strict=True + ): if not safe_chunks or len(dchunks) <= 1: # It is not necessary to perform any additional validation if the # safe_chunks is False, or there are less than two dchunks @@ -302,7 +304,12 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks, region + encoding.get("chunks"), + variable.chunks, + variable.ndim, + name, + safe_chunks, + region, ) encoding["chunks"] = chunks return encoding diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a78b583598b..06646e6ec4a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6149,7 +6149,5 @@ def test_zarr_safe_chunk(tmp_path): da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") # Append with a single chunk it's totally valid, # and it does not matter the size of the chunk - da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr( - store, append_dim="x", safe_chunks=True - ) + da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) From 60a7a3f18e2b450590d311141ca2ee4b79df6dc8 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 18 Sep 2024 08:55:26 -0400 Subject: [PATCH 03/13] fix safe chunks validation --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 264c07f562b..56f4dda4cca 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -51,7 +51,9 @@ Bug fixes the non-missing times could in theory be encoded with integers (:issue:`9488`, :pull:`9497`). By `Spencer Clark `_. - +- Fix the safe_chunks validation option on the to_zarr method + (:issue:`5511`, :pull:`9513`). By `Joseph Nowak + `_. Documentation ~~~~~~~~~~~~~ From 6c41f4beb059d4ac0a8c04cda117177284e3fd62 Mon Sep 17 00:00:00 2001 From: joseph nowak Date: Wed, 18 Sep 2024 15:26:00 -0400 Subject: [PATCH 04/13] Update xarray/tests/test_backends.py Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 06646e6ec4a..a2419cf9145 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6104,7 +6104,7 @@ def test_zarr_region_chunk_partial_offset(tmp_path): @requires_zarr @requires_dask -def test_zarr_safe_chunk(tmp_path): +def test_zarr_safe_chunk_append_dim(tmp_path): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((20,)) From a2a786bcbf0bd0692dcbab2e9196cb0379c70d0a Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 16:15:50 -0400 Subject: [PATCH 05/13] The validation of the chunks now is able to detect full or partial chunk and raise a proper error based on the mode selected, it is also possible to use the auto region detection with the mode "a" --- xarray/backends/zarr.py | 76 ++++++++++++++++++++------------ xarray/core/dataarray.py | 8 ++++ xarray/core/dataset.py | 8 ++++ xarray/tests/test_backends.py | 83 +++++++++++++++++++++++++++++++++-- 4 files changed, 143 insertions(+), 32 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index af289d2ea7b..98936aae31a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +163,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region) + return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region, mode) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -189,9 +189,19 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: + # If it is possible to write on partial chunks then it is not necessary to check + # the last one contained on the region + allow_partial_chunks = True + end = -1 + if mode == "r+": + # This mode forces to write only on full chunks, even on the last one + allow_partial_chunks = False + end = None + base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r} " + f"on the region {region}. " f"Writing this array in parallel with dask could lead to corrupted data." f"Consider either rechunking using `chunk()`, deleting " f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." @@ -200,27 +210,27 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi for zchunk, dchunks, interval in zip( enc_chunks_tuple, var_chunks, region, strict=True ): - if not safe_chunks or len(dchunks) <= 1: - # It is not necessary to perform any additional validation if the - # safe_chunks is False, or there are less than two dchunks + if not safe_chunks: continue - start = 0 + # The first border size is the amount of data that needs to be updated on the + # first chunk taking into account the region slice. + first_border_size = zchunk if interval.start: - # If the start of the interval is not None or 0, it means that the data - # is being appended or updated, and in both cases it is mandatory that - # the residue of the division between the first dchunk and the zchunk - # being equal to the border size - border_size = zchunk - interval.start % zchunk - if dchunks[0] % zchunk != border_size: - raise ValueError(base_error) - # Avoid validating the first chunk inside the loop - start = 1 + first_border_size = zchunk - interval.start % zchunk - for dchunk in dchunks[start:-1]: - if dchunk % zchunk: + if not allow_partial_chunks and first_border_size < zchunk: + # If the border is smaller than zchunk, then it is a partial chunk write + raise ValueError(first_border_size) + + for dchunk in dchunks[:end]: + if (dchunk - first_border_size) % zchunk: raise ValueError(base_error) + # The first border is only useful during the first iteration, + # so ignore it in the next validations + first_border_size = 0 + return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -261,7 +271,12 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, region, raise_on_invalid=False, name=None, safe_chunks=True + variable, + raise_on_invalid=False, + name=None, + safe_chunks=True, + region=None, + mode=None ): """ Extract zarr encoding dictionary from xarray Variable @@ -269,8 +284,11 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - region: tuple[slice] + region: tuple[slice], optional raise_on_invalid : bool, optional + safe_chunks: bool, optional + name: str | Hashable, optional + mode: str, optional Returns ------- @@ -304,12 +322,13 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), - variable.chunks, - variable.ndim, - name, - safe_chunks, - region, + enc_chunks=encoding.get("chunks"), + var_chunks=variable.chunks, + ndim=variable.ndim, + name=name, + safe_chunks=safe_chunks, + region=region, + mode=mode ) encoding["chunks"] = chunks return encoding @@ -845,6 +864,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, + mode=self._mode ) if name not in existing_keys: @@ -927,9 +947,9 @@ def _validate_and_autodetect_region(self, ds) -> None: if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): - if self._mode != "r+": + if self._mode not in ["r+", "a"]: raise ValueError( - f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" + f"``mode`` must be 'r+' or 'a' when using ``region='auto'``, got {self._mode!r}" ) region = self._auto_detect_regions(ds, region) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 37369afbf96..1a308213ab3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4304,6 +4304,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr store from an existing DataArray with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7b9b4819245..b1ce264cbc8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2509,6 +2509,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr from an existing Dataset with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index beaf22826ec..a7f13c12f8a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5991,9 +5991,10 @@ def test_zarr_region_append(self, tmp_path): } ) - # Don't allow auto region detection in append mode due to complexities in - # implementing the overlap logic and lack of safety with parallel writes - with pytest.raises(ValueError): + # Now it is valid to use auto region detection with the append mode, + # but it is still unsafe to modify dimensions or metadata using the region + # parameter. + with pytest.raises(KeyError): ds_new.to_zarr( tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" ) @@ -6105,7 +6106,6 @@ def test_zarr_region_chunk_partial_offset(tmp_path): @requires_zarr @requires_dask def test_zarr_safe_chunk_append_dim(tmp_path): - # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((20,)) da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) @@ -6151,3 +6151,78 @@ def test_zarr_safe_chunk_append_dim(tmp_path): # and it does not matter the size of the chunk da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk_region(tmp_path): + store = tmp_path / "foo.zarr" + + arr = xr.DataArray( + list(range(10)), + dims=["a"], + coords={"a": list(range(10))}, + name="foo" + ).chunk(a=3) + arr.to_zarr(store, mode="w") + + for mode in ["r+", "a"]: + with pytest.raises(ValueError): + # There are two Dask chunks on the same Zarr chunk, + # which means that it is unsafe in any mode + arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode=mode) + + with pytest.raises(ValueError): + # the first chunk is covering the border size, but it is not + # completely covering the second chunk, which means that it is + # unsafe in any mode + arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr(store, region="auto", mode=mode) + + with pytest.raises(ValueError): + # The first chunk is safe but the other two chunks are overlapping with + # the same Zarr chunk + arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr(store, region="auto", mode=mode) + + # Fully update two contiguous chunks is safe in any mode + arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) + + # Write the last chunk partially is safe in "a" mode + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # with "r+" mode it is invalid to write partial chunk even on the last one + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") + + # This is safe with mode "a", the border size is covered by the first chunk of Dask + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # This is considered unsafe in mode "r+" because it is writing in a partial chunk + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") + + # This is safe on mode "a" because there is a single dask chunk + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # This is unsafe on mode "r+", because there is a single dask + # chunk smaller than the Zarr chunk + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is completely covering the first Zarr chunk + # and the last chunk is a partial chunk + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The last chunk is partial, so it is considered unsafe on mode "r+" + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is covering the border size (2 elements) + # and also the second chunk (3 elements), so it is valid + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The first chunk is not fully covering the first zarr chunk + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # Validate that the border condition is not affecting the "r+" mode + arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") From 604b8e16bcdc1f8565bf561b10e19185183e6efd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 20:20:55 +0000 Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 14 +++++++++----- xarray/tests/test_backends.py | 17 ++++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 98936aae31a..c66cd65e4ad 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,9 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode): +def _determine_zarr_chunks( + enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode +): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +165,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks, regi if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks, region, mode) + return _determine_zarr_chunks( + None, var_chunks, ndim, name, safe_chunks, region, mode + ) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -276,7 +280,7 @@ def extract_zarr_variable_encoding( name=None, safe_chunks=True, region=None, - mode=None + mode=None, ): """ Extract zarr encoding dictionary from xarray Variable @@ -328,7 +332,7 @@ def extract_zarr_variable_encoding( name=name, safe_chunks=safe_chunks, region=region, - mode=mode + mode=mode, ) encoding["chunks"] = chunks return encoding @@ -864,7 +868,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, - mode=self._mode + mode=self._mode, ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a7f13c12f8a..3a3e16afe93 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6159,10 +6159,7 @@ def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(10)), - dims=["a"], - coords={"a": list(range(10))}, - name="foo" + list(range(10)), dims=["a"], coords={"a": list(range(10))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") @@ -6170,18 +6167,24 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # There are two Dask chunks on the same Zarr chunk, # which means that it is unsafe in any mode - arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr( + store, region="auto", mode=mode + ) with pytest.raises(ValueError): # the first chunk is covering the border size, but it is not # completely covering the second chunk, which means that it is # unsafe in any mode - arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr( + store, region="auto", mode=mode + ) with pytest.raises(ValueError): # The first chunk is safe but the other two chunks are overlapping with # the same Zarr chunk - arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr( + store, region="auto", mode=mode + ) # Fully update two contiguous chunks is safe in any mode arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) From a30b1e07df9fff306c247e69e421b6ac4de1598c Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 16:22:54 -0400 Subject: [PATCH 07/13] The test_extract_zarr_variable_encoding does not need to use the region parameter --- xarray/tests/test_backends.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a7f13c12f8a..032a24c037c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5496,26 +5496,24 @@ def test_encode_zarr_attr_value() -> None: @requires_zarr def test_extract_zarr_variable_encoding() -> None: - # The region is not useful in these cases, but I still think that it must be mandatory - # because the validation of the chunks is in the same function var = xr.Variable("x", [1, 2]) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) assert "chunks" in actual assert actual["chunks"] is None var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) assert actual["chunks"] == (1,) # does not raise on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) - actual = backends.zarr.extract_zarr_variable_encoding(var, region=tuple()) + actual = backends.zarr.extract_zarr_variable_encoding(var) # raises on invalid var = xr.Variable("x", [1, 2], encoding={"foo": (1,)}) with pytest.raises(ValueError, match=r"unexpected encoding parameters"): actual = backends.zarr.extract_zarr_variable_encoding( - var, raise_on_invalid=True, region=tuple() + var, raise_on_invalid=True ) From c781042a1250731ed26e3a674075813f2def4091 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 17:15:06 -0400 Subject: [PATCH 08/13] Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ --- xarray/backends/zarr.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c66cd65e4ad..756ce21bc9b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -195,12 +195,9 @@ def _determine_zarr_chunks( if var_chunks and enc_chunks_tuple: # If it is possible to write on partial chunks then it is not necessary to check # the last one contained on the region - allow_partial_chunks = True - end = -1 - if mode == "r+": - # This mode forces to write only on full chunks, even on the last one - allow_partial_chunks = False - end = None + allow_partial_chunks = mode != "r+" + # The r+ mode force to write only on full chunks, even on the last one + end = None if mode == "r+" else -1 base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " @@ -225,7 +222,7 @@ def _determine_zarr_chunks( if not allow_partial_chunks and first_border_size < zchunk: # If the border is smaller than zchunk, then it is a partial chunk write - raise ValueError(first_border_size) + raise ValueError(base_error) for dchunk in dchunks[:end]: if (dchunk - first_border_size) % zchunk: @@ -278,6 +275,7 @@ def extract_zarr_variable_encoding( variable, raise_on_invalid=False, name=None, + *, safe_chunks=True, region=None, mode=None, @@ -288,10 +286,10 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - region: tuple[slice], optional + name: str | Hashable, optional raise_on_invalid : bool, optional safe_chunks: bool, optional - name: str | Hashable, optional + region: tuple[slice], optional mode: str, optional Returns From c454cfef842f9f795460c5994f8bd0ccf0ad3cf4 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Fri, 20 Sep 2024 17:17:18 -0400 Subject: [PATCH 09/13] Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 756ce21bc9b..b10f3c8da94 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -286,8 +286,8 @@ def extract_zarr_variable_encoding( Parameters ---------- variable : Variable - name: str | Hashable, optional raise_on_invalid : bool, optional + name: str | Hashable, optional safe_chunks: bool, optional region: tuple[slice], optional mode: str, optional From cc585d0fb4b7003822d75c8199b30a0afb47b278 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 18:14:24 -0400 Subject: [PATCH 10/13] Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" --- xarray/backends/zarr.py | 55 ++++++++++++++++++++++------------- xarray/tests/test_backends.py | 41 ++++++++++++++++++++++---- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b10f3c8da94..e6fe93a398a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -113,7 +113,7 @@ def __getitem__(self, key): def _determine_zarr_chunks( - enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode + enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape ): """ Given encoding chunks (possibly None or []) and variable chunks @@ -166,7 +166,7 @@ def _determine_zarr_chunks( if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over return _determine_zarr_chunks( - None, var_chunks, ndim, name, safe_chunks, region, mode + None, var_chunks, ndim, name, safe_chunks, region, mode, shape ) for x in enc_chunks_tuple: @@ -208,29 +208,38 @@ def _determine_zarr_chunks( f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." ) - for zchunk, dchunks, interval in zip( - enc_chunks_tuple, var_chunks, region, strict=True + for zchunk, dchunks, interval, size in zip( + enc_chunks_tuple, var_chunks, region, shape, strict=True ): if not safe_chunks: continue - # The first border size is the amount of data that needs to be updated on the - # first chunk taking into account the region slice. - first_border_size = zchunk - if interval.start: - first_border_size = zchunk - interval.start % zchunk + for dchunk in dchunks[1:-1]: + if dchunk % zchunk: + raise ValueError(base_error) + + region_start = interval.start if interval.start else 0 - if not allow_partial_chunks and first_border_size < zchunk: - # If the border is smaller than zchunk, then it is a partial chunk write - raise ValueError(base_error) + if len(dchunks) > 1: + # The first border size is the amount of data that needs to be updated on the + # first chunk taking into account the region slice. + first_border_size = zchunk + if allow_partial_chunks: + first_border_size = zchunk - region_start % zchunk - for dchunk in dchunks[:end]: - if (dchunk - first_border_size) % zchunk: + if (dchunks[0] - first_border_size) % zchunk: raise ValueError(base_error) - # The first border is only useful during the first iteration, - # so ignore it in the next validations - first_border_size = 0 + if not allow_partial_chunks: + region_stop = interval.stop if interval.stop else size + cover_last_chunk = region_stop > size - size % zchunk + + if not cover_last_chunk: + if dchunks[-1] % zchunk: + raise ValueError(base_error) + elif dchunks[-1] % zchunk != size % zchunk: + # The remainder must be equal to the size of the last Zarr chunk + raise ValueError(base_error) return enc_chunks_tuple @@ -279,6 +288,7 @@ def extract_zarr_variable_encoding( safe_chunks=True, region=None, mode=None, + shape=None ): """ Extract zarr encoding dictionary from xarray Variable @@ -289,9 +299,9 @@ def extract_zarr_variable_encoding( raise_on_invalid : bool, optional name: str | Hashable, optional safe_chunks: bool, optional - region: tuple[slice], optional + region: tuple[slice, ...], optional mode: str, optional - + shape: tuple[int, ...], optional Returns ------- encoding : dict @@ -331,6 +341,7 @@ def extract_zarr_variable_encoding( safe_chunks=safe_chunks, region=region, mode=mode, + shape=shape ) encoding["chunks"] = chunks return encoding @@ -808,6 +819,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No v.encoding = {} zarr_array = None + zarr_shape = None write_region = self._write_region if self._write_region is not None else {} write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} @@ -852,6 +864,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No new_shape[append_axis] += v.shape[append_axis] zarr_array.resize(new_shape) + zarr_shape = zarr_array.shape + region = tuple(write_region[dim] for dim in dims) # We need to do this for both new and existing variables to ensure we're not @@ -862,11 +876,12 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # another one for extracting the encoding. encoding = extract_zarr_variable_encoding( v, - region=region, raise_on_invalid=vn in check_encoding_set, name=vn, safe_chunks=self._safe_chunks, + region=region, mode=self._mode, + shape=zarr_shape ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 78d50fcbdac..c04f71ae61c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6157,7 +6157,7 @@ def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(10)), dims=["a"], coords={"a": list(range(10))}, name="foo" + list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") @@ -6187,10 +6187,14 @@ def test_zarr_safe_chunk_region(tmp_path): # Fully update two contiguous chunks is safe in any mode arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) - # Write the last chunk partially is safe in "a" mode + # The last chunk is considered full based on their current size (2) + arr.isel(a=slice(9, 11)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(6, None)).chunk(a=-1).to_zarr(store, region="auto", mode=mode) + + # Write the last chunk of a region partially is safe in "a" mode arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): - # with "r+" mode it is invalid to write partial chunk even on the last one + # with "r+" mode it is invalid to write partial chunk arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") # This is safe with mode "a", the border size is covered by the first chunk of Dask @@ -6204,12 +6208,12 @@ def test_zarr_safe_chunk_region(tmp_path): arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): - # This is unsafe on mode "r+", because there is a single dask - # chunk smaller than the Zarr chunk + # This is unsafe on mode "r+", because the Dask chunk is partially writing + # in the first chunk of Zarr arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") # The first chunk is completely covering the first Zarr chunk - # and the last chunk is a partial chunk + # and the last chunk is a partial one arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): @@ -6227,3 +6231,28 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # Validate that the border condition is not affecting the "r+" mode arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Validate that even if we write with a single Dask chunk on the last Zarr + # chunk it is still unsafe if it is not fully covering it + # (the last Zarr chunk has size 2) + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") + + # Validate the same than the above test but in the beginning of the last chunk + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Test that even a Dask chunk that covers the last Zarr chunk can be unsafe + # if it is partial covering other Zarr chunks + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # If the chunk is of size equal to the one in the Zarr encoding, but + # it is partially writing in the last chunk then raise an error + arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") + + From 9302036426847f3fbde31915660e23e826684633 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Sep 2024 22:15:01 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 6 +++--- xarray/tests/test_backends.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e6fe93a398a..775bd1e6d80 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -288,7 +288,7 @@ def extract_zarr_variable_encoding( safe_chunks=True, region=None, mode=None, - shape=None + shape=None, ): """ Extract zarr encoding dictionary from xarray Variable @@ -341,7 +341,7 @@ def extract_zarr_variable_encoding( safe_chunks=safe_chunks, region=region, mode=mode, - shape=shape + shape=shape, ) encoding["chunks"] = chunks return encoding @@ -881,7 +881,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No safe_chunks=self._safe_chunks, region=region, mode=self._mode, - shape=zarr_shape + shape=zarr_shape, ) if name not in existing_keys: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c04f71ae61c..6529dd74c21 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6254,5 +6254,3 @@ def test_zarr_safe_chunk_region(tmp_path): # If the chunk is of size equal to the one in the Zarr encoding, but # it is partially writing in the last chunk then raise an error arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") - - From 0b4b9b1f9bb61becc39ad3bea6da9775cb82a72d Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 19:54:22 -0400 Subject: [PATCH 12/13] Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" --- xarray/backends/zarr.py | 21 ++++++++++++++------- xarray/tests/test_backends.py | 9 ++++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e6fe93a398a..197f735f950 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -196,8 +196,6 @@ def _determine_zarr_chunks( # If it is possible to write on partial chunks then it is not necessary to check # the last one contained on the region allow_partial_chunks = mode != "r+" - # The r+ mode force to write only on full chunks, even on the last one - end = None if mode == "r+" else -1 base_error = ( f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " @@ -231,14 +229,21 @@ def _determine_zarr_chunks( raise ValueError(base_error) if not allow_partial_chunks: + chunk_start = sum(dchunks[:-1]) + region_start + if chunk_start % zchunk: + # The last chunk which can also be the only one is a partial chunk + # if it is not aligned at the beginning + raise ValueError(base_error) + region_stop = interval.stop if interval.stop else size - cover_last_chunk = region_stop > size - size % zchunk - if not cover_last_chunk: - if dchunks[-1] % zchunk: + if size - region_stop + 1 < zchunk: + # If the region is covering the last chunk then check + # if the reminder with the default chunk size + # is equal to the size of the last chunk + if dchunks[-1] % zchunk != size % zchunk: raise ValueError(base_error) - elif dchunks[-1] % zchunk != size % zchunk: - # The remainder must be equal to the size of the last Zarr chunk + elif dchunks[-1] % zchunk: raise ValueError(base_error) return enc_chunks_tuple @@ -307,6 +312,8 @@ def extract_zarr_variable_encoding( encoding : dict Zarr encoding for `variable` """ + + shape = shape if shape else variable.shape encoding = variable.encoding.copy() safe_to_drop = {"source", "original_shape"} diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c04f71ae61c..919317fb0d0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6199,14 +6199,12 @@ def test_zarr_safe_chunk_region(tmp_path): # This is safe with mode "a", the border size is covered by the first chunk of Dask arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): # This is considered unsafe in mode "r+" because it is writing in a partial chunk arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") # This is safe on mode "a" because there is a single dask chunk arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): # This is unsafe on mode "r+", because the Dask chunk is partially writing # in the first chunk of Zarr @@ -6239,7 +6237,7 @@ def test_zarr_safe_chunk_region(tmp_path): # (the last Zarr chunk has size 2) arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") - # Validate the same than the above test but in the beginning of the last chunk + # Validate the same as the above test but in the beginning of the last chunk arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") with pytest.raises(ValueError): arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") @@ -6252,7 +6250,8 @@ def test_zarr_safe_chunk_region(tmp_path): with pytest.raises(ValueError): # If the chunk is of size equal to the one in the Zarr encoding, but - # it is partially writing in the last chunk then raise an error + # it is partially writing in the first chunk then raise an error arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") - + with pytest.raises(ValueError): + arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") From 23a864aa9b1ce298be58506203c31abed6499d76 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Sat, 21 Sep 2024 20:19:55 -0400 Subject: [PATCH 13/13] Add a typehint to the modes to avoid issues with mypy --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 919317fb0d0..ccf1bc73dd6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6161,7 +6161,8 @@ def test_zarr_safe_chunk_region(tmp_path): ).chunk(a=3) arr.to_zarr(store, mode="w") - for mode in ["r+", "a"]: + modes: list[Literal["r+", "a"]] = ["r+", "a"] + for mode in modes: with pytest.raises(ValueError): # There are two Dask chunks on the same Zarr chunk, # which means that it is unsafe in any mode