diff --git a/docs/releases.rst b/docs/releases.rst index d41ae206..d3088147 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -29,6 +29,8 @@ Bug fixes By `Tom Nicholas `_. - Ensure that `.attrs` on coordinate variables are preserved during round-tripping. (:issue:`155`, :pull:`154`) By `Tom Nicholas `_. +- Ensure that non-dimension coordinate variables described via the CF conventions are preserved during round-tripping. (:issue:`105`, :pull:`156`) + By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index ded76e3e..f6683690 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -217,11 +217,18 @@ def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: all_arr_refs.update(prepended_with_var_name) + zattrs = ds.attrs + if ds.coords: + coord_names = list(ds.coords) + # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored + # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 + zattrs["coordinates"] = " ".join(coord_names) + ds_refs = { "version": 1, "refs": { ".zgroup": '{"zarr_format":2}', - ".zattrs": ujson.dumps(ds.attrs), + ".zattrs": ujson.dumps(zattrs), **all_arr_refs, }, } diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 50c0d647..9023feed 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import xarray as xr import xarray.testing as xrt @@ -95,6 +96,31 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): # assert identical to original dataset xrt.assert_identical(roundtrip, ds) + def test_non_dimension_coordinates(self, tmpdir, format): + # regression test for GH issue #105 + + # set up example xarray dataset containing non-dimension coordinate variables + ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))}) + + # save it to disk as netCDF (in temporary directory) + ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") + + vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={}) + + assert "lat" in vds.coords + assert "coordinates" not in vds.attrs + + # write those references to disk as kerchunk references format + vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format) + + # use fsspec to read the dataset from disk via the kerchunk references + roundtrip = xr.open_dataset( + f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False + ) + + # assert equal to original dataset + xrt.assert_identical(roundtrip, ds) + def test_open_scalar_variable(tmpdir): # regression test for GH issue #100 diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 85f27c0d..d8bf8609 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -117,6 +117,7 @@ def open_virtual_dataset( virtual_array_class=virtual_array_class, ) ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) if indexes is None or len(loadable_variables) > 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... @@ -152,7 +153,7 @@ def open_virtual_dataset( vars = {**virtual_vars, **loadable_vars} - data_vars, coords = separate_coords(vars, indexes) + data_vars, coords = separate_coords(vars, indexes, coord_names) vds = xr.Dataset( data_vars, @@ -177,6 +178,7 @@ def open_virtual_dataset_from_v3_store( _storepath = Path(storepath) ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) # TODO recursive glob to create a datatree # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it @@ -205,7 +207,7 @@ def open_virtual_dataset_from_v3_store( else: indexes = dict(**indexes) # for type hinting: to allow mutation - data_vars, coords = separate_coords(vars, indexes) + data_vars, coords = separate_coords(vars, indexes, coord_names) ds = xr.Dataset( data_vars, @@ -223,8 +225,10 @@ def virtual_vars_from_kerchunk_refs( virtual_array_class=ManifestArray, ) -> Mapping[str, xr.Variable]: """ - Translate a store-level kerchunk reference dict into aa set of xarray Variables containing virtualized arrays. + Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. + Parameters + ---------- drop_variables: list[str], default is None Variables in the file to drop before returning. virtual_array_class @@ -263,12 +267,12 @@ def dataset_from_kerchunk_refs( """ vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) if indexes is None: indexes = {} - data_vars, coords = separate_coords(vars, indexes) - - ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + data_vars, coords = separate_coords(vars, indexes, coord_names) vds = xr.Dataset( data_vars, @@ -301,6 +305,7 @@ def variable_from_kerchunk_refs( def separate_coords( vars: Mapping[str, xr.Variable], indexes: MutableMapping[str, Index], + coord_names: Iterable[str] | None = None, ) -> tuple[Mapping[str, xr.Variable], xr.Coordinates]: """ Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. @@ -310,8 +315,8 @@ def separate_coords( Will also preserve any loaded variables and indexes it is passed. """ - # this would normally come from CF decoding, let's hope the fact we're skipping that doesn't cause any problems... - coord_names: list[str] = [] + if coord_names is None: + coord_names = [] # split data and coordinate variables (promote dimension coordinates) data_vars = {}