From d973f71fb78ac11d27ee35d9f35e17b73feb803f Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 30 Oct 2023 14:40:41 -0400 Subject: [PATCH 1/4] Allow kerchunk attrs to be passed in the data-cube ext -following pattern in xstac --- .gitignore | 3 +- README.md | 15 + environment.yaml | 4 +- tests/conftest.py | 6 + .../data-cube-kerchunk-item-collection.json | 16021 ++++++++++++++++ tests/test_core.py | 8 + xpystac/core.py | 50 +- 7 files changed, 16100 insertions(+), 7 deletions(-) create mode 100644 tests/data/data-cube-kerchunk-item-collection.json diff --git a/.gitignore b/.gitignore index 4a0c389..61f3202 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ dist .direnv .mypy_cache .pytest_cache -.ruff_cache \ No newline at end of file +.ruff_cache +*.egg-info/ \ No newline at end of file diff --git a/README.md b/README.md index dbb94c1..ca3bd67 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,21 @@ xr.open_dataset(asset) ``` ref: https://planetarycomputer.microsoft.com/docs/quickstarts/reading-zarr-data/ + +Here is an example using the new approach of storing kerchunked metadata within the data-cube extension: + +```python +import pystac +import xarray as xr + +path = "https://raw.githubusercontent.com/stac-utils/xpystac/main/tests/data/data-cube-kerchunk-item-collection.json" +item_collection = pystac.ItemCollection.from_file(path) + +ds = xr.open_dataset(item_collection) +ds +``` + + ## Install ```bash diff --git a/environment.yaml b/environment.yaml index c32d731..ca2c72f 100644 --- a/environment.yaml +++ b/environment.yaml @@ -1,4 +1,4 @@ -name: xpystac-broken +name: xpystac channels: - conda-forge - nodefaults @@ -11,6 +11,7 @@ dependencies: - adlfs - aiohttp - fsspec + - kerchunk - odc-stac - planetary-computer - pystac-client @@ -18,6 +19,7 @@ dependencies: - rioxarray - stackstac - urllib3<2 # temporary pin https://github.com/stac-utils/pystac-client/issues/509 + - xstac - zarr # testing - pytest diff --git a/tests/conftest.py b/tests/conftest.py index c32568d..6e55f05 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,3 +57,9 @@ def simple_zarr() -> pystac.Asset: def complex_zarr(simple_zarr) -> pystac.Asset: simple_zarr.extra_fields["xarray:open_kwargs"]["engine"] = "zarr" return simple_zarr + + +@pytest.fixture(scope="module") +def data_cube_kerchunk() -> pystac.ItemCollection: + path = "tests/data/data-cube-kerchunk-item-collection.json" + return pystac.ItemCollection.from_file(path) diff --git a/tests/data/data-cube-kerchunk-item-collection.json b/tests/data/data-cube-kerchunk-item-collection.json new file mode 100644 index 0000000..0ff2811 --- /dev/null +++ b/tests/data/data-cube-kerchunk-item-collection.json @@ -0,0 +1,16021 @@ +{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "20230920-short_range-conus-channel_rt-0-1", + "properties": { + "nwm:model_configuration": "short_range", + "nwm:model_domain": "conus", + "nwm:file_output_type": "channel_rt", + "nwm:forecast_hour": 1, + "cube:dimensions": { + "time": { + "extent": [ + "2023-09-20T01:00:00Z", + "2023-09-20T01:00:00Z" + ], + "description": "valid output time", + "type": "temporal", + "kerchunk:zarray": { + "chunks": [ + 1024 + ], + "compressor": { + "id": "zlib", + "level": 2 + }, + "dtype": " xarray.Dataset: """Given a pystac object return an xarray dataset @@ -18,24 +19,60 @@ def to_xarray( When stacking multiple items, an optional ``stacking_library`` argument is accepted. It defaults to ``odc.stac`` if available and otherwise ``stackstac``. Control the behavior by setting ``stacking_library`` + + User ``allow_kerchunk`` (True by default) to control whether this reader tries to + interpret kerchunk attributes if provided (either in the data-cube extension or + as a full asset with ``references`` or ``index`` as the role). """ if _is_item_search(obj): item_collection = obj.item_collection() - return to_xarray(item_collection, stacking_library=stacking_library, **kwargs) + return to_xarray( + item_collection, + stacking_library=stacking_library, + allow_kerchunk=allow_kerchunk, + **kwargs, + ) raise TypeError @to_xarray.register(pystac.Item) @to_xarray.register(pystac.ItemCollection) +@to_xarray.register(list) def _( - obj: Union[pystac.Item, pystac.ItemCollection], + obj: Union[pystac.Item, pystac.ItemCollection, List[pystac.Item]], drop_variables: Union[str, List[str], None] = None, stacking_library: Union[Literal["odc.stac", "stackstac"], None] = None, + allow_kerchunk: bool = True, **kwargs, ) -> xarray.Dataset: if drop_variables is not None: raise KeyError("``drop_variables`` not implemented for pystac items") + if allow_kerchunk: + first_obj = obj if isinstance(obj, pystac.Item) else next(i for i in obj) + is_kerchunked = any("kerchunk:" in k for k in first_obj.properties.keys()) + if is_kerchunked: + kerchunk_combine = _import_optional_dependency("kerchunk.combine") + xstac = _import_optional_dependency("xstac") + fsspec = _import_optional_dependency("fsspec") + + if isinstance(obj, (list, pystac.ItemCollection)): + refs = kerchunk_combine.MultiZarrToZarr( + [xstac.kerchunk.stac_to_kerchunk(item) for item in obj], + concat_dims=kwargs.get("concat_dims", "time"), + ).translate() + else: + refs = xstac.kerchunk.stac_to_kerchunk(obj) + + mapper = fsspec.filesystem("reference", fo=refs).get_mapper() + default_kwargs = { + "chunks": {}, + "engine": "zarr", + "consolidated": False, + } + + return xarray.open_dataset(mapper, **{**default_kwargs, **kwargs}) + if stacking_library is None: try: _import_optional_dependency("odc.stac") @@ -62,6 +99,7 @@ def _( def _( obj: pystac.Asset, stacking_library: Union[Literal["odc.stac", "stackstac"], None] = None, + allow_kerchunk: bool = True, **kwargs, ) -> xarray.Dataset: default_kwargs: Mapping = {"chunks": {}} @@ -71,8 +109,10 @@ def _( if storage_options: open_kwargs["storage_options"] = storage_options - if obj.media_type == pystac.MediaType.JSON and {"index", "references"}.intersection( - set(obj.roles) if obj.roles else set() + if ( + allow_kerchunk + and obj.media_type == pystac.MediaType.JSON + and {"index", "references"}.intersection(set(obj.roles) if obj.roles else set()) ): requests = _import_optional_dependency("requests") fsspec = _import_optional_dependency("fsspec") @@ -85,7 +125,7 @@ def _( except ImportError: refs = r.json() - mapper = fsspec.get_mapper("reference://", fo=refs) + mapper = fsspec.filesystem("reference", fo=refs).get_mapper() default_kwargs = { **default_kwargs, "engine": "zarr", From e73543d9ccd5ad7449107f2f181c47d4c83f328e Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 30 Oct 2023 15:06:08 -0400 Subject: [PATCH 2/4] Vendor in xstac.kerchunk.stac_to_kerchunk --- environment.yaml | 1 - xpystac/_xstac_kerchunk.py | 28 ++++++++++++++++++++++++++++ xpystac/core.py | 6 +++--- 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 xpystac/_xstac_kerchunk.py diff --git a/environment.yaml b/environment.yaml index ca2c72f..a92ec15 100644 --- a/environment.yaml +++ b/environment.yaml @@ -19,7 +19,6 @@ dependencies: - rioxarray - stackstac - urllib3<2 # temporary pin https://github.com/stac-utils/pystac-client/issues/509 - - xstac - zarr # testing - pytest diff --git a/xpystac/_xstac_kerchunk.py b/xpystac/_xstac_kerchunk.py new file mode 100644 index 0000000..e095639 --- /dev/null +++ b/xpystac/_xstac_kerchunk.py @@ -0,0 +1,28 @@ +import json +from typing import Any + +import pystac + + +def _stac_to_kerchunk(item: pystac.Item, kerchunk_version: int = 1) -> dict[str, Any]: + """ + Copied from xstac to support python < 3.10 + ref: https://github.com/stac-utils/xstac/blob/1.2.0/xstac/_kerchunk.py + + Derive Kerchunk indices from a STAC item. + """ + refs = {} + refs[".zgroup"] = json.dumps(item.properties["kerchunk:zgroup"]) + refs[".zattrs"] = json.dumps(item.properties["kerchunk:zattrs"]) + + for attr in ["cube:dimensions", "cube:variables"]: + cd = item.properties[attr] + for k in cd: + refs[f"{k}/.zarray"] = json.dumps(cd[k]["kerchunk:zarray"]) + # TODO: derive from datacube stuff, ARRAY_DIMENSIONS + refs[f"{k}/.zattrs"] = json.dumps(cd[k]["kerchunk:zattrs"]) + for i in cd[k]["kerchunk:value"]: + refs[f"{k}/{i}"] = cd[k]["kerchunk:value"][i] + + d = {"version": kerchunk_version, "refs": refs} + return d diff --git a/xpystac/core.py b/xpystac/core.py index 5ccf663..29ee73a 100644 --- a/xpystac/core.py +++ b/xpystac/core.py @@ -4,6 +4,7 @@ import pystac import xarray +from xpystac._xstac_kerchunk import _stac_to_kerchunk from xpystac.utils import _import_optional_dependency, _is_item_search @@ -53,16 +54,15 @@ def _( is_kerchunked = any("kerchunk:" in k for k in first_obj.properties.keys()) if is_kerchunked: kerchunk_combine = _import_optional_dependency("kerchunk.combine") - xstac = _import_optional_dependency("xstac") fsspec = _import_optional_dependency("fsspec") if isinstance(obj, (list, pystac.ItemCollection)): refs = kerchunk_combine.MultiZarrToZarr( - [xstac.kerchunk.stac_to_kerchunk(item) for item in obj], + [_stac_to_kerchunk(item) for item in obj], concat_dims=kwargs.get("concat_dims", "time"), ).translate() else: - refs = xstac.kerchunk.stac_to_kerchunk(obj) + refs = _stac_to_kerchunk(obj) mapper = fsspec.filesystem("reference", fo=refs).get_mapper() default_kwargs = { From b0dc84fc0ebcf93d7e9af285f66a299be4a21419 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 30 Oct 2023 15:12:11 -0400 Subject: [PATCH 3/4] Fix dict for python 3.8 --- xpystac/_xstac_kerchunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xpystac/_xstac_kerchunk.py b/xpystac/_xstac_kerchunk.py index e095639..bad1116 100644 --- a/xpystac/_xstac_kerchunk.py +++ b/xpystac/_xstac_kerchunk.py @@ -1,10 +1,10 @@ import json -from typing import Any +from typing import Any, Dict import pystac -def _stac_to_kerchunk(item: pystac.Item, kerchunk_version: int = 1) -> dict[str, Any]: +def _stac_to_kerchunk(item: pystac.Item, kerchunk_version: int = 1) -> Dict[str, Any]: """ Copied from xstac to support python < 3.10 ref: https://github.com/stac-utils/xstac/blob/1.2.0/xstac/_kerchunk.py From b170dff1591d7de24fbdd7707aaf2c520c4f3830 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 30 Oct 2023 15:15:37 -0400 Subject: [PATCH 4/4] Be sure to test just one item --- tests/test_core.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 428bf93..a97220f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -63,9 +63,18 @@ def test_to_xarray_zarr_with_open_kwargs_engine(complex_zarr): assert ds -def test_to_xarray_with_kerchunk_attrs_in_data_cube(data_cube_kerchunk): +def test_to_xarray_with_item_collection_with_kerchunk_attrs_in_data_cube( + data_cube_kerchunk, +): ds = to_xarray(data_cube_kerchunk) assert ds + +def test_to_xarray_with_list_with_kerchunk_attrs_in_data_cube(data_cube_kerchunk): ds = to_xarray([i for i in data_cube_kerchunk]) assert ds + + +def test_to_xarray_with_item_with_kerchunk_attrs_in_data_cube(data_cube_kerchunk): + ds = to_xarray([i for i in data_cube_kerchunk][-1]) + assert ds