diff --git a/.gitignore b/.gitignore index 4a0c389..61f3202 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ dist .direnv .mypy_cache .pytest_cache -.ruff_cache \ No newline at end of file +.ruff_cache +*.egg-info/ \ No newline at end of file diff --git a/README.md b/README.md index dbb94c1..ca3bd67 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,21 @@ xr.open_dataset(asset) ``` ref: https://planetarycomputer.microsoft.com/docs/quickstarts/reading-zarr-data/ + +Here is an example using the new approach of storing kerchunked metadata within the data-cube extension: + +```python +import pystac +import xarray as xr + +path = "https://raw.githubusercontent.com/stac-utils/xpystac/main/tests/data/data-cube-kerchunk-item-collection.json" +item_collection = pystac.ItemCollection.from_file(path) + +ds = xr.open_dataset(item_collection) +ds +``` + + ## Install ```bash diff --git a/environment.yaml b/environment.yaml index c32d731..a92ec15 100644 --- a/environment.yaml +++ b/environment.yaml @@ -1,4 +1,4 @@ -name: xpystac-broken +name: xpystac channels: - conda-forge - nodefaults @@ -11,6 +11,7 @@ dependencies: - adlfs - aiohttp - fsspec + - kerchunk - odc-stac - planetary-computer - pystac-client diff --git a/tests/conftest.py b/tests/conftest.py index c32568d..6e55f05 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,3 +57,9 @@ def simple_zarr() -> pystac.Asset: def complex_zarr(simple_zarr) -> pystac.Asset: simple_zarr.extra_fields["xarray:open_kwargs"]["engine"] = "zarr" return simple_zarr + + +@pytest.fixture(scope="module") +def data_cube_kerchunk() -> pystac.ItemCollection: + path = "tests/data/data-cube-kerchunk-item-collection.json" + return pystac.ItemCollection.from_file(path) diff --git a/tests/data/data-cube-kerchunk-item-collection.json b/tests/data/data-cube-kerchunk-item-collection.json new file mode 100644 index 0000000..0ff2811 --- /dev/null +++ b/tests/data/data-cube-kerchunk-item-collection.json @@ -0,0 +1,16021 @@ +{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "stac_version": "1.0.0", + "id": "20230920-short_range-conus-channel_rt-0-1", + "properties": { + "nwm:model_configuration": "short_range", + "nwm:model_domain": "conus", + "nwm:file_output_type": "channel_rt", + "nwm:forecast_hour": 1, + "cube:dimensions": { + "time": { + "extent": [ + "2023-09-20T01:00:00Z", + "2023-09-20T01:00:00Z" + ], + "description": "valid output time", + "type": "temporal", + "kerchunk:zarray": { + "chunks": [ + 1024 + ], + "compressor": { + "id": "zlib", + "level": 2 + }, + "dtype": " Dict[str, Any]: + """ + Copied from xstac to support python < 3.10 + ref: https://github.com/stac-utils/xstac/blob/1.2.0/xstac/_kerchunk.py + + Derive Kerchunk indices from a STAC item. + """ + refs = {} + refs[".zgroup"] = json.dumps(item.properties["kerchunk:zgroup"]) + refs[".zattrs"] = json.dumps(item.properties["kerchunk:zattrs"]) + + for attr in ["cube:dimensions", "cube:variables"]: + cd = item.properties[attr] + for k in cd: + refs[f"{k}/.zarray"] = json.dumps(cd[k]["kerchunk:zarray"]) + # TODO: derive from datacube stuff, ARRAY_DIMENSIONS + refs[f"{k}/.zattrs"] = json.dumps(cd[k]["kerchunk:zattrs"]) + for i in cd[k]["kerchunk:value"]: + refs[f"{k}/{i}"] = cd[k]["kerchunk:value"][i] + + d = {"version": kerchunk_version, "refs": refs} + return d diff --git a/xpystac/core.py b/xpystac/core.py index aacdf5c..29ee73a 100644 --- a/xpystac/core.py +++ b/xpystac/core.py @@ -4,6 +4,7 @@ import pystac import xarray +from xpystac._xstac_kerchunk import _stac_to_kerchunk from xpystac.utils import _import_optional_dependency, _is_item_search @@ -11,6 +12,7 @@ def to_xarray( obj, stacking_library: Union[Literal["odc.stac", "stackstac"], None] = None, + allow_kerchunk: bool = True, **kwargs, ) -> xarray.Dataset: """Given a pystac object return an xarray dataset @@ -18,24 +20,59 @@ def to_xarray( When stacking multiple items, an optional ``stacking_library`` argument is accepted. It defaults to ``odc.stac`` if available and otherwise ``stackstac``. Control the behavior by setting ``stacking_library`` + + User ``allow_kerchunk`` (True by default) to control whether this reader tries to + interpret kerchunk attributes if provided (either in the data-cube extension or + as a full asset with ``references`` or ``index`` as the role). """ if _is_item_search(obj): item_collection = obj.item_collection() - return to_xarray(item_collection, stacking_library=stacking_library, **kwargs) + return to_xarray( + item_collection, + stacking_library=stacking_library, + allow_kerchunk=allow_kerchunk, + **kwargs, + ) raise TypeError @to_xarray.register(pystac.Item) @to_xarray.register(pystac.ItemCollection) +@to_xarray.register(list) def _( - obj: Union[pystac.Item, pystac.ItemCollection], + obj: Union[pystac.Item, pystac.ItemCollection, List[pystac.Item]], drop_variables: Union[str, List[str], None] = None, stacking_library: Union[Literal["odc.stac", "stackstac"], None] = None, + allow_kerchunk: bool = True, **kwargs, ) -> xarray.Dataset: if drop_variables is not None: raise KeyError("``drop_variables`` not implemented for pystac items") + if allow_kerchunk: + first_obj = obj if isinstance(obj, pystac.Item) else next(i for i in obj) + is_kerchunked = any("kerchunk:" in k for k in first_obj.properties.keys()) + if is_kerchunked: + kerchunk_combine = _import_optional_dependency("kerchunk.combine") + fsspec = _import_optional_dependency("fsspec") + + if isinstance(obj, (list, pystac.ItemCollection)): + refs = kerchunk_combine.MultiZarrToZarr( + [_stac_to_kerchunk(item) for item in obj], + concat_dims=kwargs.get("concat_dims", "time"), + ).translate() + else: + refs = _stac_to_kerchunk(obj) + + mapper = fsspec.filesystem("reference", fo=refs).get_mapper() + default_kwargs = { + "chunks": {}, + "engine": "zarr", + "consolidated": False, + } + + return xarray.open_dataset(mapper, **{**default_kwargs, **kwargs}) + if stacking_library is None: try: _import_optional_dependency("odc.stac") @@ -62,6 +99,7 @@ def _( def _( obj: pystac.Asset, stacking_library: Union[Literal["odc.stac", "stackstac"], None] = None, + allow_kerchunk: bool = True, **kwargs, ) -> xarray.Dataset: default_kwargs: Mapping = {"chunks": {}} @@ -71,8 +109,10 @@ def _( if storage_options: open_kwargs["storage_options"] = storage_options - if obj.media_type == pystac.MediaType.JSON and {"index", "references"}.intersection( - set(obj.roles) if obj.roles else set() + if ( + allow_kerchunk + and obj.media_type == pystac.MediaType.JSON + and {"index", "references"}.intersection(set(obj.roles) if obj.roles else set()) ): requests = _import_optional_dependency("requests") fsspec = _import_optional_dependency("fsspec") @@ -85,7 +125,7 @@ def _( except ImportError: refs = r.json() - mapper = fsspec.get_mapper("reference://", fo=refs) + mapper = fsspec.filesystem("reference", fo=refs).get_mapper() default_kwargs = { **default_kwargs, "engine": "zarr",