diff --git a/docs/source/explanation/esm-catalog-spec.md b/docs/source/explanation/esm-catalog-spec.md index 76a991a0..2245e7f5 100644 --- a/docs/source/explanation/esm-catalog-spec.md +++ b/docs/source/explanation/esm-catalog-spec.md @@ -96,11 +96,11 @@ The column names can optionally be associated with a controlled vocabulary, such An assets object describes the columns in the CSV file relevant for opening the actual data files. -| Element | Type | Description | -| ------------------ | ------ | ---------------------------------------------------------------------------------------------------------------------------------- | -| column_name | string | **REQUIRED.** The name of the column containing the path to the asset. Must be in the header of the CSV file. | -| format | string | The data format. Valid values are `netcdf` and `zarr`. If specified, it means that all data in the catalog is the same type. | -| format_column_name | string | The column name which contains the data format, allowing for variable data types in one catalog. Mutually exclusive with `format`. | +| Element | Type | Description | +| ------------------ | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| column_name | string | **REQUIRED.** The name of the column containing the path to the asset. Must be in the header of the CSV file. | +| format | string | The data format. Valid values are `netcdf`, `zarr`, or `reference` ([`kerchunk`](https://github.com/fsspec/kerchunk) reference files). If specified, it means that all data in the catalog is the same type. | +| format_column_name | string | The column name which contains the data format, allowing for variable data types in one catalog. Mutually exclusive with `format`. | ### Aggregation Control Object diff --git a/intake_esm/source.py b/intake_esm/source.py index aa801e94..2e2419d7 100644 --- a/intake_esm/source.py +++ b/intake_esm/source.py @@ -44,10 +44,19 @@ def _open_dataset( requested_variables=None, additional_attrs=None, expand_dims=None, + data_format=None, ): _can_be_local = fsspec.utils.can_be_local(urlpath) storage_options = xarray_open_kwargs.get('backend_kwargs', {}).get('storage_options', {}) + + # Support kerchunk datasets, setting the file object (fo) and urlpath + if data_format == 'reference': + if 'storage_options' not in xarray_open_kwargs.keys(): + xarray_open_kwargs['storage_options'] = {} + xarray_open_kwargs['storage_options']['fo'] = urlpath + urlpath = 'reference://' + if xarray_open_kwargs['engine'] == 'zarr': url = urlpath elif _can_be_local: @@ -220,6 +229,7 @@ def _open_dataset(self): if agg.type.value == 'join_new' }, requested_variables=self.requested_variables, + data_format=record['_data_format_'], additional_attrs=record.to_dict(), ) for _, record in self.df.iterrows() diff --git a/tests/sample_data/kerchunk-files/noaa-nwm-test-reference.json b/tests/sample_data/kerchunk-files/noaa-nwm-test-reference.json new file mode 100644 index 00000000..65064c1c --- /dev/null +++ b/tests/sample_data/kerchunk-files/noaa-nwm-test-reference.json @@ -0,0 +1,1562 @@ +{ + "version": 1, + "refs": { + ".zgroup": "{\"zarr_format\":2}", + "time/.zarray": "{\n \"chunks\": [\n 10\n ],\n \"compressor\": null,\n \"dtype\": \"