diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8dd431c5f62..de4231669e4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -434,9 +434,9 @@ def open_dataset( """ if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": kwargs = locals().copy() - from . import apiv2 + from . import apiv2, plugins - if engine in apiv2.ENGINES: + if engine in plugins.ENGINES: return apiv2.open_dataset(**kwargs) if autoclose is not None: diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 2f34cc285ff..7e4605c42ce 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,7 +1,7 @@ import os from ..core.utils import is_remote_uri -from . import cfgrib_, h5netcdf_, zarr +from . import plugins, zarr from .api import ( _autodetect_engine, _get_backend_cls, @@ -9,12 +9,6 @@ _protect_dataset_variables_inplace, ) -ENGINES = { - "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, - "zarr": zarr.open_backend_dataset_zarr, - "cfgrib": cfgrib_.open_backend_dataset_cfgrib, -} - def dataset_from_backend_dataset( ds, @@ -23,7 +17,7 @@ def dataset_from_backend_dataset( chunks, cache, overwrite_encoded_chunks, - extra_tokens, + **extra_tokens, ): if not (isinstance(chunks, (int, dict)) or chunks is None): if chunks != "auto": @@ -73,17 +67,34 @@ def dataset_from_backend_dataset( # Ensure source filename always stored in dataset object (GH issue #2550) if "source" not in ds.encoding: if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + ds2.encoding["source"] = filename_or_obj return ds2 +def resolve_decoders_kwargs(decode_cf, engine, **decoders): + signature = plugins.ENGINES[engine]["signature"] + if decode_cf is False: + for d in decoders: + if d in signature: + decoders[d] = False + return {k: v for k, v in decoders.items() if v is not None} + + def open_dataset( filename_or_obj, *, engine=None, chunks=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, backend_kwargs=None, **kwargs, ): @@ -94,70 +105,50 @@ def open_dataset( filename_or_obj : str, Path, file-like or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is gunzipped and opened with + ends with .gz, in which case the file is unzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - autoclose : bool, optional - If True, automatically close files to avoid OS Error of too many files - being open. However, this option doesn't work with streams, e.g., - BytesIO. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ - "pseudonetcdf", "zarr"}, optional + engine : str, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for - "netcdf4". + "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ + "pynio", "cfgrib", "pseudonetcdf", "zarr"}. chunks : int or dict, optional If chunks is provided, it is used to load the new dataset into dask arrays. ``chunks={}`` loads the dataset with dask using a single chunk for all arrays. When using ``engine="zarr"``, setting ``chunks='auto'`` will create dask chunks based on the variable's zarr chunks. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as + If True, cache data is loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - drop_variables: str or iterable, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. + decode_cf : bool, optional + Setting ``decode_cf=False`` will disable ``mask_and_scale``, + ``decode_times``, ``decode_timedelta``, ``concat_characters``, + ``decode_coords``. + mask_and_scale : bool, optional + If True, array values equal to `_FillValue` are replaced with NA and other + values are scaled according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values, a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. mask_and_scale defaults to True except for the + pseudonetcdf backend. This keyword may not be supported by all the backends. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, they remain encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -167,12 +158,38 @@ def open_dataset( ``cftime.datetime`` objects, regardless of whether or not they can be represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + raise an error. This keyword may not be supported by all the backends. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + This keyword may not be supported by all the backends. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. This keyword may not be supported by all the + backends. + drop_variables: str or iterable, optional + A variable or list of variables to exclude from the dataset parsing. + This may be useful to drop variables with problems or + inconsistent values. + backend_kwargs: + Additional keyword arguments passed on to the engine open function. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. + Returns ------- @@ -202,12 +219,27 @@ def open_dataset( if engine is None: engine = _autodetect_engine(filename_or_obj) + decoders = resolve_decoders_kwargs( + decode_cf, + engine=engine, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) + backend_kwargs = backend_kwargs.copy() overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None) - open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) + open_backend_dataset = _get_backend_cls(engine, engines=plugins.ENGINES)[ + "open_dataset" + ] backend_ds = open_backend_dataset( filename_or_obj, + drop_variables=drop_variables, + **decoders, **backend_kwargs, **{k: v for k, v in kwargs.items() if v is not None}, ) @@ -218,7 +250,10 @@ def open_dataset( chunks, cache, overwrite_encoded_chunks, - {**backend_kwargs, **kwargs}, + drop_variables=drop_variables, + **decoders, + **backend_kwargs, + **kwargs, ) return ds diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index cfff1b146e3..80b3183f197 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -76,7 +76,6 @@ def get_encoding(self): def open_backend_dataset_cfgrib( filename_or_obj, *, - decode_cf=True, mask_and_scale=True, decode_times=None, concat_characters=None, @@ -93,13 +92,6 @@ def open_backend_dataset_cfgrib( time_dims=("time", "step"), ): - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False - store = CfGribDataStore( filename_or_obj, indexpath=indexpath, diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 5cfa80bafa1..9d013e3e46e 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -328,7 +328,6 @@ def close(self, **kwargs): def open_backend_dataset_h5necdf( filename_or_obj, *, - decode_cf=True, mask_and_scale=True, decode_times=None, concat_characters=None, @@ -343,13 +342,6 @@ def open_backend_dataset_h5necdf( phony_dims=None, ): - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False - store = H5NetCDFStore.open( filename_or_obj, format=format, diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py new file mode 100644 index 00000000000..a0b1a825600 --- /dev/null +++ b/xarray/backends/plugins.py @@ -0,0 +1,31 @@ +import inspect +import typing as T + +from . import cfgrib_, h5netcdf_, zarr + +ENGINES: T.Dict[str, T.Dict[str, T.Any]] = { + "h5netcdf": { + "open_dataset": h5netcdf_.open_backend_dataset_h5necdf, + }, + "zarr": { + "open_dataset": zarr.open_backend_dataset_zarr, + }, + "cfgrib": { + "open_dataset": cfgrib_.open_backend_dataset_cfgrib, + }, +} + + +for engine in ENGINES.values(): + if "signature" not in engine: + parameters = inspect.signature(engine["open_dataset"]).parameters + for name, param in parameters.items(): + if param.kind in ( + inspect.Parameter.VAR_KEYWORD, + inspect.Parameter.VAR_POSITIONAL, + ): + raise TypeError( + f'All the parameters in {engine["open_dataset"]!r} signature should be explicit. ' + "*args and **kwargs is not supported" + ) + engine["signature"] = set(parameters) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b54be09e749..8c42f062d12 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -684,7 +684,6 @@ def open_zarr( def open_backend_dataset_zarr( filename_or_obj, - decode_cf=True, mask_and_scale=True, decode_times=None, concat_characters=None, @@ -700,13 +699,6 @@ def open_backend_dataset_zarr( chunk_store=None, ): - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False - store = ZarrStore.open_group( filename_or_obj, group=group,