diff --git a/docs/api.rst b/docs/api.rst index 60716898..b712cb97 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -19,6 +19,7 @@ Reading :toctree: generated/ open_virtual_dataset + open_virtual_mfdataset Serialization ------------- diff --git a/pyproject.toml b/pyproject.toml index ecda9a16..38d622d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ test = [ "ruff", "s3fs", "scipy", + "lithops", "virtualizarr[hdf_reader]" ] diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index bd70f834..9d7ef92e 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -1,6 +1,6 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa -from virtualizarr.backend import open_virtual_dataset # noqa: F401 +from virtualizarr.backend import open_virtual_dataset, open_virtual_mfdataset # noqa: F401 from importlib.metadata import version as _version diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index a8e3b66a..09640b24 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -1,13 +1,21 @@ +import os import warnings from collections.abc import Iterable, Mapping from enum import Enum, auto from pathlib import Path from typing import ( + TYPE_CHECKING, Any, + Callable, + Literal, Optional, + Sequence, + cast, ) -from xarray import Dataset, Index +from xarray import DataArray, Dataset, Index, combine_by_coords +from xarray.backends.common import _find_absolute_paths +from xarray.core.combine import _infer_concat_order_from_positions, _nested_combine from virtualizarr.manifests import ManifestArray from virtualizarr.readers import ( @@ -22,6 +30,15 @@ from virtualizarr.readers.common import VirtualBackend from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions +if TYPE_CHECKING: + from xarray.core.types import ( + CombineAttrsOptions, + CompatOptions, + JoinOptions, + NestedSequence, + ) + + # TODO add entrypoint to allow external libraries to add to this mapping VIRTUAL_BACKENDS = { "kerchunk": KerchunkVirtualBackend, @@ -209,3 +226,215 @@ def open_virtual_dataset( ) return vds + + +def open_virtual_mfdataset( + paths: str + | os.PathLike + | Sequence[str | os.PathLike] + | "NestedSequence[str | os.PathLike]", + concat_dim: ( + str + | DataArray + | Index + | Sequence[str] + | Sequence[DataArray] + | Sequence[Index] + | None + ) = None, + compat: "CompatOptions" = "no_conflicts", + preprocess: Callable[[Dataset], Dataset] | None = None, + data_vars: Literal["all", "minimal", "different"] | list[str] = "all", + coords="different", + combine: Literal["by_coords", "nested"] = "by_coords", + parallel: Literal["lithops", "dask", False] = False, + join: "JoinOptions" = "outer", + attrs_file: str | os.PathLike | None = None, + combine_attrs: "CombineAttrsOptions" = "override", + **kwargs, +) -> Dataset: + """Open multiple files as a single virtual dataset. + + If combine='by_coords' then the function ``combine_by_coords`` is used to combine + the datasets into one before returning the result, and if combine='nested' then + ``combine_nested`` is used. The filepaths must be structured according to which + combining function is used, the details of which are given in the documentation for + ``combine_by_coords`` and ``combine_nested``. By default ``combine='by_coords'`` + will be used. Global attributes from the ``attrs_file`` are used + for the combined dataset. + + Parameters + ---------- + paths + Same as in xarray.open_mfdataset + concat_dim + Same as in xarray.open_mfdataset + compat + Same as in xarray.open_mfdataset + preprocess + Same as in xarray.open_mfdataset + data_vars + Same as in xarray.open_mfdataset + coords + Same as in xarray.open_mfdataset + combine + Same as in xarray.open_mfdataset + parallel : 'dask', 'lithops', or False + Specify whether the open and preprocess steps of this function will be + performed in parallel using ``dask.delayed``, in parallel using ``lithops.map``, or in serial. + Default is False. + join + Same as in xarray.open_mfdataset + attrs_file + Same as in xarray.open_mfdataset + combine_attrs + Same as in xarray.open_mfdataset + **kwargs : optional + Additional arguments passed on to :py:func:`virtualizarr.open_virtual_dataset`. For an + overview of some of the possible options, see the documentation of + :py:func:`virtualizarr.open_virtual_dataset`. + + Returns + ------- + xarray.Dataset + + Notes + ----- + The results of opening each virtual dataset in parallel are sent back to the client process, so must not be too large. + """ + + # TODO this is practically all just copied from xarray.open_mfdataset - an argument for writing a virtualizarr engine for xarray? + + # TODO list kwargs passed to open_virtual_dataset explicitly? + + paths = _find_absolute_paths(paths) + + if not paths: + raise OSError("no files to open") + + paths1d: list[str] + if combine == "nested": + if isinstance(concat_dim, str | DataArray) or concat_dim is None: + concat_dim = [concat_dim] # type: ignore[assignment] + + # This creates a flat list which is easier to iterate over, whilst + # encoding the originally-supplied structure as "ids". + # The "ids" are not used at all if combine='by_coords`. + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids, paths1d = ( + list(combined_ids_paths.keys()), + list(combined_ids_paths.values()), + ) + elif concat_dim is not None: + raise ValueError( + "When combine='by_coords', passing a value for `concat_dim` has no " + "effect. To manually combine along a specific dimension you should " + "instead specify combine='nested' along with a value for `concat_dim`.", + ) + else: + paths1d = paths # type: ignore[assignment] + + if parallel == "dask": + import dask + + # wrap the open_dataset, getattr, and preprocess with delayed + open_ = dask.delayed(open_virtual_dataset) + getattr_ = dask.delayed(getattr) + if preprocess is not None: + preprocess = dask.delayed(preprocess) + elif parallel == "lithops": + import lithops + + # TODO use RetryingFunctionExecutor instead? + # TODO what's the easiest way to pass the lithops config in? + fn_exec = lithops.FunctionExecutor() + + # lithops doesn't have a delayed primitive + open_ = open_virtual_dataset + # TODO I don't know how best to chain this with the getattr, or if that closing stuff is even necessary for virtual datasets + # getattr_ = getattr + elif parallel is not False: + raise ValueError( + f"{parallel} is an invalid option for the keyword argument ``parallel``" + ) + else: + open_ = open_virtual_dataset + getattr_ = getattr + + if parallel == "dask": + virtual_datasets = [open_(p, **kwargs) for p in paths1d] + closers = [getattr_(ds, "_close") for ds in virtual_datasets] + if preprocess is not None: + virtual_datasets = [preprocess(ds) for ds in virtual_datasets] + + # calling compute here will return the datasets/file_objs lists, + # the underlying datasets will still be stored as dask arrays + virtual_datasets, closers = dask.compute(virtual_datasets, closers) + elif parallel == "lithops": + + def generate_refs(path): + # allows passing the open_virtual_dataset function to lithops without evaluating it + vds = open_(path, **kwargs) + # TODO perhaps we should just load the loadable_vars here and close before returning? + return vds + + futures = fn_exec.map(generate_refs, paths1d) + + # wait for all the serverless workers to finish, and send their resulting virtual datasets back to the client + # TODO do we need download_results? + completed_futures, _ = fn_exec.wait(futures, download_results=True) + virtual_datasets = completed_futures.get_result() + elif parallel is False: + virtual_datasets = [open_(p, **kwargs) for p in paths1d] + closers = [getattr_(ds, "_close") for ds in virtual_datasets] + if preprocess is not None: + virtual_datasets = [preprocess(ds) for ds in virtual_datasets] + + # Combine all datasets, closing them in case of a ValueError + try: + if combine == "nested": + # Combined nested list by successive concat and merge operations + # along each dimension, using structure given by "ids" + combined_vds = _nested_combine( + virtual_datasets, + concat_dims=concat_dim, + compat=compat, + data_vars=data_vars, + coords=coords, + ids=ids, + join=join, + combine_attrs=combine_attrs, + ) + elif combine == "by_coords": + # Redo ordering from coordinates, ignoring how they were ordered + # previously + combined_vds = combine_by_coords( + virtual_datasets, + compat=compat, + data_vars=data_vars, + coords=coords, + join=join, + combine_attrs=combine_attrs, + ) + else: + raise ValueError( + f"{combine} is an invalid option for the keyword argument" + " ``combine``" + ) + except ValueError: + for vds in virtual_datasets: + vds.close() + raise + + # combined_vds.set_close(partial(_multi_file_closer, closers)) + + # read global attributes from the attrs_file or from the first dataset + if attrs_file is not None: + if isinstance(attrs_file, os.PathLike): + attrs_file = cast(str, os.fspath(attrs_file)) + combined_vds.attrs = virtual_datasets[paths1d.index(attrs_file)].attrs + + # TODO should we just immediately close everything? + # TODO We should have already read everything we're ever going to read into memory at this point + + return combined_vds diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 658cf640..ecc0ab43 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -35,6 +35,7 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") has_kerchunk, requires_kerchunk = _importorskip("kerchunk") has_s3fs, requires_s3fs = _importorskip("s3fs") +has_lithops, requires_lithops = _importorskip("lithops") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 932fe7df..2e4b527b 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,4 +1,5 @@ from collections.abc import Mapping +from pathlib import Path from unittest.mock import patch import numpy as np @@ -8,7 +9,7 @@ from xarray import Dataset, open_dataset from xarray.core.indexes import Index -from virtualizarr import open_virtual_dataset +from virtualizarr import open_virtual_dataset, open_virtual_mfdataset from virtualizarr.backend import FileType, automatically_determine_filetype from virtualizarr.manifests import ManifestArray from virtualizarr.readers import HDF5VirtualBackend @@ -17,6 +18,7 @@ has_astropy, network, requires_kerchunk, + requires_lithops, requires_s3fs, requires_scipy, ) @@ -446,3 +448,114 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend): vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} + + +# TODO consolidate these by parameterizing over parallel kwarg once they all work +@requires_kerchunk +class TestOpenVirtualMFDataset: + def test_serial(self, netcdf4_files_factory): + filepath1, filepath2 = netcdf4_files_factory() + + # test combine nested without in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], + combine="nested", + concat_dim="time", + coords="minimal", + compat="override", + indexes={}, + ) + vds1 = open_virtual_dataset(filepath1, indexes={}) + vds2 = open_virtual_dataset(filepath2, indexes={}) + expected_vds = xr.concat( + [vds1, vds2], dim="time", coords="minimal", compat="override" + ) + xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords using in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], combine="by_coords", loadable_variables=["time"] + ) + vds1 = open_virtual_dataset(filepath1, loadable_variables=["time"]) + vds2 = open_virtual_dataset(filepath2, loadable_variables=["time"]) + expected_vds = xr.concat( + [vds1, vds2], dim="time", coords="minimal", compat="override" + ) + xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords again using in-memory indexes but for a glob + file_glob = Path(filepath1).parent.glob("air*.nc") + combined_vds = open_virtual_mfdataset( + file_glob, combine="by_coords", loadable_variables=["time"] + ) + xrt.assert_identical(combined_vds, expected_vds) + + # @requires_dask + def test_dask(self, netcdf4_files_factory): ... + + @requires_lithops + def test_lithops(self, netcdf4_files_factory): + # by default this will use the lithops LocalHost executor + + filepath1, filepath2 = netcdf4_files_factory() + + # test combine nested without in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], + combine="nested", + concat_dim="time", + coords="minimal", + compat="override", + indexes={}, + parallel="lithops", + ) + vds1 = open_virtual_dataset(filepath1, indexes={}) + vds2 = open_virtual_dataset(filepath2, indexes={}) + expected_vds = xr.concat( + [vds1, vds2], dim="time", coords="minimal", compat="override" + ) + + print(combined_vds) + print(expected_vds) + print(combined_vds.indexes) + print(combined_vds.indexes) + print(combined_vds["lat"].attrs) + print(expected_vds["lat"].attrs) + print(combined_vds["lat"].encoding) + print(expected_vds["lat"].encoding) + print(combined_vds["lat"].data) + print(expected_vds["lat"].data) + print(combined_vds["lat"].data.zarray) + print(expected_vds["lat"].data.zarray) + print(combined_vds["lat"].data.manifest.dict()) + print(expected_vds["lat"].data.manifest.dict()) + + # TODO this assertion unintentially triggers loading, see issue #354 + # xrt.assert_identical(combined_vds.coords.variables['lat'], expected_vds.coords.variables['lat']) + + # TODO I have no idea why this assertion fails for all the coords - everything about the coords looks identical + # xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords using in-memory indexes + combined_vds = open_virtual_mfdataset( + [filepath1, filepath2], + combine="by_coords", + loadable_variables=["time"], + parallel="lithops", + ) + vds1 = open_virtual_dataset(filepath1, loadable_variables=["time"]) + vds2 = open_virtual_dataset(filepath2, loadable_variables=["time"]) + expected_vds = xr.concat( + [vds1, vds2], dim="time", coords="minimal", compat="override" + ) + xrt.assert_identical(combined_vds, expected_vds) + + # test combine by coords again using in-memory indexes but for a glob + file_glob = Path(filepath1).parent.glob("air*.nc") + combined_vds = open_virtual_mfdataset( + file_glob, + combine="by_coords", + loadable_variables=["time"], + parallel="lithops", + ) + xrt.assert_identical(combined_vds, expected_vds)