diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e0adb53150..02554cc7a45 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,9 +14,9 @@ repos: - id: absolufy-imports name: absolufy-imports files: ^xarray/ - - repo: https://github.com/charliermarsh/ruff-pre-commit + - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.0.275' + rev: 'v0.0.277' hooks: - id: ruff args: ["--fix"] @@ -47,7 +47,7 @@ repos: types-pkg_resources, types-PyYAML, types-pytz, - typing-extensions==3.10.0.0, + typing-extensions>=4.1.0, numpy, ] - repo: https://github.com/citation-file-format/cff-converter-python diff --git a/.readthedocs.yaml b/.readthedocs.yaml index db2e1cd0b9a..55fea717f71 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,6 +7,7 @@ build: jobs: post_checkout: - (git --no-pager log --pretty="tformat:%s" -1 | grep -vqF "[skip-rtd]") || exit 183 + - git fetch --unshallow || true pre_install: - git update-index --assume-unchanged doc/conf.py ci/requirements/doc.yml diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 39e04d04d47..41507fce13e 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -23,7 +23,7 @@ conda uninstall -y --force \ xarray # to limit the runtime of Upstream CI python -m pip install \ - -i https://pypi.anaconda.org/scipy-wheels-nightly/simple \ + -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ --no-deps \ --pre \ --upgrade \ diff --git a/doc/conf.py b/doc/conf.py index f201af859b9..6c6efb47f6b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -323,6 +323,7 @@ "dask": ("https://docs.dask.org/en/latest", None), "cftime": ("https://unidata.github.io/cftime", None), "sparse": ("https://sparse.pydata.org/en/latest/", None), + "cubed": ("https://tom-e-white.com/cubed/", None), } diff --git a/doc/examples/multidimensional-coords.ipynb b/doc/examples/multidimensional-coords.ipynb index f7471f05e5d..ce8a091a5da 100644 --- a/doc/examples/multidimensional-coords.ipynb +++ b/doc/examples/multidimensional-coords.ipynb @@ -56,7 +56,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this example, the _logical coordinates_ are `x` and `y`, while the _physical coordinates_ are `xc` and `yc`, which represent the latitudes and longitude of the data." + "In this example, the _logical coordinates_ are `x` and `y`, while the _physical coordinates_ are `xc` and `yc`, which represent the longitudes and latitudes of the data." ] }, { diff --git a/doc/internals/chunked-arrays.rst b/doc/internals/chunked-arrays.rst new file mode 100644 index 00000000000..7192c3f0bc5 --- /dev/null +++ b/doc/internals/chunked-arrays.rst @@ -0,0 +1,102 @@ +.. currentmodule:: xarray + +.. _internals.chunkedarrays: + +Alternative chunked array types +=============================== + +.. warning:: + + This is a *highly* experimental feature. Please report any bugs or other difficulties on `xarray's issue tracker `_. + In particular see discussion on `xarray issue #6807 `_ + +Xarray can wrap chunked dask arrays (see :ref:`dask`), but can also wrap any other chunked array type that exposes the correct interface. +This allows us to support using other frameworks for distributed and out-of-core processing, with user code still written as xarray commands. +In particular xarray also supports wrapping :py:class:`cubed.Array` objects +(see `Cubed's documentation `_ and the `cubed-xarray package `_). + +The basic idea is that by wrapping an array that has an explicit notion of ``.chunks``, xarray can expose control over +the choice of chunking scheme to users via methods like :py:meth:`DataArray.chunk` whilst the wrapped array actually +implements the handling of processing all of the chunks. + +Chunked array methods and "core operations" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A chunked array needs to meet all the :ref:`requirements for normal duck arrays `, but must also +implement additional features. + +Chunked arrays have additional attributes and methods, such as ``.chunks`` and ``.rechunk``. +Furthermore, Xarray dispatches chunk-aware computations across one or more chunked arrays using special functions known +as "core operations". Examples include ``map_blocks``, ``blockwise``, and ``apply_gufunc``. + +The core operations are generalizations of functions first implemented in :py:mod:`dask.array`. +The implementation of these functions is specific to the type of arrays passed to them. For example, when applying the +``map_blocks`` core operation, :py:class:`dask.array.Array` objects must be processed by :py:func:`dask.array.map_blocks`, +whereas :py:class:`cubed.Array` objects must be processed by :py:func:`cubed.map_blocks`. + +In order to use the correct implementation of a core operation for the array type encountered, xarray dispatches to the +corresponding subclass of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`, +also known as a "Chunk Manager". Therefore **a full list of the operations that need to be defined is set by the +API of the** :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` **abstract base class**. Note that chunked array +methods are also currently dispatched using this class. + +Chunked array creation is also handled by this class. As chunked array objects have a one-to-one correspondence with +in-memory numpy arrays, it should be possible to create a chunked array from a numpy array by passing the desired +chunking pattern to an implementation of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint.from_array``. + +.. note:: + + The :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` abstract base class is mostly just acting as a + namespace for containing the chunked-aware function primitives. Ideally in the future we would have an API standard + for chunked array types which codified this structure, making the entrypoint system unnecessary. + +.. currentmodule:: xarray.core.parallelcompat + +.. autoclass:: xarray.core.parallelcompat.ChunkManagerEntrypoint + :members: + +Registering a new ChunkManagerEntrypoint subclass +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Rather than hard-coding various chunk managers to deal with specific chunked array implementations, xarray uses an +entrypoint system to allow developers of new chunked array implementations to register their corresponding subclass of +:py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`. + + +To register a new entrypoint you need to add an entry to the ``setup.cfg`` like this:: + + [options.entry_points] + xarray.chunkmanagers = + dask = xarray.core.daskmanager:DaskManager + +See also `cubed-xarray `_ for another example. + +To check that the entrypoint has worked correctly, you may find it useful to display the available chunkmanagers using +the internal function :py:func:`~xarray.core.parallelcompat.list_chunkmanagers`. + +.. autofunction:: list_chunkmanagers + + +User interface +~~~~~~~~~~~~~~ + +Once the chunkmanager subclass has been registered, xarray objects wrapping the desired array type can be created in 3 ways: + +#. By manually passing the array type to the :py:class:`~xarray.DataArray` constructor, see the examples for :ref:`numpy-like arrays `, + +#. Calling :py:meth:`~xarray.DataArray.chunk`, passing the keyword arguments ``chunked_array_type`` and ``from_array_kwargs``, + +#. Calling :py:func:`~xarray.open_dataset`, passing the keyword arguments ``chunked_array_type`` and ``from_array_kwargs``. + +The latter two methods ultimately call the chunkmanager's implementation of ``.from_array``, to which they pass the ``from_array_kwargs`` dict. +The ``chunked_array_type`` kwarg selects which registered chunkmanager subclass to dispatch to. It defaults to ``'dask'`` +if Dask is installed, otherwise it defaults to whichever chunkmanager is registered if only one is registered. +If multiple chunkmanagers are registered it will raise an error by default. + +Parallel processing without chunks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To use a parallel array type that does not expose a concept of chunks explicitly, none of the information on this page +is theoretically required. Such an array type (e.g. `Ramba `_ or +`Arkouda `_) could be wrapped using xarray's existing support for +:ref:`numpy-like "duck" arrays `. diff --git a/doc/internals/duck-arrays-integration.rst b/doc/internals/duck-arrays-integration.rst index 3b6313dbf2f..1f1f57974df 100644 --- a/doc/internals/duck-arrays-integration.rst +++ b/doc/internals/duck-arrays-integration.rst @@ -11,6 +11,8 @@ Integrating with duck arrays Xarray can wrap custom numpy-like arrays (":term:`duck array`\s") - see the :ref:`user guide documentation `. This page is intended for developers who are interested in wrapping a new custom array type with xarray. +.. _internals.duckarrays.requirements: + Duck array requirements ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/internals/index.rst b/doc/internals/index.rst index 132f6c40ede..666f8fd2343 100644 --- a/doc/internals/index.rst +++ b/doc/internals/index.rst @@ -21,6 +21,7 @@ The pages in this section are intended for: variable-objects duck-arrays-integration + chunked-arrays extending-xarray zarr-encoding-spec how-to-add-new-backend diff --git a/doc/user-guide/duckarrays.rst b/doc/user-guide/duckarrays.rst index dc1d2d1cb8a..f0650ac61b5 100644 --- a/doc/user-guide/duckarrays.rst +++ b/doc/user-guide/duckarrays.rst @@ -27,7 +27,7 @@ Some numpy-like array types that xarray already has some support for: For information on wrapping dask arrays see :ref:`dask`. Whilst xarray wraps dask arrays in a similar way to that described on this page, chunked array types like :py:class:`dask.array.Array` implement additional methods that require - slightly different user code (e.g. calling ``.chunk`` or ``.compute``). + slightly different user code (e.g. calling ``.chunk`` or ``.compute``). See the docs on :ref:`wrapping chunked arrays `. Why "duck"? ----------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ce2c0a698ac..3740a5a44f1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,9 +14,9 @@ What's New np.random.seed(123456) -.. _whats-new.2023.06.1: +.. _whats-new.2023.07.1: -v2023.06.1 (unreleased) +v2023.07.1 (unreleased) ----------------------- New Features @@ -29,17 +29,45 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- `hue_style` is being deprecated for scatter plots. (:issue:`7907`, :pull:`7925`). + By `Jimmy Westling `_. + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + +v2023.07.0 (July 11, 2023) +-------------------------- +This release brings improvements to the documentation on wrapping numpy-like arrays, improved docstrings, and bug fixes. Bug fixes ~~~~~~~~~ +- Ensure no forward slashes in variable and dimension names for HDF5-based engines. + (:issue:`7943`, :pull:`7953`) By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ +- Added examples to docstrings of :py:meth:`Dataset.tail`, :py:meth:`Dataset.head`, :py:meth:`Dataset.dropna`, + :py:meth:`Dataset.ffill`, :py:meth:`Dataset.bfill`, :py:meth:`Dataset.set_coords`, :py:meth:`Dataset.reset_coords` + (:issue:`6793`, :pull:`7936`) By `Harshitha `_ . +- Added page on wrapping chunked numpy-like arrays as alternatives to dask arrays. + (:pull:`7951`) By `Tom Nicholas `_. - Expanded the page on wrapping numpy-like "duck" arrays. (:pull:`7911`) By `Tom Nicholas `_. +- Added examples to docstrings of :py:meth:`Dataset.isel`, :py:meth:`Dataset.reduce`, :py:meth:`Dataset.argmin`, + :py:meth:`Dataset.argmax` (:issue:`6793`, :pull:`7881`) + By `Harshitha `_ . Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0157e0d9d66..d992d3999a3 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -3,7 +3,6 @@ import os from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence from functools import partial -from glob import glob from io import BytesIO from numbers import Number from typing import ( @@ -21,7 +20,12 @@ from xarray import backends, conventions from xarray.backends import plugins -from xarray.backends.common import AbstractDataStore, ArrayWriter, _normalize_path +from xarray.backends.common import ( + AbstractDataStore, + ArrayWriter, + _find_absolute_paths, + _normalize_path, +) from xarray.backends.locks import _get_scheduler from xarray.core import indexing from xarray.core.combine import ( @@ -967,37 +971,7 @@ def open_mfdataset( .. [1] https://docs.xarray.dev/en/stable/dask.html .. [2] https://docs.xarray.dev/en/stable/dask.html#chunking-and-performance """ - if isinstance(paths, str): - if is_remote_uri(paths) and engine == "zarr": - try: - from fsspec.core import get_fs_token_paths - except ImportError as e: - raise ImportError( - "The use of remote URLs for opening zarr requires the package fsspec" - ) from e - - fs, _, _ = get_fs_token_paths( - paths, - mode="rb", - storage_options=kwargs.get("backend_kwargs", {}).get( - "storage_options", {} - ), - expand=False, - ) - tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories - paths = [fs.get_mapper(path) for path in tmp_paths] - elif is_remote_uri(paths): - raise ValueError( - "cannot do wild-card matching for paths that are remote URLs " - f"unless engine='zarr' is specified. Got paths: {paths}. " - "Instead, supply paths as an explicit list of strings." - ) - else: - paths = sorted(glob(_normalize_path(paths))) - elif isinstance(paths, os.PathLike): - paths = [os.fspath(paths)] - else: - paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + paths = _find_absolute_paths(paths, engine=engine, **kwargs) if not paths: raise OSError("no files to open") diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 50ac606a83e..1ac988c6b4f 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,6 +5,7 @@ import time import traceback from collections.abc import Iterable +from glob import glob from typing import TYPE_CHECKING, Any, ClassVar import numpy as np @@ -19,6 +20,7 @@ from io import BufferedIOBase from xarray.core.dataset import Dataset + from xarray.core.types import NestedSequence # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -28,6 +30,24 @@ def _normalize_path(path): + """ + Normalize pathlikes to string. + + Parameters + ---------- + path : + Path to file. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths_path = Path(directory).joinpath("comm*n.py") + >>> paths_str = xr.backends.common._normalize_path(paths_path) + >>> print([type(p) for p in (paths_str,)]) + [] + """ if isinstance(path, os.PathLike): path = os.fspath(path) @@ -37,6 +57,64 @@ def _normalize_path(path): return path +def _find_absolute_paths( + paths: str | os.PathLike | NestedSequence[str | os.PathLike], **kwargs +) -> list[str]: + """ + Find absolute paths from the pattern. + + Parameters + ---------- + paths : + Path(s) to file(s). Can include wildcards like * . + **kwargs : + Extra kwargs. Mainly for fsspec. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths = str(Path(directory).joinpath("comm*n.py")) # Find common with wildcard + >>> paths = xr.backends.common._find_absolute_paths(paths) + >>> [Path(p).name for p in paths] + ['common.py'] + """ + if isinstance(paths, str): + if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in tmp_paths] + elif is_remote_uri(paths): + raise ValueError( + "cannot do wild-card matching for paths that are remote URLs " + f"unless engine='zarr' is specified. Got paths: {paths}. " + "Instead, supply paths as an explicit list of strings." + ) + else: + paths = sorted(glob(_normalize_path(paths))) + elif isinstance(paths, os.PathLike): + paths = [os.fspath(paths)] + else: + paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + + return paths + + def _encode_variable_name(name): if name is None: name = NONE_VAR_NAME diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 7389f6a2862..697ebb8ab92 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -20,6 +20,7 @@ from xarray.backends.netCDF4_ import ( BaseNetCDF4Array, _encode_nc4_variable, + _ensure_no_forward_slash_in_name, _extract_nc4_variable_encoding, _get_datatype, _nc4_require_group, @@ -256,6 +257,7 @@ def get_encoding(self): } def set_dimension(self, name, length, is_unlimited=False): + _ensure_no_forward_slash_in_name(name) if is_unlimited: self.ds.dimensions[name] = None self.ds.resize_dimension(name, length) @@ -273,6 +275,7 @@ def prepare_variable( ): import h5py + _ensure_no_forward_slash_in_name(name) attrs = variable.attrs.copy() dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8a5d48c8c1e..b5c3413e7f8 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -194,6 +194,15 @@ def _nc4_require_group(ds, group, mode, create_group=_netcdf4_create_group): return ds +def _ensure_no_forward_slash_in_name(name): + if "/" in name: + raise ValueError( + f"Forward slashes '/' are not allowed in variable and dimension names (got {name!r}). " + "Forward slashes are used as hierarchy-separators for " + "HDF5-based files ('netcdf4'/'h5netcdf')." + ) + + def _ensure_fill_value_valid(data, attributes): # work around for netCDF4/scipy issue where _FillValue has the wrong type: # https://github.com/Unidata/netcdf4-python/issues/271 @@ -447,6 +456,7 @@ def get_encoding(self): } def set_dimension(self, name, length, is_unlimited=False): + _ensure_no_forward_slash_in_name(name) dim_length = length if not is_unlimited else None self.ds.createDimension(name, size=dim_length) @@ -470,6 +480,8 @@ def encode_variable(self, variable): def prepare_variable( self, name, variable, check_encoding=False, unlimited_dims=None ): + _ensure_no_forward_slash_in_name(name) + datatype = _get_datatype( variable, self.format, raise_on_invalid_encoding=check_encoding ) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5ff1cf5866e..214581bba27 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -812,7 +812,7 @@ def open_zarr( possible, otherwise defaulting to 2. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict, optional Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f4ba9d4f9fe..7bd92ea32a0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1742,6 +1742,33 @@ def set_coords(self: T_Dataset, names: Hashable | Iterable[Hashable]) -> T_Datas names : hashable or iterable of hashable Name(s) of variables in this dataset to convert into coordinates. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "pressure": ("time", [1.013, 1.2, 3.5]), + ... "time": pd.date_range("2023-01-01", periods=3), + ... } + ... ) + >>> dataset + + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + pressure (time) float64 1.013 1.2 3.5 + + >>> dataset.set_coords("pressure") + + Dimensions: (time: 3) + Coordinates: + pressure (time) float64 1.013 1.2 3.5 + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + *empty* + + On calling ``set_coords`` , these data variables are converted to coordinates, as shown in the final dataset. + Returns ------- Dataset @@ -1780,9 +1807,66 @@ def reset_coords( If True, remove coordinates instead of converting them into variables. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "lat", "lon"], + ... [[[25, 26], [27, 28]], [[29, 30], [31, 32]]], + ... ), + ... "precipitation": ( + ... ["time", "lat", "lon"], + ... [[[0.5, 0.8], [0.2, 0.4]], [[0.3, 0.6], [0.7, 0.9]]], + ... ), + ... }, + ... coords={ + ... "time": pd.date_range(start="2023-01-01", periods=2), + ... "lat": [40, 41], + ... "lon": [-80, -79], + ... "altitude": 1000, + ... }, + ... ) + + # Dataset before resetting coordinates + + >>> dataset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + altitude int64 1000 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + + # Reset the 'altitude' coordinate + + >>> dataset_reset = dataset.reset_coords("altitude") + + # Dataset after resetting coordinates + + >>> dataset_reset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + altitude int64 1000 + Returns ------- Dataset + + See Also + -------- + Dataset.set_coords """ if names is None: names = self._coord_names - set(self._indexes) @@ -2497,6 +2581,63 @@ def isel( in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # A specific element from the dataset is selected + + >>> dataset.isel(student=1, test=0) + + Dimensions: () + Coordinates: + student >> slice_of_data = dataset.isel(student=slice(0, 2), test=slice(0, 2)) + >>> slice_of_data + + Dimensions: (student: 2, test: 2) + Coordinates: + * student (student) >> index_array = xr.DataArray([0, 2], dims="student") + >>> indexed_data = dataset.isel(student=index_array) + >>> indexed_data + + Dimensions: (student: 2, test: 3) + Coordinates: + * student (student) >> dates = pd.date_range(start="2023-01-01", periods=5) + >>> pageviews = [1200, 1500, 900, 1800, 2000] + >>> visitors = [800, 1000, 600, 1200, 1500] + >>> dataset = xr.Dataset( + ... { + ... "pageviews": (("date"), pageviews), + ... "visitors": (("date"), visitors), + ... }, + ... coords={"date": dates}, + ... ) + >>> busiest_days = dataset.sortby("pageviews", ascending=False) + >>> busiest_days.head() + + Dimensions: (date: 5) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 ... 2023-01-03 + Data variables: + pageviews (date) int64 2000 1800 1500 1200 900 + visitors (date) int64 1500 1200 1000 800 600 + + # Retrieve the 3 most busiest days in terms of pageviews + + >>> busiest_days.head(3) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + + # Using a dictionary to specify the number of elements for specific dimensions + + >>> busiest_days.head({"date": 3}) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + See Also -------- Dataset.tail @@ -2731,6 +2916,48 @@ def tail( The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. + Examples + -------- + >>> activity_names = ["Walking", "Running", "Cycling", "Swimming", "Yoga"] + >>> durations = [30, 45, 60, 45, 60] # in minutes + >>> energies = [150, 300, 250, 400, 100] # in calories + >>> dataset = xr.Dataset( + ... { + ... "duration": (["activity"], durations), + ... "energy_expenditure": (["activity"], energies), + ... }, + ... coords={"activity": activity_names}, + ... ) + >>> sorted_dataset = dataset.sortby("energy_expenditure", ascending=False) + >>> sorted_dataset + + Dimensions: (activity: 5) + Coordinates: + * activity (activity) >> sorted_dataset.tail(3) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) >> sorted_dataset.tail({"activity": 3}) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) >> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "location"], + ... [[23.4, 24.1], [np.nan, 22.1], [21.8, 24.2], [20.5, 25.3]], + ... ) + ... }, + ... coords={"time": [1, 2, 3, 4], "location": ["A", "B"]}, + ... ) + >>> dataset + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="any") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="all") + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time", thresh=2) + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) T_Dataset Parameters ---------- dim : Hashable - Specifies the dimension along which to propagate values when - filling. + Specifies the dimension along which to propagate values when filling. limit : int or None, optional The maximum number of consecutive NaN values to forward fill. In other words, if there is a gap with more than this number of @@ -5829,9 +6119,48 @@ def ffill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # Perform forward fill (ffill) on the dataset + + >>> dataset.ffill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 1.0 5.0 5.0 5.0 8.0 8.0 10.0 + + # Limit the forward filling to a maximum of 2 consecutive NaN values + + >>> dataset.ffill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 nan 5.0 5.0 5.0 8.0 8.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.bfill """ from xarray.core.missing import _apply_over_vars_with_dim, ffill @@ -5855,9 +6184,48 @@ def bfill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # filled dataset, fills NaN values by propagating values backward + + >>> dataset.bfill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 5.0 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + + # Limit the backward filling to a maximum of 2 consecutive NaN values + + >>> dataset.bfill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.ffill """ from xarray.core.missing import _apply_over_vars_with_dim, bfill @@ -5922,6 +6290,38 @@ def reduce( reduced : Dataset Dataset with this object's DataArrays replaced with new DataArrays of summarized data and the indicated dimension(s) removed. + + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Calculate the 75th percentile of math scores for each student using np.percentile + + >>> percentile_scores = dataset.reduce(np.percentile, q=75, dim="test") + >>> percentile_scores + + Dimensions: (student: 3) + Coordinates: + * student (student) T_Dataset: ------- result : Dataset + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 79], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [39, 96, 78]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Indices of the minimum values along the 'student' dimension are calculated + + >>> argmin_indices = dataset.argmin(dim="student") + + >>> min_score_in_math = dataset["student"].isel( + ... student=argmin_indices["math_scores"] + ... ) + >>> min_score_in_math + + array(['Bob', 'Bob', 'Alice'], dtype='>> min_score_in_english = dataset["student"].isel( + ... student=argmin_indices["english_scores"] + ... ) + >>> min_score_in_english + + array(['Charlie', 'Bob', 'Charlie'], dtype=' T_Dataset: ------- result : Dataset + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Indices of the maximum values along the 'student' dimension are calculated + + >>> argmax_indices = dataset.argmax(dim="test") + + >>> argmax_indices + + Dimensions: (student: 3) + Coordinates: + * student (student) dict[str, ChunkManagerEntrypoint]: """ - Return a dictionary of available chunk managers and their ChunkManagerEntrypoint objects. + Return a dictionary of available chunk managers and their ChunkManagerEntrypoint subclass objects. + + Returns + ------- + chunkmanagers : dict + Dictionary whose values are registered ChunkManagerEntrypoint subclass instances, and whose values + are the strings under which they are registered. Notes ----- @@ -143,7 +149,13 @@ def get_chunked_array_type(*args) -> ChunkManagerEntrypoint: class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): """ - Adapter between a particular parallel computing framework and xarray. + Interface between a particular parallel computing framework and xarray. + + This abstract base class must be subclassed by libraries implementing chunked array types, and + registered via the ``chunkmanagers`` entrypoint. + + Abstract methods on this class must be implemented, whereas non-abstract methods are only required in order to + enable a subset of xarray functionality, and by default will raise a ``NotImplementedError`` if called. Attributes ---------- @@ -151,7 +163,7 @@ class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): Type of the array class this parallel computing framework provides. Parallel frameworks need to provide an array class that supports the array API standard. - Used for type checking. + This attribute is used for array instance type checking at runtime. """ array_cls: type[T_ChunkedArray] @@ -159,13 +171,51 @@ class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): @abstractmethod def __init__(self) -> None: + """Used to set the array_cls attribute at import time.""" raise NotImplementedError() def is_chunked_array(self, data: Any) -> bool: + """ + Check if the given object is an instance of this type of chunked array. + + Compares against the type stored in the array_cls attribute by default. + + Parameters + ---------- + data : Any + + Returns + ------- + is_chunked : bool + + See Also + -------- + dask.is_dask_collection + """ return isinstance(data, self.array_cls) @abstractmethod def chunks(self, data: T_ChunkedArray) -> T_NormalizedChunks: + """ + Return the current chunks of the given array. + + Returns chunks explicitly as a tuple of tuple of ints. + + Used internally by xarray objects' .chunks and .chunksizes properties. + + Parameters + ---------- + data : chunked array + + Returns + ------- + chunks : tuple[tuple[int, ...], ...] + + See Also + -------- + dask.array.Array.chunks + cubed.Array.chunks + """ raise NotImplementedError() @abstractmethod @@ -177,14 +227,58 @@ def normalize_chunks( dtype: np.dtype | None = None, previous_chunks: T_NormalizedChunks | None = None, ) -> T_NormalizedChunks: - """Called by open_dataset""" + """ + Normalize given chunking pattern into an explicit tuple of tuples representation. + + Exposed primarily because different chunking backends may want to make different decisions about how to + automatically chunk along dimensions not given explicitly in the input chunks. + + Called internally by xarray.open_dataset. + + Parameters + ---------- + chunks : tuple, int, dict, or string + The chunks to be normalized. + shape : Tuple[int] + The shape of the array + limit : int (optional) + The maximum block size to target in bytes, + if freedom is given to choose + dtype : np.dtype + previous_chunks : Tuple[Tuple[int]], optional + Chunks from a previous array that we should use for inspiration when + rechunking dimensions automatically. + + See Also + -------- + dask.array.core.normalize_chunks + """ raise NotImplementedError() @abstractmethod def from_array( self, data: np.ndarray, chunks: T_Chunks, **kwargs ) -> T_ChunkedArray: - """Called when .chunk is called on an xarray object that is not already chunked.""" + """ + Create a chunked array from a non-chunked numpy-like array. + + Generally input should have a ``.shape``, ``.ndim``, ``.dtype`` and support numpy-style slicing. + + Called when the .chunk method is called on an xarray object that is not already chunked. + Also called within open_dataset (when chunks is not None) to create a chunked array from + an xarray lazily indexed array. + + Parameters + ---------- + data : array_like + chunks : int, tuple + How to chunk the array. + + See Also + -------- + dask.array.from_array + cubed.from_array + """ raise NotImplementedError() def rechunk( @@ -193,17 +287,71 @@ def rechunk( chunks: T_NormalizedChunks | tuple[int, ...] | T_Chunks, **kwargs, ) -> T_ChunkedArray: - """Called when .chunk is called on an xarray object that is already chunked.""" + """ + Changes the chunking pattern of the given array. + + Called when the .chunk method is called on an xarray object that is already chunked. + + Parameters + ---------- + data : dask array + Array to be rechunked. + chunks : int, tuple, dict or str, optional + The new block dimensions to create. -1 indicates the full size of the + corresponding dimension. Default is "auto" which automatically + determines chunk sizes. + + Returns + ------- + chunked array + + See Also + -------- + dask.array.Array.rechunk + cubed.Array.rechunk + """ return data.rechunk(chunks, **kwargs) # type: ignore[attr-defined] @abstractmethod - def compute(self, *data: T_ChunkedArray, **kwargs) -> tuple[np.ndarray, ...]: - """Used anytime something needs to computed, including multiple arrays at once.""" + def compute(self, *data: T_ChunkedArray | Any, **kwargs) -> tuple[np.ndarray, ...]: + """ + Computes one or more chunked arrays, returning them as eager numpy arrays. + + Called anytime something needs to computed, including multiple arrays at once. + Used by `.compute`, `.persist`, `.values`. + + Parameters + ---------- + *data : object + Any number of objects. If an object is an instance of the chunked array type, it is computed + and the in-memory result returned as a numpy array. All other types should be passed through unchanged. + + Returns + ------- + objs + The input, but with all chunked arrays now computed. + + See Also + -------- + dask.compute + cubed.compute + """ raise NotImplementedError() @property def array_api(self) -> Any: - """Return the array_api namespace following the python array API standard.""" + """ + Return the array_api namespace following the python array API standard. + + See https://data-apis.org/array-api/latest/ . Currently used to access the array API function + ``full_like``, which is called within the xarray constructors ``xarray.full_like``, ``xarray.ones_like``, + ``xarray.zeros_like``, etc. + + See Also + -------- + dask.array + cubed.array_api + """ raise NotImplementedError() def reduction( @@ -216,7 +364,43 @@ def reduction( dtype: np.dtype | None = None, keepdims: bool = False, ) -> T_ChunkedArray: - """Used in some reductions like nanfirst, which is used by groupby.first""" + """ + A general version of array reductions along one or more axes. + + Used inside some reductions like nanfirst, which is used by ``groupby.first``. + + Parameters + ---------- + arr : chunked array + Data to be reduced along one or more axes. + func : Callable(x_chunk, axis, keepdims) + First function to be executed when resolving the dask graph. + This function is applied in parallel to all original chunks of x. + See below for function parameters. + combine_func : Callable(x_chunk, axis, keepdims), optional + Function used for intermediate recursive aggregation (see + split_every below). If omitted, it defaults to aggregate_func. + aggregate_func : Callable(x_chunk, axis, keepdims) + Last function to be executed, producing the final output. It is always invoked, even when the reduced + Array counts a single chunk along the reduced axes. + axis : int or sequence of ints, optional + Axis or axes to aggregate upon. If omitted, aggregate along all axes. + dtype : np.dtype + data type of output. This argument was previously optional, but + leaving as ``None`` will now raise an exception. + keepdims : boolean, optional + Whether the reduction function should preserve the reduced axes, + leaving them at size ``output_size``, or remove them. + + Returns + ------- + chunked array + + See Also + -------- + dask.array.reduction + cubed.core.reduction + """ raise NotImplementedError() @abstractmethod @@ -232,7 +416,77 @@ def apply_gufunc( **kwargs, ): """ - Called inside xarray.apply_ufunc, so must be supplied for vast majority of xarray computations to be supported. + Apply a generalized ufunc or similar python function to arrays. + + ``signature`` determines if the function consumes or produces core + dimensions. The remaining dimensions in given input arrays (``*args``) + are considered loop dimensions and are required to broadcast + naturally against each other. + + In other terms, this function is like ``np.vectorize``, but for + the blocks of chunked arrays. If the function itself shall also + be vectorized use ``vectorize=True`` for convenience. + + Called inside ``xarray.apply_ufunc``, which is called internally for most xarray operations. + Therefore this method must be implemented for the vast majority of xarray computations to be supported. + + Parameters + ---------- + func : callable + Function to call like ``func(*args, **kwargs)`` on input arrays + (``*args``) that returns an array or tuple of arrays. If multiple + arguments with non-matching dimensions are supplied, this function is + expected to vectorize (broadcast) over axes of positional arguments in + the style of NumPy universal functions [1]_ (if this is not the case, + set ``vectorize=True``). If this function returns multiple outputs, + ``output_core_dims`` has to be set as well. + signature: string + Specifies what core dimensions are consumed and produced by ``func``. + According to the specification of numpy.gufunc signature [2]_ + *args : numeric + Input arrays or scalars to the callable function. + axes: List of tuples, optional, keyword only + A list of tuples with indices of axes a generalized ufunc should operate on. + For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for + matrix multiplication, the base elements are two-dimensional matrices + and these are taken to be stored in the two last axes of each argument. The + corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. + For simplicity, for generalized ufuncs that operate on 1-dimensional arrays + (vectors), a single integer is accepted instead of a single-element tuple, + and for generalized ufuncs for which all outputs are scalars, the output + tuples can be omitted. + keepdims: bool, optional, keyword only + If this is set to True, axes which are reduced over will be left in the result as + a dimension with size one, so that the result will broadcast correctly against the + inputs. This option can only be used for generalized ufuncs that operate on inputs + that all have the same number of core dimensions and with outputs that have no core + dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. + If used, the location of the dimensions in the output can be controlled with axes + and axis. + output_dtypes : Optional, dtype or list of dtypes, keyword only + Valid numpy dtype specification or list thereof. + If not given, a call of ``func`` with a small set of data + is performed in order to try to automatically determine the + output dtypes. + vectorize: bool, keyword only + If set to ``True``, ``np.vectorize`` is applied to ``func`` for + convenience. Defaults to ``False``. + **kwargs : dict + Extra keyword arguments to pass to `func` + + Returns + ------- + Single chunked array or tuple of chunked arrays + + See Also + -------- + dask.array.gufunc.apply_gufunc + cubed.apply_gufunc + + References + ---------- + .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html + .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html """ raise NotImplementedError() @@ -246,7 +500,42 @@ def map_blocks( new_axis: int | Sequence[int] | None = None, **kwargs, ): - """Called in elementwise operations, but notably not called in xarray.map_blocks.""" + """ + Map a function across all blocks of a chunked array. + + Called in elementwise operations, but notably not (currently) called within xarray.map_blocks. + + Parameters + ---------- + func : callable + Function to apply to every block in the array. + If ``func`` accepts ``block_info=`` or ``block_id=`` + as keyword arguments, these will be passed dictionaries + containing information about input and output chunks/arrays + during computation. See examples for details. + args : dask arrays or other objects + dtype : np.dtype, optional + The ``dtype`` of the output array. It is recommended to provide this. + If not provided, will be inferred by applying the function to a small + set of fake data. + chunks : tuple, optional + Chunk shape of resulting blocks if the function does not preserve + shape. If not provided, the resulting array is assumed to have the same + block structure as the first input array. + drop_axis : number or iterable, optional + Dimensions lost by the function. + new_axis : number or iterable, optional + New dimensions created by the function. Note that these are applied + after ``drop_axis`` (if present). + **kwargs : + Other keyword arguments to pass to function. Values must be constants + (not dask.arrays) + + See Also + -------- + dask.array.map_blocks + cubed.map_blocks + """ raise NotImplementedError() def blockwise( @@ -259,7 +548,45 @@ def blockwise( align_arrays: bool = True, **kwargs, ): - """Called by some niche functions in xarray.""" + """ + Tensor operation: Generalized inner and outer products. + + A broad class of blocked algorithms and patterns can be specified with a + concise multi-index notation. The ``blockwise`` function applies an in-memory + function across multiple blocks of multiple inputs in a variety of ways. + Many chunked array operations are special cases of blockwise including + elementwise, broadcasting, reductions, tensordot, and transpose. + + Currently only called explicitly in xarray when performing multidimensional interpolation. + + Parameters + ---------- + func : callable + Function to apply to individual tuples of blocks + out_ind : iterable + Block pattern of the output, something like 'ijk' or (1, 2, 3) + *args : sequence of Array, index pairs + You may also pass literal arguments, accompanied by None index + e.g. (x, 'ij', y, 'jk', z, 'i', some_literal, None) + **kwargs : dict + Extra keyword arguments to pass to function + adjust_chunks : dict + Dictionary mapping index to function to be applied to chunk sizes + new_axes : dict, keyword only + New indexes and their dimension lengths + align_arrays: bool + Whether or not to align chunks along equally sized dimensions when + multiple arrays are provided. This allows for larger chunks in some + arrays to be broken into smaller ones that match chunk sizes in other + arrays such that they are compatible for block function mapping. If + this is false, then an error will be thrown if arrays do not already + have the same number of blocks in each dimension. + + See Also + -------- + dask.array.blockwise + cubed.core.blockwise + """ raise NotImplementedError() def unify_chunks( @@ -267,7 +594,21 @@ def unify_chunks( *args: Any, # can't type this as mypy assumes args are all same type, but dask unify_chunks args alternate types **kwargs, ) -> tuple[dict[str, T_NormalizedChunks], list[T_ChunkedArray]]: - """Called by xr.unify_chunks.""" + """ + Unify chunks across a sequence of arrays. + + Called by xarray.unify_chunks. + + Parameters + ---------- + *args: sequence of Array, index pairs + Sequence like (x, 'ij', y, 'jk', z, 'i') + + See Also + -------- + dask.array.core.unify_chunks + cubed.core.unify_chunks + """ raise NotImplementedError() def store( @@ -276,5 +617,29 @@ def store( targets: Any, **kwargs: dict[str, Any], ): - """Used when writing to any backend.""" + """ + Store chunked arrays in array-like objects, overwriting data in target. + + This stores chunked arrays into object that supports numpy-style setitem + indexing (e.g. a Zarr Store). Allows storing values chunk by chunk so that it does not have to + fill up memory. For best performance you likely want to align the block size of + the storage target with the block size of your array. + + Used when writing to any registered xarray I/O backend. + + Parameters + ---------- + sources: Array or collection of Arrays + targets: array-like or collection of array-likes + These should support setitem syntax ``target[10:20] = ...``. + If sources is a single item, targets must be a single item; if sources is a + collection of arrays, targets must be a matching collection. + kwargs: + Parameters passed to compute/persist (only used if compute=True) + + See Also + -------- + dask.array.store + cubed.store + """ raise NotImplementedError() diff --git a/xarray/plot/dataarray_plot.py b/xarray/plot/dataarray_plot.py index 7f11ddac0a6..d2c0a8e2af6 100644 --- a/xarray/plot/dataarray_plot.py +++ b/xarray/plot/dataarray_plot.py @@ -733,15 +733,6 @@ def _plot1d(plotfunc): If specified plot 3D and use this coordinate for *z* axis. hue : Hashable or None, optional Dimension or coordinate for which you want multiple lines plotted. - hue_style: {'discrete', 'continuous'} or None, optional - How to use the ``hue`` variable: - - - ``'continuous'`` -- continuous color scale - (default for numeric ``hue`` variables) - - ``'discrete'`` -- a color for each unique value, - using the default color cycle - (default for non-numeric ``hue`` variables) - markersize: Hashable or None, optional scatter only. Variable by which to vary size of scattered points. linewidth: Hashable or None, optional @@ -935,6 +926,19 @@ def newplotfunc( warnings.warn(msg, DeprecationWarning, stacklevel=2) del args + if hue_style is not None: + # TODO: Not used since 2022.10. Deprecated since 2023.07. + warnings.warn( + ( + "hue_style is no longer used for plot1d plots " + "and the argument will eventually be removed. " + "Convert numbers to string for a discrete hue " + "and use add_legend or add_colorbar to control which guide to display." + ), + DeprecationWarning, + stacklevel=2, + ) + _is_facetgrid = kwargs.pop("_is_facetgrid", False) if plotfunc.__name__ == "scatter": diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index e807081f838..2c58fe83cef 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -1438,6 +1438,16 @@ def data_is_numeric(self) -> bool: >>> a = xr.DataArray([0.5, 0, 0, 0.5, 2, 3]) >>> _Normalize(a).data_is_numeric True + + >>> # TODO: Datetime should be numeric right? + >>> a = xr.DataArray(pd.date_range("2000-1-1", periods=4)) + >>> _Normalize(a).data_is_numeric + False + + # TODO: Timedelta should be numeric right? + >>> a = xr.DataArray(pd.timedelta_range("-1D", periods=4, freq="D")) + >>> _Normalize(a).data_is_numeric + True """ return self._data_is_numeric diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0450e769e7b..dad2d668ff8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1611,6 +1611,20 @@ def test_encoding_unlimited_dims(self) -> None: assert actual.encoding["unlimited_dims"] == set("y") assert_equal(ds, actual) + def test_raise_on_forward_slashes_in_names(self) -> None: + # test for forward slash in variable names and dimensions + # see GH 7943 + data_vars: list[dict[str, Any]] = [ + {"PASS/FAIL": (["PASSFAIL"], np.array([0]))}, + {"PASS/FAIL": np.array([0])}, + {"PASSFAIL": (["PASS/FAIL"], np.array([0]))}, + ] + for dv in data_vars: + ds = Dataset(data_vars=dv) + with pytest.raises(ValueError, match="Forward slashes '/' are not allowed"): + with self.roundtrip(ds): + pass + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 18ca49670ba..8b2dfbdec41 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2708,23 +2708,32 @@ def test_bad_args( x=x, y=y, hue=hue, add_legend=add_legend, add_colorbar=add_colorbar ) - @pytest.mark.xfail(reason="datetime,timedelta hue variable not supported.") - @pytest.mark.parametrize("hue_style", ["discrete", "continuous"]) - def test_datetime_hue(self, hue_style: Literal["discrete", "continuous"]) -> None: + def test_datetime_hue(self) -> None: ds2 = self.ds.copy() + + # TODO: Currently plots as categorical, should it behave as numerical? ds2["hue"] = pd.date_range("2000-1-1", periods=4) - ds2.plot.scatter(x="A", y="B", hue="hue", hue_style=hue_style) + ds2.plot.scatter(x="A", y="B", hue="hue") ds2["hue"] = pd.timedelta_range("-1D", periods=4, freq="D") - ds2.plot.scatter(x="A", y="B", hue="hue", hue_style=hue_style) + ds2.plot.scatter(x="A", y="B", hue="hue") - @pytest.mark.parametrize("hue_style", ["discrete", "continuous"]) - def test_facetgrid_hue_style( - self, hue_style: Literal["discrete", "continuous"] - ) -> None: - g = self.ds.plot.scatter( - x="A", y="B", row="row", col="col", hue="hue", hue_style=hue_style - ) + def test_facetgrid_hue_style(self) -> None: + ds2 = self.ds.copy() + + # Numbers plots as continous: + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") + assert isinstance(g._mappables[-1], mpl.collections.PathCollection) + + # Datetimes plots as categorical: + # TODO: Currently plots as categorical, should it behave as numerical? + ds2["hue"] = pd.date_range("2000-1-1", periods=4) + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") + assert isinstance(g._mappables[-1], mpl.collections.PathCollection) + + # Strings plots as categorical: + ds2["hue"] = ["a", "a", "b", "b"] + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") assert isinstance(g._mappables[-1], mpl.collections.PathCollection) @pytest.mark.parametrize( diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index ad9da0fad33..addd7587544 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -2,6 +2,7 @@ import functools import operator +import sys import numpy as np import pandas as pd @@ -1508,6 +1509,10 @@ def test_dot_dataarray(dtype): class TestVariable: + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -2339,6 +2344,10 @@ def test_repr(self, func, variant, dtype): # warnings or errors, but does not check the result func(data_array) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -2416,6 +2425,10 @@ def test_aggregation(self, func, dtype): assert_units_equal(expected, actual) assert_allclose(expected, actual) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -4069,6 +4082,10 @@ def test_repr(self, func, variant, dtype): # warnings or errors, but does not check the result func(ds) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", (