diff --git a/doc/api.rst b/doc/api.rst index fbd0a75bb41..e7d1cd495ca 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -32,6 +32,7 @@ Top-level functions cov corr dot + hist polyval map_blocks show_versions @@ -180,6 +181,7 @@ Computation Dataset.map_blocks Dataset.polyfit Dataset.curvefit + Dataset.hist **Aggregation**: :py:attr:`~Dataset.all` @@ -378,6 +380,7 @@ Computation DataArray.polyfit DataArray.map_blocks DataArray.curvefit + DataArray.hist **Aggregation**: :py:attr:`~DataArray.all` diff --git a/properties/test_chunking_patterns.py b/properties/test_chunking_patterns.py new file mode 100644 index 00000000000..eb99736f277 --- /dev/null +++ b/properties/test_chunking_patterns.py @@ -0,0 +1,116 @@ +""" +Property-based tests that dask-powered algorithms will succeed regardless of the +pattern of chunks on the input arrays. +""" + +import uuid + +import pytest + +pytest.importorskip("hypothesis") + +import hypothesis.strategies as st +import numpy as np +from hypothesis import given + +import xarray as xr +from xarray.tests import requires_dask + + +def example_dataarray(shape=(5, 20)): + data = np.random.randn(*shape) + dims = [f"dim_{i}" for i in range(len(shape))] + da = xr.DataArray(data, dims=dims, name="T") + return da + + +def example_dataset(n_dim=2, n_vars=2): + """Random dataset with every variable having the same shape""" + + shape = tuple(range(8, 8 + n_dim)) + dims = [f"dim_{i}" for i in range(len(shape))] + var_names = [uuid.uuid4().hex for _ in range(n_vars)] + ds = xr.Dataset() + for i in range(n_vars): + name = var_names[i] + data = np.random.randn(*shape) + da = xr.DataArray(data, dims=dims, name=name) + ds[name] = da + return ds + + +@st.composite +def chunk_shapes(draw, n_dim=3, max_arr_len=10): + """Generate different chunking patterns for an N-D array of data.""" + chunks = [] + for n in range(n_dim): + shape = draw(st.integers(min_value=1, max_value=max_arr_len)) + chunks.append(shape) + return tuple(chunks) + + +@requires_dask +class TestChunkingHypotheses: + @given(chunk_shapes(n_dim=1, max_arr_len=20)) + def test_all_chunking_patterns_1d(self, chunks): + + data = example_dataarray(shape=(20,)).chunk(chunks) + + nbins_a = 8 + bins = np.linspace(-4, 4, nbins_a + 1) + + h = xr.hist(data, bins=[bins]) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram( + data.values, + bins=bins, + ) + + np.testing.assert_allclose(hist, h) + + @pytest.mark.slow + @given(chunk_shapes(n_dim=2, max_arr_len=8)) + def test_all_chunking_patterns_2d(self, chunks): + + data_a = example_dataarray(shape=(5, 20)).chunk(chunks) + data_b = example_dataarray(shape=(5, 20)).chunk(chunks) + + nbins_a = 8 + nbins_b = 9 + bins_a = np.linspace(-4, 4, nbins_a + 1) + bins_b = np.linspace(-4, 4, nbins_b + 1) + + h = xr.hist(data_a, data_b, bins=[bins_a, bins_b]) + + assert h.shape == (nbins_a, nbins_b) + + hist, _, _ = np.histogram2d( + data_a.values.ravel(), + data_b.values.ravel(), + bins=[bins_a, bins_b], + ) + + np.testing.assert_allclose(hist, h.values) + + @pytest.mark.slow + @pytest.mark.parametrize("n_vars", [1, 2, 3, 4]) + @given(chunk_shapes(n_dim=2, max_arr_len=7)) + def test_all_chunking_patterns_dd_hist(self, n_vars, chunk_shapes): + ds = example_dataset(n_dim=2, n_vars=n_vars) + ds = ds.chunk({d: c for d, c in zip(ds.dims.keys(), chunk_shapes)}) + + n_bins = (7, 8, 9, 10)[:n_vars] + bins = [np.linspace(-4, 4, n + 1) for n in n_bins] + + h = xr.hist(*[da for name, da in ds.data_vars.items()], bins=bins) + + assert h.shape == n_bins + + input_data = np.stack( + [da.values.ravel() for name, da in ds.data_vars.items()], axis=-1 + ) + hist, _ = np.histogramdd(input_data, bins=bins) + + np.testing.assert_allclose(hist, h.values) diff --git a/xarray/__init__.py b/xarray/__init__.py index 3886edc60e6..54e2f4ec45d 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -18,7 +18,7 @@ from .core.alignment import align, broadcast from .core.combine import combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, corr, cov, dot, polyval, where +from .core.computation import apply_ufunc, corr, cov, dot, hist, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset @@ -57,6 +57,7 @@ "cov", "corr", "full_like", + "hist", "infer_freq", "load_dataarray", "load_dataset", diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 07b7bca27c5..d0d6834f280 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -26,16 +26,17 @@ import numpy as np from . import dtypes, duck_array_ops, utils -from .alignment import align, deep_align +from .alignment import align, broadcast, deep_align from .merge import merge_attrs, merge_coordinates_without_align from .options import OPTIONS, _get_keep_attrs from .pycompat import is_duck_dask_array -from .utils import is_dict_like +from .utils import OrderedSet, is_dict_like from .variable import Variable if TYPE_CHECKING: from .coordinates import Coordinates # noqa from .dataset import Dataset + from .dataarray import DataArray _NO_FILL_VALUE = utils.ReprObject("") _DEFAULT_NAME = utils.ReprObject("") @@ -1724,3 +1725,304 @@ def _calc_idxminmax( res.attrs = indx.attrs return res + + +_ALLOWED_BINS_TYPES = Union[int, str, np.ndarray, DataArray, None] + + +def hist( + *dataarrays : DataArray, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + weights : DataArray = None, + density : bool = False, + keep_attrs : bool = None, +) -> DataArray: + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + Parameters + ---------- + dataarrays : xarray.DataArray objects + Input data. The number of input arguments determines the dimensionality of + the histogram. For example, two arguments produce a 2D histogram. + dim : str or tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int, str, numpy array or DataArray, or a list of ints, strs, arrays and/or DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specifications for each entry: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[var]_bins` + for a given input variable named `var`), and must not have any + dimensions shared with the `dim` argument. If supplied this DataArray + will be present as a coordinate on the output. + * If a list of ints, strs, arrays and/or DataArrays; the bin specification + as above for every argument in ``args``. + * If not supplied (or any elements of the list are `None`) then bins + will be automatically calculated by `np.histogram_bin_edges`. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will also be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly via arrays or DataArrays, because + other bin specifications trigger loading of the entire input data. + range : (float, float) or a list of (float, float), optional + If a list, there should be one entry for each item in ``args``. + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins for all + arguments in ``args``. Values outside the range are ignored. The first + element of the range must be less than or equal to the second. `range` + affects the automatic bin computation as well. In this case, while bin + width is computed to be optimal based on the actual data within `range`, + the bin count will fill the entire range including portions containing + no data. + * If a list of (float, float); the ranges as above for every argument in + ``args``. + * If not provided, range is simply ``(arg.min(), arg.max())`` for each + arg. + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the first original + object passed to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `[var]_bins`, which contains the positions + of the centres of each bin, varing along a new dimension of the same name. + + All other coordinates will be retained, unless they depend on a dimension + which has been reduced along, in which case they will be dropped. + + Examples + -------- + + See Also + -------- + DataArray.hist + Dataset.hist + numpy.histogramdd + numpy.histogram_bin_edges + dask.array.blockwise + """ + + from .dataarray import DataArray, Variable + + # Check inputs + if any(not isinstance(arr, DataArray) for arr in dataarrays): + raise TypeError( + "Only xr.DataArray is supported as input data, but given " + f"{[type(arr) for arr in dataarrays]}." + ) + if weights is not None: + if not isinstance(weights, (Variable, DataArray)): + raise TypeError( + "Only xr.DataArray and xr.Variable are supported as weights, " + "but given {type(weights)}." + ) + + n_args = len(dataarrays) + if n_args == 0: + raise TypeError("At least one input dataarray must be given.") + + for da in dataarrays: + if da.name is None: + raise ValueError("All input dataarrays must have a name.") + + if isinstance(dim, str): + dim = (dim,) + reduce_dims = dim + + # Will broadcast over all dimensions not counted along by histogram + all_input_dims = ordered_set_union([da.dims for da in dataarrays]) + broadcast_dims : Iterable[Hashable] = OrderedSet(*all_input_dims) - OrderedSet(*reduce_dims) + + # Create output dims + new_bin_dims = [da.name + "_bins" for da in dataarrays] + output_dims = [broadcast_dims] + new_bin_dims + + # Check range kwarg + def _check_or_find_range(r, da): + if r is None: + lower, upper = da.min(), da.max() + else: + lower, upper = r + if not all(isinstance(bound, float) for bound in (lower, upper)): + raise TypeError( + "ranges must be provided in form (float, float), " + f"but found ({type(lower)}, {type(upper)})." + ) + if lower > upper: + raise ValueError( + "lower bound of range must be smaller than upper bound" + ) + return lower, upper + + if isinstance(range, list): + if len(range) != n_args: + raise TypeError( + "If `range` is a list then it must have same length " + "as number of input dataarrays passed, but instead has " + f"length {len(range)}." + ) + else: + range = [range] * n_args + ranges = [_check_or_find_range(r, da) for r, da in zip(range, dataarrays)] + + # Check bins kwarg + def _check_and_format_bins_into_coords(b, da, r, bin_dim): + # Check validity of given bins, or create using np.histogram_bin_edges + # Package into a coordinate DataArray before returning + _edges = np.histogram_bin_edges + if isinstance(b, (int, str)) or b is None: + if is_duck_dask_array(da.data): + raise TypeError( + f"Choice of bins as {b} would trigger loading " + f"of entire input array to histogram" + ) + b = "auto" if b is None else b + return DataArray( + _edges(da.values, b, r), dims=bin_dim, name=bin_dim, attrs=da.attrs + ) + elif isinstance(b, np.ndarray): + if b.ndim > 1: + raise ValueError( + "bins specified as numpy arrays can only be 1-dimensional" + ) + return DataArray(b, dims=bin_dim, name=bin_dim, attrs=da.attrs) + elif isinstance(b, DataArray): + if bin_dim not in b.dims: + raise ValueError( + "A bins dataarray does not contain the " + "corresponding output bins dimension - " + f"has dims {b.dims} but not {bin_dim}." + ) + if not (set(b.dims) - set([bin_dim])).issubset(broadcast_dims): + raise ValueError( + "A bins dataarray has dimensions present that " + "will not be broadcast on the output: " + f"{da.dims} vs {tuple(*broadcast_dims)}" + ) + return b + else: + raise TypeError(f"Type {type(b)} is not a valid argument to bins") + + if isinstance(bins, list): + if len(bins) != n_args: + raise TypeError( + "If `bins` is a list then it must have same length " + "as number of input dataarrays passed, but instead has " + f"length {len(bins)}. To manually specify bin edges for " + "a single input pass them as a numpy array instead." + ) + else: + bins = [bins] * n_args + bins_as_coords : List[DataArray] = [ + _check_and_format_bins_into_coords(b, da, r, d) + for b, da, r, d in zip(bins, dataarrays, ranges, new_bin_dims) + ] + + # Align / broadcast all inputs (including weights and bins) + # TODO if this was merely alignment could blockwise handle all the broadcasting? + arrs = broadcast(dataarrays) + weights = weights.broadcast_like(arrs[0]) + # TODO surround with try except? + aligned_bins = [b.broadcast_like(arrs[0]) for b in bins_as_coords] + # TODO bins now already has the output dims included, is that correct? + + # Compute histogram results + reduce_axes = tuple(list(all_input_dims).index(d) for d in reduce_dims) + h_data = _calc_histogram( + [arr.values for arr in arrs], + axis=reduce_axes, + bins=[b.values for b in aligned_bins], + weights=weights.values, + density=density, + ) + + # Reconstruct output dataarray + output_name = "_".join(["histogram"] + [da.name for da in dataarrays]) + + # Adjust bin coords to return positions of bin centres rather than bin edges + def _find_centers(da, dim): + return 0.5 * (da.isel(dim=slice(None, None, -1)) + da.isel(dim=slice(1, None))) + + bin_centers = [ + _find_centers(bin, new_bin_dim) for bin, new_bin_dim in zip(bins, new_bin_dims) + ] + + # Keep all old coords from any input that have not been reduced along + old_coords = { + coord.name: coord + for da in dataarrays + for coord in da.coords + if not any(coord.dims in reduce_dims) + } + + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + attrs = dataarrays[0].attrs if keep_attrs else None + + return DataArray( + h_data, + dims=output_dims, + coords={**old_coords, **bin_centers}, + name=output_name, + attrs=attrs, + ) + + +def _calc_histogram(*arrs, axis, bins, weights, density): + """ + Internal axis-aware histogram calculation. + Will map over dask arrays using blockwise if any are present. + Expects all numpy/dask arrays, already aligned and broadcast against one another. + """ + + # TODO xhistogram code goes here + # Use xhistogram.core.histogram + + # TODO how does xarray normally test for dask arrays? + + return [] + + +def _bincount(): + """ + Axis-aware bincounting function. + Acts on a single numpy array (/ dask chunk). + Works by reshaping input to combine all broadcast dims along one axis, + and all reduce dims along one axis, then reshaping back. + """ + return ... diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2ab60b894e1..5ee95f3ca37 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -73,6 +73,8 @@ T_DataArray = TypeVar("T_DataArray", bound="DataArray") T_DSorDA = TypeVar("T_DSorDA", "DataArray", Dataset) if TYPE_CHECKING: + from .computation import _ALLOWED_BINS_TYPES + try: from dask.delayed import Delayed except ImportError: @@ -4599,6 +4601,107 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Tuple[float, float] = None, + weights : T_DataArray = None, + density : bool = False, + keep_attrs : bool = None, + ) -> DataArray: + """ + Histogram applied along specified dimensions. + + If the supplied dataarray contains a chunked dask array it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int, str, numpy array or DataArray, optional + The bin specifications: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[da.name]_bins`), + and must not have any dimensions shared with the `dim` argument. + If supplied this DataArray will be present as a coordinate on the output. + * If not supplied then bins will be automatically calculated by + `np.histogram_bin_edges`. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will also be raised if the dataarray contains a dask array and + ``bins`` are not specified explicitly via an array or DataArray, because + other bin specifications trigger loading of the entire input data. + range : (float, float), optional + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins. Values + outside the range are ignored. The first element of the range must be + less than or equal to the second. `range` affects the automatic bin + computation as well. In this case, while bin width is computed to be + optimal based on the actual data within `range`, the bin count will + fill the entire range including portions containing no data. + * If not provided, range is simply ``(da.min(), da.max())``. + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate, named as + `[da.name]_bins`, which contains the positions of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + numpy.histogramdd + numpy.histogram_bin_edges + dask.array.blockwise + """ + + from .computation import hist + + return hist( + self, + dim=dim, + bins=bins, + range=range, + weights=weights, + density=density, + keep_attrs=keep_attrs, + ) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1697a7c67aa..dfe1b2075f4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -106,6 +106,7 @@ from ..backends import AbstractDataStore, ZarrStore from .dataarray import DataArray from .merge import CoercibleMapping + from .computation import _ALLOWED_BINS_TYPES T_DSorDA = TypeVar("T_DSorDA", DataArray, "Dataset") @@ -7627,3 +7628,115 @@ def _wrapper(Y, *coords_, **kwargs): result.attrs = self.attrs.copy() return result + + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + weights : DataArray = None, + density : bool = False, + keep_attrs : bool = None, + ) -> DataArray: + """ + Histogram applied along specified dimensions. Will create a N-dimensional + histogram, where N is the number of variables in the dataset. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int, str, numpy array or DataArray, or a list of ints, strs, arrays and/or DataArrays, optional + If a list, there should be one entry for each variable in the dataset. + The bin specifications for each entry: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[var]_bins` + for a given input variable named `var`), and must not have any + dimensions shared with the `dim` argument. If supplied this DataArray + will be present as a coordinate on the output. + * If a list of ints, strs, arrays and/or DataArrays; the bin specification + as above for every variable in the dataset. + * If not supplied (or any elements of the list are `None`) then bins + will be automatically calculated by `np.histogram_bin_edges`. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will also be raised if the variables contain dask arrays and + ``bins`` are not specified explicitly via arrays or DataArrays, because + other bin specifications trigger loading of the entire input variable. + range : (float, float) or a list of (float, float), optional + If a list, there should be one entry for each variable in the dataset. + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins for all + variables in the dataset. Values outside the range are ignored. The first + element of the range must be less than or equal to the second. `range` + affects the automatic bin computation as well. In this case, while bin + width is computed to be optimal based on the actual data within `range`, + the bin count will fill the entire range including portions containing + no data. + * If a list of (float, float); the ranges as above for every variable in + the dataset. + * If not provided, the ranges are simply ``(ds[var].min(), ds[var].max())`` + for each variable. + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + variable in the input dataset, named as `var_bins`, which contains + the positions of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + numpy.histogramdd + numpy.histogram_bin_edges + dask.array.blockwise + """ + + from .computation import hist + + return hist( + *[self[var] for var in self], + dim=dim, + bins=bins, + range=range, + weights=weights, + density=density, + keep_attrs=keep_attrs, + ) diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index e8838b07157..86232af8d5c 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Generic, Hashable, Iterable, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Generic, Hashable, Iterable, Tuple, List, Optional, TypeVar, Union from . import duck_array_ops from .computation import dot @@ -7,6 +7,7 @@ if TYPE_CHECKING: from .common import DataWithCoords # noqa: F401 from .dataarray import DataArray, Dataset + from .computation import _ALLOWED_BINS_TYPES T_DataWithCoords = TypeVar("T_DataWithCoords", bound="DataWithCoords") @@ -58,6 +59,71 @@ New {cls} object with the sum of the weights over the given dimension. """ +_WEIGHTED_HIST_DOCSTRING = """ + Weighted histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + Parameters + ---------- + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened {cls}. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. + + Returns + ------- + hist : xarray.DataArray + A xarray.DataArray object which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + variable supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + Dataset.hist + numpy.histogramdd + dask.array.blockwise + """ + class Weighted(Generic[T_DataWithCoords]): """An object that implements weighted operations. @@ -238,6 +304,23 @@ def mean( self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + density : bool = False, + keep_attrs : bool = None, + ) -> T_DataWithCoords: + return self.obj.hist( + dim=dim, + bins=bins, + range=range, + weights=self.weights, + density=density, + keep_attrs=keep_attrs, + ) + def __repr__(self): """provide a nice str repr of our Weighted object""" @@ -268,6 +351,8 @@ def _inject_docstring(cls, cls_name): cls.sum_of_weights.__doc__ = _SUM_OF_WEIGHTS_DOCSTRING.format(cls=cls_name) + cls.hist.__doc__ = _WEIGHTED_HIST_DOCSTRING.format(cls=cls_name) + cls.sum.__doc__ = _WEIGHTED_REDUCE_DOCSTRING_TEMPLATE.format( cls=cls_name, fcn="sum", on_zero="0" ) diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index f530427562a..b13ad21db13 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -9,9 +9,8 @@ import functools import numpy as np -import pandas as pd -from .facetgrid import _easy_facetgrid +from .facetgrid import FacetGrid, _easy_facetgrid from .utils import ( _add_colorbar, _assert_valid_xy, @@ -379,6 +378,9 @@ def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs): def hist( darray, + row=None, + col=None, + col_wrap=None, figsize=None, size=None, aspect=None, @@ -391,6 +393,9 @@ def hist( yticks=None, xlim=None, ylim=None, + bins=None, + weights=None, + density=False, **kwargs, ): """ @@ -398,12 +403,15 @@ def hist( Wraps :py:func:`matplotlib:matplotlib.pyplot.hist`. - Plots *N*-dimensional arrays by first flattening the array. + Plots *N*-dimensional arrays by first flattening the array and calculating + the histogram via :py:func:`DataArray.hist`. Parameters ---------- darray : DataArray - Can have any number of dimensions. + Can have any number of dimensions, but will be reduced to 1 dimension + by the histogram calculation before plotting (faceted plots will retain + more dimensions). figsize : tuple, optional A tuple (width, height) of the figure in inches. Mutually exclusive with ``size`` and ``ax``. @@ -416,16 +424,50 @@ def hist( ax : matplotlib axes object, optional Axes on which to plot. By default, use the current axes. Mutually exclusive with ``size`` and ``figsize``. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + See :py:func:DataArray.hist + weights : array_like, optional + See :py:func:DataArray.hist + density : bool, optional + See :py:func:DataArray.hist **kwargs : optional Additional keyword arguments to :py:func:`matplotlib:matplotlib.pyplot.hist`. """ + + # compute the dims to count over + reduce_dims = set(darray.dims) - set([row, col]) + + # Handle facetgrids first + if row or col: + allargs = locals().copy() + allargs.update(allargs.pop("kwargs")) + allargs.pop("darray") + + g = FacetGrid( + data=darray, + col=col, + row=row, + col_wrap=col_wrap, + sharex=False, + sharey=False, + figsize=figsize, + aspect=aspect, + size=size, + subplot_kws=kwargs, + ) + + return g.map(hist, **kwargs) + ax = get_axis(figsize, size, aspect, ax) - no_nan = np.ravel(darray.values) - no_nan = no_nan[pd.notnull(no_nan)] + h = darray.hist(bins=bins, dim=reduce_dims, weights=weights, density=density) + counts = h.values + bins = h.coords[f"{darray.name}_bins"].values - primitive = ax.hist(no_nan, **kwargs) + # Use the weights kwarg to avoid recomputing histogram in matplotlib + # (see matplotlib.pyplot.hist docstring) + primitive = ax.hist(bins[:-1], weights=counts, **kwargs) ax.set_title("Histogram") ax.set_xlabel(label_from_attrs(darray)) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index b7ae1ca9828..70eb1750101 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -15,6 +15,7 @@ apply_ufunc, broadcast_compat_data, collect_dict_values, + hist, join_dict_keys, ordered_set_intersection, ordered_set_union, @@ -1927,3 +1928,63 @@ def test_polyval(use_dask, use_datetime): da_pv = xr.polyval(da.x, coeffs) xr.testing.assert_allclose(da, da_pv.T) + + +class TestHistInputChecks: + def test_invalid_data(self): + with pytest.raises(TypeError, match="Only xr.DataArray is supported"): + hist("string") + + def test_invalid_weights(self): + with pytest.raises(TypeError, match="supported as weights"): + hist(xr.DataArray([0], name="a"), weights="string") + + def test_no_data(self): + with pytest.raises(TypeError, match="At least one"): + hist() + + def test_nameless_data(self): + with pytest.raises(ValueError, match="must have a name"): + hist(xr.DataArray([0])) + + def test_wrong_number_of_bins(self): + with pytest.raises(TypeError, match="must have same length"): + hist(xr.DataArray([0], name="a"), bins=[2, 3]) + + def test_non_1d_numpy_array(self): + with pytest.raises(ValueError, match="can only be 1-d"): + hist(xr.DataArray([0], name="a"), bins=np.array([[2, 3]])) + + def test_invalid_bins(self): + with pytest.raises(TypeError, match="not a valid argument"): + hist(xr.DataArray([0], name="a"), bins=2.7) + + def test_wrong_number_of_ranges(self): + with pytest.raises(TypeError, match="must have same length"): + hist(xr.DataArray([0], name="a"), range=[(1.0, 2.0), (2.0, 3.0)]) + + def test_unsorted_range(self): + with pytest.raises(ValueError, match="must be smaller"): + hist(xr.DataArray([0], name="a"), range=(6.7, 2.7)) + + def test_invalid_range(self): + with pytest.raises(TypeError, match="float, float"): + hist(xr.DataArray([0], name="a"), range=(2.7, [4.5, 6.7])) + + def test_bin_dataarrays_with_extra_dims(self): + data = xr.DataArray([0], dims=["x"], name="a") + bins = xr.DataArray([[1]], dims=["bad", "a_bins"]) + with pytest.raises(ValueError, match="will not be broadcast"): + hist(data, dim="x", bins=[bins]) + + def test_bin_dataarrays_without_reduce_dim(self): + data = xr.DataArray([0], dims=["x"], name="a") + bins = xr.DataArray(1) + with pytest.raises(ValueError, match="does not contain"): + hist(data, dim="x", bins=[bins]) + + def test_prevent_trigger_loading_dask(self): + with pytest.raises(TypeError, match="would trigger loading"): + hist(xr.DataArray([0], name="a").chunk(1), bins=2) + +