From ad5a3f4ac0a74c6663ad1780903ec7b15084216c Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Sun, 3 May 2020 16:23:51 +0200 Subject: [PATCH 1/5] add tests --- xarray/tests/test_computation.py | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 4eed464d2dc..b45049ae72f 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,6 +817,58 @@ def test_vectorize_dask(): assert_identical(expected, actual) +@requires_dask +def test_vectorize_dask_dtype(): + # ensure output_dtypes is preserved with vectorize=True + # GH4015 + + # integer + data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + expected = xr.DataArray([1, 2], dims=["x"]) + actual = apply_ufunc( + pandas_median, + data_array.chunk({"x": 1}), + input_core_dims=[["y"]], + vectorize=True, + dask="parallelized", + output_dtypes=[int], + ) + assert_identical(expected, actual) + assert expected.dtype == actual.dtype + + # complex + data_array = xr.DataArray([[0 + 0j, 1 + 2j, 2 + 1j]], dims=("x", "y")) + expected = data_array.copy() + actual = apply_ufunc( + identity, + data_array.chunk({"x": 1}), + vectorize=True, + dask="parallelized", + output_dtypes=[complex], + ) + assert_identical(expected, actual) + assert expected.dtype == actual.dtype + + +@requires_dask +def test_vectorize_dask_dtype_meta(): + # meta dtype takes precedence + + data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) + expected = xr.DataArray([1, 2], dims=["x"]) + actual = apply_ufunc( + pandas_median, + data_array.chunk({"x": 1}), + input_core_dims=[["y"]], + vectorize=True, + dask="parallelized", + output_dtypes=[int], + meta=np.ndarray((0, 0), dtype=np.float), + ) + assert_identical(expected, actual) + assert np.float == actual.dtype + + @requires_dask def test_vectorize_dask_new_output_dims(): # regression test for GH3574 From ba73f16346940d36620672a533f2c50019128caa Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Sun, 3 May 2020 17:07:55 +0200 Subject: [PATCH 2/5] update apply_ufunc --- xarray/core/computation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index a3723ea9db9..b62a5ce0372 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -864,7 +864,9 @@ def apply_ufunc( inputs are a dask array. If used, the ``output_dtypes`` argument must also be provided. Multiple output arguments are not yet supported. output_dtypes : list of dtypes, optional - Optional list of output dtypes. Only used if dask='parallelized'. + Optional list of output dtypes. Used in ``np.vectorize`` and required if + dask='parallelized'. Note that the dtype of meta takes precedence over + output_dtypes in ``dask.array.blockwise``. output_sizes : dict, optional Optional mapping from dimension names to sizes for outputs. Only used if dask='parallelized' and new dimensions (not found on inputs) appear @@ -1005,10 +1007,12 @@ def earth_mover_distance(first_samples, func = functools.partial(func, **kwargs) if vectorize: - if meta is None: - # set meta=np.ndarray by default for numpy vectorized functions - # work around dask bug computing meta with vectorized functions: GH5642 - meta = np.ndarray + if dask == "parallelized" and meta is None: + # only basic checks: _apply_blockwise will raise the appropriate errors + if output_dtypes is not None and isinstance(output_dtypes, list): + # set meta=np.ndarray by default for numpy vectorized functions + # work around dask bug computing meta with vectorized functions: GH5642 + meta = np.ndarray((0, 0), dtype=output_dtypes[0]) if signature.all_core_dims: func = np.vectorize( From f154d1c520bafcfc2aa094500b59dd4f5acb18e9 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Sun, 3 May 2020 17:16:19 +0200 Subject: [PATCH 3/5] whats new --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 051a41a57e5..27d8c6a8e5f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -92,6 +92,9 @@ Bug fixes - Fix bug in time parsing failing to fall back to cftime. This was causing time variables with a time unit of `'msecs'` to fail to parse. (:pull:`3998`) By `Ryan May `_. +- Ensure ``output_dtypes`` is preserved when using :py:func:`apply_ufunc` with + ``vectorize=True`` and ``dask="parallelized"`` (:issue:`4015`), by + `Mathias Hauser `_ Documentation ~~~~~~~~~~~~~ From 303cef18a3422de656e1d490d7b624b6c8407c89 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 5 May 2020 14:33:26 +0200 Subject: [PATCH 4/5] add warning --- xarray/core/computation.py | 9 +++++++++ xarray/tests/test_computation.py | 22 +++++++++++++--------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index b62a5ce0372..037820a8cc3 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -4,6 +4,7 @@ import functools import itertools import operator +import warnings from collections import Counter from typing import ( TYPE_CHECKING, @@ -687,6 +688,14 @@ def _apply_blockwise( ) (dtype,) = output_dtypes + if (meta is not None) and hasattr(meta, "dtype") and (meta.dtype != dtype): + warnings.warn( + f"dtype of meta ({meta.dtype}) takes precedence over" + f" output_dtypes ({dtype})", + UserWarning, + stacklevel=3, + ) + if output_sizes is None: output_sizes = {} diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index b45049ae72f..b57312f7f16 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -856,15 +856,19 @@ def test_vectorize_dask_dtype_meta(): data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y")) expected = xr.DataArray([1, 2], dims=["x"]) - actual = apply_ufunc( - pandas_median, - data_array.chunk({"x": 1}), - input_core_dims=[["y"]], - vectorize=True, - dask="parallelized", - output_dtypes=[int], - meta=np.ndarray((0, 0), dtype=np.float), - ) + + with pytest.warns( + UserWarning, match=r"dtype of meta \(float64\) takes precedence", + ): + actual = apply_ufunc( + pandas_median, + data_array.chunk({"x": 1}), + input_core_dims=[["y"]], + vectorize=True, + dask="parallelized", + output_dtypes=[int], + meta=np.ndarray((0, 0), dtype=np.float), + ) assert_identical(expected, actual) assert np.float == actual.dtype From db5913a794f4b246f5ab4b235eebd0c76d1f7cdf Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 5 May 2020 21:12:37 +0200 Subject: [PATCH 5/5] combine if statements --- xarray/core/computation.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 037820a8cc3..fef161b54d1 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1016,12 +1016,16 @@ def earth_mover_distance(first_samples, func = functools.partial(func, **kwargs) if vectorize: - if dask == "parallelized" and meta is None: - # only basic checks: _apply_blockwise will raise the appropriate errors - if output_dtypes is not None and isinstance(output_dtypes, list): - # set meta=np.ndarray by default for numpy vectorized functions - # work around dask bug computing meta with vectorized functions: GH5642 - meta = np.ndarray((0, 0), dtype=output_dtypes[0]) + if ( + dask == "parallelized" + and meta is None + and output_dtypes is not None + and isinstance(output_dtypes, list) + ): + # set meta=np.ndarray by default for numpy vectorized functions + # work around dask bug computing meta with vectorized functions: GH5642 + # defer raising errors to _apply_blockwise (e.g. if output_dtypes is None) + meta = np.ndarray((0, 0), dtype=output_dtypes[0]) if signature.all_core_dims: func = np.vectorize(