pydata · mathause · May 3, 2020 · May 3, 2020 · May 3, 2020 · May 5, 2020
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -94,6 +94,9 @@ Bug fixes
 - Fix bug in time parsing failing to fall back to cftime. This was causing time
   variables with a time unit of `'msecs'` to fail to parse. (:pull:`3998`)
   By `Ryan May <https://github.com/dopplershift>`_.
+- Ensure ``output_dtypes`` is preserved when using :py:func:`apply_ufunc` with
+  ``vectorize=True`` and ``dask="parallelized"`` (:issue:`4015`), by
+  `Mathias Hauser <https://github.com/mathause>`_
 
 Documentation
 ~~~~~~~~~~~~~

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -4,6 +4,7 @@
 import functools
 import itertools
 import operator
+import warnings
 from collections import Counter
 from typing import (
     TYPE_CHECKING,
@@ -687,6 +688,14 @@ def _apply_blockwise(
         )
     (dtype,) = output_dtypes
 
+    if (meta is not None) and hasattr(meta, "dtype") and (meta.dtype != dtype):
+        warnings.warn(
+            f"dtype of meta ({meta.dtype}) takes precedence over"
+            f" output_dtypes ({dtype})",
+            UserWarning,
+            stacklevel=3,
+        )
+
     if output_sizes is None:
         output_sizes = {}
 
@@ -864,7 +873,9 @@ def apply_ufunc(
           inputs are a dask array. If used, the ``output_dtypes`` argument must
           also be provided. Multiple output arguments are not yet supported.
     output_dtypes : list of dtypes, optional
-        Optional list of output dtypes. Only used if dask='parallelized'.
+        Optional list of output dtypes. Used in ``np.vectorize`` and required if
+        dask='parallelized'. Note that the dtype of meta takes precedence over
+        output_dtypes in ``dask.array.blockwise``.
     output_sizes : dict, optional
         Optional mapping from dimension names to sizes for outputs. Only used
         if dask='parallelized' and new dimensions (not found on inputs) appear
@@ -1005,10 +1016,16 @@ def earth_mover_distance(first_samples,
         func = functools.partial(func, **kwargs)
 
     if vectorize:
-        if meta is None:
+        if (
+            dask == "parallelized"
+            and meta is None
+            and output_dtypes is not None
+            and isinstance(output_dtypes, list)
+        ):
             # set meta=np.ndarray by default for numpy vectorized functions
             # work around dask bug computing meta with vectorized functions: GH5642
-            meta = np.ndarray
+            # defer raising errors to _apply_blockwise (e.g. if output_dtypes is None)
+            meta = np.ndarray((0, 0), dtype=output_dtypes[0])
 if output_dtypes is None: 
     raise ValueError( 
         "output dtypes (output_dtypes) must be supplied to " 
         "apply_func when using dask='parallelized'" 
     ) 
 if output_dtypes is None: 
     raise ValueError( 
         "output dtypes (output_dtypes) must be supplied to " 
         "apply_func when using dask='parallelized'" 
     ) 
 
         if signature.all_core_dims:
             func = np.vectorize(

diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -817,6 +817,62 @@ def test_vectorize_dask():
     assert_identical(expected, actual)
 
 
+@requires_dask
+def test_vectorize_dask_dtype():
+    # ensure output_dtypes is preserved with vectorize=True
+    # GH4015
+
+    # integer
+    data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y"))
+    expected = xr.DataArray([1, 2], dims=["x"])
+    actual = apply_ufunc(
+        pandas_median,
+        data_array.chunk({"x": 1}),
+        input_core_dims=[["y"]],
+        vectorize=True,
+        dask="parallelized",
+        output_dtypes=[int],
+    )
+    assert_identical(expected, actual)
+    assert expected.dtype == actual.dtype
+
+    # complex
+    data_array = xr.DataArray([[0 + 0j, 1 + 2j, 2 + 1j]], dims=("x", "y"))
+    expected = data_array.copy()
+    actual = apply_ufunc(
+        identity,
+        data_array.chunk({"x": 1}),
+        vectorize=True,
+        dask="parallelized",
+        output_dtypes=[complex],
+    )
+    assert_identical(expected, actual)
+    assert expected.dtype == actual.dtype
+
+
+@requires_dask
+def test_vectorize_dask_dtype_meta():
+    # meta dtype takes precedence
+
+    data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y"))
+    expected = xr.DataArray([1, 2], dims=["x"])
+
+    with pytest.warns(
+        UserWarning, match=r"dtype of meta \(float64\) takes precedence",
+    ):
+        actual = apply_ufunc(
+            pandas_median,
+            data_array.chunk({"x": 1}),
+            input_core_dims=[["y"]],
+            vectorize=True,
+            dask="parallelized",
+            output_dtypes=[int],
+            meta=np.ndarray((0, 0), dtype=np.float),
+        )
+    assert_identical(expected, actual)
+    assert np.float == actual.dtype
+
+
 @requires_dask
 def test_vectorize_dask_new_output_dims():
     # regression test for GH3574