Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/apply ufunc meta dtype #4022

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ Bug fixes
- Fix bug in time parsing failing to fall back to cftime. This was causing time
variables with a time unit of `'msecs'` to fail to parse. (:pull:`3998`)
By `Ryan May <https://github.com/dopplershift>`_.
- Ensure ``output_dtypes`` is preserved when using :py:func:`apply_ufunc` with
``vectorize=True`` and ``dask="parallelized"`` (:issue:`4015`), by
`Mathias Hauser <https://github.com/mathause>`_

Documentation
~~~~~~~~~~~~~
Expand Down
14 changes: 9 additions & 5 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,9 @@ def apply_ufunc(
inputs are a dask array. If used, the ``output_dtypes`` argument must
also be provided. Multiple output arguments are not yet supported.
output_dtypes : list of dtypes, optional
Optional list of output dtypes. Only used if dask='parallelized'.
Optional list of output dtypes. Used in ``np.vectorize`` and required if
dask='parallelized'. Note that the dtype of meta takes precedence over
output_dtypes in ``dask.array.blockwise``.
output_sizes : dict, optional
Optional mapping from dimension names to sizes for outputs. Only used
if dask='parallelized' and new dimensions (not found on inputs) appear
Expand Down Expand Up @@ -1005,10 +1007,12 @@ def earth_mover_distance(first_samples,
func = functools.partial(func, **kwargs)

if vectorize:
if meta is None:
# set meta=np.ndarray by default for numpy vectorized functions
# work around dask bug computing meta with vectorized functions: GH5642
meta = np.ndarray
if dask == "parallelized" and meta is None:
# only basic checks: _apply_blockwise will raise the appropriate errors
if output_dtypes is not None and isinstance(output_dtypes, list):
dcherian marked this conversation as resolved.
Show resolved Hide resolved
dcherian marked this conversation as resolved.
Show resolved Hide resolved
# set meta=np.ndarray by default for numpy vectorized functions
# work around dask bug computing meta with vectorized functions: GH5642
meta = np.ndarray((0, 0), dtype=output_dtypes[0])
dcherian marked this conversation as resolved.
Show resolved Hide resolved

if signature.all_core_dims:
func = np.vectorize(
Expand Down
52 changes: 52 additions & 0 deletions xarray/tests/test_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,58 @@ def test_vectorize_dask():
assert_identical(expected, actual)


@requires_dask
def test_vectorize_dask_dtype():
# ensure output_dtypes is preserved with vectorize=True
# GH4015

# integer
data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y"))
expected = xr.DataArray([1, 2], dims=["x"])
actual = apply_ufunc(
pandas_median,
data_array.chunk({"x": 1}),
input_core_dims=[["y"]],
vectorize=True,
dask="parallelized",
output_dtypes=[int],
)
assert_identical(expected, actual)
assert expected.dtype == actual.dtype

# complex
data_array = xr.DataArray([[0 + 0j, 1 + 2j, 2 + 1j]], dims=("x", "y"))
expected = data_array.copy()
actual = apply_ufunc(
identity,
data_array.chunk({"x": 1}),
vectorize=True,
dask="parallelized",
output_dtypes=[complex],
)
assert_identical(expected, actual)
assert expected.dtype == actual.dtype


@requires_dask
def test_vectorize_dask_dtype_meta():
# meta dtype takes precedence

data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=("x", "y"))
expected = xr.DataArray([1, 2], dims=["x"])
actual = apply_ufunc(
pandas_median,
data_array.chunk({"x": 1}),
input_core_dims=[["y"]],
vectorize=True,
dask="parallelized",
output_dtypes=[int],
dcherian marked this conversation as resolved.
Show resolved Hide resolved
meta=np.ndarray((0, 0), dtype=np.float),
)
assert_identical(expected, actual)
assert np.float == actual.dtype


@requires_dask
def test_vectorize_dask_new_output_dims():
# regression test for GH3574
Expand Down