From 37a0bd9bafccb067c92b0ca857118a4c09730dc1 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Fri, 15 May 2020 09:33:38 +0100 Subject: [PATCH 01/36] Added chunks='auto' option in dataset.py --- xarray/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d93f4044a6d..a0be8fe7215 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1725,7 +1725,7 @@ def chunk( Parameters ---------- - chunks : int or mapping, optional + chunks : int, 'auto' or mapping, optional Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. name_prefix : str, optional @@ -1742,7 +1742,7 @@ def chunk( """ from dask.base import tokenize - if isinstance(chunks, Number): + if is_scalar(chunks): chunks = dict.fromkeys(self.dims, chunks) if chunks is not None: From 45edda18b3548cce8738621be9fb6b7d9f81903a Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Fri, 15 May 2020 16:40:14 +0100 Subject: [PATCH 02/36] reverted accidental changes in dataset.chunk() --- xarray/tests/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 75beb3757ca..506857841c8 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -994,7 +994,7 @@ def make_ds(): map_ds.coords["cx"].attrs["test2"] = "test2" map_ds.attrs["test"] = "test" map_ds.coords["xx"] = map_ds["a"] * map_ds.y - + return map_ds From 500e0b27a86c41de69678b1dac6c9b19375b4af6 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 19:11:14 +0100 Subject: [PATCH 03/36] Added corr and cov to computation.py. Taken from r-beer:xarray/corr --- xarray/core/computation.py | 157 +++++++++++++++++++++++++++++++++++++ xarray/tests/test_dask.py | 4 +- 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 28bf818e4a3..bc8bae6a263 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1069,6 +1069,163 @@ def earth_mover_distance(first_samples, return apply_array_ufunc(func, *args, dask=dask) +def cov(da_a, da_b, dim=None, ddof=1): + """Compute covariance between two DataArray objects along a shared dimension. + Parameters + ---------- + da_a: DataArray (or Variable) object + Array to compute. + da_b: DataArray (or Variable) object + Array to compute. + dim : str, optional + The dimension along which the covariance will be computed + Returns + ------- + covariance: DataArray + See also + -------- + pandas.Series.cov: corresponding pandas function + xr.corr: respective function to calculate correlation + Examples + -------- + >>> da_a = DataArray(np.random.random((3, 5)), + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + >>> da_a + + array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438], + [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536], + [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]]) + Coordinates: + * space (space) >> da_b = DataArray(np.random.random((3, 5)), + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + >>> da_b + + array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ], + [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ], + [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]]) + Coordinates: + * space (space) >> xr.cov(da_a, da_b) + + array(0.03823054) + >>> xr.cov(da_a, da_b, dim='time') + + array([0.00207952, 0.01024296, 0.08214707]) + Coordinates: + * space (space) >> da_a = DataArray(np.random.random((3, 5)), + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + >>> da_a + + array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438], + [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536], + [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]]) + Coordinates: + * space (space) >> da_b = DataArray(np.random.random((3, 5)), + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + >>> da_b + + array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ], + [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ], + [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]]) + Coordinates: + * space (space) >> xr.corr(da_a, da_b) + + array(0.67407116) + >>> xr.corr(da_a, da_b, dim='time') + + array([0.23150267, 0.24900968, 0.9061562 ]) + Coordinates: + * space (space) Date: Sat, 23 May 2020 19:27:50 +0100 Subject: [PATCH 04/36] Added r-beer's tests to test_computation.py Still issues I think --- xarray/tests/test_computation.py | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 4eed464d2dc..28fd8eb72aa 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,6 +817,71 @@ def test_vectorize_dask(): assert_identical(expected, actual) +@pytest.fixture() +def array_tuples(): + da = xr.DataArray(np.random.random((3, 21, 4)), + coords={"time": pd.date_range("2000-01-01", freq="1D", periods=21)}, + dims=("a", "time", "x"),) + + arrays = [ + da.isel(time=range(0, 18)), + da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(dim="time"), + xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"), + xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"), + xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), + xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]), + ] + + array_tuples = [ + (arrays[0], arrays[0]), + (arrays[0], arrays[1]), + (arrays[1], arrays[1]), + (arrays[2], arrays[2]), + (arrays[2], arrays[3]), + (arrays[3], arrays[3]), + (arrays[4], arrays[4]), + (arrays[4], arrays[5]), + (arrays[5], arrays[5]), + ] + + return array_tuples + +@pytest.mark.parametrize("da_a, da_b", array_tuples) +@pytest.mark.parametrize("dim", [None, "time", "x"]) +def test_cov(da_a, da_b, dim): + def pandas_cov(ts1, ts2): + """Ensure the ts are aligned and missing values ignored""" + ts1, ts2 = xr.align(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() + + ts1 = ts1.where(valid_values, drop=True) + ts2 = ts2.where(valid_values, drop=True) + + return ts1.to_series().cov(ts2.to_series()) + + expected = pandas_cov(da_a, da_b) + actual = xr.cov(da_a, da_b, dim) + + assert_allclose(actual, expected) + + +@pytest.mark.parametrize("da_a, da_b", array_tuples) +@pytest.mark.parametrize("dim", [None, "time", "x"]) +def test_corr(da_a, da_b, dim): + def pandas_corr(ts1, ts2): + """Ensure the ts are aligned and missing values ignored""" + ts1, ts2 = xr.align(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() + + ts1 = ts1.where(valid_values, drop=True) + ts2 = ts2.where(valid_values, drop=True) + + return ts1.to_series().corr(ts2.to_series()) + + expected = pandas_corr(da_a, da_b) + actual = xr.corr(da_a, da_b, dim) + assert_allclose(actual, expected) + @requires_dask def test_vectorize_dask_new_output_dims(): # regression test for GH3574 From fdd5c5f6839f3d0edf32e0d06233066531d38201 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 19:31:25 +0100 Subject: [PATCH 05/36] trying to fix github.com/pydata/xarray/pull/3550#discussion_r349935731 --- xarray/tests/test_computation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 28fd8eb72aa..afde4ad9509 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -846,9 +846,10 @@ def array_tuples(): return array_tuples -@pytest.mark.parametrize("da_a, da_b", array_tuples) +# TODO: https://github.com/pydata/xarray/pull/3550#discussion_r349935731 +#@pytest.mark.parametrize("da_a, da_b", array_tuples) @pytest.mark.parametrize("dim", [None, "time", "x"]) -def test_cov(da_a, da_b, dim): +def test_cov(array_tuples, dim): def pandas_cov(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) From aeabf2cf941d38cb0d7b1b013407fbe2c9346a77 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 19:35:31 +0100 Subject: [PATCH 06/36] Removing drop=True from the `.where()` calls in `computation.py`+test.py --- xarray/core/computation.py | 27 +++++++++++++++------------ xarray/tests/test_computation.py | 8 ++++---- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index bc8bae6a263..f6a84fbc44d 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1070,7 +1070,8 @@ def earth_mover_distance(first_samples, def cov(da_a, da_b, dim=None, ddof=1): - """Compute covariance between two DataArray objects along a shared dimension. + """ + Compute covariance between two DataArray objects along a shared dimension. Parameters ---------- da_a: DataArray (or Variable) object @@ -1128,8 +1129,8 @@ def cov(da_a, da_b, dim=None, ddof=1): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() # TODO: avoid drop - da_a = da_a.where(valid_values, drop=True) - da_b = da_b.where(valid_values, drop=True) + da_a = da_a.where(valid_values) + da_b = da_b.where(valid_values) valid_count = valid_values.sum(dim) - ddof # if dim is not None: @@ -1148,7 +1149,9 @@ def cov(da_a, da_b, dim=None, ddof=1): def corr(da_a, da_b, dim=None, ddof=0): - """Compute the Pearson correlation coefficient between two DataArray objects along a shared dimension. + """ + Compute the Pearson correlation coefficient between + two DataArray objects along a shared dimension. Parameters ---------- da_a: DataArray (or Variable) object @@ -1169,7 +1172,7 @@ def corr(da_a, da_b, dim=None, ddof=0): >>> da_a = DataArray(np.random.random((3, 5)), ... dims=("space", "time"), ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) >>> da_a array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438], @@ -1179,9 +1182,9 @@ def corr(da_a, da_b, dim=None, ddof=0): * space (space) >> da_b = DataArray(np.random.random((3, 5)), - ... dims=("space", "time"), - ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) >>> da_b array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ], @@ -1212,10 +1215,10 @@ def corr(da_a, da_b, dim=None, ddof=0): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() - da_a = da_a.where(valid_values, drop=True) - da_b = da_b.where( - valid_values, drop=True - ) # TODO: avoid drop as explained in https://github.com/pydata/xarray/pull/2652#discussion_r245492002 + # TODO: avoid drop https://github.com/pydata/xarray/pull/2652#discussion_r245492002 + # FIX: I think @shoyer convinced that you can just remove drop=True from all the + da_a = da_a.where(valid_values) + da_b = da_b.where(valid_values) # 3. Compute correlation based on standard deviations and cov() da_a_std = da_a.std(dim=dim) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index afde4ad9509..7ee40468e0f 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -855,8 +855,8 @@ def pandas_cov(ts1, ts2): ts1, ts2 = xr.align(ts1, ts2) valid_values = ts1.notnull() & ts2.notnull() - ts1 = ts1.where(valid_values, drop=True) - ts2 = ts2.where(valid_values, drop=True) + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) return ts1.to_series().cov(ts2.to_series()) @@ -874,8 +874,8 @@ def pandas_corr(ts1, ts2): ts1, ts2 = xr.align(ts1, ts2) valid_values = ts1.notnull() & ts2.notnull() - ts1 = ts1.where(valid_values, drop=True) - ts2 = ts2.where(valid_values, drop=True) + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) return ts1.to_series().corr(ts2.to_series()) From 1489e0f0672090383063e0d9ef2da792d830cb6e Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 19:50:26 +0100 Subject: [PATCH 07/36] api.rst and whats-new.rst --- doc/api.rst | 2 ++ doc/whats-new.rst | 2 ++ 2 files changed, 4 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 8ec6843d24a..c9f24e8c3f1 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -29,6 +29,8 @@ Top-level functions full_like zeros_like ones_like + cov + corr dot polyval map_blocks diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 447aaf5b0bf..878f9a8ba8f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,6 +36,8 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:``). + By `Andrew Williams `_ and `Robin Beer `_. - Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) By `Pascal Bourgault `_. - Control over attributes of result in :py:func:`merge`, :py:func:`concat`, From c121a3dc9e80e17a78a4e2654932639559c71406 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 19:55:26 +0100 Subject: [PATCH 08/36] Updated `xarray/__init__.py` and added `broadcast` import to computation --- xarray/__init__.py | 2 +- xarray/core/computation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 0fead57e5fb..49c83284cc9 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -17,7 +17,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, polyval, where +from .core.computation import apply_ufunc, dot, polyval, where, cov, corr from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset diff --git a/xarray/core/computation.py b/xarray/core/computation.py index f6a84fbc44d..f65a17b87ac 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -24,7 +24,7 @@ import numpy as np from . import dtypes, duck_array_ops, utils -from .alignment import deep_align +from .alignment import broadcast, deep_align from .merge import merge_coordinates_without_align from .options import OPTIONS from .pycompat import dask_array_type From a40d95bc44f0c51301d92bff23d12b29045faf12 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 20:01:52 +0100 Subject: [PATCH 09/36] added DataArray import to corr, cov --- xarray/core/computation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index f65a17b87ac..678839b53de 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1122,7 +1122,7 @@ def cov(da_a, da_b, dim=None, ddof=1): Coordinates: * space (space) Date: Sat, 23 May 2020 20:02:56 +0100 Subject: [PATCH 10/36] assert_allclose added to test_computation.py --- xarray/tests/test_computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 7ee40468e0f..63e5ff4dbf8 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd import pytest -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_allclose import xarray as xr from xarray.core.computation import ( From 2ddcb553c4a48a13a0b737fdcc87662db64e3fdc Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 20:03:47 +0100 Subject: [PATCH 11/36] removed whitespace in test_dask...oops --- xarray/tests/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 37692fe501d..75beb3757ca 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1048,7 +1048,7 @@ def returns_numpy(darray): with raises_regex(TypeError, "Function must return an xarray DataArray"): xr.map_blocks(returns_numpy, map_da) - with raises_regex(TypeError, "args must be"): + with raises_regex(TypeError, "args must be"): xr.map_blocks(operator.add, map_da, args=10) with raises_regex(TypeError, "kwargs must be"): From 2fce1758b146c6be0907c973630b8f9a6fe9e384 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 20:13:31 +0100 Subject: [PATCH 12/36] Added to init --- xarray/__init__.py | 2 ++ xarray/tests/test_computation.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 49c83284cc9..c1452777bdb 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -54,6 +54,8 @@ "concat", "decode_cf", "dot", + "cov", + "corr", "full_like", "load_dataarray", "load_dataset", diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 63e5ff4dbf8..c6c32917dbb 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -846,10 +846,13 @@ def array_tuples(): return array_tuples + # TODO: https://github.com/pydata/xarray/pull/3550#discussion_r349935731 -#@pytest.mark.parametrize("da_a, da_b", array_tuples) +# @pytest.mark.parametrize("da_a, da_b", array_tuples) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(array_tuples, dim): + da_a, da_b = array_tuples + def pandas_cov(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) @@ -866,9 +869,11 @@ def pandas_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", array_tuples) +#@pytest.mark.parametrize("da_a, da_b", array_tuples) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): + da_a, da_b = array_tuples + def pandas_corr(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) @@ -883,6 +888,7 @@ def pandas_corr(ts1, ts2): actual = xr.corr(da_a, da_b, dim) assert_allclose(actual, expected) + @requires_dask def test_vectorize_dask_new_output_dims(): # regression test for GH3574 From a0ef1c240fef00b78c524d77c3e20b7c3b9ff861 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 21:52:20 +0100 Subject: [PATCH 13/36] format changes --- xarray/core/computation.py | 1 + xarray/core/dataset.py | 4 ++-- xarray/tests/test_computation.py | 18 ++++++++---------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 678839b53de..fca5cbbf7f6 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1123,6 +1123,7 @@ def cov(da_a, da_b, dim=None, ddof=1): * space (space) Date: Sat, 23 May 2020 23:04:54 +0100 Subject: [PATCH 14/36] Fiddling around with cov/corr tests in `test_computation.py` --- xarray/core/computation.py | 5 ----- xarray/tests/test_computation.py | 15 +++++++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index fca5cbbf7f6..8c091c27fa6 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1134,11 +1134,6 @@ def cov(da_a, da_b, dim=None, ddof=1): da_b = da_b.where(valid_values) valid_count = valid_values.sum(dim) - ddof - # if dim is not None: - # valid_count = da_a[dim].size - ddof - # else: - # valid_count = da_a.size - # 3. Compute mean and standard deviation along the given dim demeaned_da_a = da_a - da_a.mean(dim=dim) demeaned_da_b = da_b - da_b.mean(dim=dim) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 3f69bfe30d4..cc78c2bbe61 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,7 +817,7 @@ def test_vectorize_dask(): assert_identical(expected, actual) -@pytest.fixture() +#@pytest.fixture() def array_tuples(): da = xr.DataArray( np.random.random((3, 21, 4)), @@ -850,9 +850,15 @@ def array_tuples(): # TODO: https://github.com/pydata/xarray/pull/3550#discussion_r349935731 -@pytest.mark.parametrize("da_a, da_b", array_tuples) +# @pytest.mark.parametrize("da_a, da_b", +# [array_tuples()[0], array_tuples()[1], array_tuples()[2], array_tuples()[3], +# array_tuples()[4], array_tuples()[5], array_tuples()[6], array_tuples()[7], +# array_tuples()[8]]) +@pytest.mark.parametrize("da_a, da_b", +[array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(da_a, da_b, dim): + def pandas_cov(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) @@ -865,13 +871,14 @@ def pandas_cov(ts1, ts2): expected = pandas_cov(da_a, da_b) actual = xr.cov(da_a, da_b, dim) - assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", array_tuples) +@pytest.mark.parametrize("da_a, da_b", +[array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): + def pandas_corr(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) From 523e4fdfeca9d0c452b0110e55c2c413511f6bc8 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 23:12:16 +0100 Subject: [PATCH 15/36] PEP8 changes --- xarray/tests/test_computation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index cc78c2bbe61..93bd08daccc 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,7 +817,7 @@ def test_vectorize_dask(): assert_identical(expected, actual) -#@pytest.fixture() +# @pytest.fixture() def array_tuples(): da = xr.DataArray( np.random.random((3, 21, 4)), @@ -854,8 +854,10 @@ def array_tuples(): # [array_tuples()[0], array_tuples()[1], array_tuples()[2], array_tuples()[3], # array_tuples()[4], array_tuples()[5], array_tuples()[6], array_tuples()[7], # array_tuples()[8]]) -@pytest.mark.parametrize("da_a, da_b", -[array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize( +"da_a, da_b", +[array_tuples()[0], array_tuples()[1]] +) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(da_a, da_b, dim): @@ -874,8 +876,10 @@ def pandas_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", -[array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize( +"da_a, da_b", +[array_tuples()[0], array_tuples()[1]] +) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): From c23cae699e0d06c081b44c8197300ed9cb1102fa Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sat, 23 May 2020 23:14:18 +0100 Subject: [PATCH 16/36] pep --- xarray/tests/test_computation.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 93bd08daccc..72ef2e7a6b6 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -854,10 +854,8 @@ def array_tuples(): # [array_tuples()[0], array_tuples()[1], array_tuples()[2], array_tuples()[3], # array_tuples()[4], array_tuples()[5], array_tuples()[6], array_tuples()[7], # array_tuples()[8]]) -@pytest.mark.parametrize( -"da_a, da_b", -[array_tuples()[0], array_tuples()[1]] -) +@pytest.mark.parametrize("da_a, da_b", + [array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(da_a, da_b, dim): @@ -876,10 +874,8 @@ def pandas_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize( -"da_a, da_b", -[array_tuples()[0], array_tuples()[1]] -) +@pytest.mark.parametrize("da_a, da_b", + [array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): From 860babca07ae36e2fa9fda49cfdcd30192908ad3 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 11:14:16 +0100 Subject: [PATCH 17/36] remove old todo and comments --- xarray/core/computation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8c091c27fa6..e072b11513d 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1129,7 +1129,6 @@ def cov(da_a, da_b, dim=None, ddof=1): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() - # TODO: avoid drop da_a = da_a.where(valid_values) da_b = da_b.where(valid_values) valid_count = valid_values.sum(dim) - ddof @@ -1211,8 +1210,6 @@ def corr(da_a, da_b, dim=None, ddof=0): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() - # TODO: avoid drop https://github.com/pydata/xarray/pull/2652#discussion_r245492002 - # FIXED?: I think @shoyer convinced that you can just remove drop=True da_a = da_a.where(valid_values) da_b = da_b.where(valid_values) From 33ded408d5684383d08fe7f94c7431870a09f261 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 11:17:00 +0100 Subject: [PATCH 18/36] isort --- xarray/__init__.py | 2 +- xarray/tests/test_computation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index c1452777bdb..e8274d13ffe 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -17,7 +17,7 @@ from .core.alignment import align, broadcast from .core.combine import auto_combine, combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, dot, polyval, where, cov, corr +from .core.computation import apply_ufunc, corr, cov, dot, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 72ef2e7a6b6..ef5ec4c2708 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd import pytest -from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing import assert_allclose, assert_array_equal import xarray as xr from xarray.core.computation import ( From 2751b1090e4e4af44611277a16690eafb3d670e9 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 17:23:46 +0100 Subject: [PATCH 19/36] Added consistency check between corr() and cov(), ensure they give same --- xarray/tests/test_computation.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index ef5ec4c2708..0854f5aafca 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -20,6 +20,7 @@ unified_dim_sizes, ) +from xarray.core.alignment import broadcast from . import has_dask, raises_regex, requires_dask @@ -854,11 +855,9 @@ def array_tuples(): # [array_tuples()[0], array_tuples()[1], array_tuples()[2], array_tuples()[3], # array_tuples()[4], array_tuples()[5], array_tuples()[6], array_tuples()[7], # array_tuples()[8]]) -@pytest.mark.parametrize("da_a, da_b", - [array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(da_a, da_b, dim): - def pandas_cov(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) @@ -874,11 +873,9 @@ def pandas_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", - [array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): - def pandas_corr(ts1, ts2): """Ensure the ts are aligned and missing values ignored""" ts1, ts2 = xr.align(ts1, ts2) @@ -894,6 +891,23 @@ def pandas_corr(ts1, ts2): assert_allclose(actual, expected) +@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize("dim", [None, "time", "x"]) +def test_covcorr_consistency(da_a, da_b, dim): + # Testing that xr.corr and xr.cov are consistent with each other + # 1. Broadcast the two arrays + da_a, da_b = broadcast(da_a, da_b) + + # 2. Ignore the nans + valid_values = da_a.notnull() & da_b.notnull() + da_a = da_a.where(valid_values) + da_b = da_b.where(valid_values) + + expected = xr.cov(da_a, da_b, dim=dim, ddof=0) / (da_a.std(dim=dim) * da_b.std(dim=dim)) + actual = xr.corr(da_a, da_b, dim=dim) + assert_allclose(actual, expected) + + @requires_dask def test_vectorize_dask_new_output_dims(): # regression test for GH3574 From 759c9f49aefbbe2e8d40968df085c422ef52f1a1 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 19:05:31 +0100 Subject: [PATCH 20/36] added `skipna=False` to `computation.py`. made consistency+autocov tests --- xarray/core/computation.py | 7 ++++-- xarray/tests/test_computation.py | 43 ++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e072b11513d..bf9fc243a4c 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1130,7 +1130,7 @@ def cov(da_a, da_b, dim=None, ddof=1): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() da_a = da_a.where(valid_values) - da_b = da_b.where(valid_values) + da_b = da_a.where(valid_values) valid_count = valid_values.sum(dim) - ddof # 3. Compute mean and standard deviation along the given dim @@ -1138,7 +1138,10 @@ def cov(da_a, da_b, dim=None, ddof=1): demeaned_da_b = da_b - da_b.mean(dim=dim) # 4. Compute covariance along the given dim - cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim) / (valid_count) + # N.B. `skipna=False` is required or there is a bug when computing + # auto-covariance. E.g. Try xr.cov(da,da) for + # da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]) + cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count) return DataArray(cov) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 0854f5aafca..2efdacf1ae4 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -818,8 +818,7 @@ def test_vectorize_dask(): assert_identical(expected, actual) -# @pytest.fixture() -def array_tuples(): +def arrays_w_tuples(): da = xr.DataArray( np.random.random((3, 21, 4)), coords={"time": pd.date_range("2000-01-01", freq="1D", periods=21)}, @@ -828,7 +827,7 @@ def array_tuples(): arrays = [ da.isel(time=range(0, 18)), - da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(dim="time"), + da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(), xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"), xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"), xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), @@ -847,19 +846,16 @@ def array_tuples(): (arrays[5], arrays[5]), ] - return array_tuples + return arrays, array_tuples -# TODO: https://github.com/pydata/xarray/pull/3550#discussion_r349935731 -# @pytest.mark.parametrize("da_a, da_b", -# [array_tuples()[0], array_tuples()[1], array_tuples()[2], array_tuples()[3], -# array_tuples()[4], array_tuples()[5], array_tuples()[6], array_tuples()[7], -# array_tuples()[8]]) -@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) +# TODO: Loop over `a` and `x` to test specific values +""" +@pytest.mark.parametrize("da_a, da_b", [array_w_tuples()[1][0], array_w_tuples()[1][1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_cov(da_a, da_b, dim): def pandas_cov(ts1, ts2): - """Ensure the ts are aligned and missing values ignored""" + #Ensure the ts are aligned and missing values ignored ts1, ts2 = xr.align(ts1, ts2) valid_values = ts1.notnull() & ts2.notnull() @@ -873,11 +869,11 @@ def pandas_cov(ts1, ts2): assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize("da_a, da_b", [array_w_tuples()[1][0], array_w_tuples()[1][1]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_corr(da_a, da_b, dim): def pandas_corr(ts1, ts2): - """Ensure the ts are aligned and missing values ignored""" + #Ensure the ts are aligned and missing values ignored ts1, ts2 = xr.align(ts1, ts2) valid_values = ts1.notnull() & ts2.notnull() @@ -889,15 +885,16 @@ def pandas_corr(ts1, ts2): expected = pandas_corr(da_a, da_b) actual = xr.corr(da_a, da_b, dim) assert_allclose(actual, expected) +""" - -@pytest.mark.parametrize("da_a, da_b", [array_tuples()[0], array_tuples()[1]]) +@pytest.mark.parametrize("da_a, da_b", +[arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], + arrays_w_tuples()[1][2], arrays_w_tuples()[1][7], arrays_w_tuples()[1][8]]) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_covcorr_consistency(da_a, da_b, dim): # Testing that xr.corr and xr.cov are consistent with each other # 1. Broadcast the two arrays da_a, da_b = broadcast(da_a, da_b) - # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() da_a = da_a.where(valid_values) @@ -907,6 +904,20 @@ def test_covcorr_consistency(da_a, da_b, dim): actual = xr.corr(da_a, da_b, dim=dim) assert_allclose(actual, expected) +@pytest.mark.parametrize("da_a", +[arrays_w_tuples()[0][0], arrays_w_tuples()[0][1], + arrays_w_tuples()[0][4], arrays_w_tuples()[0][5]]) +@pytest.mark.parametrize("dim", [None, "time", "x"]) +def test_autocov(da_a, dim): + # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix + # 1. Ignore the nans + valid_values = da_a.notnull() + da_a = da_a.where(valid_values) + expected = ((da_a - da_a.mean(dim=dim))**2).sum(dim=dim, skipna=False) + actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1) + print(da_a, actual, expected) + assert_allclose(actual, expected) + @requires_dask def test_vectorize_dask_new_output_dims(): From 1accabdc8bd4e977704d09a941615f9b87ed03ee Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 19:10:55 +0100 Subject: [PATCH 21/36] formatting --- xarray/core/computation.py | 2 +- xarray/tests/test_computation.py | 35 ++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index bf9fc243a4c..a0ba981829b 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1138,7 +1138,7 @@ def cov(da_a, da_b, dim=None, ddof=1): demeaned_da_b = da_b - da_b.mean(dim=dim) # 4. Compute covariance along the given dim - # N.B. `skipna=False` is required or there is a bug when computing + # N.B. `skipna=False` is required or there is a bug when computing # auto-covariance. E.g. Try xr.cov(da,da) for # da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]) cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 2efdacf1ae4..ea577c9260b 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -8,6 +8,7 @@ from numpy.testing import assert_allclose, assert_array_equal import xarray as xr +from xarray.core.alignment import broadcast from xarray.core.computation import ( _UFuncSignature, apply_ufunc, @@ -20,7 +21,6 @@ unified_dim_sizes, ) -from xarray.core.alignment import broadcast from . import has_dask, raises_regex, requires_dask @@ -887,9 +887,17 @@ def pandas_corr(ts1, ts2): assert_allclose(actual, expected) """ -@pytest.mark.parametrize("da_a, da_b", -[arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], - arrays_w_tuples()[1][2], arrays_w_tuples()[1][7], arrays_w_tuples()[1][8]]) + +@pytest.mark.parametrize( + "da_a, da_b", + [ + arrays_w_tuples()[1][0], + arrays_w_tuples()[1][1], + arrays_w_tuples()[1][2], + arrays_w_tuples()[1][7], + arrays_w_tuples()[1][8], + ], +) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_covcorr_consistency(da_a, da_b, dim): # Testing that xr.corr and xr.cov are consistent with each other @@ -900,20 +908,29 @@ def test_covcorr_consistency(da_a, da_b, dim): da_a = da_a.where(valid_values) da_b = da_b.where(valid_values) - expected = xr.cov(da_a, da_b, dim=dim, ddof=0) / (da_a.std(dim=dim) * da_b.std(dim=dim)) + expected = xr.cov(da_a, da_b, dim=dim, ddof=0) / ( + da_a.std(dim=dim) * da_b.std(dim=dim) + ) actual = xr.corr(da_a, da_b, dim=dim) assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a", -[arrays_w_tuples()[0][0], arrays_w_tuples()[0][1], - arrays_w_tuples()[0][4], arrays_w_tuples()[0][5]]) + +@pytest.mark.parametrize( + "da_a", + [ + arrays_w_tuples()[0][0], + arrays_w_tuples()[0][1], + arrays_w_tuples()[0][4], + arrays_w_tuples()[0][5], + ], +) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_autocov(da_a, dim): # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix # 1. Ignore the nans valid_values = da_a.notnull() da_a = da_a.where(valid_values) - expected = ((da_a - da_a.mean(dim=dim))**2).sum(dim=dim, skipna=False) + expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False) actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1) print(da_a, actual, expected) assert_allclose(actual, expected) From 43a6ad7e63ecaacffd17ef5c7d5e26dbe5775227 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 20:38:52 +0100 Subject: [PATCH 22/36] Added numpy-based tests. --- xarray/tests/test_computation.py | 106 +++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 28 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index ea577c9260b..542ec827e54 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -849,43 +849,94 @@ def arrays_w_tuples(): return arrays, array_tuples -# TODO: Loop over `a` and `x` to test specific values -""" -@pytest.mark.parametrize("da_a, da_b", [array_w_tuples()[1][0], array_w_tuples()[1][1]]) -@pytest.mark.parametrize("dim", [None, "time", "x"]) +@pytest.mark.parametrize( + "da_a, da_b", + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2],], +) +@pytest.mark.parametrize("dim", [None, "time"]) def test_cov(da_a, da_b, dim): - def pandas_cov(ts1, ts2): - #Ensure the ts are aligned and missing values ignored - ts1, ts2 = xr.align(ts1, ts2) - valid_values = ts1.notnull() & ts2.notnull() + if dim is not None: - ts1 = ts1.where(valid_values) - ts2 = ts2.where(valid_values) + def np_cov_ind(ts1, ts2, a, x): + # Ensure the ts are aligned and missing values ignored + ts1, ts2 = broadcast(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() - return ts1.to_series().cov(ts2.to_series()) + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) - expected = pandas_cov(da_a, da_b) - actual = xr.cov(da_a, da_b, dim) - assert_allclose(actual, expected) + return np.cov( + ts1.sel(a=a, x=x).data.flatten(), + ts2.sel(a=a, x=x).data.flatten(), + ddof=1, + )[0, 1] + expected = np.zeros((3, 4)) + for a in [0, 1, 2]: + for x in [0, 1, 2, 3]: + expected[a, x] = np_cov_ind(da_a, da_b, a=a, x=x) + actual = xr.cov(da_a, da_b, dim) + assert_allclose(actual, expected) -@pytest.mark.parametrize("da_a, da_b", [array_w_tuples()[1][0], array_w_tuples()[1][1]]) -@pytest.mark.parametrize("dim", [None, "time", "x"]) + else: + + def np_cov(ts1, ts2): + # Ensure the ts are aligned and missing values ignored + ts1, ts2 = broadcast(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() + + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) + + return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=1)[0, 1] + + expected = np_cov(da_a, da_b) + actual = xr.cov(da_a, da_b, dim) + assert_allclose(actual, expected) + + +@pytest.mark.parametrize( + "da_a, da_b", + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2],], +) +@pytest.mark.parametrize("dim", [None, "time"]) def test_corr(da_a, da_b, dim): - def pandas_corr(ts1, ts2): - #Ensure the ts are aligned and missing values ignored - ts1, ts2 = xr.align(ts1, ts2) - valid_values = ts1.notnull() & ts2.notnull() + if dim is not None: - ts1 = ts1.where(valid_values) - ts2 = ts2.where(valid_values) + def np_corr_ind(ts1, ts2, a, x): + # Ensure the ts are aligned and missing values ignored + ts1, ts2 = broadcast(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() - return ts1.to_series().corr(ts2.to_series()) + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) - expected = pandas_corr(da_a, da_b) - actual = xr.corr(da_a, da_b, dim) - assert_allclose(actual, expected) -""" + return np.corrcoef( + ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten() + )[0, 1] + + expected = np.zeros((3, 4)) + for a in [0, 1, 2]: + for x in [0, 1, 2, 3]: + expected[a, x] = np_corr_ind(da_a, da_b, a=a, x=x) + actual = xr.corr(da_a, da_b, dim) + assert_allclose(actual, expected) + + else: + + def np_corr(ts1, ts2): + # Ensure the ts are aligned and missing values ignored + ts1, ts2 = broadcast(ts1, ts2) + valid_values = ts1.notnull() & ts2.notnull() + + ts1 = ts1.where(valid_values) + ts2 = ts2.where(valid_values) + + return np.corrcoef(ts1.data.flatten(), ts2.data.flatten())[0, 1] + + expected = np_corr(da_a, da_b) + actual = xr.corr(da_a, da_b, dim) + assert_allclose(actual, expected) @pytest.mark.parametrize( @@ -932,7 +983,6 @@ def test_autocov(da_a, dim): da_a = da_a.where(valid_values) expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False) actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1) - print(da_a, actual, expected) assert_allclose(actual, expected) From 29bbcfbac49706cb31e593a2fe67e5491b9ba578 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 20:58:37 +0100 Subject: [PATCH 23/36] format --- xarray/tests/test_computation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 542ec827e54..6f51caae767 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -851,7 +851,7 @@ def arrays_w_tuples(): @pytest.mark.parametrize( "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2],], + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2], ], ) @pytest.mark.parametrize("dim", [None, "time"]) def test_cov(da_a, da_b, dim): @@ -897,7 +897,7 @@ def np_cov(ts1, ts2): @pytest.mark.parametrize( "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2],], + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2], ], ) @pytest.mark.parametrize("dim", [None, "time"]) def test_corr(da_a, da_b, dim): From a5ce9b3d0f94d0b8bec6343039ae8e99df8bf061 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 24 May 2020 21:01:46 +0100 Subject: [PATCH 24/36] formatting again --- xarray/tests/test_computation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 6f51caae767..7e6a7b97bdb 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -851,7 +851,7 @@ def arrays_w_tuples(): @pytest.mark.parametrize( "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2], ], + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]], ) @pytest.mark.parametrize("dim", [None, "time"]) def test_cov(da_a, da_b, dim): @@ -897,7 +897,7 @@ def np_cov(ts1, ts2): @pytest.mark.parametrize( "da_a, da_b", - [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2], ], + [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]], ) @pytest.mark.parametrize("dim", [None, "time"]) def test_corr(da_a, da_b, dim): From 5557da99b9674d6b183f69fd5e4e109a742cca01 Mon Sep 17 00:00:00 2001 From: Andrew Williams <56925856+AndrewWilliams3142@users.noreply.github.com> Date: Mon, 25 May 2020 05:02:00 +0100 Subject: [PATCH 25/36] Update doc/whats-new.rst Co-authored-by: keewis --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 878f9a8ba8f..e9f2920b0f7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -36,7 +36,7 @@ Breaking changes New Features ~~~~~~~~~~~~ -- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:``). +- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:`3550`, :pull:`4089`). By `Andrew Williams `_ and `Robin Beer `_. - Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`) By `Pascal Bourgault `_. From d395c273f5abd914427b3276a3226cadaf98172e Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 09:40:26 +0100 Subject: [PATCH 26/36] refactored corr/cov so there is one internal method for calculating both --- xarray/core/computation.py | 58 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index a0ba981829b..452ca65d450 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1080,6 +1080,8 @@ def cov(da_a, da_b, dim=None, ddof=1): Array to compute. dim : str, optional The dimension along which the covariance will be computed + ddof: int + If ddof=1, covariance is normalized by N-1, giving an unbiased estimate. Returns ------- covariance: DataArray @@ -1123,27 +1125,13 @@ def cov(da_a, da_b, dim=None, ddof=1): * space (space) Date: Mon, 25 May 2020 09:41:34 +0100 Subject: [PATCH 27/36] formatting --- xarray/core/computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 452ca65d450..1af6ccc0000 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1125,6 +1125,7 @@ def cov(da_a, da_b, dim=None, ddof=1): * space (space) Date: Mon, 25 May 2020 09:59:26 +0100 Subject: [PATCH 28/36] updating docstrings and code suggestions from PR --- xarray/core/computation.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 1af6ccc0000..54f5c8bcee7 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -24,7 +24,7 @@ import numpy as np from . import dtypes, duck_array_ops, utils -from .alignment import broadcast, deep_align +from .alignment import align, deep_align from .merge import merge_coordinates_without_align from .options import OPTIONS from .pycompat import dask_array_type @@ -1072,6 +1072,7 @@ def earth_mover_distance(first_samples, def cov(da_a, da_b, dim=None, ddof=1): """ Compute covariance between two DataArray objects along a shared dimension. + Parameters ---------- da_a: DataArray (or Variable) object @@ -1082,6 +1083,7 @@ def cov(da_a, da_b, dim=None, ddof=1): The dimension along which the covariance will be computed ddof: int If ddof=1, covariance is normalized by N-1, giving an unbiased estimate. + Returns ------- covariance: DataArray @@ -1089,6 +1091,7 @@ def cov(da_a, da_b, dim=None, ddof=1): -------- pandas.Series.cov: corresponding pandas function xr.corr: respective function to calculate correlation + Examples -------- >>> da_a = DataArray(np.random.random((3, 5)), @@ -1126,19 +1129,20 @@ def cov(da_a, da_b, dim=None, ddof=1): """ from .dataarray import DataArray - if any(not isinstance(arr, (Variable, DataArray)) for arr in [da_a, da_b]): + if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]): raise TypeError( - "Only xr.DataArray and xr.Variable are supported." + "Only xr.DataArray is supported." "Given {}.".format([type(arr) for arr in [da_a, da_b]]) ) return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov") -def corr(da_a, da_b, dim=None, ddof=0): +def corr(da_a, da_b, dim=None): """ Compute the Pearson correlation coefficient between two DataArray objects along a shared dimension. + Parameters ---------- da_a: DataArray (or Variable) object @@ -1147,6 +1151,7 @@ def corr(da_a, da_b, dim=None, ddof=0): Array to compute. dim: str, optional The dimension along which the correlation will be computed + Returns ------- correlation: DataArray @@ -1154,6 +1159,7 @@ def corr(da_a, da_b, dim=None, ddof=0): -------- pandas.Series.corr: corresponding pandas function xr.cov: underlying covariance function + Examples -------- >>> da_a = DataArray(np.random.random((3, 5)), @@ -1191,9 +1197,9 @@ def corr(da_a, da_b, dim=None, ddof=0): """ from .dataarray import DataArray - if any(not isinstance(arr, (Variable, DataArray)) for arr in [da_a, da_b]): + if any(not isinstance(arr, DataArray) for arr in [da_a, da_b]): raise TypeError( - "Only xr.DataArray and xr.Variable are supported." + "Only xr.DataArray is supported." "Given {}.".format([type(arr) for arr in [da_a, da_b]]) ) @@ -1208,7 +1214,7 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): from .dataarray import DataArray # 1. Broadcast the two arrays - da_a, da_b = broadcast(da_a, da_b) + da_a, da_b = align(da_a, da_b, join="inner", copy=False) # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() @@ -1229,12 +1235,12 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count) if method == "cov": - return DataArray(cov) + return cov else: # compute corr corr = cov / (da_a_std * da_b_std) - return DataArray(corr) + return corr def dot(*arrays, dims=None, **kwargs): From 0e4b68255439f4ef193f3d00bc65826ec459f0a1 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 10:15:19 +0100 Subject: [PATCH 29/36] paramterize ddof in tests --- xarray/core/computation.py | 15 ++++++------- xarray/tests/test_computation.py | 37 +++++++++++--------------------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 54f5c8bcee7..58598a1ca93 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1075,15 +1075,15 @@ def cov(da_a, da_b, dim=None, ddof=1): Parameters ---------- - da_a: DataArray (or Variable) object + da_a: DataArray object Array to compute. - da_b: DataArray (or Variable) object + da_b: DataArray object Array to compute. dim : str, optional The dimension along which the covariance will be computed ddof: int If ddof=1, covariance is normalized by N-1, giving an unbiased estimate. - + Returns ------- covariance: DataArray @@ -1145,9 +1145,9 @@ def corr(da_a, da_b, dim=None): Parameters ---------- - da_a: DataArray (or Variable) object + da_a: DataArray object Array to compute. - da_b: DataArray (or Variable) object + da_b: DataArray object Array to compute. dim: str, optional The dimension along which the correlation will be computed @@ -1203,15 +1203,14 @@ def corr(da_a, da_b, dim=None): "Given {}.".format([type(arr) for arr in [da_a, da_b]]) ) - return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="corr") + return _cov_corr(da_a, da_b, dim=dim, method="corr") def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): """ Internal method for xr.cov() and xr.corr() so only have to - sanitize the input arrays once. + sanitize the input arrays once and we don't repeat code. """ - from .dataarray import DataArray # 1. Broadcast the two arrays da_a, da_b = align(da_a, da_b, join="inner", copy=False) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 7e6a7b97bdb..e4365d65754 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -828,8 +828,8 @@ def arrays_w_tuples(): arrays = [ da.isel(time=range(0, 18)), da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(), - xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"), - xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"), + #xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"), + #xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"), xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]), ] @@ -841,20 +841,20 @@ def arrays_w_tuples(): (arrays[2], arrays[2]), (arrays[2], arrays[3]), (arrays[3], arrays[3]), - (arrays[4], arrays[4]), - (arrays[4], arrays[5]), - (arrays[5], arrays[5]), + #(arrays[4], arrays[4]), + #(arrays[4], arrays[5]), + #(arrays[5], arrays[5]), ] return arrays, array_tuples - +@pytest.mark.parametrize("ddof", [0, 1]) @pytest.mark.parametrize( "da_a, da_b", [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]], ) @pytest.mark.parametrize("dim", [None, "time"]) -def test_cov(da_a, da_b, dim): +def test_cov(da_a, da_b, dim, ddof): if dim is not None: def np_cov_ind(ts1, ts2, a, x): @@ -868,14 +868,14 @@ def np_cov_ind(ts1, ts2, a, x): return np.cov( ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten(), - ddof=1, + ddof=ddof, )[0, 1] expected = np.zeros((3, 4)) for a in [0, 1, 2]: for x in [0, 1, 2, 3]: expected[a, x] = np_cov_ind(da_a, da_b, a=a, x=x) - actual = xr.cov(da_a, da_b, dim) + actual = xr.cov(da_a, da_b, dim=dim, ddof=ddof) assert_allclose(actual, expected) else: @@ -888,10 +888,10 @@ def np_cov(ts1, ts2): ts1 = ts1.where(valid_values) ts2 = ts2.where(valid_values) - return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=1)[0, 1] + return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=ddof)[0, 1] expected = np_cov(da_a, da_b) - actual = xr.cov(da_a, da_b, dim) + actual = xr.cov(da_a, da_b, dim=dim, ddof=ddof) assert_allclose(actual, expected) @@ -941,13 +941,7 @@ def np_corr(ts1, ts2): @pytest.mark.parametrize( "da_a, da_b", - [ - arrays_w_tuples()[1][0], - arrays_w_tuples()[1][1], - arrays_w_tuples()[1][2], - arrays_w_tuples()[1][7], - arrays_w_tuples()[1][8], - ], + arrays_w_tuples()[1], ) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_covcorr_consistency(da_a, da_b, dim): @@ -968,12 +962,7 @@ def test_covcorr_consistency(da_a, da_b, dim): @pytest.mark.parametrize( "da_a", - [ - arrays_w_tuples()[0][0], - arrays_w_tuples()[0][1], - arrays_w_tuples()[0][4], - arrays_w_tuples()[0][5], - ], + arrays_w_tuples()[0], ) @pytest.mark.parametrize("dim", [None, "time", "x"]) def test_autocov(da_a, dim): From b23eea8b203740e2125bfab67c1af00c370c5042 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 10:15:53 +0100 Subject: [PATCH 30/36] removed extraneous test arrays --- xarray/tests/test_computation.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index e4365d65754..4878d256bf2 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -828,8 +828,6 @@ def arrays_w_tuples(): arrays = [ da.isel(time=range(0, 18)), da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(), - #xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"), - #xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"), xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]), ] @@ -841,9 +839,6 @@ def arrays_w_tuples(): (arrays[2], arrays[2]), (arrays[2], arrays[3]), (arrays[3], arrays[3]), - #(arrays[4], arrays[4]), - #(arrays[4], arrays[5]), - #(arrays[5], arrays[5]), ] return arrays, array_tuples From 44c77f024664f20c7dda84d23b6201881532c064 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 10:27:52 +0100 Subject: [PATCH 31/36] formatting + adding deterministic docstring --- xarray/core/computation.py | 80 ++++++++++++++++---------------- xarray/tests/test_computation.py | 7 ++- 2 files changed, 43 insertions(+), 44 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 58598a1ca93..78d2d3e65ef 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1094,38 +1094,38 @@ def cov(da_a, da_b, dim=None, ddof=1): Examples -------- - >>> da_a = DataArray(np.random.random((3, 5)), + >>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]), ... dims=("space", "time"), ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) >>> da_a - - array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438], - [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536], - [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]]) + + array([[1. , 2. , 3. ], + [0.1, 0.2, 0.3], + [3.2, 0.6, 1.8]]) Coordinates: - * space (space) >> da_b = DataArray(np.random.random((3, 5)), + * space (space) >> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]), ... dims=("space", "time"), ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) >>> da_b - - array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ], - [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ], - [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]]) + + array([[ 0.2, 0.4, 0.6], + [15. , 10. , 5. ], + [ 3.2, 0.6, 1.8]]) Coordinates: - * space (space) >> xr.cov(da_a, da_b) - array(0.03823054) + array(-3.53055556) >>> xr.cov(da_a, da_b, dim='time') - array([0.00207952, 0.01024296, 0.08214707]) + array([ 0.2, -0.5, 1.69333333]) Coordinates: - * space (space) >> da_a = DataArray(np.random.random((3, 5)), + >>> da_a = DataArray(np.array([[1, 2, 3], [0.1, 0.2, 0.3], [3.2, 0.6, 1.8]]), ... dims=("space", "time"), ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) >>> da_a - - array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438], - [0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536], - [0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]]) + + array([[1. , 2. , 3. ], + [0.1, 0.2, 0.3], + [3.2, 0.6, 1.8]]) Coordinates: - * space (space) >> da_b = DataArray(np.random.random((3, 5)), - ... dims=("space", "time"), - ... coords=[('space', ['IA', 'IL', 'IN']), - ... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))]) + * space (space) >> da_a = DataArray(np.array([[0.2, 0.4, 0.6], [15, 10, 5], [3.2, 0.6, 1.8]]), + ... dims=("space", "time"), + ... coords=[('space', ['IA', 'IL', 'IN']), + ... ('time', pd.date_range("2000-01-01", freq="1D", periods=3))]) >>> da_b - - array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ], - [0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ], - [0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]]) + + array([[ 0.2, 0.4, 0.6], + [15. , 10. , 5. ], + [ 3.2, 0.6, 1.8]]) Coordinates: - * space (space) >> xr.corr(da_a, da_b) - array(0.67407116) + array(-0.57087777) >>> xr.corr(da_a, da_b, dim='time') - array([0.23150267, 0.24900968, 0.9061562 ]) + array([ 1., -1., 1.]) Coordinates: - * space (space) Date: Mon, 25 May 2020 10:30:38 +0100 Subject: [PATCH 32/36] added test for TypeError --- xarray/tests/test_computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 83d867eb9e8..d15d1a2d333 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,6 +817,8 @@ def test_vectorize_dask(): ) assert_identical(expected, actual) +with raises_regex(TypeError, "Only xr.DataArray is supported"): + xr.corr(xr.Dataset(), xr.Dataset()) def arrays_w_tuples(): da = xr.DataArray( From c2ba27b40b94a6f3616f33939d88103ed2536368 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 10:31:36 +0100 Subject: [PATCH 33/36] formatting --- xarray/tests/test_computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index d15d1a2d333..88f500e9b1e 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -817,9 +817,11 @@ def test_vectorize_dask(): ) assert_identical(expected, actual) + with raises_regex(TypeError, "Only xr.DataArray is supported"): xr.corr(xr.Dataset(), xr.Dataset()) + def arrays_w_tuples(): da = xr.DataArray( np.random.random((3, 21, 4)), From bc58708534c5f9675db4ea82ef8b948d91eee270 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 11:55:09 +0100 Subject: [PATCH 34/36] tidying up docstring --- xarray/core/computation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 78d2d3e65ef..7bfe0ac26c4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1081,8 +1081,9 @@ def cov(da_a, da_b, dim=None, ddof=1): Array to compute. dim : str, optional The dimension along which the covariance will be computed - ddof: int - If ddof=1, covariance is normalized by N-1, giving an unbiased estimate. + ddof: int, optional + If ddof=1, covariance is normalized by N-1, giving an unbiased estimate, + else normalization is by N. Returns ------- @@ -1211,7 +1212,6 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): Internal method for xr.cov() and xr.corr() so only have to sanitize the input arrays once and we don't repeat code. """ - # 1. Broadcast the two arrays da_a, da_b = align(da_a, da_b, join="inner", copy=False) From 6bfb3cff0bcd8df924c7412969af7c294c7f8ce5 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 14:53:57 +0100 Subject: [PATCH 35/36] formatting and tidying up `_cov_corr()` so that the logic is more clear --- xarray/core/computation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 7bfe0ac26c4..ce5b8d27739 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1088,6 +1088,7 @@ def cov(da_a, da_b, dim=None, ddof=1): Returns ------- covariance: DataArray + See also -------- pandas.Series.cov: corresponding pandas function @@ -1156,6 +1157,7 @@ def corr(da_a, da_b, dim=None): Returns ------- correlation: DataArray + See also -------- pandas.Series.corr: corresponding pandas function @@ -1217,15 +1219,16 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() - da_a = da_a.where(valid_values) - da_b = da_b.where(valid_values) + + if not valid_values.all(): + da_a = da_a.where(valid_values) + da_b = da_b.where(valid_values) + valid_count = valid_values.sum(dim) - ddof - # 3. Compute mean and standard deviation along the given dim + # 3. Detrend along the given dim demeaned_da_a = da_a - da_a.mean(dim=dim) demeaned_da_b = da_b - da_b.mean(dim=dim) - da_a_std = da_a.std(dim=dim) - da_b_std = da_b.std(dim=dim) # 4. Compute covariance along the given dim # N.B. `skipna=False` is required or there is a bug when computing @@ -1237,7 +1240,9 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): return cov else: - # compute corr + # compute std + corr + da_a_std = da_a.std(dim=dim) + da_b_std = da_b.std(dim=dim) corr = cov / (da_a_std * da_b_std) return corr From 672c87f207147948853e5d8071e1557d203c6e5e Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Mon, 25 May 2020 14:55:11 +0100 Subject: [PATCH 36/36] flake8 ... --- xarray/core/computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index ce5b8d27739..6ac4f74c3a6 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1088,7 +1088,7 @@ def cov(da_a, da_b, dim=None, ddof=1): Returns ------- covariance: DataArray - + See also -------- pandas.Series.cov: corresponding pandas function