Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xr.cov() and xr.corr() #4089

Merged
merged 36 commits into from
May 25, 2020
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
37a0bd9
Added chunks='auto' option in dataset.py
AndrewILWilliams May 15, 2020
45edda1
reverted accidental changes in dataset.chunk()
AndrewILWilliams May 15, 2020
500e0b2
Added corr and cov to computation.py. Taken from r-beer:xarray/corr
AndrewILWilliams May 23, 2020
29a1373
Added r-beer's tests to test_computation.py
AndrewILWilliams May 23, 2020
fdd5c5f
trying to fix github.com/pydata/xarray/pull/3550#discussion_r349935731
AndrewILWilliams May 23, 2020
aeabf2c
Removing drop=True from the `.where()` calls in `computation.py`+test.py
AndrewILWilliams May 23, 2020
1489e0f
api.rst and whats-new.rst
AndrewILWilliams May 23, 2020
c121a3d
Updated `xarray/__init__.py` and added `broadcast` import to computation
AndrewILWilliams May 23, 2020
a40d95b
added DataArray import to corr, cov
AndrewILWilliams May 23, 2020
cd19e32
assert_allclose added to test_computation.py
AndrewILWilliams May 23, 2020
2ddcb55
removed whitespace in test_dask...oops
AndrewILWilliams May 23, 2020
2fce175
Added to init
AndrewILWilliams May 23, 2020
a0ef1c2
format changes
AndrewILWilliams May 23, 2020
5d456b5
Fiddling around with cov/corr tests in `test_computation.py`
AndrewILWilliams May 23, 2020
523e4fd
PEP8 changes
AndrewILWilliams May 23, 2020
c23cae6
pep
AndrewILWilliams May 23, 2020
860babc
remove old todo and comments
AndrewILWilliams May 24, 2020
33ded40
isort
AndrewILWilliams May 24, 2020
2751b10
Added consistency check between corr() and cov(), ensure they give same
AndrewILWilliams May 24, 2020
759c9f4
added `skipna=False` to `computation.py`. made consistency+autocov tests
AndrewILWilliams May 24, 2020
1accabd
formatting
AndrewILWilliams May 24, 2020
43a6ad7
Added numpy-based tests.
AndrewILWilliams May 24, 2020
29bbcfb
format
AndrewILWilliams May 24, 2020
a5ce9b3
formatting again
AndrewILWilliams May 24, 2020
5557da9
Update doc/whats-new.rst
AndrewILWilliams May 25, 2020
d395c27
refactored corr/cov so there is one internal method for calculating both
AndrewILWilliams May 25, 2020
21351f5
formatting
AndrewILWilliams May 25, 2020
87c9bea
updating docstrings and code suggestions from PR
AndrewILWilliams May 25, 2020
0e4b682
paramterize ddof in tests
AndrewILWilliams May 25, 2020
b23eea8
removed extraneous test arrays
AndrewILWilliams May 25, 2020
44c77f0
formatting + adding deterministic docstring
AndrewILWilliams May 25, 2020
4bfc1f2
added test for TypeError
AndrewILWilliams May 25, 2020
c2ba27b
formatting
AndrewILWilliams May 25, 2020
bc58708
tidying up docstring
AndrewILWilliams May 25, 2020
6bfb3cf
formatting and tidying up `_cov_corr()` so that the logic is more clear
AndrewILWilliams May 25, 2020
672c87f
flake8 ...
AndrewILWilliams May 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Top-level functions
full_like
zeros_like
ones_like
cov
corr
dot
polyval
map_blocks
Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Breaking changes

New Features
~~~~~~~~~~~~
- Added :py:func:`xarray.cov` and :py:func:`xarray.corr` (:issue:`3784`, :pull:``).
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
By `Andrew Williams <https://github.com/AndrewWilliams3142>`_ and `Robin Beer <https://github.com/r-beer>`_.
- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`)
By `Pascal Bourgault <https://github.com/aulemahal>`_.
- Control over attributes of result in :py:func:`merge`, :py:func:`concat`,
Expand Down
4 changes: 3 additions & 1 deletion xarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .core.alignment import align, broadcast
from .core.combine import auto_combine, combine_by_coords, combine_nested
from .core.common import ALL_DIMS, full_like, ones_like, zeros_like
from .core.computation import apply_ufunc, dot, polyval, where
from .core.computation import apply_ufunc, corr, cov, dot, polyval, where
from .core.concat import concat
from .core.dataarray import DataArray
from .core.dataset import Dataset
Expand Down Expand Up @@ -54,6 +54,8 @@
"concat",
"decode_cf",
"dot",
"cov",
"corr",
"full_like",
"load_dataarray",
"load_dataset",
Expand Down
158 changes: 157 additions & 1 deletion xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import numpy as np

from . import dtypes, duck_array_ops, utils
from .alignment import deep_align
from .alignment import broadcast, deep_align
from .merge import merge_coordinates_without_align
from .options import OPTIONS
from .pycompat import dask_array_type
Expand Down Expand Up @@ -1069,6 +1069,162 @@ def earth_mover_distance(first_samples,
return apply_array_ufunc(func, *args, dask=dask)


def cov(da_a, da_b, dim=None, ddof=1):
"""
Compute covariance between two DataArray objects along a shared dimension.
Parameters
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
----------
da_a: DataArray (or Variable) object
Array to compute.
da_b: DataArray (or Variable) object
Array to compute.
dim : str, optional
The dimension along which the covariance will be computed
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
covariance: DataArray
See also
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
--------
pandas.Series.cov: corresponding pandas function
xr.corr: respective function to calculate correlation
Examples
--------
>>> da_a = DataArray(np.random.random((3, 5)),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
>>> da_a
<xarray.DataArray (space: 3, time: 5)>
array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438],
[0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536],
[0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
>>> da_b = DataArray(np.random.random((3, 5)),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
>>> da_b
<xarray.DataArray (space: 3, time: 5)>
array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ],
[0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ],
[0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
>>> xr.cov(da_a, da_b)
<xarray.DataArray ()>
array(0.03823054)
>>> xr.cov(da_a, da_b, dim='time')
<xarray.DataArray (space: 3)>
array([0.00207952, 0.01024296, 0.08214707])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
"""
from .dataarray import DataArray

AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
# 1. Broadcast the two arrays
da_a, da_b = broadcast(da_a, da_b)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved

# 2. Ignore the nans
valid_values = da_a.notnull() & da_b.notnull()
da_a = da_a.where(valid_values)
da_b = da_a.where(valid_values)
valid_count = valid_values.sum(dim) - ddof

# 3. Compute mean and standard deviation along the given dim
demeaned_da_a = da_a - da_a.mean(dim=dim)
demeaned_da_b = da_b - da_b.mean(dim=dim)

# 4. Compute covariance along the given dim
# N.B. `skipna=False` is required or there is a bug when computing
# auto-covariance. E.g. Try xr.cov(da,da) for
# da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=False) / (valid_count)

return DataArray(cov)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved


def corr(da_a, da_b, dim=None, ddof=0):
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
"""
Compute the Pearson correlation coefficient between
two DataArray objects along a shared dimension.
Parameters
----------
da_a: DataArray (or Variable) object
Array to compute.
da_b: DataArray (or Variable) object
Array to compute.
dim: str, optional
The dimension along which the correlation will be computed
Returns
-------
correlation: DataArray
See also
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
--------
pandas.Series.corr: corresponding pandas function
xr.cov: underlying covariance function
Examples
--------
>>> da_a = DataArray(np.random.random((3, 5)),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
>>> da_a
<xarray.DataArray (space: 3, time: 5)>
array([[0.04356841, 0.11479286, 0.70359101, 0.59072561, 0.16601438],
[0.81552383, 0.72304926, 0.77644406, 0.05788198, 0.74065536],
[0.96252519, 0.36877741, 0.22248412, 0.55185954, 0.23547536]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
>>> da_b = DataArray(np.random.random((3, 5)),
... dims=("space", "time"),
... coords=[('space', ['IA', 'IL', 'IN']),
... ('time', pd.date_range("2000-01-01", freq="1D", periods=5))])
>>> da_b
<xarray.DataArray (space: 3, time: 5)>
array([[0.41505599, 0.43002193, 0.45250454, 0.57701084, 0.5327754 ],
[0.0998048 , 0.67225522, 0.4234324 , 0.13514615, 0.4399088 ],
[0.24675048, 0.58555283, 0.1942955 , 0.86128908, 0.05068975]])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-05
>>> xr.corr(da_a, da_b)
<xarray.DataArray ()>
array(0.67407116)
>>> xr.corr(da_a, da_b, dim='time')
<xarray.DataArray (space: 3)>
array([0.23150267, 0.24900968, 0.9061562 ])
Coordinates:
* space (space) <U2 'IA' 'IL' 'IN'
"""
from .dataarray import DataArray

if any(not isinstance(arr, (Variable, DataArray)) for arr in [da_a, da_b]):
raise TypeError(
"Only xr.DataArray and xr.Variable are supported."
"Given {}.".format([type(arr) for arr in [da_a, da_b]])
)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved

# 1. Broadcast the two arrays
da_a, da_b = broadcast(da_a, da_b)

# 2. Ignore the nans
valid_values = da_a.notnull() & da_b.notnull()
da_a = da_a.where(valid_values)
da_b = da_b.where(valid_values)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved

# 3. Compute correlation based on standard deviations and cov()
da_a_std = da_a.std(dim=dim)
da_b_std = da_b.std(dim=dim)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved

corr = cov(da_a, da_b, dim=dim, ddof=ddof) / (da_a_std * da_b_std)
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved

return DataArray(corr)


def dot(*arrays, dims=None, **kwargs):
"""Generalized dot product for xarray objects. Like np.einsum, but
provides a simpler interface based on array dimensions.
Expand Down
171 changes: 170 additions & 1 deletion xarray/tests/test_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_array_equal
from numpy.testing import assert_allclose, assert_array_equal

import xarray as xr
from xarray.core.alignment import broadcast
from xarray.core.computation import (
_UFuncSignature,
apply_ufunc,
Expand Down Expand Up @@ -817,6 +818,174 @@ def test_vectorize_dask():
assert_identical(expected, actual)


def arrays_w_tuples():
da = xr.DataArray(
np.random.random((3, 21, 4)),
coords={"time": pd.date_range("2000-01-01", freq="1D", periods=21)},
dims=("a", "time", "x"),
)

arrays = [
da.isel(time=range(0, 18)),
da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(),
xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time"),
xr.DataArray([1, 1, np.nan, 2, np.nan, 3, 5, 4, 6, np.nan, 7], dims="time"),
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]),
xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]),
]

array_tuples = [
(arrays[0], arrays[0]),
(arrays[0], arrays[1]),
(arrays[1], arrays[1]),
(arrays[2], arrays[2]),
(arrays[2], arrays[3]),
(arrays[3], arrays[3]),
(arrays[4], arrays[4]),
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
(arrays[4], arrays[5]),
(arrays[5], arrays[5]),
]

return arrays, array_tuples


AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"da_a, da_b",
[arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]],
)
@pytest.mark.parametrize("dim", [None, "time"])
def test_cov(da_a, da_b, dim):
if dim is not None:

def np_cov_ind(ts1, ts2, a, x):
# Ensure the ts are aligned and missing values ignored
ts1, ts2 = broadcast(ts1, ts2)
valid_values = ts1.notnull() & ts2.notnull()

ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.cov(
ts1.sel(a=a, x=x).data.flatten(),
ts2.sel(a=a, x=x).data.flatten(),
ddof=1,
)[0, 1]

expected = np.zeros((3, 4))
for a in [0, 1, 2]:
for x in [0, 1, 2, 3]:
expected[a, x] = np_cov_ind(da_a, da_b, a=a, x=x)
actual = xr.cov(da_a, da_b, dim)
assert_allclose(actual, expected)

else:

def np_cov(ts1, ts2):
# Ensure the ts are aligned and missing values ignored
ts1, ts2 = broadcast(ts1, ts2)
valid_values = ts1.notnull() & ts2.notnull()

ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.cov(ts1.data.flatten(), ts2.data.flatten(), ddof=1)[0, 1]

expected = np_cov(da_a, da_b)
actual = xr.cov(da_a, da_b, dim)
assert_allclose(actual, expected)


@pytest.mark.parametrize(
"da_a, da_b",
[arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]],
AndrewILWilliams marked this conversation as resolved.
Show resolved Hide resolved
)
@pytest.mark.parametrize("dim", [None, "time"])
def test_corr(da_a, da_b, dim):
if dim is not None:

def np_corr_ind(ts1, ts2, a, x):
# Ensure the ts are aligned and missing values ignored
ts1, ts2 = broadcast(ts1, ts2)
valid_values = ts1.notnull() & ts2.notnull()

ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.corrcoef(
ts1.sel(a=a, x=x).data.flatten(), ts2.sel(a=a, x=x).data.flatten()
)[0, 1]

expected = np.zeros((3, 4))
for a in [0, 1, 2]:
for x in [0, 1, 2, 3]:
expected[a, x] = np_corr_ind(da_a, da_b, a=a, x=x)
actual = xr.corr(da_a, da_b, dim)
assert_allclose(actual, expected)

else:

def np_corr(ts1, ts2):
# Ensure the ts are aligned and missing values ignored
ts1, ts2 = broadcast(ts1, ts2)
valid_values = ts1.notnull() & ts2.notnull()

ts1 = ts1.where(valid_values)
ts2 = ts2.where(valid_values)

return np.corrcoef(ts1.data.flatten(), ts2.data.flatten())[0, 1]

expected = np_corr(da_a, da_b)
actual = xr.corr(da_a, da_b, dim)
assert_allclose(actual, expected)


@pytest.mark.parametrize(
"da_a, da_b",
[
arrays_w_tuples()[1][0],
arrays_w_tuples()[1][1],
arrays_w_tuples()[1][2],
arrays_w_tuples()[1][7],
arrays_w_tuples()[1][8],
],
)
@pytest.mark.parametrize("dim", [None, "time", "x"])
def test_covcorr_consistency(da_a, da_b, dim):
# Testing that xr.corr and xr.cov are consistent with each other
# 1. Broadcast the two arrays
da_a, da_b = broadcast(da_a, da_b)
# 2. Ignore the nans
valid_values = da_a.notnull() & da_b.notnull()
da_a = da_a.where(valid_values)
da_b = da_b.where(valid_values)

expected = xr.cov(da_a, da_b, dim=dim, ddof=0) / (
da_a.std(dim=dim) * da_b.std(dim=dim)
)
actual = xr.corr(da_a, da_b, dim=dim)
assert_allclose(actual, expected)


@pytest.mark.parametrize(
"da_a",
[
arrays_w_tuples()[0][0],
arrays_w_tuples()[0][1],
arrays_w_tuples()[0][4],
arrays_w_tuples()[0][5],
],
)
@pytest.mark.parametrize("dim", [None, "time", "x"])
def test_autocov(da_a, dim):
# Testing that the autocovariance*(N-1) is ~=~ to the variance matrix
# 1. Ignore the nans
valid_values = da_a.notnull()
da_a = da_a.where(valid_values)
expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=False)
actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1)
assert_allclose(actual, expected)


@requires_dask
def test_vectorize_dask_new_output_dims():
# regression test for GH3574
Expand Down