add rs.utils.weighted_pearsonr and tests (#189)

* add `rs.utils.weighted_pearsonr` and tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove redundant division * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * clarify docstring * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test against uniform weights other than 1 * test against previous results * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: Kevin Dalton <kmdalton@pop-os.localdomain> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
rs-station · Oct 19, 2022 · 28f6c93 · 28f6c93
1 parent 8d90ddc
commit 28f6c93
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 1 deletion.
diff --git a/reciprocalspaceship/utils/__init__.py b/reciprocalspaceship/utils/__init__.py
@@ -38,7 +38,7 @@
 from reciprocalspaceship.utils.math import angle_between
 from reciprocalspaceship.utils.phases import canonicalize_phases, get_phase_restrictions
 from reciprocalspaceship.utils.rfree import add_rfree, copy_rfree
-from reciprocalspaceship.utils.stats import compute_redundancy
+from reciprocalspaceship.utils.stats import compute_redundancy, weighted_pearsonr
 from reciprocalspaceship.utils.structurefactors import (
     compute_structurefactor_multiplicity,
     from_structurefactor,

diff --git a/reciprocalspaceship/utils/stats.py b/reciprocalspaceship/utils/stats.py
@@ -81,3 +81,41 @@ def compute_redundancy(
 
     mult = mult.sort_index()
     return mult.get_hkls(), mult["Count"].to_numpy(np.int32)
+
+
+def weighted_pearsonr(x, y, w):
+    """
+    Calculate a [weighted Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Weighted_correlation_coefficient).
+
+    Note
+    ----
+    x, y, and w may have arbitrarily shaped leading dimensions. The correlation coefficient will always be computed pairwise along the last axis.
+
+    Parameters
+    ----------
+    x : np.array(float)
+        An array of observations.
+    y : np.array(float)
+        An array of observations the same shape as x.
+    w : np.array(float)
+        An array of weights the same shape as x. These needn't be normalized.
+
+    Returns
+    -------
+    r : float
+        The Pearson correlation coefficient along the last dimension. This has shape {x,y,w}.shape[:-1].
+    """
+    z = np.reciprocal(w.sum(-1))
+
+    mx = z * (w * x).sum(-1)
+    my = z * (w * y).sum(-1)
+
+    dx = x - np.expand_dims(mx, axis=-1)
+    dy = y - np.expand_dims(my, axis=-1)
+
+    cxy = z * (w * dx * dy).sum(-1)
+    cx = z * (w * dx * dx).sum(-1)
+    cy = z * (w * dy * dy).sum(-1)
+
+    r = cxy / np.sqrt(cx * cy)
+    return r
diff --git a/tests/utils/test_weighted_pearsonr.py b/tests/utils/test_weighted_pearsonr.py
@@ -0,0 +1,69 @@
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+import pytest
+from scipy.stats import pearsonr
+
+import reciprocalspaceship as rs
+
+
+def test_against_previous_result():
+    csv = """
+,x,y,w
+0,0.9051822979780747,0.2700784720702334,0.937803097548697
+1,0.7653655064893381,0.9096902688106824,0.9022413175501339
+2,0.6701438010457531,0.7643360435701648,0.8943602840313861
+3,0.1524086489285047,0.9854887367590378,0.23724145891681891
+4,0.8673578229262408,0.1660901563869679,0.6802234818049551
+5,0.04749197327200072,0.4056733186535064,0.41411735570989516
+6,0.5555482004198411,0.4273894191186294,0.36358917098272747
+7,0.5463417645646479,0.5092920447904933,0.29441863366197596
+8,0.31353494110452584,0.7666249814241163,0.7493823577932279
+9,0.3923683608283065,0.18587807020463565,0.9318927399856036
+    """
+    df = pd.read_csv(StringIO(csv))
+    x, y, w = df.x.to_numpy(), df.y.to_numpy(), df.w.to_numpy()
+    expected_r = -0.1478766135438829
+
+    r = rs.utils.stats.weighted_pearsonr(x, y, w)
+    assert np.isclose(r, expected_r)
+
+
+def test_weighted_pearsonr():
+    n = 100
+
+    x, y, sigx, sigy = np.random.random((4, n))
+    w = np.sqrt(sigx * sigx + sigy * sigy)
+
+    # Test execution
+    rs.utils.stats.weighted_pearsonr(x, y, w)
+
+    # Test against scipy pearsonr
+    w = np.ones(n)
+    r = rs.utils.stats.weighted_pearsonr(x, y, w)
+    expected_r = pearsonr(x, y)[0]
+    assert np.isclose(r, expected_r)
+
+    # Test against scipy with another uniform weight value
+    w = np.ones(n) * 42.0
+    r = rs.utils.stats.weighted_pearsonr(x, y, w)
+    expected_r = pearsonr(x, y)[0]
+    assert np.isclose(r, expected_r)
+
+
+def test_weighted_pearsonr_batched():
+    # Test batch execution
+    a, b, n = 2, 3, 100
+    x, y, sigx, sigy = np.random.random((4, a, b, n))
+    w = np.sqrt(sigx * sigx + sigy * sigy)
+    r = rs.utils.stats.weighted_pearsonr(x, y, w)
+    assert np.all(np.array(r.shape) == np.array([a, b]))
+
+    # Test against scipy pearsonr
+    w = np.ones((a, b, n))
+    r = rs.utils.stats.weighted_pearsonr(x, y, w)
+    for i in range(a):
+        for j in range(b):
+            expected_r = pearsonr(x[i, j], y[i, j])[0]
+            assert np.isclose(r[i, j], expected_r)