diff --git a/reciprocalspaceship/utils/__init__.py b/reciprocalspaceship/utils/__init__.py index 2832981e..da7c12c1 100644 --- a/reciprocalspaceship/utils/__init__.py +++ b/reciprocalspaceship/utils/__init__.py @@ -38,7 +38,7 @@ from reciprocalspaceship.utils.math import angle_between from reciprocalspaceship.utils.phases import canonicalize_phases, get_phase_restrictions from reciprocalspaceship.utils.rfree import add_rfree, copy_rfree -from reciprocalspaceship.utils.stats import compute_redundancy +from reciprocalspaceship.utils.stats import compute_redundancy, weighted_pearsonr from reciprocalspaceship.utils.structurefactors import ( compute_structurefactor_multiplicity, from_structurefactor, diff --git a/reciprocalspaceship/utils/stats.py b/reciprocalspaceship/utils/stats.py index 2243270a..c8b7f542 100644 --- a/reciprocalspaceship/utils/stats.py +++ b/reciprocalspaceship/utils/stats.py @@ -81,3 +81,41 @@ def compute_redundancy( mult = mult.sort_index() return mult.get_hkls(), mult["Count"].to_numpy(np.int32) + + +def weighted_pearsonr(x, y, w): + """ + Calculate a [weighted Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Weighted_correlation_coefficient). + + Note + ---- + x, y, and w may have arbitrarily shaped leading dimensions. The correlation coefficient will always be computed pairwise along the last axis. + + Parameters + ---------- + x : np.array(float) + An array of observations. + y : np.array(float) + An array of observations the same shape as x. + w : np.array(float) + An array of weights the same shape as x. These needn't be normalized. + + Returns + ------- + r : float + The Pearson correlation coefficient along the last dimension. This has shape {x,y,w}.shape[:-1]. + """ + z = np.reciprocal(w.sum(-1)) + + mx = z * (w * x).sum(-1) + my = z * (w * y).sum(-1) + + dx = x - np.expand_dims(mx, axis=-1) + dy = y - np.expand_dims(my, axis=-1) + + cxy = z * (w * dx * dy).sum(-1) + cx = z * (w * dx * dx).sum(-1) + cy = z * (w * dy * dy).sum(-1) + + r = cxy / np.sqrt(cx * cy) + return r diff --git a/tests/utils/test_weighted_pearsonr.py b/tests/utils/test_weighted_pearsonr.py new file mode 100644 index 00000000..abcb4122 --- /dev/null +++ b/tests/utils/test_weighted_pearsonr.py @@ -0,0 +1,69 @@ +from io import StringIO + +import numpy as np +import pandas as pd +import pytest +from scipy.stats import pearsonr + +import reciprocalspaceship as rs + + +def test_against_previous_result(): + csv = """ +,x,y,w +0,0.9051822979780747,0.2700784720702334,0.937803097548697 +1,0.7653655064893381,0.9096902688106824,0.9022413175501339 +2,0.6701438010457531,0.7643360435701648,0.8943602840313861 +3,0.1524086489285047,0.9854887367590378,0.23724145891681891 +4,0.8673578229262408,0.1660901563869679,0.6802234818049551 +5,0.04749197327200072,0.4056733186535064,0.41411735570989516 +6,0.5555482004198411,0.4273894191186294,0.36358917098272747 +7,0.5463417645646479,0.5092920447904933,0.29441863366197596 +8,0.31353494110452584,0.7666249814241163,0.7493823577932279 +9,0.3923683608283065,0.18587807020463565,0.9318927399856036 + """ + df = pd.read_csv(StringIO(csv)) + x, y, w = df.x.to_numpy(), df.y.to_numpy(), df.w.to_numpy() + expected_r = -0.1478766135438829 + + r = rs.utils.stats.weighted_pearsonr(x, y, w) + assert np.isclose(r, expected_r) + + +def test_weighted_pearsonr(): + n = 100 + + x, y, sigx, sigy = np.random.random((4, n)) + w = np.sqrt(sigx * sigx + sigy * sigy) + + # Test execution + rs.utils.stats.weighted_pearsonr(x, y, w) + + # Test against scipy pearsonr + w = np.ones(n) + r = rs.utils.stats.weighted_pearsonr(x, y, w) + expected_r = pearsonr(x, y)[0] + assert np.isclose(r, expected_r) + + # Test against scipy with another uniform weight value + w = np.ones(n) * 42.0 + r = rs.utils.stats.weighted_pearsonr(x, y, w) + expected_r = pearsonr(x, y)[0] + assert np.isclose(r, expected_r) + + +def test_weighted_pearsonr_batched(): + # Test batch execution + a, b, n = 2, 3, 100 + x, y, sigx, sigy = np.random.random((4, a, b, n)) + w = np.sqrt(sigx * sigx + sigy * sigy) + r = rs.utils.stats.weighted_pearsonr(x, y, w) + assert np.all(np.array(r.shape) == np.array([a, b])) + + # Test against scipy pearsonr + w = np.ones((a, b, n)) + r = rs.utils.stats.weighted_pearsonr(x, y, w) + for i in range(a): + for j in range(b): + expected_r = pearsonr(x[i, j], y[i, j])[0] + assert np.isclose(r[i, j], expected_r)