diff --git a/haptools/ld.py b/haptools/ld.py index 47bed516..a7d1c799 100644 --- a/haptools/ld.py +++ b/haptools/ld.py @@ -28,22 +28,42 @@ class Haplotype(HaplotypeBase): ) -def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float: +def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float | npt.NDArray: """ - Compute the Pearson correlation coefficient between two vectors (1D numpy arrays) + Compute the Pearson correlation coefficient (LD) between two vectors (1D arrays) + + If 2D array(s) are given instead, the rows will be treated as samples and the columns + as variants Parameters ---------- arrA: npt.NDArray - The first 1D numpy array + The first 1D (or 2D) numpy array arrB: npt.NDArray - The second 1D numpy array + The second 1D (or 2D) numpy array Returns ------- - The LD between the genotypes in arrA and the genotypes in arrB + The signed LD between the genotypes in arrA and the genotypes in arrB + + If either arrA or arrB is 2D, then the return value will be an array instead. If + both are 2D, the shape of the array will correspond with the second dimensions of + arrA and arrB, respectively. Otherwise, it will be 1D. """ - return np.corrcoef(arrA, arrB)[1, 0] + if arrA.ndim == 1: + dim = 1 + elif arrA.ndim == 2: + dim = arrA.shape[1] + else: + raise ValueError("We can only compute LD for 2D genotype arrays") + ld_mat = np.corrcoef(arrA, arrB, rowvar=False)[:dim:, dim:] + if arrA.ndim == 1: + ld_mat = ld_mat[0, :] + if arrB.ndim == 1: + ld_mat = ld_mat[0] + elif arrB.ndim == 1: + ld_mat = ld_mat[:, 0] + return ld_mat def calc_ld( diff --git a/tests/test_ld.py b/tests/test_ld.py index 2426e4ae..1e492849 100644 --- a/tests/test_ld.py +++ b/tests/test_ld.py @@ -1,13 +1,73 @@ +import math from pathlib import Path +import numpy as np from click.testing import CliRunner from haptools.data import Data from haptools.__main__ import main +from haptools.ld import pearson_corr_ld DATADIR = Path(__file__).parent.joinpath("data") +def test_ld(seed=42): + rng = np.random.default_rng(seed) + + # Different input shapes will give different output shapes + # (25,) x (25,) --> float + arrA = rng.choice((0, 1, 2), size=(25,)) + arrB = rng.choice((0, 1, 2), size=(25,)) + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, float) + assert math.isclose(ld, -0.1148198316929615) + + # (25,) x (25,1) --> (1,) + arrB = arrB[:, np.newaxis] + old_ld = ld + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (1,) + assert old_ld == ld[0] + + # (25,1) x (25,1) --> (1,1) + arrA = arrA[:, np.newaxis] + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (1, 1) + assert old_ld == ld[0, 0] + + # (25,3) x (25,) --> (3,) + arrA = np.hstack((np.random.choice((0, 1, 2), size=(25, 2)), arrA)) + arrB = np.squeeze(arrB) + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (3,) + assert old_ld == ld[2] + + # (25,) x (25,3) --> (3,) + arrA = arrB + arrB = np.random.choice((0, 1, 2), size=(25, 3)) + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (3,) + + # (25,1) x (25,3) --> (1,3) + arrA = arrA[:, np.newaxis] + old_ld = ld + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (1, 3) + np.testing.assert_allclose(old_ld[np.newaxis, :], ld) + + # (25,2) x (25,3) --> (2,3) + arrA = np.hstack((arrA, np.random.choice((0, 1, 2), size=(25, 1)))) + ld = pearson_corr_ld(arrA, arrB) + assert isinstance(ld, np.ndarray) + assert ld.shape == (2, 3) + np.testing.assert_allclose(old_ld, ld[0]) + + def test_basic(capfd): expected = """#\torderH\tld #\tversion\t0.2.0