CAST-genomics · aryarm · Nov 16, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/haptools/ld.py b/haptools/ld.py
@@ -28,22 +28,42 @@ class Haplotype(HaplotypeBase):
     )
 
 
-def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float:
+def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float | npt.NDArray:
     """
-    Compute the Pearson correlation coefficient between two vectors (1D numpy arrays)
+    Compute the Pearson correlation coefficient (LD) between two vectors (1D arrays)
+
+    If 2D array(s) are given instead, the rows will be treated as samples and the columns
+    as variants
 
     Parameters
     ----------
     arrA: npt.NDArray
-        The first 1D numpy array
+        The first 1D (or 2D) numpy array
     arrB: npt.NDArray
-        The second 1D numpy array
+        The second 1D (or 2D) numpy array
 
     Returns
     -------
-    The LD between the genotypes in arrA and the genotypes in arrB
+    The signed LD between the genotypes in arrA and the genotypes in arrB
+
+    If either arrA or arrB is 2D, then the return value will be an array instead. If
+    both are 2D, the shape of the array will correspond with the second dimensions of
+    arrA and arrB, respectively. Otherwise, it will be 1D.
     """
-    return np.corrcoef(arrA, arrB)[1, 0]
+    if arrA.ndim == 1:
+        dim = 1
+    elif arrA.ndim == 2:
+        dim = arrA.shape[1]
+    else:
+        raise ValueError("We can only compute LD for 2D genotype arrays")
+    ld_mat = np.corrcoef(arrA, arrB, rowvar=False)[:dim:, dim:]
+    if arrA.ndim == 1:
+        ld_mat = ld_mat[0, :]
+        if arrB.ndim == 1:
+            ld_mat = ld_mat[0]
+    elif arrB.ndim == 1:
+        ld_mat = ld_mat[:, 0]
+    return ld_mat
 
 
 def calc_ld(

diff --git a/tests/test_ld.py b/tests/test_ld.py
@@ -1,13 +1,73 @@
+import math
 from pathlib import Path
 
+import numpy as np
 from click.testing import CliRunner
 
 from haptools.data import Data
 from haptools.__main__ import main
+from haptools.ld import pearson_corr_ld
 
 DATADIR = Path(__file__).parent.joinpath("data")
 
 
+def test_ld(seed=42):
+    rng = np.random.default_rng(seed)
+
+    # Different input shapes will give different output shapes
+    # (25,) x (25,) --> float
+    arrA = rng.choice((0, 1, 2), size=(25,))
+    arrB = rng.choice((0, 1, 2), size=(25,))
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, float)
+    assert math.isclose(ld, -0.1148198316929615)
+
+    # (25,) x (25,1) --> (1,)
+    arrB = arrB[:, np.newaxis]
+    old_ld = ld
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (1,)
+    assert old_ld == ld[0]
+
+    # (25,1) x (25,1) --> (1,1)
+    arrA = arrA[:, np.newaxis]
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (1, 1)
+    assert old_ld == ld[0, 0]
+
+    # (25,3) x (25,) --> (3,)
+    arrA = np.hstack((np.random.choice((0, 1, 2), size=(25, 2)), arrA))
+    arrB = np.squeeze(arrB)
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (3,)
+    assert old_ld == ld[2]
+
+    # (25,) x (25,3) --> (3,)
+    arrA = arrB
+    arrB = np.random.choice((0, 1, 2), size=(25, 3))
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (3,)
+
+    # (25,1) x (25,3) --> (1,3)
+    arrA = arrA[:, np.newaxis]
+    old_ld = ld
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (1, 3)
+    np.testing.assert_allclose(old_ld[np.newaxis, :], ld)
+
+    # (25,2) x (25,3) --> (2,3)
+    arrA = np.hstack((arrA, np.random.choice((0, 1, 2), size=(25, 1))))
+    ld = pearson_corr_ld(arrA, arrB)
+    assert isinstance(ld, np.ndarray)
+    assert ld.shape == (2, 3)
+    np.testing.assert_allclose(old_ld, ld[0])
+
+
 def test_basic(capfd):
     expected = """#\torderH\tld
 #\tversion\t0.2.0