Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow multiple variants in pearson_corr_ld() #258

Merged
merged 11 commits into from
Nov 16, 2024
32 changes: 26 additions & 6 deletions haptools/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,42 @@ class Haplotype(HaplotypeBase):
)


def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float:
def pearson_corr_ld(arrA: npt.NDArray, arrB: npt.NDArray) -> float | npt.NDArray:
"""
Compute the Pearson correlation coefficient between two vectors (1D numpy arrays)
Compute the Pearson correlation coefficient (LD) between two vectors (1D arrays)

If 2D array(s) are given instead, the rows will be treated as samples and the columns
as variants

Parameters
----------
arrA: npt.NDArray
The first 1D numpy array
The first 1D (or 2D) numpy array
arrB: npt.NDArray
The second 1D numpy array
The second 1D (or 2D) numpy array

Returns
-------
The LD between the genotypes in arrA and the genotypes in arrB
The signed LD between the genotypes in arrA and the genotypes in arrB

If either arrA or arrB is 2D, then the return value will be an array instead. If
both are 2D, the shape of the array will correspond with the second dimensions of
arrA and arrB, respectively. Otherwise, it will be 1D.
"""
return np.corrcoef(arrA, arrB)[1, 0]
if arrA.ndim == 1:
dim = 1
elif arrA.ndim == 2:
dim = arrA.shape[1]
else:
raise ValueError("We can only compute LD for 2D genotype arrays")
ld_mat = np.corrcoef(arrA, arrB, rowvar=False)[:dim:, dim:]
if arrA.ndim == 1:
ld_mat = ld_mat[0, :]
if arrB.ndim == 1:
ld_mat = ld_mat[0]
elif arrB.ndim == 1:
ld_mat = ld_mat[:, 0]
return ld_mat


def calc_ld(
Expand Down
60 changes: 60 additions & 0 deletions tests/test_ld.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,73 @@
import math
from pathlib import Path

import numpy as np
from click.testing import CliRunner

from haptools.data import Data
from haptools.__main__ import main
from haptools.ld import pearson_corr_ld

DATADIR = Path(__file__).parent.joinpath("data")


def test_ld(seed=42):
rng = np.random.default_rng(seed)

# Different input shapes will give different output shapes
# (25,) x (25,) --> float
arrA = rng.choice((0, 1, 2), size=(25,))
arrB = rng.choice((0, 1, 2), size=(25,))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, float)
assert math.isclose(ld, -0.1148198316929615)

# (25,) x (25,1) --> (1,)
arrB = arrB[:, np.newaxis]
old_ld = ld
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1,)
assert old_ld == ld[0]

# (25,1) x (25,1) --> (1,1)
arrA = arrA[:, np.newaxis]
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1, 1)
assert old_ld == ld[0, 0]

# (25,3) x (25,) --> (3,)
arrA = np.hstack((np.random.choice((0, 1, 2), size=(25, 2)), arrA))
arrB = np.squeeze(arrB)
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (3,)
assert old_ld == ld[2]

# (25,) x (25,3) --> (3,)
arrA = arrB
arrB = np.random.choice((0, 1, 2), size=(25, 3))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (3,)

# (25,1) x (25,3) --> (1,3)
arrA = arrA[:, np.newaxis]
old_ld = ld
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (1, 3)
np.testing.assert_allclose(old_ld[np.newaxis, :], ld)

# (25,2) x (25,3) --> (2,3)
arrA = np.hstack((arrA, np.random.choice((0, 1, 2), size=(25, 1))))
ld = pearson_corr_ld(arrA, arrB)
assert isinstance(ld, np.ndarray)
assert ld.shape == (2, 3)
np.testing.assert_allclose(old_ld, ld[0])


def test_basic(capfd):
expected = """#\torderH\tld
#\tversion\t0.2.0
Expand Down
Loading