diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..083aa052e --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +omit = + sgkit/tests/* diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cda264383..432d9e5ff 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,7 +21,7 @@ jobs: uses: pre-commit/action@v2.0.0 - name: Test with pytest and coverage run: | - pytest -v --cov=sgkit + pytest -v --cov=sgkit --cov-report term-missing - name: Upload coverage to Codecov env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0b114f16f..48b567f70 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,5 +29,5 @@ repos: rev: v0.782 hooks: - id: mypy - args: ["--strict"] - additional_dependencies: ["numpy", "xarray"] + args: ["--strict", "--show-error-codes"] + additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy"] diff --git a/README.md b/README.md index 4c3c754fc..44ab49331 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ pytest To check code coverage and get a coverage report, run ```bash -pytest --cov=sgkit +pytest --cov=sgkit --cov-report term-missing ``` ### Code standards diff --git a/requirements-dev.txt b/requirements-dev.txt index 875f397ba..c5a02b2b5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,3 +2,4 @@ codecov pre-commit pytest pytest-cov +statsmodels diff --git a/requirements.txt b/requirements.txt index 6fefda50d..894396a41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ numpy xarray +dask[array] +scipy diff --git a/setup.cfg b/setup.cfg index 403ada3af..c8097812a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ ignore = profile = black default_section = THIRDPARTY known_first_party = sgkit -known_third_party = numpy,pytest,setuptools,xarray +known_third_party = dask,numpy,pandas,pytest,setuptools,xarray multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 @@ -60,8 +60,14 @@ line_length = 88 [mypy-numpy.*] ignore_missing_imports = True +[mypy-pandas.*] +ignore_missing_imports = True +[mypy-dask.*] +ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True +[mypy-statsmodels.*] +ignore_missing_imports = True [mypy-setuptools] ignore_missing_imports = True [mypy-sgkit.tests.*] diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 8b170a470..6417b5646 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -5,6 +5,7 @@ DIM_VARIANT, create_genotype_call_dataset, ) +from .stats.association import gwas_linear_regression __all__ = [ "DIM_ALLELE", @@ -12,4 +13,5 @@ "DIM_SAMPLE", "DIM_VARIANT", "create_genotype_call_dataset", + "gwas_linear_regression", ] diff --git a/sgkit/stats/__init__.py b/sgkit/stats/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py new file mode 100644 index 000000000..65e7061f6 --- /dev/null +++ b/sgkit/stats/association.py @@ -0,0 +1,163 @@ +from dataclasses import dataclass +from typing import Optional, Sequence + +import dask.array as da +import numpy as np +import xarray as xr +from dask.array import Array, stats +from xarray import Dataset + +from ..typing import ArrayLike + + +@dataclass +class LinearRegressionResult: + beta: ArrayLike + t_value: ArrayLike + p_value: ArrayLike + + +def _gwas_linear_regression( + G: ArrayLike, X: ArrayLike, y: ArrayLike +) -> LinearRegressionResult: + """Efficient linear regression estimation for multiple covariate sets + + Parameters + ---------- + G : (M, N) array-like + "Loop" covariates for which a separate regression will be fit to + individual columns + X : (M, P) array-like + "Core" covariates that are included in the regressions along + with each loop covariate + y : (M,) + Continuous outcome + + Returns + ------- + LinearRegressionResult + Regression statistics and coefficient estimates + """ + G, X = da.asarray(G), da.asarray(X) + + # Apply orthogonal projection to eliminate core covariates + # Note: QR factorization or SVD should be used here to find + # what are effectively OLS residuals rather than matrix inverse + # to avoid need for MxM array; additionally, dask.lstsq will not + # work with numpy arrays + Gp = G - X @ da.linalg.lstsq(X, G)[0] + yp = y - X @ da.linalg.lstsq(X, y)[0] + + # Estimate coefficients for each loop covariate + # Note: A key assumption here is that 0-mean residuals + # from projection require no extra terms in variance + # estimate for loop covariates (columns of G), which is + # only true when an intercept is present. + Gps = (Gp ** 2).sum(axis=0) + b = (Gp.T @ yp) / Gps + + # Compute statistics and p values for each regression separately + dof = y.shape[0] - X.shape[1] - 1 + y_resid = yp[:, np.newaxis] - Gp * b + rss = (y_resid ** 2).sum(axis=0) + t_val = b / np.sqrt(rss / dof / Gps) + p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof) + + return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val) + + +def _get_loop_covariates(ds: Dataset, dosage: Optional[str] = None) -> Array: + if dosage is None: + # TODO: This should be (probably gwas-specific) allele + # count with sex chromosome considerations + G = ds["call/genotype"].sum(dim="ploidy") # pragma: no cover + else: + G = ds[dosage] + return da.asarray(G.data) + + +def _get_core_covariates( + ds: Dataset, covariates: Sequence[str], add_intercept: bool = False +) -> Array: + if not add_intercept and not covariates: + raise ValueError( + "At least one covariate must be provided when `add_intercept`=False" + ) + X = da.stack([da.asarray(ds[c].data) for c in covariates]).T + if add_intercept: + X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1) + # Note: dask qr decomp (used by lstsq) requires no chunking in one + # dimension, and because dim 0 will be far greater than the number + # of covariates for the large majority of use cases, chunking + # should be removed from dim 1 + return X.rechunk((None, -1)) + + +def gwas_linear_regression( + ds: Dataset, + covariates: Sequence[str], + dosage: str, + trait: str, + add_intercept: bool = True, +) -> Dataset: + """Run linear regression to identify continuous trait associations with genetic variants + + This method solves OLS regressions for each variant simultaneously and reports + effect statistics as defined in [1]. This is facilitated by the removal of + sample (i.e. person/individual) covariates through orthogonal projection + of both the genetic variant and phenotype data [2]. A consequence of this + rotation is that effect sizes and significances cannot be reported for + covariates, only variants. + + Warning: Regression statistics from this implementation are only valid when an + intercept is present. The `add_intercept` flag is a convenience for adding one + when not already present, but there is currently no parameterization for + intercept-free regression. + + Parameters + ---------- + ds : Dataset + Dataset containing necessary dependent and independent variables + covariates : Sequence[str] + Covariate variable names + dosage : str + Dosage variable name where "dosage" array can contain represent + one of several possible quantities, e.g.: + - Alternate allele counts + - Recessive or dominant allele encodings + - True dosages as computed from imputed or probabilistic variant calls + trait : str + Trait (e.g. phenotype) variable name, must be continuous + add_intercept : bool, optional + Add intercept term to covariate set, by default True + + References + ---------- + - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements + of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition. + Springer Science & Business Media. + - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson, + Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient + Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.” + Nature Genetics 47 (3): 284–90. + + Returns + ------- + Dataset + Regression result containing: + - beta: beta values associated with each independent variant regressed + against the trait + - t_value: T-test statistic for beta estimate + - p_value: P-value for beta estimate (unscaled float in [0, 1]) + """ + G = _get_loop_covariates(ds, dosage=dosage) + Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept) + y = da.asarray(ds[trait].data) + res = _gwas_linear_regression(G.T, Z, y) + return xr.Dataset( + { + "variant/beta": ("variants", res.beta), + "variant/t_value": ("variants", res.t_value), + "variant/p_value": ("variants", res.p_value), + } + ) diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py new file mode 100644 index 000000000..97407a6a5 --- /dev/null +++ b/sgkit/tests/test_association.py @@ -0,0 +1,169 @@ +import warnings +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd +import pytest +import xarray as xr +from pandas import DataFrame +from xarray import Dataset + +from sgkit.stats.association import gwas_linear_regression +from sgkit.typing import ArrayLike + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Ignore: DeprecationWarning: Using or importing the ABCs from 'collections' + # instead of from 'collections.abc' is deprecated since Python 3.3, + # and in 3.9 it will stop working + import statsmodels.api as sm + from statsmodels.regression.linear_model import RegressionResultsWrapper + + +def _generate_test_data( + n: int = 100, + m: int = 10, + p: int = 3, + e_std: float = 0.001, + b_zero_slice: Optional[slice] = None, + seed: Optional[int] = 1, +) -> Tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]: + """Test data simulator for multiple variant associations to a continuous outcome + + Outcomes for each variant are simulated separately based on linear combinations + of randomly generated fixed effect covariates as well as the variant itself. + + This does not add an intercept term in covariates. + + Parameters + ---------- + n : int, optional + Number of samples + m : int, optional + Number of variants + p : int, optional + Number of covariates + e_std : float, optional + Standard deviation for noise term + b_zero_slice : slice + Variant beta values to zero out, defaults to `slice(m // 2)` + meaning that the first half will all be 0. + Set to `slice(0)` to disable. + + Returns + ------- + g : (n, m) array-like + Simulated genotype dosage + x : (n, p) array-like + Simulated covariates + bg : (m,) array-like + Variant betas + ys : (m, n) array-like + Outcomes for each column in genotypes i.e. variant + """ + if b_zero_slice is None: + b_zero_slice = slice(m // 2) + rs = np.random.RandomState(seed) + g = rs.uniform(size=(n, m), low=0, high=2) + x = rs.normal(size=(n, p)) + bg = rs.normal(size=m) + bg[b_zero_slice or slice(m // 2)] = 0 + bx = rs.normal(size=p) + e = rs.normal(size=n, scale=e_std) + + # Simulate y values using each variant independently + ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)]) + return g, x, bg, ys + + +def _generate_test_dataset(**kwargs: Any) -> Dataset: + g, x, bg, ys = _generate_test_data(**kwargs) + data_vars = {} + data_vars["dosage"] = (["variant", "sample"], g.T) + for i in range(x.shape[1]): + data_vars[f"covar_{i}"] = (["sample"], x[:, i]) + for i in range(len(ys)): + data_vars[f"trait_{i}"] = (["sample"], ys[i]) + attrs = dict(beta=bg) + return xr.Dataset(data_vars, attrs=attrs) # type: ignore[arg-type] + + +@pytest.fixture(scope="module") # type: ignore[misc] +def ds() -> Dataset: + return _generate_test_dataset() + + +def _sm_statistics( + ds: Dataset, i: int, add_intercept: bool +) -> RegressionResultsWrapper: + X = [] + # Make sure first independent variable is variant + X.append(ds["dosage"].values[i]) + for v in [c for c in list(ds.keys()) if c.startswith("covar_")]: + X.append(ds[v].values) + if add_intercept: + X.append(np.ones(ds.dims["sample"])) + X = np.stack(X).T + y = ds[f"trait_{i}"].values + + return sm.OLS(y, X, hasconst=True).fit() + + +def _get_statistics( + ds: Dataset, add_intercept: bool, **kwargs: Any +) -> Tuple[DataFrame, DataFrame]: + df_pred: List[Dict[str, Any]] = [] + df_true: List[Dict[str, Any]] = [] + for i in range(ds.dims["variant"]): + dsr = gwas_linear_regression( + ds, + dosage="dosage", + trait=f"trait_{i}", + add_intercept=add_intercept, + **kwargs, + ) + res = _sm_statistics(ds, i, add_intercept) + df_pred.append( + dsr.to_dataframe() # type: ignore[no-untyped-call] + .rename(columns=lambda c: c.replace("variant/", "")) + .iloc[i] + .to_dict() + ) + df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0])) + return pd.DataFrame(df_pred), pd.DataFrame(df_true) + + +def test_linear_regression_statistics(ds): + def validate(dfp: DataFrame, dft: DataFrame) -> None: + print(dfp) + print(dft) + + # Validate results at a higher level, looking only for recapitulation + # of more obvious inferences based on how the data was simulated + np.testing.assert_allclose(dfp["beta"], ds.attrs["beta"], atol=1e-3) + mid_idx = ds.dims["variant"] // 2 + assert np.all(dfp["p_value"].iloc[:mid_idx] > 0.05) + assert np.all(dfp["p_value"].iloc[mid_idx:] < 0.05) + + # Validate more precisely against statsmodels results + np.testing.assert_allclose(dfp["t_value"], dft["t_value"]) + np.testing.assert_allclose(dfp["p_value"], dft["p_value"]) + + dfp, dft = _get_statistics( + ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True + ) + validate(dfp, dft) + + dfp, dft = _get_statistics( + ds.assign(covar_3=("sample", np.ones(ds.dims["sample"]))), + covariates=["covar_0", "covar_1", "covar_2", "covar_3"], + add_intercept=False, + ) + validate(dfp, dft) + + +def test_linear_regression_raise_on_no_covars(ds): + with pytest.raises(ValueError, match="At least one covariate must be provided"): + gwas_linear_regression( + ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False + ) diff --git a/sgkit/typing.py b/sgkit/typing.py index 3dea6b2be..fab53b2a9 100644 --- a/sgkit/typing.py +++ b/sgkit/typing.py @@ -1,3 +1,7 @@ -from typing import Any +from typing import Any, Union + +import dask.array as da +import numpy as np DType = Any +ArrayLike = Union[np.ndarray, da.Array]