sgkit-dev · eric-czech · Jul 17, 2020 · Jun 29, 2020 · Jun 30, 2020 · Jun 30, 2020
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+omit =
+    sgkit/tests/*
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -21,7 +21,7 @@ jobs:
       uses: pre-commit/action@v2.0.0
     - name: Test with pytest and coverage
       run: |
-        pytest -v --cov=sgkit
+        pytest -v --cov=sgkit --cov-report term-missing
     - name: Upload coverage to Codecov
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -29,5 +29,5 @@ repos:
     rev: v0.782
     hooks:
       - id: mypy
-        args: ["--strict"]
+        args: ["--strict", "--show-error-codes"]
         additional_dependencies: ["numpy", "xarray"]
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ pytest
 To check code coverage and get a coverage report, run
 
 ```bash
-pytest --cov=sgkit
+pytest --cov=sgkit --cov-report term-missing
 ```
 
 ### Code standards

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -2,3 +2,4 @@ codecov
 pre-commit
 pytest
 pytest-cov
+statsmodels
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 numpy
 xarray
+dask[array]
+scipy
diff --git a/setup.cfg b/setup.cfg
@@ -51,7 +51,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = numpy,pytest,setuptools,xarray
+known_third_party = dask,numpy,pandas,pytest,setuptools,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
@@ -60,8 +60,14 @@ line_length = 88
 
 [mypy-numpy.*]
 ignore_missing_imports = True
+[mypy-pandas.*]
+ignore_missing_imports = True
+[mypy-dask.*]
+ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
+[mypy-statsmodels.*]
+ignore_missing_imports = True
 [mypy-setuptools]
 ignore_missing_imports = True
 [mypy-sgkit.tests.*]

diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -5,11 +5,13 @@
     DIM_VARIANT,
     create_genotype_call_dataset,
 )
+from .stats.association import gwas_linear_regression
 
 __all__ = [
     "DIM_ALLELE",
     "DIM_PLOIDY",
     "DIM_SAMPLE",
     "DIM_VARIANT",
     "create_genotype_call_dataset",
+    "gwas_linear_regression",
 ]
diff --git a/sgkit/stats/__init__.py b/sgkit/stats/__init__.py
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
@@ -0,0 +1,160 @@
+import collections
+from typing import Optional, Sequence
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from dask.array import Array, stats
+from xarray import Dataset
+
+from ..typing import ArrayLike
+
+LinearRegressionResult = collections.namedtuple(
+    "LinearRegressionResult", ["beta", "t_value", "p_value"]
+)
+
+
+def _gwas_linear_regression(
+    G: ArrayLike, X: ArrayLike, y: ArrayLike
+) -> LinearRegressionResult:
+    """Efficient linear regression estimation for multiple covariate sets
+
+    Parameters
+    ----------
+    G : (M, N) array-like
+        "Loop" covariates for which a separate regression will be fit to
+        individual columns
+    X : (M, P) array-like
+        "Core" covariates that are included in the regressions along
+        with each loop covariate
+    y : (M,)
+        Continuous outcome
+
+    Returns
+    -------
+    LinearRegressionResult
+        Regression statistics and coefficient estimates
+    """
+    G, X = da.asarray(G), da.asarray(X)
+
+    # Apply orthogonal projection to eliminate core covariates
+    # Note: QR factorization or SVD should be used here to find
+    # what are effectively OLS residuals rather than matrix inverse
+    # to avoid need for MxM array; additionally, dask.lstsq will not
+    # work with numpy arrays
+    Gp = G - X @ da.linalg.lstsq(X, G)[0]
+    yp = y - X @ da.linalg.lstsq(X, y)[0]
+
+    # Estimate coefficients for each loop covariate
+    # Note: A key assumption here is that 0-mean residuals
+    # from projection require no extra terms in variance
+    # estimate for loop covariates (columns of G), which is
+    # only true when an intercept is present.
+    Gps = (Gp ** 2).sum(axis=0)
+    b = (Gp.T @ yp) / Gps
+
+    # Compute statistics and p values for each regression separately
+    dof = y.shape[0] - X.shape[1] - 1
+    y_resid = yp[:, np.newaxis] - Gp * b
+    rss = (y_resid ** 2).sum(axis=0)
+    t_val = b / np.sqrt(rss / dof / Gps)
+    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
+
+    return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
+
+
+def _get_loop_covariates(ds: Dataset, dosage: Optional[str] = None) -> Array:
+    if dosage is None:
+        # TODO: This should be (probably gwas-specific) allele
+        # count with sex chromosome considerations
+        G = ds["call/genotype"].sum(dim="ploidy")  # pragma: no cover
+    else:
+        G = ds[dosage]
+    return da.asarray(G.data)
+
+
+def _get_core_covariates(
+    ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
+) -> Array:
+    if not add_intercept and not covariates:
+        raise ValueError(
+            "At least one covariate must be provided when `add_intercept`=False"
+        )
+    X = da.stack([da.asarray(ds[c].data) for c in covariates]).T
+    if add_intercept:
+        X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1)
+    # Note: dask qr decomp (used by lstsq) requires no chunking in one
+    # dimension, and because dim 0 will be far greater than the number
+    # of covariates for the large majority of use cases, chunking
+    # should be removed from dim 1
+    return X.rechunk((None, -1))
+
+
+def gwas_linear_regression(
+    ds: Dataset,
+    covariates: Sequence[str],
+    dosage: str,
+    trait: str,
+    add_intercept: bool = True,
+) -> Dataset:
+    """Run linear regression to identify continuous trait associations with genetic variants
+
+    This method solves OLS regressions for each variant simultaneously and reports
+    effect statistics as defined in [1]. This is facilitated by the removal of
+    sample (i.e. person/individual) covariates through orthogonal projection
+    of both the genetic variant and phenotype data [2]. A consequence of this
+    rotation is that effect sizes and significances cannot be reported for
+    covariates, only variants.
+
+    Warning: Regression statistics from this implementation are only valid when an
+    intercept is present. The `add_intercept` flag is a convenience for adding one
+    when not already present, but there is currently no parameterization for
+    intercept-free regression.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Dataset containing necessary dependent and independent variables
+    covariates : Sequence[str]
+        Covariate variable names
+    dosage : str
+        Dosage variable name where "dosage" array can contain represent
+        one of several possible quantities, e.g.:
+        - Alternate allele counts
+        - Recessive or dominant allele encodings
+        - True dosages as computed from imputed or probabilistic variant calls
+    trait : str
+        Trait (e.g. phenotype) variable name, must be continuous
+    add_intercept : bool, optional
+        Add intercept term to covariate set, by default True
+
+    References
+    ----------
+    - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
+        of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition.
+        Springer Science & Business Media.
+    - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson,
+        Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient
+        Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
+        Nature Genetics 47 (3): 284–90.
+
+    Returns
+    -------
+    Dataset
+        Regression result containing:
+        - beta: beta values associated with each independent variant regressed
+            against the trait
+        - t_value: T-test statistic for beta estimate
+        - p_value: P-value for beta estimate (unscaled float in [0, 1])
+    """
+    G = _get_loop_covariates(ds, dosage=dosage)
+    Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
+    y = da.asarray(ds[trait].data)
+    res = _gwas_linear_regression(G.T, Z, y)
+    return xr.Dataset(
+        {
+            "variant/beta": ("variants", res.beta),
+            "variant/t_value": ("variants", res.t_value),
+            "variant/p_value": ("variants", res.p_value),
+        }
+    )