diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000..083aa052e
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+omit =
+    sgkit/tests/*
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cda264383..432d9e5ff 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,7 +21,7 @@ jobs:
       uses: pre-commit/action@v2.0.0
     - name: Test with pytest and coverage
       run: |
-        pytest -v --cov=sgkit
+        pytest -v --cov=sgkit --cov-report term-missing
     - name: Upload coverage to Codecov
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b114f16f..48b567f70 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,5 +29,5 @@ repos:
     rev: v0.782
     hooks:
       - id: mypy
-        args: ["--strict"]
-        additional_dependencies: ["numpy", "xarray"]
+        args: ["--strict", "--show-error-codes"]
+        additional_dependencies: ["numpy", "xarray", "dask[array]", "scipy"]
diff --git a/README.md b/README.md
index 4c3c754fc..44ab49331 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ pytest
 To check code coverage and get a coverage report, run
 
 ```bash
-pytest --cov=sgkit
+pytest --cov=sgkit --cov-report term-missing
 ```
 
 ### Code standards
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 875f397ba..c5a02b2b5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,3 +2,4 @@ codecov
 pre-commit
 pytest
 pytest-cov
+statsmodels
diff --git a/requirements.txt b/requirements.txt
index 6fefda50d..894396a41 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
 numpy
 xarray
+dask[array]
+scipy
diff --git a/setup.cfg b/setup.cfg
index 403ada3af..c8097812a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = numpy,pytest,setuptools,xarray
+known_third_party = dask,numpy,pandas,pytest,setuptools,xarray
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
@@ -60,8 +60,14 @@ line_length = 88
 
 [mypy-numpy.*]
 ignore_missing_imports = True
+[mypy-pandas.*]
+ignore_missing_imports = True
+[mypy-dask.*]
+ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
+[mypy-statsmodels.*]
+ignore_missing_imports = True
 [mypy-setuptools]
 ignore_missing_imports = True
 [mypy-sgkit.tests.*]
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
index 8b170a470..6417b5646 100644
--- a/sgkit/__init__.py
+++ b/sgkit/__init__.py
@@ -5,6 +5,7 @@
     DIM_VARIANT,
     create_genotype_call_dataset,
 )
+from .stats.association import gwas_linear_regression
 
 __all__ = [
     "DIM_ALLELE",
@@ -12,4 +13,5 @@
     "DIM_SAMPLE",
     "DIM_VARIANT",
     "create_genotype_call_dataset",
+    "gwas_linear_regression",
 ]
diff --git a/sgkit/stats/__init__.py b/sgkit/stats/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
new file mode 100644
index 000000000..65e7061f6
--- /dev/null
+++ b/sgkit/stats/association.py
@@ -0,0 +1,163 @@
+from dataclasses import dataclass
+from typing import Optional, Sequence
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from dask.array import Array, stats
+from xarray import Dataset
+
+from ..typing import ArrayLike
+
+
+@dataclass
+class LinearRegressionResult:
+    beta: ArrayLike
+    t_value: ArrayLike
+    p_value: ArrayLike
+
+
+def _gwas_linear_regression(
+    G: ArrayLike, X: ArrayLike, y: ArrayLike
+) -> LinearRegressionResult:
+    """Efficient linear regression estimation for multiple covariate sets
+
+    Parameters
+    ----------
+    G : (M, N) array-like
+        "Loop" covariates for which a separate regression will be fit to
+        individual columns
+    X : (M, P) array-like
+        "Core" covariates that are included in the regressions along
+        with each loop covariate
+    y : (M,)
+        Continuous outcome
+
+    Returns
+    -------
+    LinearRegressionResult
+        Regression statistics and coefficient estimates
+    """
+    G, X = da.asarray(G), da.asarray(X)
+
+    # Apply orthogonal projection to eliminate core covariates
+    # Note: QR factorization or SVD should be used here to find
+    # what are effectively OLS residuals rather than matrix inverse
+    # to avoid need for MxM array; additionally, dask.lstsq will not
+    # work with numpy arrays
+    Gp = G - X @ da.linalg.lstsq(X, G)[0]
+    yp = y - X @ da.linalg.lstsq(X, y)[0]
+
+    # Estimate coefficients for each loop covariate
+    # Note: A key assumption here is that 0-mean residuals
+    # from projection require no extra terms in variance
+    # estimate for loop covariates (columns of G), which is
+    # only true when an intercept is present.
+    Gps = (Gp ** 2).sum(axis=0)
+    b = (Gp.T @ yp) / Gps
+
+    # Compute statistics and p values for each regression separately
+    dof = y.shape[0] - X.shape[1] - 1
+    y_resid = yp[:, np.newaxis] - Gp * b
+    rss = (y_resid ** 2).sum(axis=0)
+    t_val = b / np.sqrt(rss / dof / Gps)
+    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
+
+    return LinearRegressionResult(beta=b, t_value=t_val, p_value=p_val)
+
+
+def _get_loop_covariates(ds: Dataset, dosage: Optional[str] = None) -> Array:
+    if dosage is None:
+        # TODO: This should be (probably gwas-specific) allele
+        # count with sex chromosome considerations
+        G = ds["call/genotype"].sum(dim="ploidy")  # pragma: no cover
+    else:
+        G = ds[dosage]
+    return da.asarray(G.data)
+
+
+def _get_core_covariates(
+    ds: Dataset, covariates: Sequence[str], add_intercept: bool = False
+) -> Array:
+    if not add_intercept and not covariates:
+        raise ValueError(
+            "At least one covariate must be provided when `add_intercept`=False"
+        )
+    X = da.stack([da.asarray(ds[c].data) for c in covariates]).T
+    if add_intercept:
+        X = da.concatenate([da.ones((X.shape[0], 1)), X], axis=1)
+    # Note: dask qr decomp (used by lstsq) requires no chunking in one
+    # dimension, and because dim 0 will be far greater than the number
+    # of covariates for the large majority of use cases, chunking
+    # should be removed from dim 1
+    return X.rechunk((None, -1))
+
+
+def gwas_linear_regression(
+    ds: Dataset,
+    covariates: Sequence[str],
+    dosage: str,
+    trait: str,
+    add_intercept: bool = True,
+) -> Dataset:
+    """Run linear regression to identify continuous trait associations with genetic variants
+
+    This method solves OLS regressions for each variant simultaneously and reports
+    effect statistics as defined in [1]. This is facilitated by the removal of
+    sample (i.e. person/individual) covariates through orthogonal projection
+    of both the genetic variant and phenotype data [2]. A consequence of this
+    rotation is that effect sizes and significances cannot be reported for
+    covariates, only variants.
+
+    Warning: Regression statistics from this implementation are only valid when an
+    intercept is present. The `add_intercept` flag is a convenience for adding one
+    when not already present, but there is currently no parameterization for
+    intercept-free regression.
+
+    Parameters
+    ----------
+    ds : Dataset
+        Dataset containing necessary dependent and independent variables
+    covariates : Sequence[str]
+        Covariate variable names
+    dosage : str
+        Dosage variable name where "dosage" array can contain represent
+        one of several possible quantities, e.g.:
+        - Alternate allele counts
+        - Recessive or dominant allele encodings
+        - True dosages as computed from imputed or probabilistic variant calls
+    trait : str
+        Trait (e.g. phenotype) variable name, must be continuous
+    add_intercept : bool, optional
+        Add intercept term to covariate set, by default True
+
+    References
+    ----------
+    - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
+        of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition.
+        Springer Science & Business Media.
+    - [2] Loh, Po-Ru, George Tucker, Brendan K. Bulik-Sullivan, Bjarni J. Vilhjálmsson,
+        Hilary K. Finucane, Rany M. Salem, Daniel I. Chasman, et al. 2015. “Efficient
+        Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
+        Nature Genetics 47 (3): 284–90.
+
+    Returns
+    -------
+    Dataset
+        Regression result containing:
+        - beta: beta values associated with each independent variant regressed
+            against the trait
+        - t_value: T-test statistic for beta estimate
+        - p_value: P-value for beta estimate (unscaled float in [0, 1])
+    """
+    G = _get_loop_covariates(ds, dosage=dosage)
+    Z = _get_core_covariates(ds, covariates, add_intercept=add_intercept)
+    y = da.asarray(ds[trait].data)
+    res = _gwas_linear_regression(G.T, Z, y)
+    return xr.Dataset(
+        {
+            "variant/beta": ("variants", res.beta),
+            "variant/t_value": ("variants", res.t_value),
+            "variant/p_value": ("variants", res.p_value),
+        }
+    )
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
new file mode 100644
index 000000000..97407a6a5
--- /dev/null
+++ b/sgkit/tests/test_association.py
@@ -0,0 +1,169 @@
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+from pandas import DataFrame
+from xarray import Dataset
+
+from sgkit.stats.association import gwas_linear_regression
+from sgkit.typing import ArrayLike
+
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore", DeprecationWarning)
+    # Ignore: DeprecationWarning: Using or importing the ABCs from 'collections'
+    # instead of from 'collections.abc' is deprecated since Python 3.3,
+    # and in 3.9 it will stop working
+    import statsmodels.api as sm
+    from statsmodels.regression.linear_model import RegressionResultsWrapper
+
+
+def _generate_test_data(
+    n: int = 100,
+    m: int = 10,
+    p: int = 3,
+    e_std: float = 0.001,
+    b_zero_slice: Optional[slice] = None,
+    seed: Optional[int] = 1,
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]:
+    """Test data simulator for multiple variant associations to a continuous outcome
+
+    Outcomes for each variant are simulated separately based on linear combinations
+    of randomly generated fixed effect covariates as well as the variant itself.
+
+    This does not add an intercept term in covariates.
+
+    Parameters
+    ----------
+    n : int, optional
+        Number of samples
+    m : int, optional
+        Number of variants
+    p : int, optional
+        Number of covariates
+    e_std : float, optional
+        Standard deviation for noise term
+    b_zero_slice : slice
+        Variant beta values to zero out, defaults to `slice(m // 2)`
+        meaning that the first half will all be 0.
+        Set to `slice(0)` to disable.
+
+    Returns
+    -------
+    g : (n, m) array-like
+        Simulated genotype dosage
+    x : (n, p) array-like
+        Simulated covariates
+    bg : (m,) array-like
+        Variant betas
+    ys : (m, n) array-like
+        Outcomes for each column in genotypes i.e. variant
+    """
+    if b_zero_slice is None:
+        b_zero_slice = slice(m // 2)
+    rs = np.random.RandomState(seed)
+    g = rs.uniform(size=(n, m), low=0, high=2)
+    x = rs.normal(size=(n, p))
+    bg = rs.normal(size=m)
+    bg[b_zero_slice or slice(m // 2)] = 0
+    bx = rs.normal(size=p)
+    e = rs.normal(size=n, scale=e_std)
+
+    # Simulate y values using each variant independently
+    ys = np.array([g[:, i] * bg[i] + x @ bx + e for i in range(m)])
+    return g, x, bg, ys
+
+
+def _generate_test_dataset(**kwargs: Any) -> Dataset:
+    g, x, bg, ys = _generate_test_data(**kwargs)
+    data_vars = {}
+    data_vars["dosage"] = (["variant", "sample"], g.T)
+    for i in range(x.shape[1]):
+        data_vars[f"covar_{i}"] = (["sample"], x[:, i])
+    for i in range(len(ys)):
+        data_vars[f"trait_{i}"] = (["sample"], ys[i])
+    attrs = dict(beta=bg)
+    return xr.Dataset(data_vars, attrs=attrs)  # type: ignore[arg-type]
+
+
+@pytest.fixture(scope="module")  # type: ignore[misc]
+def ds() -> Dataset:
+    return _generate_test_dataset()
+
+
+def _sm_statistics(
+    ds: Dataset, i: int, add_intercept: bool
+) -> RegressionResultsWrapper:
+    X = []
+    # Make sure first independent variable is variant
+    X.append(ds["dosage"].values[i])
+    for v in [c for c in list(ds.keys()) if c.startswith("covar_")]:
+        X.append(ds[v].values)
+    if add_intercept:
+        X.append(np.ones(ds.dims["sample"]))
+    X = np.stack(X).T
+    y = ds[f"trait_{i}"].values
+
+    return sm.OLS(y, X, hasconst=True).fit()
+
+
+def _get_statistics(
+    ds: Dataset, add_intercept: bool, **kwargs: Any
+) -> Tuple[DataFrame, DataFrame]:
+    df_pred: List[Dict[str, Any]] = []
+    df_true: List[Dict[str, Any]] = []
+    for i in range(ds.dims["variant"]):
+        dsr = gwas_linear_regression(
+            ds,
+            dosage="dosage",
+            trait=f"trait_{i}",
+            add_intercept=add_intercept,
+            **kwargs,
+        )
+        res = _sm_statistics(ds, i, add_intercept)
+        df_pred.append(
+            dsr.to_dataframe()  # type: ignore[no-untyped-call]
+            .rename(columns=lambda c: c.replace("variant/", ""))
+            .iloc[i]
+            .to_dict()
+        )
+        df_true.append(dict(t_value=res.tvalues[0], p_value=res.pvalues[0]))
+    return pd.DataFrame(df_pred), pd.DataFrame(df_true)
+
+
+def test_linear_regression_statistics(ds):
+    def validate(dfp: DataFrame, dft: DataFrame) -> None:
+        print(dfp)
+        print(dft)
+
+        # Validate results at a higher level, looking only for recapitulation
+        # of more obvious inferences based on how the data was simulated
+        np.testing.assert_allclose(dfp["beta"], ds.attrs["beta"], atol=1e-3)
+        mid_idx = ds.dims["variant"] // 2
+        assert np.all(dfp["p_value"].iloc[:mid_idx] > 0.05)
+        assert np.all(dfp["p_value"].iloc[mid_idx:] < 0.05)
+
+        # Validate more precisely against statsmodels results
+        np.testing.assert_allclose(dfp["t_value"], dft["t_value"])
+        np.testing.assert_allclose(dfp["p_value"], dft["p_value"])
+
+    dfp, dft = _get_statistics(
+        ds, covariates=["covar_0", "covar_1", "covar_2"], add_intercept=True
+    )
+    validate(dfp, dft)
+
+    dfp, dft = _get_statistics(
+        ds.assign(covar_3=("sample", np.ones(ds.dims["sample"]))),
+        covariates=["covar_0", "covar_1", "covar_2", "covar_3"],
+        add_intercept=False,
+    )
+    validate(dfp, dft)
+
+
+def test_linear_regression_raise_on_no_covars(ds):
+    with pytest.raises(ValueError, match="At least one covariate must be provided"):
+        gwas_linear_regression(
+            ds, covariates=[], dosage="dosage", trait="trait_0", add_intercept=False
+        )
diff --git a/sgkit/typing.py b/sgkit/typing.py
index 3dea6b2be..fab53b2a9 100644
--- a/sgkit/typing.py
+++ b/sgkit/typing.py
@@ -1,3 +1,7 @@
-from typing import Any
+from typing import Any, Union
+
+import dask.array as da
+import numpy as np
 
 DType = Any
+ArrayLike = Union[np.ndarray, da.Array]