diff --git a/docs/api.rst b/docs/api.rst index 883328521..3bfdee620 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -34,6 +34,48 @@ Methods Tajimas_D pc_relate +Variables +========= + +.. autosummary:: + :toctree: generated/ + + variables.call_genotype + variables.call_genotype_mask + variables.variant_contig + variables.variant_position + variables.variant_allele + variables.sample_id + variables.call_genotype_phased + variables.variant_id + variables.call_dosage + variables.call_dosage_mask + variables.call_genotype_probability + variables.call_genotype_probability_mask + variables.genotype_counts + variables.call_allele_count + variables.variant_allele_count + variables.variant_hwe_p_value + variables.variant_beta + variables.variant_t_value + variables.variant_p_value + variables.covariates + variables.traits + variables.dosage + variables.sample_pcs + variables.pc_relate_phi + variables.base_prediction + variables.meta_prediction + variables.loco_prediction + variables.variant_n_called + variables.variant_call_rate + variables.variant_n_het + variables.variant_n_hom_ref + variables.variant_n_hom_alt + variables.variant_n_non_ref + variables.variant_allele_total + variables.variant_allele_frequency + Utilities ========= diff --git a/docs/conf.py b/docs/conf.py index 0c2fb4c39..db7539ded 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,6 +6,8 @@ # -- Path setup -------------------------------------------------------------- +import logging as pylogging + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -15,6 +17,7 @@ from pathlib import Path import xarray +from sphinx.util import logging sys.path.insert(0, os.path.abspath("..")) @@ -50,6 +53,26 @@ *[p.stem for p in (HERE / "extensions").glob("*.py")], ] + +# Workaround https://github.com/agronholm/sphinx-autodoc-typehints/issues/123 +# When this https://github.com/agronholm/sphinx-autodoc-typehints/pull/153 +# gets merged, we can remove this +class FilterForIssue123(pylogging.Filter): + def filter(self, record: pylogging.LogRecord) -> bool: + msg = record.getMessage() + return not ( + msg.startswith("Cannot treat a function") + and any( + s in msg + for s in ["sgkit.variables.Spec", "sgkit.variables.ArrayLikeSpec"] + ) + ) + + +logging.getLogger("sphinx_autodoc_typehints").logger.addFilter(FilterForIssue123()) +# End of workaround + + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/docs/extensions/typed_returns.py b/docs/extensions/typed_returns.py index 86fe58331..6402739b0 100644 --- a/docs/extensions/typed_returns.py +++ b/docs/extensions/typed_returns.py @@ -7,8 +7,8 @@ import re from typing import Iterator, List -from sphinx.application import Sphinx # type: ignore -from sphinx.ext.napoleon import NumpyDocstring # type: ignore +from sphinx.application import Sphinx +from sphinx.ext.napoleon import NumpyDocstring def process_return(lines: List[str]) -> Iterator[str]: diff --git a/setup.cfg b/setup.cfg index 70732bcc7..d183cae7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -92,6 +92,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-sklearn.*] ignore_missing_imports = True +[mypy-sphinx.*] +ignore_missing_imports = True [mypy-sgkit.*] allow_redefinition = True [mypy-sgkit.tests.*] diff --git a/sgkit/__init__.py b/sgkit/__init__.py index ce24c0ea1..24848f7e9 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -35,4 +35,5 @@ "Fst", "Tajimas_D", "pc_relate", + "variables", ] diff --git a/sgkit/variables.py b/sgkit/variables.py index aeaf7f2f2..567f2036e 100644 --- a/sgkit/variables.py +++ b/sgkit/variables.py @@ -21,97 +21,101 @@ class ArrayLikeSpec(Spec): call_genotype = ArrayLikeSpec("call_genotype", kind="i", ndim=3) """ -Genotype, encoded as allele values (0 for the reference, 1 for +Call genotype. Encoded as allele values (0 for the reference, 1 for the first allele, 2 for the second allele), or -1 to indicate a missing value. """ call_genotype_mask = ArrayLikeSpec("call_genotype_mask", kind="b", ndim=3) +"""TODO""" variant_contig = ArrayLikeSpec("variant_contig", kind="i", ndim=1) -"""The (index of the) contig for each variant""" +"""The (index of the) contig for each variant.""" variant_position = ArrayLikeSpec("variant_position", kind="i", ndim=1) -"""The reference position of the variant""" +"""The reference position of the variant.""" variant_allele = ArrayLikeSpec("variant_allele", kind={"S", "O"}, ndim=2) -"""The possible alleles for the variant""" +"""The possible alleles for the variant.""" sample_id = ArrayLikeSpec("sample_id", kind={"U", "O"}, ndim=1) -"""The unique identifier of the sample""" +"""The unique identifier of the sample.""" call_genotype_phased = ArrayLikeSpec("call_genotype_phased", kind="b", ndim=2) """ -A flag for each call indicating if it is phased or not. If -omitted all calls are unphased. +A flag for each call indicating if it is phased or not. If omitted +all calls are unphased. """ variant_id = ArrayLikeSpec("variant_id", kind="U", ndim=1) -"""The unique identifier of the variant""" +"""The unique identifier of the variant.""" call_dosage = ArrayLikeSpec("call_dosage", kind="f", ndim=2) -"""Dosages, encoded as floats, with NaN indicating a missing value""" +"""Dosages, encoded as floats, with NaN indicating a missing value.""" call_dosage_mask = ArrayLikeSpec("call_dosage_mask", kind="b", ndim=2) +"""TODO""" call_genotype_probability = ArrayLikeSpec("call_genotype_probability", kind="f", ndim=3) +"""TODO""" call_genotype_probability_mask = ArrayLikeSpec( "call_genotype_probability_mask", kind="b", ndim=3 ) +"""TODO""" genotype_counts = ArrayLikeSpec("genotype_counts", ndim=2, kind="i") """ -Genotype counts, must correspond to an (`N`, 3) array where `N` is equal +Genotype counts. Must correspond to an (`N`, 3) array where `N` is equal to the number of variants and the 3 columns contain heterozygous, homozygous reference, and homozygous alternate counts (in that order) across all samples for a variant. """ call_allele_count = ArrayLikeSpec("call_allele_count", ndim=3, kind="u") """ -Allele counts with shape (variants, samples, alleles) and values +Allele counts. With shape (variants, samples, alleles) and values corresponding to the number of non-missing occurrences of each allele. """ variant_allele_count = ArrayLikeSpec("variant_allele_count", ndim=2, kind="u") """ -Variant allele counts with shape (variants, alleles) and values +Variant allele counts. With shape (variants, alleles) and values corresponding to the number of non-missing occurrences of each allele. """ variant_hwe_p_value = ArrayLikeSpec("variant_hwe_p_value", kind="f") -"""P values from HWE test for each variant as float in [0, 1]""" +"""P values from HWE test for each variant as float in [0, 1].""" variant_beta = ArrayLikeSpec("variant_beta") -"""Beta values associated with each variant and trait""" +"""Beta values associated with each variant and trait.""" variant_t_value = ArrayLikeSpec("variant_t_value") -"""T statistics for each beta""" +"""T statistics for each beta.""" variant_p_value = ArrayLikeSpec("variant_p_value", kind="f") -"""P values as float in [0, 1]""" +"""P values as float in [0, 1].""" covariates = ArrayLikeSpec("covariates", ndim={1, 2}) """ -Covariate variable names, must correspond to 1 or 2D dataset +Covariate variable names. Must correspond to 1 or 2D dataset variables of shape (samples[, covariates]). All covariate arrays will be concatenated along the second axis (columns). """ traits = ArrayLikeSpec("traits", ndim={1, 2}) """ -Trait (e.g. phenotype) variable names, must all be continuous and +Trait (for example phenotype) variable names. Must all be continuous and correspond to 1 or 2D dataset variables of shape (samples[, traits]). 2D trait arrays will be assumed to contain separate traits within columns and concatenated to any 1D traits along the second axis (columns). """ dosage = ArrayLikeSpec("dosage") """ -Dosage variable name where "dosage" array can contain represent +Dosage variable name. Where "dosage" array can contain represent one of several possible quantities, e.g.: - - Alternate allele counts - - Recessive or dominant allele encodings - - True dosages as computed from imputed or probabilistic variant calls - - Any other custom encoding in a user-defined variable +- Alternate allele counts +- Recessive or dominant allele encodings +- True dosages as computed from imputed or probabilistic variant calls +- Any other custom encoding in a user-defined variable """ sample_pcs = ArrayLikeSpec("sample_pcs", ndim=2, kind="f") -"""Sample PCs. Dimensions: (PCxS)""" +"""Sample PCs (PCxS).""" pc_relate_phi = ArrayLikeSpec("pc_relate_phi", ndim=2, kind="f") -"""PC Relate kinship coefficient matrix""" +"""PC Relate kinship coefficient matrix.""" base_prediction = ArrayLikeSpec("base_prediction", ndim=4, kind="f") """ -REGENIE's base prediction: (blocks, alphas, samples, outcomes): Stage 1 +REGENIE's base prediction (blocks, alphas, samples, outcomes). Stage 1 predictions from ridge regression reduction. """ meta_prediction = ArrayLikeSpec("meta_prediction", ndim=2, kind="f") """ -REGENIE's meta_prediction: (samples, outcomes): Stage 2 predictions from +REGENIE's meta_prediction (samples, outcomes). Stage 2 predictions from the best meta estimator trained on the out-of-sample Stage 1 predictions. """ loco_prediction = ArrayLikeSpec("loco_prediction", ndim=3, kind="f") """ -REGENIE's loco_prediction: (contigs, samples, outcomes): LOCO predictions +REGENIE's loco_prediction (contigs, samples, outcomes). LOCO predictions resulting from Stage 2 predictions ignoring effects for variant blocks on held out contigs. This will be absent if the data provided does not contain at least 2 contigs. @@ -119,9 +123,9 @@ class ArrayLikeSpec(Spec): variant_n_called = ArrayLikeSpec("variant_n_called", ndim=1, kind="i") """The number of samples with called genotypes.""" variant_call_rate = ArrayLikeSpec("variant_call_rate", ndim=1, kind="f") -"""The number of samples with heterozygous calls""" +"""The number of samples with heterozygous calls.""" variant_n_het = ArrayLikeSpec("variant_n_het", ndim=1, kind="i") -"""The number of samples with heterozygous calls""" +"""The number of samples with heterozygous calls.""" variant_n_hom_ref = ArrayLikeSpec("variant_n_hom_ref", ndim=1, kind="i") """The number of samples with homozygous reference calls.""" variant_n_hom_alt = ArrayLikeSpec("variant_n_hom_alt", ndim=1, kind="i") @@ -150,7 +154,7 @@ def register_variable(cls, spec: ArrayLikeSpec) -> None: @classmethod @overload - def validate( + def _validate( cls, xr_dataset: xr.Dataset, *specs: Mapping[Hashable, ArrayLikeSpec], @@ -163,7 +167,7 @@ def validate( @classmethod @overload - def validate(cls, xr_dataset: xr.Dataset, *specs: ArrayLikeSpec) -> xr.Dataset: + def _validate(cls, xr_dataset: xr.Dataset, *specs: ArrayLikeSpec) -> xr.Dataset: """ Validate that xr_dataset contains array(s) of interest with default variable name(s). @@ -172,7 +176,7 @@ def validate(cls, xr_dataset: xr.Dataset, *specs: ArrayLikeSpec) -> xr.Dataset: @classmethod @overload - def validate(cls, xr_dataset: xr.Dataset, *specs: Hashable) -> xr.Dataset: + def _validate(cls, xr_dataset: xr.Dataset, *specs: Hashable) -> xr.Dataset: """ Validate that xr_dataset contains array(s) of interest with variable name(s). Variable must be registered in `SgkitVariables.registered_variables`. @@ -180,7 +184,7 @@ def validate(cls, xr_dataset: xr.Dataset, *specs: Hashable) -> xr.Dataset: ... @classmethod - def validate( + def _validate( cls, xr_dataset: xr.Dataset, *specs: Union[ArrayLikeSpec, Mapping[Hashable, ArrayLikeSpec], Hashable], @@ -217,5 +221,5 @@ def _check_field( ) from e -validate = SgkitVariables.validate -"""Shorthand for SgkitVariables.validate""" +validate = SgkitVariables._validate +"""Shortcut for the SgkitVariables.validate"""