diff --git a/docs/api.rst b/docs/api.rst index 6a075f813..d61d9e68c 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -27,9 +27,9 @@ Methods gwas_linear_regression hardy_weinberg_test regenie + variant_stats Utilities -========= .. autosummary:: :toctree: generated/ diff --git a/sgkit/__init__.py b/sgkit/__init__.py index 807bf7293..9a90dc1f0 100644 --- a/sgkit/__init__.py +++ b/sgkit/__init__.py @@ -8,7 +8,7 @@ ) from .display import display_genotypes from .io.vcfzarr_reader import read_vcfzarr -from .stats.aggregation import count_call_alleles, count_variant_alleles +from .stats.aggregation import count_call_alleles, count_variant_alleles, variant_stats from .stats.association import gwas_linear_regression from .stats.hwe import hardy_weinberg_test from .stats.regenie import regenie @@ -27,4 +27,5 @@ "read_vcfzarr", "regenie", "hardy_weinberg_test", + "variant_stats", ] diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 72dfa0900..183066df2 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -175,10 +175,11 @@ def call_rate(ds: Dataset, dim: Dimension) -> Dataset: def genotype_count(ds: Dataset, dim: Dimension) -> Dataset: odim = _swap(dim)[:-1] M, G = ds["call_genotype_mask"].any(dim="ploidy"), ds["call_genotype"] - n_het = (G > 0).any(dim="ploidy") & (G == 0).any(dim="ploidy") n_hom_ref = (G == 0).all(dim="ploidy") - n_hom_alt = (G > 0).all(dim="ploidy") + n_hom_alt = ((G > 0) & (G[..., 0] == G)).all(dim="ploidy") n_non_ref = (G > 0).any(dim="ploidy") + n_het = ~(n_hom_alt | n_hom_ref) + # This would 0 out the `het` case with any missing calls agg = lambda x: xr.where(M, False, x).sum(dim=dim) # type: ignore[no-untyped-call] return xr.Dataset( { @@ -191,7 +192,7 @@ def genotype_count(ds: Dataset, dim: Dimension) -> Dataset: def allele_frequency(ds: Dataset) -> Dataset: - AC = count_alleles(ds) + AC = count_variant_alleles(ds) M = ds["call_genotype_mask"].stack(calls=("samples", "ploidy")) AN = (~M).sum(dim="calls") # type: ignore