Skip to content

Commit

Permalink
Sample summary stats sgkit-dev#29
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Dec 3, 2020
1 parent 915a3ba commit 88a616c
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Methods
gwas_linear_regression
hardy_weinberg_test
regenie
sample_stats
variant_stats
Tajimas_D
pc_relate
Expand Down
2 changes: 2 additions & 0 deletions sgkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
count_call_alleles,
count_cohort_alleles,
count_variant_alleles,
sample_stats,
variant_stats,
)
from .stats.association import gwas_linear_regression
Expand Down Expand Up @@ -48,6 +49,7 @@
"read_vcfzarr",
"regenie",
"hardy_weinberg_test",
"sample_stats",
"variant_stats",
"diversity",
"divergence",
Expand Down
72 changes: 72 additions & 0 deletions sgkit/stats/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,75 @@ def variant_stats(
]
)
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)


def sample_stats(
ds: Dataset,
*,
call_genotype_mask: Hashable = variables.call_genotype_mask,
call_genotype: Hashable = variables.call_genotype,
variant_allele_count: Hashable = variables.variant_allele_count,
merge: bool = True,
) -> Dataset:
"""Compute quality control sample statistics from genotype calls.
Parameters
----------
ds
Dataset containing genotype calls.
call_genotype
Input variable name holding call_genotype.
Defined by :data:`sgkit.variables.call_genotype_spec`.
Must be present in ``ds``.
call_genotype_mask
Input variable name holding call_genotype_mask.
Defined by :data:`sgkit.variables.call_genotype_mask_spec`
Must be present in ``ds``.
variant_allele_count
Input variable name holding variant_allele_count,
as defined by :data:`sgkit.variables.variant_allele_count_spec`.
If the variable is not present in ``ds``, it will be computed
using :func:`count_variant_alleles`.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
the computed output variables.
See :ref:`dataset_merge` for more details.
Returns
-------
A dataset containing the following variables:
- :data:`sgkit.variables.sample_n_called_spec` (samples):
The number of variants with called genotypes.
- :data:`sgkit.variables.sample_call_rate_spec` (samples):
The fraction of variants with called genotypes.
- :data:`sgkit.variables.sample_n_het_spec` (samples):
The number of variants with heterozygous calls.
- :data:`sgkit.variables.sample_n_hom_ref_spec` (samples):
The number of variants with homozygous reference calls.
- :data:`sgkit.variables.sample_n_hom_alt_spec` (samples):
The number of variants with homozygous alternate calls.
- :data:`sgkit.variables.sample_n_non_ref_spec` (samples):
The number of variants that are not homozygous reference calls.
"""
variables.validate(
ds,
{
call_genotype: variables.call_genotype_spec,
call_genotype_mask: variables.call_genotype_mask_spec,
},
)
new_ds = xr.merge(
[
call_rate(ds, dim="variants", call_genotype_mask=call_genotype_mask),
count_genotypes(
ds,
dim="variants",
call_genotype=call_genotype,
call_genotype_mask=call_genotype_mask,
merge=False,
),
]
)
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
19 changes: 18 additions & 1 deletion sgkit/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
count_call_alleles,
count_cohort_alleles,
count_variant_alleles,
sample_stats,
variant_stats,
)
from sgkit.testing import simulate_genotype_call_dataset
Expand Down Expand Up @@ -263,7 +264,6 @@ def test_variant_stats(precompute_variant_allele_count):
np.testing.assert_equal(vs["variant_n_hom_alt"], np.array([0, 1, 0, 0]))
np.testing.assert_equal(vs["variant_n_het"], np.array([1, 1, 2, 0]))
np.testing.assert_equal(vs["variant_n_non_ref"], np.array([1, 2, 2, 0]))
np.testing.assert_equal(vs["variant_n_non_ref"], np.array([1, 2, 2, 0]))
np.testing.assert_equal(
vs["variant_allele_count"], np.array([[1, 1], [1, 3], [2, 2], [2, 0]])
)
Expand All @@ -272,3 +272,20 @@ def test_variant_stats(precompute_variant_allele_count):
vs["variant_allele_frequency"],
np.array([[0.5, 0.5], [0.25, 0.75], [0.5, 0.5], [1, 0]]),
)


@pytest.mark.parametrize("precompute_variant_allele_count", [False, True])
def test_sample_stats(precompute_variant_allele_count):
ds = get_dataset(
[[[1, 0], [-1, -1]], [[1, 0], [1, 1]], [[0, 1], [1, 0]], [[-1, -1], [0, 0]]]
)
if precompute_variant_allele_count:
ds = count_variant_alleles(ds)
ss = sample_stats(ds)

np.testing.assert_equal(ss["sample_n_called"], np.array([3, 3]))
np.testing.assert_equal(ss["sample_call_rate"], np.array([0.75, 0.75]))
np.testing.assert_equal(ss["sample_n_hom_ref"], np.array([0, 1]))
np.testing.assert_equal(ss["sample_n_hom_alt"], np.array([0, 1]))
np.testing.assert_equal(ss["sample_n_het"], np.array([3, 1]))
np.testing.assert_equal(ss["sample_n_non_ref"], np.array([3, 2]))
24 changes: 24 additions & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,34 @@ def _check_field(
ArrayLikeSpec("pc_relate_phi", ndim=2, kind="f")
)
"""PC Relate kinship coefficient matrix."""
sample_call_rate, sample_call_rate_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_call_rate", ndim=1, kind="f")
)
"""The number of variants with heterozygous calls."""
sample_id, sample_id_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_id", kind={"S", "U", "O"}, ndim=1)
)
"""The unique identifier of the sample."""
sample_n_called, sample_n_called_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_called", ndim=1, kind="i")
)
"""The number of variants with called genotypes."""
sample_n_het, sample_n_het_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_het", ndim=1, kind="i")
)
"""The number of variants with heterozygous calls."""
sample_n_hom_alt, sample_n_hom_alt_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_hom_alt", ndim=1, kind="i")
)
"""The number of variants with homozygous alternate calls."""
sample_n_hom_ref, sample_n_hom_ref_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_hom_ref", ndim=1, kind="i")
)
"""The number of variants with homozygous reference calls."""
sample_n_non_ref, sample_n_non_ref_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_non_ref", ndim=1, kind="i")
)
"""The number of variants that are not homozygous reference calls."""
sample_pcs, sample_pcs_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_pcs", ndim=2, kind="f")
)
Expand Down

0 comments on commit 88a616c

Please sign in to comment.