Skip to content

Commit

Permalink
Rename 'call_genotype_non_allele' to 'call_genotype_fill'
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite authored and mergify[bot] committed Dec 2, 2021
1 parent 8f04f0b commit 31dc606
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 33 deletions.
2 changes: 1 addition & 1 deletion docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ By convention, variable names are singular in sgkit. For example, ``genotype_cou
variables.call_genotype_complete_mask_spec
variables.call_genotype_spec
variables.call_genotype_mask_spec
variables.call_genotype_non_allele_spec
variables.call_genotype_fill_spec
variables.call_genotype_phased_spec
variables.call_genotype_probability_spec
variables.call_genotype_probability_mask_spec
Expand Down
8 changes: 4 additions & 4 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,10 @@ def update_dataset(self, ds: xr.Dataset) -> None:
{"comment": variables.call_genotype_mask_spec.__doc__.strip()},
)
if self.mixed_ploidy is True:
ds["call_genotype_non_allele"] = (
ds["call_genotype_fill"] = (
[DIM_VARIANT, DIM_SAMPLE, DIM_PLOIDY],
self.call_genotype < -1,
{"comment": variables.call_genotype_non_allele_spec.__doc__.strip()},
{"comment": variables.call_genotype_fill_spec.__doc__.strip()},
)
ds["call_genotype_phased"] = (
[DIM_VARIANT, DIM_SAMPLE],
Expand Down Expand Up @@ -642,7 +642,7 @@ def vcf_to_zarrs(
The (maximum) ploidy of genotypes in the VCF file.
mixed_ploidy
If True, genotype calls with fewer alleles than the specified ploidy will be padded
with the non-allele sentinel value of -2. If false, calls with fewer alleles than
with the fill (non-allele) sentinel value of -2. If false, calls with fewer alleles than
the specified ploidy will be treated as incomplete and will be padded with the
missing-allele sentinel value of -1.
truncate_calls
Expand Down Expand Up @@ -845,7 +845,7 @@ def vcf_to_zarr(
The (maximum) ploidy of genotypes in the VCF file.
mixed_ploidy
If True, genotype calls with fewer alleles than the specified ploidy will be padded
with the non-allele sentinel value of -2. If false, calls with fewer alleles than
with the fill (non-allele) sentinel value of -2. If false, calls with fewer alleles than
the specified ploidy will be treated as incomplete and will be padded with the
missing-allele sentinel value of -1.
truncate_calls
Expand Down
36 changes: 17 additions & 19 deletions sgkit/stats/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def sample_stats(
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)


def infer_non_alleles(
def infer_call_genotype_fill(
ds: Dataset,
*,
call_genotype: Hashable = variables.call_genotype,
Expand All @@ -626,24 +626,22 @@ def infer_non_alleles(
variables.validate(ds, {call_genotype: variables.call_genotype_spec})
mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
if mixed_ploidy:
call_genotype_non_allele = ds[call_genotype] < -1
call_genotype_fill = ds[call_genotype] < -1
else:
call_genotype_non_allele = xr.full_like(ds[call_genotype], False, "b1")
new_ds = create_dataset(
{variables.call_genotype_non_allele: call_genotype_non_allele}
)
call_genotype_fill = xr.full_like(ds[call_genotype], False, "b1")
new_ds = create_dataset({variables.call_genotype_fill: call_genotype_fill})
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)


def infer_call_ploidy(
ds: Dataset,
*,
call_genotype: Hashable = variables.call_genotype,
call_genotype_non_allele: Hashable = variables.call_genotype_non_allele,
call_genotype_fill: Hashable = variables.call_genotype_fill,
merge: bool = True,
) -> Dataset:
"""Infer the ploidy of each call genotype based on the number of
non-allele values in each call genotype.
fill (non-allele) values in each call genotype.
Parameters
----------
Expand All @@ -653,12 +651,12 @@ def infer_call_ploidy(
Input variable name holding call_genotype as defined by
:data:`sgkit.variables.call_genotype_spec`.
Must be present in ``ds``.
call_genotype_non_allele
Input variable name holding call_genotype_non_allele as defined by
:data:`sgkit.variables.call_genotype_non_allele_spec`.
call_genotype_fill
Input variable name holding call_genotype_fill as defined by
:data:`sgkit.variables.call_genotype_fill_spec`.
If the variable is not present in ``ds``, it will be computed
assuming that allele values less than -1 are non-alleles in mixed ploidy
datasets, or that no non-alleles are present in fixed ploidy datasets.
assuming that allele values less than -1 are fill (non-allele) values in mixed ploidy
datasets, or that no fill values are present in fixed ploidy datasets.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
Expand All @@ -671,13 +669,13 @@ def infer_call_ploidy(
"""
ds = define_variable_if_absent(
ds,
variables.call_genotype_non_allele,
call_genotype_non_allele,
infer_non_alleles,
variables.call_genotype_fill,
call_genotype_fill,
infer_call_genotype_fill,
)
mixed_ploidy = ds[variables.call_genotype].attrs.get("mixed_ploidy", False)
if mixed_ploidy:
call_ploidy = (~ds[call_genotype_non_allele]).sum(axis=-1)
call_ploidy = (~ds[call_genotype_fill]).sum(axis=-1)
else:
ploidy = ds[variables.call_genotype].shape[-1]
call_ploidy = xr.full_like(ds[variables.call_genotype][..., 0], ploidy)
Expand All @@ -694,7 +692,7 @@ def infer_variant_ploidy(
merge: bool = True,
) -> Dataset:
"""Infer the ploidy at each variant across all samples based on
the number of non-allele values in call genotypes.
the number of fill (non-allele) values in call genotypes.
Parameters
----------
Expand Down Expand Up @@ -743,7 +741,7 @@ def infer_sample_ploidy(
merge: bool = True,
) -> Dataset:
"""Infer the ploidy of each sample across all variants based on
the number of non-allele values in call genotypes.
the number of fill (non-allele) values in call genotypes.
Parameters
----------
Expand Down
6 changes: 3 additions & 3 deletions sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def test_vcf_to_zarr__mixed_ploidy_vcf(
assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"])

assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy
pad = -2 if mixed_ploidy else -1 # -2 indicates a non-allele
pad = -2 if mixed_ploidy else -1 # -2 indicates a fill (non-allele) value
call_genotype = np.array(
[
[[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]],
Expand All @@ -648,7 +648,7 @@ def test_vcf_to_zarr__mixed_ploidy_vcf(
assert_array_equal(ds["call_genotype"], call_genotype)
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
if mixed_ploidy:
assert_array_equal(ds["call_genotype_non_allele"], call_genotype < -1)
assert_array_equal(ds["call_genotype_fill"], call_genotype < -1)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1141,7 +1141,7 @@ def test_spec(shared_datadir, tmp_path):
"call_GQ",
"call_genotype",
"call_genotype_mask",
"call_genotype_non_allele",
"call_genotype_fill",
"call_genotype_phased",
"call_HQ",
"sample_id",
Expand Down
9 changes: 3 additions & 6 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,14 @@ def _check_field(
)
)

(
call_genotype_non_allele,
call_genotype_non_allele_spec,
) = SgkitVariables.register_variable(
(call_genotype_fill, call_genotype_fill_spec,) = SgkitVariables.register_variable(
ArrayLikeSpec(
"call_genotype_non_allele",
"call_genotype_fill",
kind="b",
ndim=3,
__doc__="""
A flag for each allele position within mixed ploidy call genotypes
indicating non-allele values of lower ploidy calls.
indicating fill (non-allele) values of lower ploidy calls.
""",
)
)
Expand Down

0 comments on commit 31dc606

Please sign in to comment.