Skip to content

Commit

Permalink
Merge pull request #249 from broadinstitute/vcf_fix
Browse files Browse the repository at this point in the history
VCF fix
  • Loading branch information
ch-kr authored Aug 14, 2020
2 parents 28b2e6a + 9073d7a commit 8158797
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 37 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241)
* Updated liftover functions to be more generic [(#246)](https://github.com/broadinstitute/gnomad_methods/pull/246)
* Changed quality histograms to label histograms calculated on raw and not adj data [(#247)](https://github.com/broadinstitute/gnomad_methods/pull/247)
* Updated some VCF export constants [(#249)](https://github.com/broadinstitute/gnomad_methods/pull/249)

## Version 0.4.0 - July 9th, 2020

Expand Down
9 changes: 6 additions & 3 deletions gnomad/assessment/sanity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ def sample_sum_check(
:param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
:return: None
"""
if prefix != "":
prefix = f"{prefix}_"

label_combos = make_label_combos(label_groups)
combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos]
combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos]
Expand Down Expand Up @@ -154,12 +157,12 @@ def sample_sum_check(
generic_field_check(
ht,
(
ht.info[f"{prefix}{subfield}_{group}_{subpop}"]
ht.info[f"{prefix}{subfield}_{subpop}_{group}"]
!= ht[f"sum_{subfield}_{group}_{alt_groups}"]
),
f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})",
f"{prefix}{subfield}_{subpop}_{group} = sum({subfield}_{group}_{alt_groups})",
[
f"info.{prefix}{subfield}_{group}_{subpop}",
f"info.{prefix}{subfield}_{subpop}_{group}",
f"sum_{subfield}_{group}_{alt_groups}",
],
verbose,
Expand Down
78 changes: 44 additions & 34 deletions gnomad/utils/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"]
SORT_ORDER = [
"popmax",
"pop",
"subpop",
"sex",
"group",
]
"""
Order to sort subgroupings during VCF export.
Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).
Expand Down Expand Up @@ -89,11 +95,11 @@
"""

RF_FIELDS = [
"fail_hard_filters",
"rf_positive_label",
"rf_negative_label",
"rf_label",
"rf_train",
"rf_tp_probability",
"tp",
]
"""
Annotations specific to the variant QC using a random forest model.
Expand All @@ -117,7 +123,8 @@
"Description": "Phred-scaled p-value of Fisher's exact test for strand bias"
},
"InbreedingCoeff": {
"Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
"Number": "A",
"Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)",
},
"MQ": {
"Description": "Root mean square of the mapping quality of reads across all samples"
Expand All @@ -132,10 +139,10 @@
"Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias"
},
"SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"},
"VQSR_POSITIVE_TRAIN_SITE": {
"POSITIVE_TRAIN_SITE": {
"Description": "Variant was used to build the positive training set of high-quality variants for VQSR"
},
"VQSR_NEGATIVE_TRAIN_SITE": {
"NEGATIVE_TRAIN_SITE": {
"Description": "Variant was used to build the negative training set of low-quality variants for VQSR"
},
"BaseQRankSum": {
Expand All @@ -147,7 +154,7 @@
"VQSLOD": {
"Description": "Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
},
"VQSR_culprit": {
"culprit": {
"Description": "Worst-performing annotation in the VQSR Gaussian mixture model",
},
"decoy": {"Description": "Variant falls within a reference decoy region"},
Expand Down Expand Up @@ -176,7 +183,7 @@
},
"allele_type": {"Description": "Allele type (snv, insertion, deletion, or mixed)",},
"n_alt_alleles": {
"Number": 1,
"Number": "1",
"Description": "Total number of alternate alleles observed at variant locus",
},
"was_mixed": {"Description": "Variant type was mixed"},
Expand All @@ -198,6 +205,7 @@
"""

SPARSE_ENTRIES = [
"END",
"DP",
"GQ",
"LA",
Expand Down Expand Up @@ -372,7 +380,7 @@ def make_combo_header_text(
preposition: str,
combo_dict: Dict[str, str],
prefix: str,
pop_names: Dict[str, str] = POP_NAMES,
pop_names: Dict[str, str],
) -> str:
"""
Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
Expand All @@ -385,7 +393,7 @@ def make_combo_header_text(
Possible grouping types are: "group", "pop", "sex", and "subpop".
Example input: {"pop": "afr", "sex": "female"}
:param prefix: Prefix string indicating sample subset.
:param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES.
:param pop_names: Dict with global population names (keys) and population descriptions (values).
:return: String with automatically generated description text for a given set of combo fields.
"""
header_text = " " + preposition
Expand All @@ -399,12 +407,14 @@ def make_combo_header_text(

header_text = header_text + " samples"

if "subpop" in combo_dict:
header_text = header_text + f" of {pop_names[combo_dict['subpop']]} ancestry"
combo_dict.pop("pop")
if "subpop" in combo_dict or "pop" in combo_dict:
if "subpop" in combo_dict:
header_text = (
header_text + f" of {pop_names[combo_dict['subpop']]} ancestry"
)

if "pop" in combo_dict:
header_text = header_text + f" of {pop_names[combo_dict['pop']]} ancestry"
else:
header_text = header_text + f" of {pop_names[combo_dict['pop']]} ancestry"

if "gnomad" in prefix:
header_text = header_text + " in gnomAD"
Expand All @@ -418,6 +428,7 @@ def make_combo_header_text(

def make_info_dict(
prefix: str = "",
pop_names: Dict[str, str] = POP_NAMES,
label_groups: Dict[str, str] = None,
bin_edges: Dict[str, str] = None,
faf: bool = False,
Expand All @@ -438,12 +449,13 @@ def make_info_dict(
- INFO fields for filtering allele frequency (faf) annotations
:param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string.
:param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES.
:param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
:param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
:param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
:param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
:param description_text: Optional text to append to the end of age and popmax descriptions. Needs to start with a space if specified.
:param description_text: Optional text to append to the end of descriptions. Needs to start with a space if specified.
:param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
:param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
:return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
Expand Down Expand Up @@ -515,37 +527,37 @@ def make_info_dict(
combo_fields = combo.split("_")
group_dict = dict(zip(group_types, combo_fields))

for_combo = make_combo_header_text("for", group_dict, prefix)
in_combo = make_combo_header_text("in", group_dict, prefix)
for_combo = make_combo_header_text("for", group_dict, prefix, pop_names)
in_combo = make_combo_header_text("in", group_dict, prefix, pop_names)

if not faf:
combo_dict = {
f"{prefix}AC_{combo}": {
"Number": "A",
"Description": f"Alternate allele count{for_combo}",
"Description": f"Alternate allele count{for_combo}{description_text}",
},
f"{prefix}AN_{combo}": {
"Number": "1",
"Description": f"Total number of alleles{in_combo}",
"Description": f"Total number of alleles{in_combo}{description_text}",
},
f"{prefix}AF_{combo}": {
"Number": "A",
"Description": f"Alternate allele frequency{in_combo}",
"Description": f"Alternate allele frequency{in_combo}{description_text}",
},
f"{prefix}nhomalt_{combo}": {
"Number": "A",
"Description": f"Count of homozygous individuals{in_combo}",
"Description": f"Count of homozygous individuals{in_combo}{description_text}",
},
}
else:
combo_dict = {
f"{prefix}faf95_{combo}": {
"Number": "A",
"Description": f"Filtering allele frequency (using Poisson 95% CI) {for_combo}",
"Description": f"Filtering allele frequency (using Poisson 95% CI){for_combo}{description_text}",
},
f"{prefix}faf99_{combo}": {
"Number": "A",
"Description": f"Filtering allele frequency (using Poisson 99% CI) {for_combo}",
"Description": f"Filtering allele frequency (using Poisson 99% CI){for_combo}{description_text}",
},
}
info_dict.update(combo_dict)
Expand All @@ -565,7 +577,6 @@ def add_as_info_dict(
:return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
"""
as_dict = {}

for field in as_fields:

try:
Expand All @@ -583,7 +594,7 @@ def add_as_info_dict(
] = f"Allele-specific {first_letter}{rest_of_description}"

except KeyError:
raise ValueError(f"{field} is not present in input info dictionary!")
logger.warning(f"{field} is not present in input info dictionary!")

return as_dict

Expand Down Expand Up @@ -611,7 +622,7 @@ def make_vcf_filter_dict(
},
"InbreedingCoeff": {"Description": f"InbreedingCoeff < {inbreeding_cutoff}"},
"RF": {
"Description": f"Failed random forest filtering thresholds of {snp_cutoff}, {indel_cutoff} (probabilities of being a true positive variant) for SNPs, indels"
"Description": f"Failed random forest filtering thresholds of {snp_cutoff} for SNPs and {indel_cutoff} for indels (probabilities of being a true positive variant)"
},
"PASS": {"Description": "Passed all variant filters"},
}
Expand Down Expand Up @@ -646,13 +657,13 @@ def make_hist_bin_edges_expr(

for hist in hists:

hist_name = hist

# Parse hists calculated on both raw and adj-filtered data
for hist_type in ["adj_qual_hists", "qual_hists"]:
for hist_type in ["raw_qual_hists", "qual_hists"]:

hist_name = hist
if "raw" in hist_type:
hist_name = f"{hist}_raw"

if "adj" in hist_type:
hist_name = f"{hist}_adj"
edges_dict[hist_name] = "|".join(
map(
lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)),
Expand Down Expand Up @@ -724,8 +735,7 @@ def set_female_y_metrics_to_na(
female_metrics_dict.update(
{
f"{metric}": hl.or_missing(
(~t.locus.contig.in_y_nonpar() | ~t.locus.contig.in_y_par()),
t.info[f"{metric}"],
(~t.locus.in_y_nonpar() | ~t.locus.in_y_par()), t.info[f"{metric}"],
)
}
)
Expand Down

0 comments on commit 8158797

Please sign in to comment.