Skip to content

Commit

Permalink
Merge pull request #531 from broadinstitute/jg/compute_info_v4_changes
Browse files Browse the repository at this point in the history
Add `pab_max_expr` function and modify `default_compute_info` to add 'AS_pab_max' annotation
  • Loading branch information
jkgoodrich authored Apr 25, 2023
2 parents 6417f44 + 75c5b2d commit 2e493d0
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 4 deletions.
44 changes: 44 additions & 0 deletions gnomad/utils/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,50 @@ def sor_from_sb(
return sor


def pab_max_expr(
gt_expr: hl.expr.CallExpression,
ad_expr: hl.expr.ArrayExpression,
la_expr: Optional[hl.expr.ArrayExpression] = None,
n_alleles_expr: Optional[hl.expr.Int32Expression] = None,
) -> hl.expr.ArrayExpression:
"""
Compute the maximum p-value of the binomial test for the alternate allele balance (PAB) for each allele.
.. note::
This function can take a `gt_expr` and `ad_expr` that use local or global
alleles. If they use local alleles, `la_expr` and `n_alleles_expr` should be
provided to transform `gt_expr` and `ad_expr` to global alleles.
:param gt_expr: Genotype call expression.
:param ad_expr: Allele depth expression.
:param la_expr: Allele local index expression. When provided `gt_expr` and
`ad_expr` are transformed from using local alleles to global alleles using
`la_expr`.
:param n_alleles_expr: Number of alleles expression. Required when 'la_expr' is
provided.
:return: Array expression of maximum p-values.
"""
if la_expr is not None:
if n_alleles_expr is None:
raise ValueError("Must provide `n_alleles_expr` if `la_expr` is provided!")

ad_expr = hl.vds.local_to_global(
ad_expr, la_expr, n_alleles_expr, fill_value=0, number="R"
)
gt_expr = hl.vds.lgt_to_gt(gt_expr, la_expr)

expr = hl.agg.array_agg(
lambda x: hl.agg.filter(
gt_expr.is_het(),
hl.agg.max(hl.binom_test(x, hl.sum(ad_expr), 0.5, "two-sided")),
),
ad_expr[1:], # Skip ref allele
)

return expr


def bi_allelic_expr(t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.BooleanExpression:
"""
Return a boolean expression selecting bi-allelic sites only, accounting for whether the input MT/HT was split.
Expand Down
29 changes: 25 additions & 4 deletions gnomad/utils/sparse_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
fs_from_sb,
get_adj_expr,
get_lowqual_expr,
pab_max_expr,
sor_from_sb,
)
from gnomad.utils.intervals import interval_length, union_intervals
Expand Down Expand Up @@ -446,7 +447,10 @@ def get_site_info_expr(


def default_compute_info(
mt: hl.MatrixTable, site_annotations: bool = False, n_partitions: int = 5000
mt: hl.MatrixTable,
site_annotations: bool = False,
n_partitions: int = 5000,
lowqual_indel_phred_het_prior: int = 40,
) -> hl.Table:
"""
Compute a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields.
Expand All @@ -458,6 +462,10 @@ def default_compute_info(
:param mt: Input MatrixTable. Note that this table should be filtered to nonref sites.
:param site_annotations: Whether to also generate site level info fields. Default is False.
:param n_partitions: Number of desired partitions for output Table. Default is 5000.
:param lowqual_indel_phred_het_prior: Phred-scaled prior for a het genotype at a
site with a low quality indel. Default is 40. We use 1/10k bases (phred=40) to
be more consistent with the filtering used by Broad's Data Sciences Platform
for VQSR.
:return: Table with info fields
:rtype: Table
"""
Expand All @@ -471,6 +479,11 @@ def default_compute_info(
# Compute AS info expr
info_expr = get_as_info_expr(mt)

# Add allele specific pab_max
info_expr = info_expr.annotate(
AS_pab_max=pab_max_expr(mt.LGT, mt.LAD, mt.LA, hl.len(mt.alleles))
)

if site_annotations:
info_expr = info_expr.annotate(**get_site_info_expr(mt))

Expand All @@ -488,7 +501,7 @@ def default_compute_info(
),
),
),
hl.range(1, hl.len(mt.alleles)),
mt.alt_alleles_range_array,
)

# Then, for each non-ref allele, compute
Expand All @@ -503,13 +516,21 @@ def default_compute_info(

# Add AS lowqual flag
info_ht = info_ht.annotate(
AS_lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.AS_QUALapprox)
AS_lowqual=get_lowqual_expr(
info_ht.alleles,
info_ht.info.AS_QUALapprox,
indel_phred_het_prior=lowqual_indel_phred_het_prior,
)
)

if site_annotations:
# Add lowqual flag
info_ht = info_ht.annotate(
lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox)
lowqual=get_lowqual_expr(
info_ht.alleles,
info_ht.info.QUALapprox,
indel_phred_het_prior=lowqual_indel_phred_het_prior,
)
)

return info_ht.naive_coalesce(n_partitions)
Expand Down

0 comments on commit 2e493d0

Please sign in to comment.