From bcc7f505bd2ec252a083dc1d50fd51efb1a262cb Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 08:13:51 -0600 Subject: [PATCH 01/26] Add summary stats filtering expression functions --- gnomad/assessment/summary_stats.py | 264 ++++++++++++++++++++++++++++- 1 file changed, 261 insertions(+), 3 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 6d9383af2..8213098f0 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -1,11 +1,12 @@ # noqa: D100 - +import functools import logging -from typing import Dict, Optional, Set +import operator +from typing import Any, Dict, List, Optional, Set, Tuple, Union import hail as hl -from gnomad.utils.filtering import filter_low_conf_regions +from gnomad.utils.filtering import filter_low_conf_regions, low_conf_regions_expr from gnomad.utils.vep import ( LOF_CSQ_SET, add_most_severe_consequence_to_consequence, @@ -355,6 +356,263 @@ def get_tx_expression_expr( ) +def expr_from_field( + t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.Expression], + field: Optional[str] = None, +) -> hl.expr.Expression: + """ + Return expression from input MatrixTable/Table or Expression. + + :param t_or_expr: Input MatrixTable/Table/Expression. + :param field: Field name to extract from input MT/HT. Default is None. + :return: Expression from input MT/HT or input Expression. + """ + if isinstance(t_or_expr, hl.MatrixTable) or isinstance(t_or_expr, hl.Table): + if field is None: + raise ValueError("Field must be provided when input is MatrixTable/Table.") + t_or_expr = t_or_expr[field] + + return t_or_expr + + +def pass_filter_expr( + t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], + filter_field: str = "filters", +) -> hl.expr.BooleanExpression: + """ + Filter to PASS variants. + + :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. + :param filter_field: Name of field in MT/HT that contains variant filters. Default + is 'filters'. + :return: BooleanExpression for filtering to PASS variants. + """ + return hl.len(expr_from_field(t_or_expr, filter_field)) == 0 + + +def min_af_filter_expr( + t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], + min_af: float, + freq_field: str = "freq", + freq_index: int = 0, +) -> hl.expr.BooleanExpression: + """ + Filter to variants with minimum allele frequency. + + :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. + :param min_af: Minimum allele frequency cutoff. + :param freq_field: Name of field in MT/HT that contains frequency information. + Default is 'freq'. + :param freq_index: Which index of frequency struct to use. Default is 0. + """ + return expr_from_field(t_or_expr, freq_field)[freq_index].AF > min_af + + +def max_af_filter_expr( + t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], + max_af: float, + freq_field: str = "freq", + freq_index: int = 0, +) -> hl.expr.BooleanExpression: + """ + Filter to variants with minimum allele frequency. + + :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. + :param max_af: Maximum allele frequency cutoff. + :param freq_field: Name of field in MT/HT that contains frequency information. + Default is 'freq'. + :param freq_index: Which index of frequency struct to use. Default is 0. + """ + return expr_from_field(t_or_expr, freq_field)[freq_index].AF < max_af + + +def get_summary_stats_variant_filter_expr( + t: Union[hl.Table, hl.MatrixTable], + filter_lcr: bool = False, + filter_expr: hl.expr.SetExpression = None, + freq_expr: hl.expr.SetExpression = None, + max_af: Optional[float] = None, + min_an_proportion: Optional[float] = None, + collapse_filters: bool = False, +) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]: + """ + Generate variant filtering expression for summary stats. + + :param t: Input Table/MatrixTable. + :param filter_lcr: Whether to filter out low confidence regions. Default is False. + :param filter_expr: SetExpression containing variant filters. Default is None. + :param freq_expr: SetExpression containing frequency information. Default is None. + :param max_af: Maximum allele frequency cutoff. Default is None. + :param min_an_proportion: Minimum allele number proportion (used as a proxy for + call rate). Default is None. + :param collapse_filters: Whether to collapse all filters into a single expression. + Default is False. + :return: BooleanExpression or Dict of BooleanExpressions for filtering variants. + """ + if max_af is not None and freq_expr is None: + raise ValueError("") + + ss_filter_expr = {"all_variants": True} + if filter_lcr: + logger.info("Filtering out low confidence regions...") + ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False) + if filter_expr is not None: + logger.info( + "Adding filter expression for variants that pass all variant QC filters..." + ) + ss_filter_expr["pass_filters"] = pass_filter_expr(filter_expr) + if max_af is not None: + logger.info("Filtering to variants with (AF < %.2f)...", max_af) + ss_filter_expr[f"max_{max_af}"] = max_af_filter_expr(freq_expr, max_af) + if min_an_proportion is not None: + logger.info( + "Using AN (as a call rate proxy) to filter to variants that meet a minimum" + " call rate of %.2f...", + min_an_proportion, + ) + ss_filter_expr[f"max_{max_af}"] = get_an_criteria( + t, an_proportion_cutoff=min_an_proportion + ) + + if collapse_filters: + if len(ss_filter_expr) == 1: + logger.warning("No filtering applied to variants for summary stats.") + ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values()) + + return ss_filter_expr + + +def get_summary_stats_csq_filter_expr( + t: Union[hl.Table, hl.MatrixTable, hl.StructExpression], + lof_csq_set: Optional[Set[str]] = None, + lof_label_set: Optional[Set[str]] = None, + lof_flag_set: Optional[Set[str]] = None, + lof_no_flags: bool = False, + lof_any_flags: bool = False, + lof_loftee_combinations: bool = False, + additional_csq_sets: Optional[Dict[str, Set[str]]] = None, + additional_csqs: Optional[Set[str]] = None, + collapse_filters: bool = False, +) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]: + """ + Generate consequence filtering expression for summary stats. + + :param t: Input Table/MatrixTable/StructExpression. + :param lof_csq_set: Set of LoF consequence strings. Default is None. + :param lof_label_set: Set of LoF consequence labels. Default is None. + :param lof_flag_set: Set of LoF consequence flags. Default is None. + :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is + False. + :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is + False. + :param lof_loftee_combinations: Whether to add combinations of LOFTEE and + consequence type filters. Default is False. + :param additional_csq_sets: Dictionary containing additional consequence sets. + Default is None. + :param additional_csqs: Set of additional consequences to keep. Default is None. + :param collapse_filters: Whether to collapse all filters into a single expression. + Default is False. + :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences. + """ + # Set up filters for specific consequences or sets of consequences. + csq_filters = {"lof": lof_csq_set} + if additional_csq_sets is not None: + csq_filters.update(additional_csq_sets) + + if lof_loftee_combinations and lof_csq_set is not None: + additional_csqs = set(lof_csq_set) | set(additional_csqs or []) + + if additional_csqs is not None: + csq_filters.update({c: {c} for c in additional_csqs}) + + def _create_filter_by_csq( + t: Union[hl.Table, hl.MatrixTable], + csq_set: Union[Set, List, hl.expr.CollectionExpression], + ) -> hl.expr.BooleanExpression: + """ + Create filtering expression for a set of consequences. + + :param t: Input Table/MatrixTable. + :param csq_set: Set of consequences to filter. + :return: BooleanExpression for filtering by consequence. + """ + if not isinstance(csq_set, hl.expr.CollectionExpression): + csq_set = hl.set(csq_set) + + return csq_set.contains(t.most_severe_csq) + + # Create filtering expressions for each consequence set. + ss_filter_expr = {} + for filter_name, csq_set in csq_filters.items(): + if csq_set is not None: + ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set) + + # Add filtering expressions for LoF consequence labels. + lof_labels = ( + { + f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False) + for lof_label in lof_label_set + } + if lof_label_set + else {} + ) + + # Add filtering expressions for LoF consequence flags. + lof_flags = ( + { + f"lof_flag_{lof_flag}": hl.or_else( + t.lof_flags.split(",").contains(lof_flag), False + ) + for lof_flag in lof_flag_set + } + if lof_flag_set + else {} + ) + + # Add filtering expressions for HC LoF variants with no flags or any flags. + lof_hc_flags = {} + if lof_no_flags or lof_any_flags: + hc_expr = hl.or_else(t.lof == "HC", False) + if "no_lof_flags" in t.row: + no_lof_flags_expr = t.no_lof_flags + elif "lof_flags" in t.row: + no_lof_flags_expr = hl.is_missing(t.lof_flags) | (t.lof_flags == "") + else: + raise ValueError( + "No LoF flag info found in input Table/MatrixTable/StructExpression." + ) + if lof_no_flags: + lof_hc_flags["lof_HC_no_flags"] = hc_expr & no_lof_flags_expr + if lof_any_flags: + lof_flags["lof_HC_with_flags"] = hc_expr & ~no_lof_flags_expr + + # Update summary stats filter expressions with LoF labels and flags. + ss_filter_expr.update({**lof_labels, **lof_flags, **lof_hc_flags}) + + # Add expressions for LOFTEE and consequence type combinations. + if lof_loftee_combinations: + lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set} + lof_combo = { + { + **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, + **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels if l != "HC"}, + **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()}, + } + for v, v_e in lof_csq.items() + } + ss_filter_expr.update(lof_combo) + + if not ss_filter_expr: + logger.warning("No filtering applied to consequences for summary stats.") + return True if collapse_filters else ss_filter_expr + + # Collapse all filters into a single expression if requested. + if collapse_filters: + ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values()) + + return ss_filter_expr + + def default_generate_gene_lof_matrix( mt: hl.MatrixTable, tx_ht: Optional[hl.Table], From 5233d9c099e7d37b4ea62dfae737a301e2dece56 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 08:18:30 -0600 Subject: [PATCH 02/26] Add `low_conf_regions_expr` function to return `filter_low_conf_regions` as an expression instead --- gnomad/utils/filtering.py | 49 +++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 62049e791..0bcc2ad96 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -128,19 +128,19 @@ def combine_functions( return cond -def filter_low_conf_regions( - mt: Union[hl.MatrixTable, hl.Table], +def low_conf_regions_expr( + locus_expr: hl.expr.LocusExpression, filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, filter_telomeres_and_centromeres: bool = False, high_conf_regions: Optional[List[str]] = None, -) -> Union[hl.MatrixTable, hl.Table]: +) -> hl.expr.BooleanExpression: """ Filter low-confidence regions. - :param mt: MatrixTable or Table to filter + :param locus_expr: Locus expression to use for filtering. :param filter_lcr: Whether to filter LCR regions :param filter_decoy: Whether to filter decoy regions :param filter_segdup: Whether to filter Segdup regions @@ -149,28 +149,30 @@ def filter_low_conf_regions( :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) :return: MatrixTable or Table with low confidence regions removed """ - build = get_reference_genome(mt.locus).name + build = get_reference_genome(locus_expr).name if build == "GRCh37": import gnomad.resources.grch37.reference_data as resources elif build == "GRCh38": import gnomad.resources.grch38.reference_data as resources + else: + raise ValueError(f"Unsupported reference genome build: {build}") criteria = [] if filter_lcr: lcr = resources.lcr_intervals.ht() - criteria.append(hl.is_missing(lcr[mt.locus])) + criteria.append(hl.is_missing(lcr[locus_expr])) if filter_decoy: decoy = resources.decoy_intervals.ht() - criteria.append(hl.is_missing(decoy[mt.locus])) + criteria.append(hl.is_missing(decoy[locus_expr])) if filter_segdup: segdup = resources.seg_dup_intervals.ht() - criteria.append(hl.is_missing(segdup[mt.locus])) + criteria.append(hl.is_missing(segdup[locus_expr])) if filter_exome_low_coverage_regions: high_cov = resources.high_coverage_intervals.ht() - criteria.append(hl.is_missing(high_cov[mt.locus])) + criteria.append(hl.is_missing(high_cov[locus_expr])) if filter_telomeres_and_centromeres: if build != "GRCh38": @@ -179,19 +181,36 @@ def filter_low_conf_regions( ) telomeres_and_centromeres = resources.telomeres_and_centromeres.ht() - criteria.append(hl.is_missing(telomeres_and_centromeres[mt.locus])) + criteria.append(hl.is_missing(telomeres_and_centromeres[locus_expr])) if high_conf_regions is not None: for region in high_conf_regions: region = hl.import_locus_intervals(region) - criteria.append(hl.is_defined(region[mt.locus])) + criteria.append(hl.is_defined(region[locus_expr])) if criteria: filter_criteria = functools.reduce(operator.iand, criteria) - if isinstance(mt, hl.MatrixTable): - mt = mt.filter_rows(filter_criteria) - else: - mt = mt.filter(filter_criteria) + return filter_criteria + else: + raise ValueError("No low confidence regions requested for filtering.") + + +def filter_low_conf_regions( + mt: Union[hl.MatrixTable, hl.Table], + **kwargs, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter low-confidence regions. + + :param mt: MatrixTable or Table to filter. + :param kwargs: Keyword arguments to pass to `low_conf_regions_expr`. + :return: MatrixTable or Table with low confidence regions removed. + """ + filter_criteria = low_conf_regions_expr(mt.locus, **kwargs) + if isinstance(mt, hl.MatrixTable): + mt = mt.filter_rows(filter_criteria) + else: + mt = mt.filter(filter_criteria) return mt From 67a0323e9d3c232721a2f9216f6fa3293a116f78 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 08:20:25 -0600 Subject: [PATCH 03/26] Add a simple function to annotate a HT/MT with another HT --- gnomad/utils/annotations.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 26f23438e..33a999db4 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -90,6 +90,41 @@ } +def annotate_with_ht( + t: Union[hl.MatrixTable, hl.Table], + annotation_ht: hl.Table, + fields: Optional[List] = None, + annotate_cols: bool = False, + filter_missing: bool = False, +) -> hl.MatrixTable: + """ + Annotate a MatrixTable/Table with additional annotations from another Table. + + :param t: MatrixTable/Table to be annotated. + :param annotation_ht: Table containing additional annotations to be joined on `t`. + :param fields: Optional list of fields to select from `annotation_ht` and add to `t` + :param annotate_cols: If True, annotate columns instead of rows. Default is False. + :param filter_missing: If True, filter out missing rows/cols in `t` that are not + present in `annotation_ht`. Default is False. + :return: Annotated MatrixTable. + """ + annotation_ht = annotation_ht.select(*fields) + if filter_missing: + logger.info("Filtering input to variants in the supplied annotation HT...") + + if isinstance(t, hl.Table): + t = t.semi_join(annotation_ht) + t = t.annotate(**annotation_ht[t.key]) + elif annotate_cols: + t = t.semi_join_cols(annotation_ht) + t = t.annotate_cols(**annotation_ht[t.col_key]) + else: + t = t.semi_join_rows(annotation_ht) + t = t.annotate_rows(**annotation_ht[t.row_key]) + + return t + + def pop_max_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, From 0a995c26315697538d62bcde07b1d4df952edf17 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 12:36:28 -0600 Subject: [PATCH 04/26] Changes needed while testing --- gnomad/assessment/summary_stats.py | 140 +++++++---------------------- gnomad/utils/annotations.py | 4 +- 2 files changed, 35 insertions(+), 109 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 8213098f0..57eba3bb6 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -356,82 +356,12 @@ def get_tx_expression_expr( ) -def expr_from_field( - t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.Expression], - field: Optional[str] = None, -) -> hl.expr.Expression: - """ - Return expression from input MatrixTable/Table or Expression. - - :param t_or_expr: Input MatrixTable/Table/Expression. - :param field: Field name to extract from input MT/HT. Default is None. - :return: Expression from input MT/HT or input Expression. - """ - if isinstance(t_or_expr, hl.MatrixTable) or isinstance(t_or_expr, hl.Table): - if field is None: - raise ValueError("Field must be provided when input is MatrixTable/Table.") - t_or_expr = t_or_expr[field] - - return t_or_expr - - -def pass_filter_expr( - t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], - filter_field: str = "filters", -) -> hl.expr.BooleanExpression: - """ - Filter to PASS variants. - - :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. - :param filter_field: Name of field in MT/HT that contains variant filters. Default - is 'filters'. - :return: BooleanExpression for filtering to PASS variants. - """ - return hl.len(expr_from_field(t_or_expr, filter_field)) == 0 - - -def min_af_filter_expr( - t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], - min_af: float, - freq_field: str = "freq", - freq_index: int = 0, -) -> hl.expr.BooleanExpression: - """ - Filter to variants with minimum allele frequency. - - :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. - :param min_af: Minimum allele frequency cutoff. - :param freq_field: Name of field in MT/HT that contains frequency information. - Default is 'freq'. - :param freq_index: Which index of frequency struct to use. Default is 0. - """ - return expr_from_field(t_or_expr, freq_field)[freq_index].AF > min_af - - -def max_af_filter_expr( - t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression], - max_af: float, - freq_field: str = "freq", - freq_index: int = 0, -) -> hl.expr.BooleanExpression: - """ - Filter to variants with minimum allele frequency. - - :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering. - :param max_af: Maximum allele frequency cutoff. - :param freq_field: Name of field in MT/HT that contains frequency information. - Default is 'freq'. - :param freq_index: Which index of frequency struct to use. Default is 0. - """ - return expr_from_field(t_or_expr, freq_field)[freq_index].AF < max_af - - def get_summary_stats_variant_filter_expr( t: Union[hl.Table, hl.MatrixTable], filter_lcr: bool = False, filter_expr: hl.expr.SetExpression = None, freq_expr: hl.expr.SetExpression = None, - max_af: Optional[float] = None, + max_af: Optional[Union[float, List[float]]] = None, min_an_proportion: Optional[float] = None, collapse_filters: bool = False, ) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]: @@ -442,7 +372,8 @@ def get_summary_stats_variant_filter_expr( :param filter_lcr: Whether to filter out low confidence regions. Default is False. :param filter_expr: SetExpression containing variant filters. Default is None. :param freq_expr: SetExpression containing frequency information. Default is None. - :param max_af: Maximum allele frequency cutoff. Default is None. + :param max_af: Maximum allele frequency cutoff(s). Can be a single float or a list + of floats. Default is None. :param min_an_proportion: Minimum allele number proportion (used as a proxy for call rate). Default is None. :param collapse_filters: Whether to collapse all filters into a single expression. @@ -450,30 +381,33 @@ def get_summary_stats_variant_filter_expr( :return: BooleanExpression or Dict of BooleanExpressions for filtering variants. """ if max_af is not None and freq_expr is None: - raise ValueError("") + raise ValueError("Frequency expression must be provided when filtering by AF!") + log_list = [] ss_filter_expr = {"all_variants": True} if filter_lcr: - logger.info("Filtering out low confidence regions...") + log_list.append("variants in low confidence regions") ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False) if filter_expr is not None: - logger.info( - "Adding filter expression for variants that pass all variant QC filters..." - ) - ss_filter_expr["pass_filters"] = pass_filter_expr(filter_expr) + log_list.append("variants that pass all variant QC filters") + ss_filter_expr["pass_filters"] = hl.len(filter_expr) == 0 if max_af is not None: - logger.info("Filtering to variants with (AF < %.2f)...", max_af) - ss_filter_expr[f"max_{max_af}"] = max_af_filter_expr(freq_expr, max_af) + if isinstance(max_af, float): + max_af = [max_af] + for af in max_af: + log_list.append(f"variants with (AF < {af:.2e})") + ss_filter_expr[f"max_{af}"] = freq_expr < af if min_an_proportion is not None: - logger.info( - "Using AN (as a call rate proxy) to filter to variants that meet a minimum" - " call rate of %.2f...", - min_an_proportion, + log_list.append( + "variants that meet a minimum call rate of %.2f (using AN as a call rate " + "proxy)" % min_an_proportion, ) ss_filter_expr[f"max_{max_af}"] = get_an_criteria( t, an_proportion_cutoff=min_an_proportion ) + logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list)) + if collapse_filters: if len(ss_filter_expr) == 1: logger.warning("No filtering applied to variants for summary stats.") @@ -548,26 +482,18 @@ def _create_filter_by_csq( ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set) # Add filtering expressions for LoF consequence labels. - lof_labels = ( - { - f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False) - for lof_label in lof_label_set - } - if lof_label_set - else {} - ) + lof_labels = { + f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False) + for lof_label in lof_label_set or [] + } # Add filtering expressions for LoF consequence flags. - lof_flags = ( - { - f"lof_flag_{lof_flag}": hl.or_else( - t.lof_flags.split(",").contains(lof_flag), False - ) - for lof_flag in lof_flag_set - } - if lof_flag_set - else {} - ) + lof_flags = { + f"lof_flag_{lof_flag}": hl.or_else( + t.lof_flags.split(",").contains(lof_flag), False + ) + for lof_flag in lof_flag_set or [] + } # Add filtering expressions for HC LoF variants with no flags or any flags. lof_hc_flags = {} @@ -592,15 +518,13 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set} - lof_combo = { - { + for v, v_e in lof_csq.items(): + lof_combo = { **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, - **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels if l != "HC"}, + **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"}, **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()}, } - for v, v_e in lof_csq.items() - } - ss_filter_expr.update(lof_combo) + ss_filter_expr.update(lof_combo) if not ss_filter_expr: logger.warning("No filtering applied to consequences for summary stats.") diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 33a999db4..9c16a189f 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -108,7 +108,9 @@ def annotate_with_ht( present in `annotation_ht`. Default is False. :return: Annotated MatrixTable. """ - annotation_ht = annotation_ht.select(*fields) + if fields is not None: + annotation_ht = annotation_ht.select(*fields) + if filter_missing: logger.info("Filtering input to variants in the supplied annotation HT...") From 7dcb9e31e3739dfc6052d51e915774706090f2da Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 13:59:57 -0600 Subject: [PATCH 05/26] Remove unused imports --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 57eba3bb6..6d9b2b6a0 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -2,7 +2,7 @@ import functools import logging import operator -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Dict, List, Optional, Set, Union import hail as hl From 5071ffad13b61fdd4580c316206a9e643015f89c Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 14:10:17 -0600 Subject: [PATCH 06/26] Small name change --- gnomad/assessment/summary_stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 6d9b2b6a0..e050bfbce 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -396,13 +396,13 @@ def get_summary_stats_variant_filter_expr( max_af = [max_af] for af in max_af: log_list.append(f"variants with (AF < {af:.2e})") - ss_filter_expr[f"max_{af}"] = freq_expr < af + ss_filter_expr[f"max_af_{af}"] = freq_expr < af if min_an_proportion is not None: log_list.append( "variants that meet a minimum call rate of %.2f (using AN as a call rate " "proxy)" % min_an_proportion, ) - ss_filter_expr[f"max_{max_af}"] = get_an_criteria( + ss_filter_expr[f"min_an_{max_af}"] = get_an_criteria( t, an_proportion_cutoff=min_an_proportion ) From e6aa0d9e8412ed823e895d65a75b52ea8dd1eb40 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 14:13:46 -0600 Subject: [PATCH 07/26] remove "HC" from name --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index e050bfbce..a6d8bcee1 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -520,7 +520,7 @@ def _create_filter_by_csq( lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { - **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, + **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"}, **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()}, } From aa070c2b030b827da6e75cc383c984f6bbcbd79e Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 9 May 2024 14:17:02 -0600 Subject: [PATCH 08/26] remove "lof_HC" from combo --- gnomad/assessment/summary_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index a6d8bcee1..a5a996fd6 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -517,11 +517,12 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: + lof_labels.pop("lof_HC") lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, - **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"}, + **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()}, **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()}, } ss_filter_expr.update(lof_combo) From 76fa6d353d85ab647adfddfa29d8564302984163 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 10 May 2024 07:55:05 -0600 Subject: [PATCH 09/26] A little more clean-up --- gnomad/assessment/summary_stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index a5a996fd6..773ac9b0a 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -476,10 +476,10 @@ def _create_filter_by_csq( return csq_set.contains(t.most_severe_csq) # Create filtering expressions for each consequence set. - ss_filter_expr = {} - for filter_name, csq_set in csq_filters.items(): - if csq_set is not None: - ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set) + ss_filter_expr = { + filter_name: _create_filter_by_csq(t, csq_set) + for filter_name, csq_set in csq_filters.items() + } # Add filtering expressions for LoF consequence labels. lof_labels = { From 7a9704556c352e053f54eb67abb50d6afb37681f Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 10 May 2024 08:09:42 -0600 Subject: [PATCH 10/26] fix issues introduced in last commit --- gnomad/assessment/summary_stats.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 773ac9b0a..370d86c1a 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -449,15 +449,12 @@ def get_summary_stats_csq_filter_expr( :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences. """ # Set up filters for specific consequences or sets of consequences. - csq_filters = {"lof": lof_csq_set} - if additional_csq_sets is not None: - csq_filters.update(additional_csq_sets) - - if lof_loftee_combinations and lof_csq_set is not None: - additional_csqs = set(lof_csq_set) | set(additional_csqs or []) - - if additional_csqs is not None: - csq_filters.update({c: {c} for c in additional_csqs}) + csq_filters = { + **({"csq_set_lof": lof_csq_set or {}}), + **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}), + **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}), + **({f"csq_{c}": {c} for c in additional_csqs or []}), + } def _create_filter_by_csq( t: Union[hl.Table, hl.MatrixTable], @@ -518,7 +515,7 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: lof_labels.pop("lof_HC") - lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set} + lof_csq = {v: ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, From 5ceac9704277dd7dbba34752b3c8f1bf39150c0e Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 10 May 2024 08:17:46 -0600 Subject: [PATCH 11/26] forgot "lof_csq_" in combos --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 370d86c1a..447053c04 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -515,7 +515,7 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: lof_labels.pop("lof_HC") - lof_csq = {v: ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set} + lof_csq = {f"lof_csq_{v}": ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, From bc04ca53550c591c7ccedff2ef3a5dccc3b92b69 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 10 May 2024 17:31:34 -0600 Subject: [PATCH 12/26] use variant_qc_pass --- gnomad/assessment/summary_stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 447053c04..467420832 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -384,13 +384,13 @@ def get_summary_stats_variant_filter_expr( raise ValueError("Frequency expression must be provided when filtering by AF!") log_list = [] - ss_filter_expr = {"all_variants": True} + ss_filter_expr = {} if filter_lcr: log_list.append("variants in low confidence regions") ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False) if filter_expr is not None: log_list.append("variants that pass all variant QC filters") - ss_filter_expr["pass_filters"] = hl.len(filter_expr) == 0 + ss_filter_expr["variant_qc_pass"] = hl.len(filter_expr) == 0 if max_af is not None: if isinstance(max_af, float): max_af = [max_af] @@ -409,7 +409,7 @@ def get_summary_stats_variant_filter_expr( logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list)) if collapse_filters: - if len(ss_filter_expr) == 1: + if len(ss_filter_expr) == 0: logger.warning("No filtering applied to variants for summary stats.") ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values()) From 0e7e297a197a95522651fe0546f8064b525167bf Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 13 May 2024 10:41:29 -0600 Subject: [PATCH 13/26] Apply suggestions from code review Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> --- gnomad/utils/annotations.py | 4 ++-- gnomad/utils/filtering.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 9c16a189f..efa583e71 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -96,7 +96,7 @@ def annotate_with_ht( fields: Optional[List] = None, annotate_cols: bool = False, filter_missing: bool = False, -) -> hl.MatrixTable: +) -> Union[hl.MatrixTable, hl.Table]: """ Annotate a MatrixTable/Table with additional annotations from another Table. @@ -106,7 +106,7 @@ def annotate_with_ht( :param annotate_cols: If True, annotate columns instead of rows. Default is False. :param filter_missing: If True, filter out missing rows/cols in `t` that are not present in `annotation_ht`. Default is False. - :return: Annotated MatrixTable. + :return: Annotated MatrixTable/Table. """ if fields is not None: annotation_ht = annotation_ht.select(*fields) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 0bcc2ad96..8b2aea991 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -147,7 +147,7 @@ def low_conf_regions_expr( :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions :param filter_telomeres_and_centromeres: Whether to filter telomeres and centromeres :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) - :return: MatrixTable or Table with low confidence regions removed + :return: Bool expression of whether loci are not low confidence (TRUE) or low confidence (FALSE) """ build = get_reference_genome(locus_expr).name if build == "GRCh37": @@ -196,23 +196,23 @@ def low_conf_regions_expr( def filter_low_conf_regions( - mt: Union[hl.MatrixTable, hl.Table], + t: Union[hl.MatrixTable, hl.Table], **kwargs, ) -> Union[hl.MatrixTable, hl.Table]: """ Filter low-confidence regions. - :param mt: MatrixTable or Table to filter. + :param t: MatrixTable or Table to filter. :param kwargs: Keyword arguments to pass to `low_conf_regions_expr`. :return: MatrixTable or Table with low confidence regions removed. """ - filter_criteria = low_conf_regions_expr(mt.locus, **kwargs) - if isinstance(mt, hl.MatrixTable): - mt = mt.filter_rows(filter_criteria) + filter_criteria = low_conf_regions_expr(t.locus, **kwargs) + if isinstance(t, hl.MatrixTable): + t = t.filter_rows(filter_criteria) else: - mt = mt.filter(filter_criteria) + t = t.filter(filter_criteria) - return mt + return t def filter_to_autosomes( From 1320b20d40a0d944f4c844c1c96a45d7c9175fb9 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 13 May 2024 15:32:20 -0600 Subject: [PATCH 14/26] Apply suggestions from code review Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 467420832..8ae078c2a 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -402,7 +402,7 @@ def get_summary_stats_variant_filter_expr( "variants that meet a minimum call rate of %.2f (using AN as a call rate " "proxy)" % min_an_proportion, ) - ss_filter_expr[f"min_an_{max_af}"] = get_an_criteria( + ss_filter_expr[f"min_an_{min_an_proportion}"] = get_an_criteria( t, an_proportion_cutoff=min_an_proportion ) From 6b6048084baa9395a4d8e4f57d17a99ab394800a Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:02:00 -0600 Subject: [PATCH 15/26] Remove `collapse_filters` and `lof_flag_set` to simplify PR for current needs --- gnomad/assessment/summary_stats.py | 62 +++++++++--------------------- 1 file changed, 19 insertions(+), 43 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 467420832..6e2852a6c 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -363,8 +363,7 @@ def get_summary_stats_variant_filter_expr( freq_expr: hl.expr.SetExpression = None, max_af: Optional[Union[float, List[float]]] = None, min_an_proportion: Optional[float] = None, - collapse_filters: bool = False, -) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]: +) -> Dict[str, hl.expr.BooleanExpression]: """ Generate variant filtering expression for summary stats. @@ -378,7 +377,7 @@ def get_summary_stats_variant_filter_expr( call rate). Default is None. :param collapse_filters: Whether to collapse all filters into a single expression. Default is False. - :return: BooleanExpression or Dict of BooleanExpressions for filtering variants. + :return: Dict of BooleanExpressions for filtering variants. """ if max_af is not None and freq_expr is None: raise ValueError("Frequency expression must be provided when filtering by AF!") @@ -408,11 +407,6 @@ def get_summary_stats_variant_filter_expr( logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list)) - if collapse_filters: - if len(ss_filter_expr) == 0: - logger.warning("No filtering applied to variants for summary stats.") - ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values()) - return ss_filter_expr @@ -420,21 +414,18 @@ def get_summary_stats_csq_filter_expr( t: Union[hl.Table, hl.MatrixTable, hl.StructExpression], lof_csq_set: Optional[Set[str]] = None, lof_label_set: Optional[Set[str]] = None, - lof_flag_set: Optional[Set[str]] = None, lof_no_flags: bool = False, lof_any_flags: bool = False, lof_loftee_combinations: bool = False, additional_csq_sets: Optional[Dict[str, Set[str]]] = None, additional_csqs: Optional[Set[str]] = None, - collapse_filters: bool = False, -) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]: +) -> Dict[str, hl.expr.BooleanExpression]: """ Generate consequence filtering expression for summary stats. :param t: Input Table/MatrixTable/StructExpression. :param lof_csq_set: Set of LoF consequence strings. Default is None. :param lof_label_set: Set of LoF consequence labels. Default is None. - :param lof_flag_set: Set of LoF consequence flags. Default is None. :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is False. :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is @@ -444,16 +435,26 @@ def get_summary_stats_csq_filter_expr( :param additional_csq_sets: Dictionary containing additional consequence sets. Default is None. :param additional_csqs: Set of additional consequences to keep. Default is None. - :param collapse_filters: Whether to collapse all filters into a single expression. - Default is False. - :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences. + :return: Dict of BooleanExpressions for filtering consequences. """ + if (lof_no_flags or lof_any_flags) and "no_lof_flags" not in t.row: + raise ValueError( + "The required 'no_lof_flags' annotation is not found in input " + "Table/MatrixTable/StructExpression." + ) + if (lof_label_set or lof_no_flags or lof_any_flags) and "lof" not in t.row: + raise ValueError( + "The required 'lof' annotation is not found in input " + "Table/MatrixTable/StructExpression." + ) + # Set up filters for specific consequences or sets of consequences. csq_filters = { **({"csq_set_lof": lof_csq_set or {}}), **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}), **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}), **({f"csq_{c}": {c} for c in additional_csqs or []}), + **({f"csq_{c}": {c} for c in additional_csqs or []}), } def _create_filter_by_csq( @@ -484,33 +485,17 @@ def _create_filter_by_csq( for lof_label in lof_label_set or [] } - # Add filtering expressions for LoF consequence flags. - lof_flags = { - f"lof_flag_{lof_flag}": hl.or_else( - t.lof_flags.split(",").contains(lof_flag), False - ) - for lof_flag in lof_flag_set or [] - } - # Add filtering expressions for HC LoF variants with no flags or any flags. lof_hc_flags = {} if lof_no_flags or lof_any_flags: hc_expr = hl.or_else(t.lof == "HC", False) - if "no_lof_flags" in t.row: - no_lof_flags_expr = t.no_lof_flags - elif "lof_flags" in t.row: - no_lof_flags_expr = hl.is_missing(t.lof_flags) | (t.lof_flags == "") - else: - raise ValueError( - "No LoF flag info found in input Table/MatrixTable/StructExpression." - ) if lof_no_flags: - lof_hc_flags["lof_HC_no_flags"] = hc_expr & no_lof_flags_expr + lof_hc_flags["lof_HC_no_flags"] = hc_expr & t.no_lof_flags if lof_any_flags: - lof_flags["lof_HC_with_flags"] = hc_expr & ~no_lof_flags_expr + lof_hc_flags["lof_HC_with_flags"] = hc_expr & ~t.no_lof_flags # Update summary stats filter expressions with LoF labels and flags. - ss_filter_expr.update({**lof_labels, **lof_flags, **lof_hc_flags}) + ss_filter_expr.update({**lof_labels, **lof_hc_flags}) # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: @@ -520,18 +505,9 @@ def _create_filter_by_csq( lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()}, - **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()}, } ss_filter_expr.update(lof_combo) - if not ss_filter_expr: - logger.warning("No filtering applied to consequences for summary stats.") - return True if collapse_filters else ss_filter_expr - - # Collapse all filters into a single expression if requested. - if collapse_filters: - ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values()) - return ss_filter_expr From 9195c5b44e74dada553bd046c4146ed5156e5848 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:05:00 -0600 Subject: [PATCH 16/26] Update docstrings --- gnomad/assessment/summary_stats.py | 6 +++--- gnomad/utils/filtering.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 6e2852a6c..5dabc015e 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -426,10 +426,10 @@ def get_summary_stats_csq_filter_expr( :param t: Input Table/MatrixTable/StructExpression. :param lof_csq_set: Set of LoF consequence strings. Default is None. :param lof_label_set: Set of LoF consequence labels. Default is None. - :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is - False. - :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is + :param lof_no_flags: Whether to filter to HC LoF variants with no flags. Default is False. + :param lof_any_flags: Whether to filter to HC LoF variants with any flags. Default + is False. :param lof_loftee_combinations: Whether to add combinations of LOFTEE and consequence type filters. Default is False. :param additional_csq_sets: Dictionary containing additional consequence sets. diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 0bcc2ad96..9727bfa18 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -138,7 +138,7 @@ def low_conf_regions_expr( high_conf_regions: Optional[List[str]] = None, ) -> hl.expr.BooleanExpression: """ - Filter low-confidence regions. + Create an expression to filter low confidence regions. :param locus_expr: Locus expression to use for filtering. :param filter_lcr: Whether to filter LCR regions From 595f97a913a18a40ee5b6baebdc969a643259ca1 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:05:51 -0600 Subject: [PATCH 17/26] Update gnomad/assessment/summary_stats.py Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com> --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index c0f2997a2..aa50fd0d7 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -385,7 +385,7 @@ def get_summary_stats_variant_filter_expr( log_list = [] ss_filter_expr = {} if filter_lcr: - log_list.append("variants in low confidence regions") + log_list.append("variants not in low confidence regions") ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False) if filter_expr is not None: log_list.append("variants that pass all variant QC filters") From 524dd5505c7f7540f1b155eef15694f6671d2225 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:09:27 -0600 Subject: [PATCH 18/26] remove unused imports --- gnomad/assessment/summary_stats.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index aa50fd0d7..9f4986aaa 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -1,7 +1,5 @@ # noqa: D100 -import functools import logging -import operator from typing import Dict, List, Optional, Set, Union import hail as hl From 52ff64ff5854049a967032c6bc8fcc4b9c9dfdb3 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:11:01 -0600 Subject: [PATCH 19/26] Update gnomad/assessment/summary_stats.py Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com> --- gnomad/assessment/summary_stats.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 9f4986aaa..862ae3f7b 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -448,7 +448,12 @@ def get_summary_stats_csq_filter_expr( # Set up filters for specific consequences or sets of consequences. csq_filters = { - **({"csq_set_lof": lof_csq_set or {}}), + csq_filters = { + **({"lof": lof_csq_set or {}}), + **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}), + **({f"{c}": {c} for c in lof_csq_set or []}), + **({f"{c}": {c} for c in additional_csqs or []}), + } **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}), **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}), **({f"csq_{c}": {c} for c in additional_csqs or []}), From 652b3840c642ba4ccff555049ccb002c965abc74 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 14:12:43 -0600 Subject: [PATCH 20/26] Fix reviewer's suggestion --- gnomad/assessment/summary_stats.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 862ae3f7b..1471c09d5 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -447,18 +447,12 @@ def get_summary_stats_csq_filter_expr( ) # Set up filters for specific consequences or sets of consequences. - csq_filters = { csq_filters = { **({"lof": lof_csq_set or {}}), **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}), **({f"{c}": {c} for c in lof_csq_set or []}), **({f"{c}": {c} for c in additional_csqs or []}), } - **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}), - **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}), - **({f"csq_{c}": {c} for c in additional_csqs or []}), - **({f"csq_{c}": {c} for c in additional_csqs or []}), - } def _create_filter_by_csq( t: Union[hl.Table, hl.MatrixTable], From f432b59713514d280698310c55f0f08f3fd91c20 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 15:04:43 -0600 Subject: [PATCH 21/26] Fix reviewer's suggestion --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 1471c09d5..273df9aef 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -497,7 +497,7 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: lof_labels.pop("lof_HC") - lof_csq = {f"lof_csq_{v}": ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set} + lof_csq = {k: v for k, v in ss_filter_expr.items() if v in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, From c314780f58dce4de8e5de39a16bd0b3af79b7b7e Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 20 May 2024 17:37:36 -0600 Subject: [PATCH 22/26] check for existence of key instead of value --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 273df9aef..353b6d3b6 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -497,7 +497,7 @@ def _create_filter_by_csq( # Add expressions for LOFTEE and consequence type combinations. if lof_loftee_combinations: lof_labels.pop("lof_HC") - lof_csq = {k: v for k, v in ss_filter_expr.items() if v in lof_csq_set} + lof_csq = {k: v for k, v in ss_filter_expr.items() if k in lof_csq_set} for v, v_e in lof_csq.items(): lof_combo = { **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, From 228adc7f5bfe76d7731b61ebb740097154bb7c78 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 21 May 2024 23:28:04 -0600 Subject: [PATCH 23/26] Remove lof combinations from summary stats filter expressions and update some expressions to be structs of str: boolean --- gnomad/assessment/summary_stats.py | 129 +++++++++++++++++------------ 1 file changed, 75 insertions(+), 54 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 353b6d3b6..b77065363 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -360,11 +360,20 @@ def get_summary_stats_variant_filter_expr( filter_expr: hl.expr.SetExpression = None, freq_expr: hl.expr.SetExpression = None, max_af: Optional[Union[float, List[float]]] = None, - min_an_proportion: Optional[float] = None, -) -> Dict[str, hl.expr.BooleanExpression]: + min_an_proportion: Optional[Union[float, List[float]]] = None, +) -> Dict[str, Union[hl.expr.BooleanExpression, hl.expr.StructExpression]]: """ Generate variant filtering expression for summary stats. + The possible filtering groups are: + + - 'no_lcr' if `filter_lcr` is True. + - 'variant_qc_pass' if `filter_expr` is provided. + - 'max_af' as a struct with a field for each `af` in `max_af` if `max_af` is + provided. + - 'min_an' as a struct with a field for each `an_proportion` in + `min_an_proportion` if `min_an_proportion` is provided. + :param t: Input Table/MatrixTable. :param filter_lcr: Whether to filter out low confidence regions. Default is False. :param filter_expr: SetExpression containing variant filters. Default is None. @@ -373,9 +382,7 @@ def get_summary_stats_variant_filter_expr( of floats. Default is None. :param min_an_proportion: Minimum allele number proportion (used as a proxy for call rate). Default is None. - :param collapse_filters: Whether to collapse all filters into a single expression. - Default is False. - :return: Dict of BooleanExpressions for filtering variants. + :return: Dict of BooleanExpressions or StructExpressions for filtering variants. """ if max_af is not None and freq_expr is None: raise ValueError("Frequency expression must be provided when filtering by AF!") @@ -391,9 +398,10 @@ def get_summary_stats_variant_filter_expr( if max_af is not None: if isinstance(max_af, float): max_af = [max_af] - for af in max_af: - log_list.append(f"variants with (AF < {af:.2e})") - ss_filter_expr[f"max_af_{af}"] = freq_expr < af + log_list.extend([f"variants with (AF < {af:.2e})" for af in max_af]) + ss_filter_expr["max_af"] = hl.struct( + **{f"{af}": freq_expr < af for af in max_af} + ) if min_an_proportion is not None: log_list.append( "variants that meet a minimum call rate of %.2f (using AN as a call rate " @@ -414,46 +422,57 @@ def get_summary_stats_csq_filter_expr( lof_label_set: Optional[Set[str]] = None, lof_no_flags: bool = False, lof_any_flags: bool = False, - lof_loftee_combinations: bool = False, additional_csq_sets: Optional[Dict[str, Set[str]]] = None, additional_csqs: Optional[Set[str]] = None, -) -> Dict[str, hl.expr.BooleanExpression]: +) -> Dict[str, Union[hl.expr.BooleanExpression, hl.expr.StructExpression]]: """ Generate consequence filtering expression for summary stats. + .. note:: + + - Assumes that input Table/MatrixTable/StructExpression contains the required + annotations for the requested filtering groups. + - 'lof' annotation for `lof_csq_set`. + - 'no_lof_flags' annotation for `lof_no_flags` and `lof_any_flags`. + + + The possible filtering groups are: + + - 'lof' if `lof_csq_set` is provided. + - 'loftee_no_flags' if `lof_no_flags` is True. + - 'loftee_with_flags' if `lof_any_flags` is True. + - 'loftee_label' as a struct with a field for each `lof_label` in + `lof_label_set` if provided. + - 'csq' as a struct with a field for each consequence in `additional_csqs` and + `lof_csq_set` if provided. + - 'csq_set' as a struct with a field for each consequence set in + `additional_csq_sets` if provided. This will also have and `lof` field if + `lof_csq_set` is provided. + :param t: Input Table/MatrixTable/StructExpression. :param lof_csq_set: Set of LoF consequence strings. Default is None. :param lof_label_set: Set of LoF consequence labels. Default is None. - :param lof_no_flags: Whether to filter to HC LoF variants with no flags. Default is + :param lof_no_flags: Whether to filter to variants with no flags. Default is False. - :param lof_any_flags: Whether to filter to HC LoF variants with any flags. Default + :param lof_any_flags: Whether to filter to variants with any flags. Default is False. - :param lof_loftee_combinations: Whether to add combinations of LOFTEE and - consequence type filters. Default is False. :param additional_csq_sets: Dictionary containing additional consequence sets. Default is None. :param additional_csqs: Set of additional consequences to keep. Default is None. - :return: Dict of BooleanExpressions for filtering consequences. + :return: Dict of BooleanExpressions or StructExpressions for filtering by + consequence. """ if (lof_no_flags or lof_any_flags) and "no_lof_flags" not in t.row: raise ValueError( "The required 'no_lof_flags' annotation is not found in input " "Table/MatrixTable/StructExpression." ) - if (lof_label_set or lof_no_flags or lof_any_flags) and "lof" not in t.row: + if lof_label_set and "lof" not in t.row: raise ValueError( "The required 'lof' annotation is not found in input " "Table/MatrixTable/StructExpression." ) - # Set up filters for specific consequences or sets of consequences. - csq_filters = { - **({"lof": lof_csq_set or {}}), - **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}), - **({f"{c}": {c} for c in lof_csq_set or []}), - **({f"{c}": {c} for c in additional_csqs or []}), - } - def _create_filter_by_csq( t: Union[hl.Table, hl.MatrixTable], csq_set: Union[Set, List, hl.expr.CollectionExpression], @@ -470,40 +489,42 @@ def _create_filter_by_csq( return csq_set.contains(t.most_severe_csq) - # Create filtering expressions for each consequence set. - ss_filter_expr = { - filter_name: _create_filter_by_csq(t, csq_set) - for filter_name, csq_set in csq_filters.items() + # Set up filters for specific consequences or sets of consequences. + csq_filters = { + "csq": { + **({f"{c}": {c} for c in lof_csq_set or []}), + **({f"{c}": {c} for c in additional_csqs or []}), + }, + "csq_set": { + **({"lof": lof_csq_set or {}}), + **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}), + }, } - # Add filtering expressions for LoF consequence labels. - lof_labels = { - f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False) - for lof_label in lof_label_set or [] + # Create filtering expressions for each consequence/ consequence set. + ss_filter_expr = { + group_name: hl.struct( + **{ + filter_name: _create_filter_by_csq(t, csq_set) + for filter_name, csq_set in group.items() + } + ) + for group_name, group in csq_filters.items() } - # Add filtering expressions for HC LoF variants with no flags or any flags. - lof_hc_flags = {} - if lof_no_flags or lof_any_flags: - hc_expr = hl.or_else(t.lof == "HC", False) - if lof_no_flags: - lof_hc_flags["lof_HC_no_flags"] = hc_expr & t.no_lof_flags - if lof_any_flags: - lof_hc_flags["lof_HC_with_flags"] = hc_expr & ~t.no_lof_flags - - # Update summary stats filter expressions with LoF labels and flags. - ss_filter_expr.update({**lof_labels, **lof_hc_flags}) - - # Add expressions for LOFTEE and consequence type combinations. - if lof_loftee_combinations: - lof_labels.pop("lof_HC") - lof_csq = {k: v for k, v in ss_filter_expr.items() if k in lof_csq_set} - for v, v_e in lof_csq.items(): - lof_combo = { - **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()}, - **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()}, - } - ss_filter_expr.update(lof_combo) + # Add filtering expressions for LOFTEE labels. + ss_filter_expr["loftee_label"] = hl.struct( + **{ + lof_label: hl.or_else(t.lof == lof_label, False) + for lof_label in lof_label_set or [] + } + ) + + # Add filtering expressions variants with no flags or any flags. + if lof_no_flags: + ss_filter_expr["loftee_no_flags"] = t.no_lof_flags + if lof_any_flags: + ss_filter_expr["loftee_with_flags"] = ~t.no_lof_flags return ss_filter_expr From f88a502d19a0af85429f20edc816caf0c028483f Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 22 May 2024 09:04:11 -0600 Subject: [PATCH 24/26] Fix docstring --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index b77065363..876210f48 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -432,10 +432,10 @@ def get_summary_stats_csq_filter_expr( - Assumes that input Table/MatrixTable/StructExpression contains the required annotations for the requested filtering groups. + - 'lof' annotation for `lof_csq_set`. - 'no_lof_flags' annotation for `lof_no_flags` and `lof_any_flags`. - The possible filtering groups are: - 'lof' if `lof_csq_set` is provided. From 192404e6ca4ebfeca15273a5cac1cbf1754eb29a Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 24 May 2024 09:52:30 -0600 Subject: [PATCH 25/26] Update gnomad/assessment/summary_stats.py Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com> --- gnomad/assessment/summary_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 876210f48..a3eb8ab36 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -446,7 +446,7 @@ def get_summary_stats_csq_filter_expr( - 'csq' as a struct with a field for each consequence in `additional_csqs` and `lof_csq_set` if provided. - 'csq_set' as a struct with a field for each consequence set in - `additional_csq_sets` if provided. This will also have and `lof` field if + `additional_csq_sets` if provided. This will also have an `lof` field if `lof_csq_set` is provided. :param t: Input Table/MatrixTable/StructExpression. From ecf79f1017436a3ae1315b0df0ca98fd9c0f288b Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 24 May 2024 10:22:11 -0600 Subject: [PATCH 26/26] Add generate_filter_combinations and get_summary_stats_filter_group_meta --- gnomad/assessment/summary_stats.py | 242 +++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index 876210f48..53dac5e1b 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -1,5 +1,7 @@ # noqa: D100 +import itertools import logging +from copy import deepcopy from typing import Dict, List, Optional, Set, Union import hail as hl @@ -529,6 +531,246 @@ def _create_filter_by_csq( return ss_filter_expr +def generate_filter_combinations( + combos: List[Union[List[str], Dict[str, List[str]]]], + combo_options: Optional[Dict[str, List[str]]] = None, +) -> List[Dict[str, str]]: + """ + Generate list of all possible filter combinations from a list of filter options. + + Example input: + + .. code-block:: python + + [ + {'pass_filters': [False, True]}, + {'pass_filters': [False, True], 'capture': ['ukb', 'broad']} + ] + + Example output: + + .. code-block:: python + + [ + {'pass_filters': False}, + {'pass_filters': True}, + {'pass_filters': False, 'capture': 'ukb'}, + {'pass_filters': False, 'capture': 'broad'}, + {'pass_filters': True, 'capture': 'ukb'}, + {'pass_filters': True, 'capture': 'broad'}, + ] + + :param combos: List of filter groups and their options. + :param combo_options: Dictionary of filter groups and their options that can be + supplied if `combos` is a list of lists. + :return: List of all possible filter combinations for each filter group. + """ + if isinstance(combos[0], list): + if combo_options is None: + raise ValueError( + "If `combos` is a list of lists, `combo_options` must be provided." + ) + combos = [{k: combo_options[k] for k in combo} for combo in combos] + + # Create combinations of filter options. + + def _expand_combinations(filter_dict: Dict[str, List[str]]) -> List[Dict[str, str]]: + """ + Expand filter combinations. + + :param filter_dict: Dictionary of filter options. + :return: List of dicts of expanded filter options. + """ + keys, values = zip(*filter_dict.items()) + return [dict(zip(keys, combo)) for combo in itertools.product(*values)] + + # Flatten list of combinations. + expanded_meta = sum([_expand_combinations(sublist) for sublist in combos], []) + + return expanded_meta + + +def get_summary_stats_filter_group_meta( + all_sum_stat_filters: Dict[str, List[str]], + common_filter_combos: List[List[str]] = None, + common_filter_override: Dict[str, List[str]] = None, + lof_filter_combos: Optional[List[List[str]]] = None, + lof_filter_override: Dict[str, List[str]] = None, + filter_key_rename: Dict[str, str] = None, +) -> List[Dict[str, str]]: + """ + Generate list of filter group combination metadata for summary stats. + + This function combines various filter settings for summary statistics and generates + all possible filter combinations. It ensures that the generated combinations include + both common filters and specific loss-of-function (LOF) filters. + + .. note:: + + - The "variant_qc" filter group is removed if the value is "none", which can + lead to a filter group of {} (no filters). + - The `filter_key_rename` parameter can be used to rename keys in the + `all_sum_stat_filters`, `common_filter_override`, or `lof_filter_override` + after creating all combinations. + + Example: + Given the following input: + + .. code-block:: python + + all_sum_stat_filters = { + "variant_qc": ["none", "pass"], + "capture": ["1", "2"], + "max_af": [0.01], + "lof_csq": ["stop_gained"], + "lof_csq_set": ["lof"], + } + common_filter_combos = [["variant_qc"], ["variant_qc", "capture"]] + common_filter_override = {"variant_qc": ["pass"], "capture": ["1"]} + lof_filter_combos = [ + ["lof_csq_set", "loftee_HC"], + ["lof_csq_set", "loftee_HC", "loftee_flags"], + ["lof_csq", "loftee_HC", "loftee_flags"], + ] + lof_filter_override = {"loftee_HC": ["HC"], "loftee_flags": ["with_flags"]} + filter_key_rename = { + "lof_csq": "csq", + "loftee_HC": "loftee_labels", + "lof_csq_set": "csq_set", + } + + The function will generate the following filter combinations: + + .. code-block:: python + + [ + # Combinations of all common filter keys and their possible values. + {}, + {'capture': '1'}, + {'capture': '2'}, + {'variant_qc': 'pass'}, + {'variant_qc': 'pass', 'capture': '1'}, + {'variant_qc': 'pass', 'capture': '2'}, + + # Combinations of all requested common filter combinations with all + # possible other filter keys and values. + {'variant_qc': 'pass', 'max_af': '0.01'}, + {'variant_qc': 'pass', 'csq': 'stop_gained'}, + {'variant_qc': 'pass', 'csq_set': 'lof'}, + {'variant_qc': 'pass', 'capture': '1', 'max_af': '0.01'}, + {'variant_qc': 'pass', 'capture': '1', 'csq': 'stop_gained'}, + {'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof'}, + + # Combinations of all requested common filter combinations with all + # requested LOF filter combination keys and their requested values. + {'variant_qc': 'pass', 'csq_set': 'lof', 'loftee_labels': 'HC'}, + { + 'variant_qc': 'pass', 'csq_set': 'lof', 'loftee_labels': 'HC', + 'loftee_flags': 'with_flags' + }, + { + 'variant_qc': 'pass', 'csq': 'stop_gained', 'loftee_labels': 'HC', + 'loftee_flags': 'with_flags' + }, + { + 'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof', + 'loftee_labels': 'HC' + }, + { + 'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof', + 'loftee_labels': 'HC', 'loftee_flags': 'with_flags' + }, + { + 'variant_qc': 'pass', 'capture': '1', 'csq': 'stop_gained', + 'loftee_labels': 'HC', 'loftee_flags': 'with_flags' + } + ] + + :param all_sum_stat_filters: Dictionary of all possible filter types. + :param common_filter_combos: Optional list of lists of common filter keys to use + for creating common filter combinations. + :param common_filter_override: Optional dictionary of filter groups and their + options to override the values in `all_sum_stat_filters` for use with values in + `common_filter_combos`. This is only used if `common_filter_combos` is not None. + :param lof_filter_combos: Optional List of loss-of-function keys in all_sum_stat_filters + to use for creating filter combinations. + :param lof_filter_override: Optional Dictionary of filter groups and their options + to override the values in `all_sum_stat_filters` for use with values in + `lof_combos`. This is only used if `lof_filter_combos` is not None. + :param filter_key_rename: Optional dictionary to rename keys in + `all_sum_stat_filters`, `common_filter_override`, or `lof_filter_override` to + final metadata keys. + :return: Dictionary of filter field to metadata. + """ + if common_filter_override is not None and common_filter_combos is None: + raise ValueError( + "If `common_combo_override` is provided, `common_filter_combos` must be " + "provided." + ) + if lof_filter_override is not None and lof_filter_combos is None: + raise ValueError( + "If `lof_filter_override` is provided, `lof_filter_combos` must be " + "provided." + ) + + # Initialize dictionaries and lists to handle cases where the optional parameters + # are not provided. + common_filter_override = common_filter_override or {} + lof_filter_override = lof_filter_override or {} + filter_key_rename = filter_key_rename or {} + + # Generate all possible filter combinations for common filter combinations. + all_sum_stat_filters = deepcopy(all_sum_stat_filters) + filter_combinations = [] + if common_filter_combos is not None: + filter_combinations.extend( + generate_filter_combinations( + [ + {f: all_sum_stat_filters[f] for f in combo} + for combo in common_filter_combos + ] + ) + ) + common_filter_combos = common_filter_combos or [[]] + + # Update the all_sum_stat_filters dictionary with common_filter_override values and + # remove all filters in common_filter_combos from all_sum_stat_filters and + # into a common_filters dictionary. + all_sum_stat_filters.update(common_filter_override) + common_filters = { + k: all_sum_stat_filters.pop(k) for k in set(sum(common_filter_combos, [])) + } + + # Add combinations of common filters with all other filters. + filter_combinations.extend( + generate_filter_combinations( + [c + [f] for c in common_filter_combos for f in all_sum_stat_filters], + {**all_sum_stat_filters, **common_filters}, + ) + ) + + if lof_filter_combos is not None: + # Add combinations of common filters with LOF specific filters. + all_sum_stat_filters.update(lof_filter_override) + filter_combinations.extend( + generate_filter_combinations( + [c + f for c in common_filter_combos for f in lof_filter_combos], + {**all_sum_stat_filters, **common_filters}, + ) + ) + + filter_combinations = [ + { + filter_key_rename.get(str(k), str(k)): str(v) + for k, v in filter_group.items() + if not (k == "variant_qc" and v == "none") + } + for filter_group in filter_combinations + ] + + return filter_combinations + + def default_generate_gene_lof_matrix( mt: hl.MatrixTable, tx_ht: Optional[hl.Table],