From bcc7f505bd2ec252a083dc1d50fd51efb1a262cb Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 08:13:51 -0600
Subject: [PATCH 01/26] Add summary stats filtering expression functions

---
 gnomad/assessment/summary_stats.py | 264 ++++++++++++++++++++++++++++-
 1 file changed, 261 insertions(+), 3 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 6d9383af2..8213098f0 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -1,11 +1,12 @@
 # noqa: D100
-
+import functools
 import logging
-from typing import Dict, Optional, Set
+import operator
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import hail as hl
 
-from gnomad.utils.filtering import filter_low_conf_regions
+from gnomad.utils.filtering import filter_low_conf_regions, low_conf_regions_expr
 from gnomad.utils.vep import (
     LOF_CSQ_SET,
     add_most_severe_consequence_to_consequence,
@@ -355,6 +356,263 @@ def get_tx_expression_expr(
     )
 
 
+def expr_from_field(
+    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.Expression],
+    field: Optional[str] = None,
+) -> hl.expr.Expression:
+    """
+    Return expression from input MatrixTable/Table or Expression.
+
+    :param t_or_expr: Input MatrixTable/Table/Expression.
+    :param field: Field name to extract from input MT/HT. Default is None.
+    :return: Expression from input MT/HT or input Expression.
+    """
+    if isinstance(t_or_expr, hl.MatrixTable) or isinstance(t_or_expr, hl.Table):
+        if field is None:
+            raise ValueError("Field must be provided when input is MatrixTable/Table.")
+        t_or_expr = t_or_expr[field]
+
+    return t_or_expr
+
+
+def pass_filter_expr(
+    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
+    filter_field: str = "filters",
+) -> hl.expr.BooleanExpression:
+    """
+    Filter to PASS variants.
+
+    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
+    :param filter_field: Name of field in MT/HT that contains variant filters. Default
+        is 'filters'.
+    :return: BooleanExpression for filtering to PASS variants.
+    """
+    return hl.len(expr_from_field(t_or_expr, filter_field)) == 0
+
+
+def min_af_filter_expr(
+    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
+    min_af: float,
+    freq_field: str = "freq",
+    freq_index: int = 0,
+) -> hl.expr.BooleanExpression:
+    """
+    Filter to variants with minimum allele frequency.
+
+    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
+    :param min_af: Minimum allele frequency cutoff.
+    :param freq_field: Name of field in MT/HT that contains frequency information.
+        Default is 'freq'.
+    :param freq_index: Which index of frequency struct to use. Default is 0.
+    """
+    return expr_from_field(t_or_expr, freq_field)[freq_index].AF > min_af
+
+
+def max_af_filter_expr(
+    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
+    max_af: float,
+    freq_field: str = "freq",
+    freq_index: int = 0,
+) -> hl.expr.BooleanExpression:
+    """
+    Filter to variants with minimum allele frequency.
+
+    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
+    :param max_af: Maximum allele frequency cutoff.
+    :param freq_field: Name of field in MT/HT that contains frequency information.
+        Default is 'freq'.
+    :param freq_index: Which index of frequency struct to use. Default is 0.
+    """
+    return expr_from_field(t_or_expr, freq_field)[freq_index].AF < max_af
+
+
+def get_summary_stats_variant_filter_expr(
+    t: Union[hl.Table, hl.MatrixTable],
+    filter_lcr: bool = False,
+    filter_expr: hl.expr.SetExpression = None,
+    freq_expr: hl.expr.SetExpression = None,
+    max_af: Optional[float] = None,
+    min_an_proportion: Optional[float] = None,
+    collapse_filters: bool = False,
+) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]:
+    """
+    Generate variant filtering expression for summary stats.
+
+    :param t: Input Table/MatrixTable.
+    :param filter_lcr: Whether to filter out low confidence regions. Default is False.
+    :param filter_expr: SetExpression containing variant filters. Default is None.
+    :param freq_expr: SetExpression containing frequency information. Default is None.
+    :param max_af: Maximum allele frequency cutoff. Default is None.
+    :param min_an_proportion: Minimum allele number proportion (used as a proxy for
+        call rate). Default is None.
+    :param collapse_filters: Whether to collapse all filters into a single expression.
+        Default is False.
+    :return: BooleanExpression or Dict of BooleanExpressions for filtering variants.
+    """
+    if max_af is not None and freq_expr is None:
+        raise ValueError("")
+
+    ss_filter_expr = {"all_variants": True}
+    if filter_lcr:
+        logger.info("Filtering out low confidence regions...")
+        ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False)
+    if filter_expr is not None:
+        logger.info(
+            "Adding filter expression for variants that pass all variant QC filters..."
+        )
+        ss_filter_expr["pass_filters"] = pass_filter_expr(filter_expr)
+    if max_af is not None:
+        logger.info("Filtering to variants with (AF < %.2f)...", max_af)
+        ss_filter_expr[f"max_{max_af}"] = max_af_filter_expr(freq_expr, max_af)
+    if min_an_proportion is not None:
+        logger.info(
+            "Using AN (as a call rate proxy) to filter to variants that meet a minimum"
+            " call rate of %.2f...",
+            min_an_proportion,
+        )
+        ss_filter_expr[f"max_{max_af}"] = get_an_criteria(
+            t, an_proportion_cutoff=min_an_proportion
+        )
+
+    if collapse_filters:
+        if len(ss_filter_expr) == 1:
+            logger.warning("No filtering applied to variants for summary stats.")
+        ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values())
+
+    return ss_filter_expr
+
+
+def get_summary_stats_csq_filter_expr(
+    t: Union[hl.Table, hl.MatrixTable, hl.StructExpression],
+    lof_csq_set: Optional[Set[str]] = None,
+    lof_label_set: Optional[Set[str]] = None,
+    lof_flag_set: Optional[Set[str]] = None,
+    lof_no_flags: bool = False,
+    lof_any_flags: bool = False,
+    lof_loftee_combinations: bool = False,
+    additional_csq_sets: Optional[Dict[str, Set[str]]] = None,
+    additional_csqs: Optional[Set[str]] = None,
+    collapse_filters: bool = False,
+) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]:
+    """
+    Generate consequence filtering expression for summary stats.
+
+    :param t: Input Table/MatrixTable/StructExpression.
+    :param lof_csq_set: Set of LoF consequence strings. Default is None.
+    :param lof_label_set: Set of LoF consequence labels. Default is None.
+    :param lof_flag_set: Set of LoF consequence flags. Default is None.
+    :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is
+        False.
+    :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is
+        False.
+    :param lof_loftee_combinations: Whether to add combinations of LOFTEE and
+        consequence type filters. Default is False.
+    :param additional_csq_sets: Dictionary containing additional consequence sets.
+        Default is None.
+    :param additional_csqs: Set of additional consequences to keep. Default is None.
+    :param collapse_filters: Whether to collapse all filters into a single expression.
+        Default is False.
+    :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences.
+    """
+    # Set up filters for specific consequences or sets of consequences.
+    csq_filters = {"lof": lof_csq_set}
+    if additional_csq_sets is not None:
+        csq_filters.update(additional_csq_sets)
+
+    if lof_loftee_combinations and lof_csq_set is not None:
+        additional_csqs = set(lof_csq_set) | set(additional_csqs or [])
+
+    if additional_csqs is not None:
+        csq_filters.update({c: {c} for c in additional_csqs})
+
+    def _create_filter_by_csq(
+        t: Union[hl.Table, hl.MatrixTable],
+        csq_set: Union[Set, List, hl.expr.CollectionExpression],
+    ) -> hl.expr.BooleanExpression:
+        """
+        Create filtering expression for a set of consequences.
+
+        :param t: Input Table/MatrixTable.
+        :param csq_set: Set of consequences to filter.
+        :return: BooleanExpression for filtering by consequence.
+        """
+        if not isinstance(csq_set, hl.expr.CollectionExpression):
+            csq_set = hl.set(csq_set)
+
+        return csq_set.contains(t.most_severe_csq)
+
+    # Create filtering expressions for each consequence set.
+    ss_filter_expr = {}
+    for filter_name, csq_set in csq_filters.items():
+        if csq_set is not None:
+            ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set)
+
+    # Add filtering expressions for LoF consequence labels.
+    lof_labels = (
+        {
+            f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False)
+            for lof_label in lof_label_set
+        }
+        if lof_label_set
+        else {}
+    )
+
+    # Add filtering expressions for LoF consequence flags.
+    lof_flags = (
+        {
+            f"lof_flag_{lof_flag}": hl.or_else(
+                t.lof_flags.split(",").contains(lof_flag), False
+            )
+            for lof_flag in lof_flag_set
+        }
+        if lof_flag_set
+        else {}
+    )
+
+    # Add filtering expressions for HC LoF variants with no flags or any flags.
+    lof_hc_flags = {}
+    if lof_no_flags or lof_any_flags:
+        hc_expr = hl.or_else(t.lof == "HC", False)
+        if "no_lof_flags" in t.row:
+            no_lof_flags_expr = t.no_lof_flags
+        elif "lof_flags" in t.row:
+            no_lof_flags_expr = hl.is_missing(t.lof_flags) | (t.lof_flags == "")
+        else:
+            raise ValueError(
+                "No LoF flag info found in input Table/MatrixTable/StructExpression."
+            )
+        if lof_no_flags:
+            lof_hc_flags["lof_HC_no_flags"] = hc_expr & no_lof_flags_expr
+        if lof_any_flags:
+            lof_flags["lof_HC_with_flags"] = hc_expr & ~no_lof_flags_expr
+
+    # Update summary stats filter expressions with LoF labels and flags.
+    ss_filter_expr.update({**lof_labels, **lof_flags, **lof_hc_flags})
+
+    # Add expressions for LOFTEE and consequence type combinations.
+    if lof_loftee_combinations:
+        lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set}
+        lof_combo = {
+            {
+                **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
+                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels if l != "HC"},
+                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()},
+            }
+            for v, v_e in lof_csq.items()
+        }
+        ss_filter_expr.update(lof_combo)
+
+    if not ss_filter_expr:
+        logger.warning("No filtering applied to consequences for summary stats.")
+        return True if collapse_filters else ss_filter_expr
+
+    # Collapse all filters into a single expression if requested.
+    if collapse_filters:
+        ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values())
+
+    return ss_filter_expr
+
+
 def default_generate_gene_lof_matrix(
     mt: hl.MatrixTable,
     tx_ht: Optional[hl.Table],

From 5233d9c099e7d37b4ea62dfae737a301e2dece56 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 08:18:30 -0600
Subject: [PATCH 02/26] Add `low_conf_regions_expr` function to return
 `filter_low_conf_regions` as an expression instead

---
 gnomad/utils/filtering.py | 49 +++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py
index 62049e791..0bcc2ad96 100644
--- a/gnomad/utils/filtering.py
+++ b/gnomad/utils/filtering.py
@@ -128,19 +128,19 @@ def combine_functions(
     return cond
 
 
-def filter_low_conf_regions(
-    mt: Union[hl.MatrixTable, hl.Table],
+def low_conf_regions_expr(
+    locus_expr: hl.expr.LocusExpression,
     filter_lcr: bool = True,
     filter_decoy: bool = True,
     filter_segdup: bool = True,
     filter_exome_low_coverage_regions: bool = False,
     filter_telomeres_and_centromeres: bool = False,
     high_conf_regions: Optional[List[str]] = None,
-) -> Union[hl.MatrixTable, hl.Table]:
+) -> hl.expr.BooleanExpression:
     """
     Filter low-confidence regions.
 
-    :param mt: MatrixTable or Table to filter
+    :param locus_expr: Locus expression to use for filtering.
     :param filter_lcr: Whether to filter LCR regions
     :param filter_decoy: Whether to filter decoy regions
     :param filter_segdup: Whether to filter Segdup regions
@@ -149,28 +149,30 @@ def filter_low_conf_regions(
     :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
     :return: MatrixTable or Table with low confidence regions removed
     """
-    build = get_reference_genome(mt.locus).name
+    build = get_reference_genome(locus_expr).name
     if build == "GRCh37":
         import gnomad.resources.grch37.reference_data as resources
     elif build == "GRCh38":
         import gnomad.resources.grch38.reference_data as resources
+    else:
+        raise ValueError(f"Unsupported reference genome build: {build}")
 
     criteria = []
     if filter_lcr:
         lcr = resources.lcr_intervals.ht()
-        criteria.append(hl.is_missing(lcr[mt.locus]))
+        criteria.append(hl.is_missing(lcr[locus_expr]))
 
     if filter_decoy:
         decoy = resources.decoy_intervals.ht()
-        criteria.append(hl.is_missing(decoy[mt.locus]))
+        criteria.append(hl.is_missing(decoy[locus_expr]))
 
     if filter_segdup:
         segdup = resources.seg_dup_intervals.ht()
-        criteria.append(hl.is_missing(segdup[mt.locus]))
+        criteria.append(hl.is_missing(segdup[locus_expr]))
 
     if filter_exome_low_coverage_regions:
         high_cov = resources.high_coverage_intervals.ht()
-        criteria.append(hl.is_missing(high_cov[mt.locus]))
+        criteria.append(hl.is_missing(high_cov[locus_expr]))
 
     if filter_telomeres_and_centromeres:
         if build != "GRCh38":
@@ -179,19 +181,36 @@ def filter_low_conf_regions(
             )
 
         telomeres_and_centromeres = resources.telomeres_and_centromeres.ht()
-        criteria.append(hl.is_missing(telomeres_and_centromeres[mt.locus]))
+        criteria.append(hl.is_missing(telomeres_and_centromeres[locus_expr]))
 
     if high_conf_regions is not None:
         for region in high_conf_regions:
             region = hl.import_locus_intervals(region)
-            criteria.append(hl.is_defined(region[mt.locus]))
+            criteria.append(hl.is_defined(region[locus_expr]))
 
     if criteria:
         filter_criteria = functools.reduce(operator.iand, criteria)
-        if isinstance(mt, hl.MatrixTable):
-            mt = mt.filter_rows(filter_criteria)
-        else:
-            mt = mt.filter(filter_criteria)
+        return filter_criteria
+    else:
+        raise ValueError("No low confidence regions requested for filtering.")
+
+
+def filter_low_conf_regions(
+    mt: Union[hl.MatrixTable, hl.Table],
+    **kwargs,
+) -> Union[hl.MatrixTable, hl.Table]:
+    """
+    Filter low-confidence regions.
+
+    :param mt: MatrixTable or Table to filter.
+    :param kwargs: Keyword arguments to pass to `low_conf_regions_expr`.
+    :return: MatrixTable or Table with low confidence regions removed.
+    """
+    filter_criteria = low_conf_regions_expr(mt.locus, **kwargs)
+    if isinstance(mt, hl.MatrixTable):
+        mt = mt.filter_rows(filter_criteria)
+    else:
+        mt = mt.filter(filter_criteria)
 
     return mt
 

From 67a0323e9d3c232721a2f9216f6fa3293a116f78 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 08:20:25 -0600
Subject: [PATCH 03/26] Add a simple function to annotate a HT/MT with another
 HT

---
 gnomad/utils/annotations.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py
index 26f23438e..33a999db4 100644
--- a/gnomad/utils/annotations.py
+++ b/gnomad/utils/annotations.py
@@ -90,6 +90,41 @@
 }
 
 
+def annotate_with_ht(
+    t: Union[hl.MatrixTable, hl.Table],
+    annotation_ht: hl.Table,
+    fields: Optional[List] = None,
+    annotate_cols: bool = False,
+    filter_missing: bool = False,
+) -> hl.MatrixTable:
+    """
+    Annotate a MatrixTable/Table with additional annotations from another Table.
+
+    :param t: MatrixTable/Table to be annotated.
+    :param annotation_ht: Table containing additional annotations to be joined on `t`.
+    :param fields: Optional list of fields to select from `annotation_ht` and add to `t`
+    :param annotate_cols: If True, annotate columns instead of rows. Default is False.
+    :param filter_missing: If True, filter out missing rows/cols in `t` that are not
+        present in `annotation_ht`. Default is False.
+    :return: Annotated MatrixTable.
+    """
+    annotation_ht = annotation_ht.select(*fields)
+    if filter_missing:
+        logger.info("Filtering input to variants in the supplied annotation HT...")
+
+    if isinstance(t, hl.Table):
+        t = t.semi_join(annotation_ht)
+        t = t.annotate(**annotation_ht[t.key])
+    elif annotate_cols:
+        t = t.semi_join_cols(annotation_ht)
+        t = t.annotate_cols(**annotation_ht[t.col_key])
+    else:
+        t = t.semi_join_rows(annotation_ht)
+        t = t.annotate_rows(**annotation_ht[t.row_key])
+
+    return t
+
+
 def pop_max_expr(
     freq: hl.expr.ArrayExpression,
     freq_meta: hl.expr.ArrayExpression,

From 0a995c26315697538d62bcde07b1d4df952edf17 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 12:36:28 -0600
Subject: [PATCH 04/26] Changes needed while testing

---
 gnomad/assessment/summary_stats.py | 140 +++++++----------------------
 gnomad/utils/annotations.py        |   4 +-
 2 files changed, 35 insertions(+), 109 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 8213098f0..57eba3bb6 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -356,82 +356,12 @@ def get_tx_expression_expr(
     )
 
 
-def expr_from_field(
-    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.Expression],
-    field: Optional[str] = None,
-) -> hl.expr.Expression:
-    """
-    Return expression from input MatrixTable/Table or Expression.
-
-    :param t_or_expr: Input MatrixTable/Table/Expression.
-    :param field: Field name to extract from input MT/HT. Default is None.
-    :return: Expression from input MT/HT or input Expression.
-    """
-    if isinstance(t_or_expr, hl.MatrixTable) or isinstance(t_or_expr, hl.Table):
-        if field is None:
-            raise ValueError("Field must be provided when input is MatrixTable/Table.")
-        t_or_expr = t_or_expr[field]
-
-    return t_or_expr
-
-
-def pass_filter_expr(
-    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
-    filter_field: str = "filters",
-) -> hl.expr.BooleanExpression:
-    """
-    Filter to PASS variants.
-
-    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
-    :param filter_field: Name of field in MT/HT that contains variant filters. Default
-        is 'filters'.
-    :return: BooleanExpression for filtering to PASS variants.
-    """
-    return hl.len(expr_from_field(t_or_expr, filter_field)) == 0
-
-
-def min_af_filter_expr(
-    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
-    min_af: float,
-    freq_field: str = "freq",
-    freq_index: int = 0,
-) -> hl.expr.BooleanExpression:
-    """
-    Filter to variants with minimum allele frequency.
-
-    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
-    :param min_af: Minimum allele frequency cutoff.
-    :param freq_field: Name of field in MT/HT that contains frequency information.
-        Default is 'freq'.
-    :param freq_index: Which index of frequency struct to use. Default is 0.
-    """
-    return expr_from_field(t_or_expr, freq_field)[freq_index].AF > min_af
-
-
-def max_af_filter_expr(
-    t_or_expr: Union[hl.MatrixTable, hl.Table, hl.expr.CollectionExpression],
-    max_af: float,
-    freq_field: str = "freq",
-    freq_index: int = 0,
-) -> hl.expr.BooleanExpression:
-    """
-    Filter to variants with minimum allele frequency.
-
-    :param t_or_expr: Input MatrixTable/Table/CollectionExpression to use for filtering.
-    :param max_af: Maximum allele frequency cutoff.
-    :param freq_field: Name of field in MT/HT that contains frequency information.
-        Default is 'freq'.
-    :param freq_index: Which index of frequency struct to use. Default is 0.
-    """
-    return expr_from_field(t_or_expr, freq_field)[freq_index].AF < max_af
-
-
 def get_summary_stats_variant_filter_expr(
     t: Union[hl.Table, hl.MatrixTable],
     filter_lcr: bool = False,
     filter_expr: hl.expr.SetExpression = None,
     freq_expr: hl.expr.SetExpression = None,
-    max_af: Optional[float] = None,
+    max_af: Optional[Union[float, List[float]]] = None,
     min_an_proportion: Optional[float] = None,
     collapse_filters: bool = False,
 ) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]:
@@ -442,7 +372,8 @@ def get_summary_stats_variant_filter_expr(
     :param filter_lcr: Whether to filter out low confidence regions. Default is False.
     :param filter_expr: SetExpression containing variant filters. Default is None.
     :param freq_expr: SetExpression containing frequency information. Default is None.
-    :param max_af: Maximum allele frequency cutoff. Default is None.
+    :param max_af: Maximum allele frequency cutoff(s). Can be a single float or a list
+        of floats. Default is None.
     :param min_an_proportion: Minimum allele number proportion (used as a proxy for
         call rate). Default is None.
     :param collapse_filters: Whether to collapse all filters into a single expression.
@@ -450,30 +381,33 @@ def get_summary_stats_variant_filter_expr(
     :return: BooleanExpression or Dict of BooleanExpressions for filtering variants.
     """
     if max_af is not None and freq_expr is None:
-        raise ValueError("")
+        raise ValueError("Frequency expression must be provided when filtering by AF!")
 
+    log_list = []
     ss_filter_expr = {"all_variants": True}
     if filter_lcr:
-        logger.info("Filtering out low confidence regions...")
+        log_list.append("variants in low confidence regions")
         ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False)
     if filter_expr is not None:
-        logger.info(
-            "Adding filter expression for variants that pass all variant QC filters..."
-        )
-        ss_filter_expr["pass_filters"] = pass_filter_expr(filter_expr)
+        log_list.append("variants that pass all variant QC filters")
+        ss_filter_expr["pass_filters"] = hl.len(filter_expr) == 0
     if max_af is not None:
-        logger.info("Filtering to variants with (AF < %.2f)...", max_af)
-        ss_filter_expr[f"max_{max_af}"] = max_af_filter_expr(freq_expr, max_af)
+        if isinstance(max_af, float):
+            max_af = [max_af]
+        for af in max_af:
+            log_list.append(f"variants with (AF < {af:.2e})")
+            ss_filter_expr[f"max_{af}"] = freq_expr < af
     if min_an_proportion is not None:
-        logger.info(
-            "Using AN (as a call rate proxy) to filter to variants that meet a minimum"
-            " call rate of %.2f...",
-            min_an_proportion,
+        log_list.append(
+            "variants that meet a minimum call rate of %.2f (using AN as a call rate "
+            "proxy)" % min_an_proportion,
         )
         ss_filter_expr[f"max_{max_af}"] = get_an_criteria(
             t, an_proportion_cutoff=min_an_proportion
         )
 
+    logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list))
+
     if collapse_filters:
         if len(ss_filter_expr) == 1:
             logger.warning("No filtering applied to variants for summary stats.")
@@ -548,26 +482,18 @@ def _create_filter_by_csq(
             ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set)
 
     # Add filtering expressions for LoF consequence labels.
-    lof_labels = (
-        {
-            f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False)
-            for lof_label in lof_label_set
-        }
-        if lof_label_set
-        else {}
-    )
+    lof_labels = {
+        f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False)
+        for lof_label in lof_label_set or []
+    }
 
     # Add filtering expressions for LoF consequence flags.
-    lof_flags = (
-        {
-            f"lof_flag_{lof_flag}": hl.or_else(
-                t.lof_flags.split(",").contains(lof_flag), False
-            )
-            for lof_flag in lof_flag_set
-        }
-        if lof_flag_set
-        else {}
-    )
+    lof_flags = {
+        f"lof_flag_{lof_flag}": hl.or_else(
+            t.lof_flags.split(",").contains(lof_flag), False
+        )
+        for lof_flag in lof_flag_set or []
+    }
 
     # Add filtering expressions for HC LoF variants with no flags or any flags.
     lof_hc_flags = {}
@@ -592,15 +518,13 @@ def _create_filter_by_csq(
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
         lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set}
-        lof_combo = {
-            {
+        for v, v_e in lof_csq.items():
+            lof_combo = {
                 **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
-                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels if l != "HC"},
+                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"},
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()},
             }
-            for v, v_e in lof_csq.items()
-        }
-        ss_filter_expr.update(lof_combo)
+            ss_filter_expr.update(lof_combo)
 
     if not ss_filter_expr:
         logger.warning("No filtering applied to consequences for summary stats.")
diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py
index 33a999db4..9c16a189f 100644
--- a/gnomad/utils/annotations.py
+++ b/gnomad/utils/annotations.py
@@ -108,7 +108,9 @@ def annotate_with_ht(
         present in `annotation_ht`. Default is False.
     :return: Annotated MatrixTable.
     """
-    annotation_ht = annotation_ht.select(*fields)
+    if fields is not None:
+        annotation_ht = annotation_ht.select(*fields)
+
     if filter_missing:
         logger.info("Filtering input to variants in the supplied annotation HT...")
 

From 7dcb9e31e3739dfc6052d51e915774706090f2da Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 13:59:57 -0600
Subject: [PATCH 05/26] Remove unused imports

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 57eba3bb6..6d9b2b6a0 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -2,7 +2,7 @@
 import functools
 import logging
 import operator
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Union
 
 import hail as hl
 

From 5071ffad13b61fdd4580c316206a9e643015f89c Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 14:10:17 -0600
Subject: [PATCH 06/26] Small name change

---
 gnomad/assessment/summary_stats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 6d9b2b6a0..e050bfbce 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -396,13 +396,13 @@ def get_summary_stats_variant_filter_expr(
             max_af = [max_af]
         for af in max_af:
             log_list.append(f"variants with (AF < {af:.2e})")
-            ss_filter_expr[f"max_{af}"] = freq_expr < af
+            ss_filter_expr[f"max_af_{af}"] = freq_expr < af
     if min_an_proportion is not None:
         log_list.append(
             "variants that meet a minimum call rate of %.2f (using AN as a call rate "
             "proxy)" % min_an_proportion,
         )
-        ss_filter_expr[f"max_{max_af}"] = get_an_criteria(
+        ss_filter_expr[f"min_an_{max_af}"] = get_an_criteria(
             t, an_proportion_cutoff=min_an_proportion
         )
 

From e6aa0d9e8412ed823e895d65a75b52ea8dd1eb40 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 14:13:46 -0600
Subject: [PATCH 07/26] remove "HC" from name

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index e050bfbce..a6d8bcee1 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -520,7 +520,7 @@ def _create_filter_by_csq(
         lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
-                **{f"{v}_HC_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
+                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"},
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()},
             }

From aa070c2b030b827da6e75cc383c984f6bbcbd79e Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 9 May 2024 14:17:02 -0600
Subject: [PATCH 08/26] remove "lof_HC" from combo

---
 gnomad/assessment/summary_stats.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index a6d8bcee1..a5a996fd6 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -517,11 +517,12 @@ def _create_filter_by_csq(
 
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
+        lof_labels.pop("lof_HC")
         lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
-                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items() if l != "HC"},
+                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()},
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()},
             }
             ss_filter_expr.update(lof_combo)

From 76fa6d353d85ab647adfddfa29d8564302984163 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 10 May 2024 07:55:05 -0600
Subject: [PATCH 09/26] A little more clean-up

---
 gnomad/assessment/summary_stats.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index a5a996fd6..773ac9b0a 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -476,10 +476,10 @@ def _create_filter_by_csq(
         return csq_set.contains(t.most_severe_csq)
 
     # Create filtering expressions for each consequence set.
-    ss_filter_expr = {}
-    for filter_name, csq_set in csq_filters.items():
-        if csq_set is not None:
-            ss_filter_expr[filter_name] = _create_filter_by_csq(t, csq_set)
+    ss_filter_expr = {
+        filter_name: _create_filter_by_csq(t, csq_set)
+        for filter_name, csq_set in csq_filters.items()
+    }
 
     # Add filtering expressions for LoF consequence labels.
     lof_labels = {

From 7a9704556c352e053f54eb67abb50d6afb37681f Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 10 May 2024 08:09:42 -0600
Subject: [PATCH 10/26] fix issues introduced in last commit

---
 gnomad/assessment/summary_stats.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 773ac9b0a..370d86c1a 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -449,15 +449,12 @@ def get_summary_stats_csq_filter_expr(
     :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences.
     """
     # Set up filters for specific consequences or sets of consequences.
-    csq_filters = {"lof": lof_csq_set}
-    if additional_csq_sets is not None:
-        csq_filters.update(additional_csq_sets)
-
-    if lof_loftee_combinations and lof_csq_set is not None:
-        additional_csqs = set(lof_csq_set) | set(additional_csqs or [])
-
-    if additional_csqs is not None:
-        csq_filters.update({c: {c} for c in additional_csqs})
+    csq_filters = {
+        **({"csq_set_lof": lof_csq_set or {}}),
+        **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}),
+        **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}),
+        **({f"csq_{c}": {c} for c in additional_csqs or []}),
+    }
 
     def _create_filter_by_csq(
         t: Union[hl.Table, hl.MatrixTable],
@@ -518,7 +515,7 @@ def _create_filter_by_csq(
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
         lof_labels.pop("lof_HC")
-        lof_csq = {lof_var: ss_filter_expr[lof_var] for lof_var in lof_csq_set}
+        lof_csq = {v: ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},

From 5ceac9704277dd7dbba34752b3c8f1bf39150c0e Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 10 May 2024 08:17:46 -0600
Subject: [PATCH 11/26] forgot "lof_csq_" in combos

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 370d86c1a..447053c04 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -515,7 +515,7 @@ def _create_filter_by_csq(
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
         lof_labels.pop("lof_HC")
-        lof_csq = {v: ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set}
+        lof_csq = {f"lof_csq_{v}": ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},

From bc04ca53550c591c7ccedff2ef3a5dccc3b92b69 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 10 May 2024 17:31:34 -0600
Subject: [PATCH 12/26] use variant_qc_pass

---
 gnomad/assessment/summary_stats.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 447053c04..467420832 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -384,13 +384,13 @@ def get_summary_stats_variant_filter_expr(
         raise ValueError("Frequency expression must be provided when filtering by AF!")
 
     log_list = []
-    ss_filter_expr = {"all_variants": True}
+    ss_filter_expr = {}
     if filter_lcr:
         log_list.append("variants in low confidence regions")
         ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False)
     if filter_expr is not None:
         log_list.append("variants that pass all variant QC filters")
-        ss_filter_expr["pass_filters"] = hl.len(filter_expr) == 0
+        ss_filter_expr["variant_qc_pass"] = hl.len(filter_expr) == 0
     if max_af is not None:
         if isinstance(max_af, float):
             max_af = [max_af]
@@ -409,7 +409,7 @@ def get_summary_stats_variant_filter_expr(
     logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list))
 
     if collapse_filters:
-        if len(ss_filter_expr) == 1:
+        if len(ss_filter_expr) == 0:
             logger.warning("No filtering applied to variants for summary stats.")
         ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values())
 

From 0e7e297a197a95522651fe0546f8064b525167bf Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 13 May 2024 10:41:29 -0600
Subject: [PATCH 13/26] Apply suggestions from code review

Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com>
---
 gnomad/utils/annotations.py |  4 ++--
 gnomad/utils/filtering.py   | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py
index 9c16a189f..efa583e71 100644
--- a/gnomad/utils/annotations.py
+++ b/gnomad/utils/annotations.py
@@ -96,7 +96,7 @@ def annotate_with_ht(
     fields: Optional[List] = None,
     annotate_cols: bool = False,
     filter_missing: bool = False,
-) -> hl.MatrixTable:
+) -> Union[hl.MatrixTable, hl.Table]:
     """
     Annotate a MatrixTable/Table with additional annotations from another Table.
 
@@ -106,7 +106,7 @@ def annotate_with_ht(
     :param annotate_cols: If True, annotate columns instead of rows. Default is False.
     :param filter_missing: If True, filter out missing rows/cols in `t` that are not
         present in `annotation_ht`. Default is False.
-    :return: Annotated MatrixTable.
+    :return: Annotated MatrixTable/Table.
     """
     if fields is not None:
         annotation_ht = annotation_ht.select(*fields)
diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py
index 0bcc2ad96..8b2aea991 100644
--- a/gnomad/utils/filtering.py
+++ b/gnomad/utils/filtering.py
@@ -147,7 +147,7 @@ def low_conf_regions_expr(
     :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions
     :param filter_telomeres_and_centromeres: Whether to filter telomeres and centromeres
     :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
-    :return: MatrixTable or Table with low confidence regions removed
+    :return: Bool expression of whether loci are not low confidence (TRUE) or low confidence (FALSE)
     """
     build = get_reference_genome(locus_expr).name
     if build == "GRCh37":
@@ -196,23 +196,23 @@ def low_conf_regions_expr(
 
 
 def filter_low_conf_regions(
-    mt: Union[hl.MatrixTable, hl.Table],
+    t: Union[hl.MatrixTable, hl.Table],
     **kwargs,
 ) -> Union[hl.MatrixTable, hl.Table]:
     """
     Filter low-confidence regions.
 
-    :param mt: MatrixTable or Table to filter.
+    :param t: MatrixTable or Table to filter.
     :param kwargs: Keyword arguments to pass to `low_conf_regions_expr`.
     :return: MatrixTable or Table with low confidence regions removed.
     """
-    filter_criteria = low_conf_regions_expr(mt.locus, **kwargs)
-    if isinstance(mt, hl.MatrixTable):
-        mt = mt.filter_rows(filter_criteria)
+    filter_criteria = low_conf_regions_expr(t.locus, **kwargs)
+    if isinstance(t, hl.MatrixTable):
+        t = t.filter_rows(filter_criteria)
     else:
-        mt = mt.filter(filter_criteria)
+        t = t.filter(filter_criteria)
 
-    return mt
+    return t
 
 
 def filter_to_autosomes(

From 1320b20d40a0d944f4c844c1c96a45d7c9175fb9 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 13 May 2024 15:32:20 -0600
Subject: [PATCH 14/26] Apply suggestions from code review

Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com>
---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 467420832..8ae078c2a 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -402,7 +402,7 @@ def get_summary_stats_variant_filter_expr(
             "variants that meet a minimum call rate of %.2f (using AN as a call rate "
             "proxy)" % min_an_proportion,
         )
-        ss_filter_expr[f"min_an_{max_af}"] = get_an_criteria(
+        ss_filter_expr[f"min_an_{min_an_proportion}"] = get_an_criteria(
             t, an_proportion_cutoff=min_an_proportion
         )
 

From 6b6048084baa9395a4d8e4f57d17a99ab394800a Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:02:00 -0600
Subject: [PATCH 15/26] Remove `collapse_filters` and `lof_flag_set` to
 simplify PR for current needs

---
 gnomad/assessment/summary_stats.py | 62 +++++++++---------------------
 1 file changed, 19 insertions(+), 43 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 467420832..6e2852a6c 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -363,8 +363,7 @@ def get_summary_stats_variant_filter_expr(
     freq_expr: hl.expr.SetExpression = None,
     max_af: Optional[Union[float, List[float]]] = None,
     min_an_proportion: Optional[float] = None,
-    collapse_filters: bool = False,
-) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]:
+) -> Dict[str, hl.expr.BooleanExpression]:
     """
     Generate variant filtering expression for summary stats.
 
@@ -378,7 +377,7 @@ def get_summary_stats_variant_filter_expr(
         call rate). Default is None.
     :param collapse_filters: Whether to collapse all filters into a single expression.
         Default is False.
-    :return: BooleanExpression or Dict of BooleanExpressions for filtering variants.
+    :return: Dict of BooleanExpressions for filtering variants.
     """
     if max_af is not None and freq_expr is None:
         raise ValueError("Frequency expression must be provided when filtering by AF!")
@@ -408,11 +407,6 @@ def get_summary_stats_variant_filter_expr(
 
     logger.info("Adding filtering for:\n\t%s...", "\n\t".join(log_list))
 
-    if collapse_filters:
-        if len(ss_filter_expr) == 0:
-            logger.warning("No filtering applied to variants for summary stats.")
-        ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values())
-
     return ss_filter_expr
 
 
@@ -420,21 +414,18 @@ def get_summary_stats_csq_filter_expr(
     t: Union[hl.Table, hl.MatrixTable, hl.StructExpression],
     lof_csq_set: Optional[Set[str]] = None,
     lof_label_set: Optional[Set[str]] = None,
-    lof_flag_set: Optional[Set[str]] = None,
     lof_no_flags: bool = False,
     lof_any_flags: bool = False,
     lof_loftee_combinations: bool = False,
     additional_csq_sets: Optional[Dict[str, Set[str]]] = None,
     additional_csqs: Optional[Set[str]] = None,
-    collapse_filters: bool = False,
-) -> Union[hl.expr.BooleanExpression, Dict[str, hl.expr.BooleanExpression]]:
+) -> Dict[str, hl.expr.BooleanExpression]:
     """
     Generate consequence filtering expression for summary stats.
 
     :param t: Input Table/MatrixTable/StructExpression.
     :param lof_csq_set: Set of LoF consequence strings. Default is None.
     :param lof_label_set: Set of LoF consequence labels. Default is None.
-    :param lof_flag_set: Set of LoF consequence flags. Default is None.
     :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is
         False.
     :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is
@@ -444,16 +435,26 @@ def get_summary_stats_csq_filter_expr(
     :param additional_csq_sets: Dictionary containing additional consequence sets.
         Default is None.
     :param additional_csqs: Set of additional consequences to keep. Default is None.
-    :param collapse_filters: Whether to collapse all filters into a single expression.
-        Default is False.
-    :return: BooleanExpression or Dict of BooleanExpressions for filtering consequences.
+    :return: Dict of BooleanExpressions for filtering consequences.
     """
+    if (lof_no_flags or lof_any_flags) and "no_lof_flags" not in t.row:
+        raise ValueError(
+            "The required 'no_lof_flags' annotation is not found in input "
+            "Table/MatrixTable/StructExpression."
+        )
+    if (lof_label_set or lof_no_flags or lof_any_flags) and "lof" not in t.row:
+        raise ValueError(
+            "The required 'lof' annotation is not found in input "
+            "Table/MatrixTable/StructExpression."
+        )
+
     # Set up filters for specific consequences or sets of consequences.
     csq_filters = {
         **({"csq_set_lof": lof_csq_set or {}}),
         **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}),
         **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}),
         **({f"csq_{c}": {c} for c in additional_csqs or []}),
+        **({f"csq_{c}": {c} for c in additional_csqs or []}),
     }
 
     def _create_filter_by_csq(
@@ -484,33 +485,17 @@ def _create_filter_by_csq(
         for lof_label in lof_label_set or []
     }
 
-    # Add filtering expressions for LoF consequence flags.
-    lof_flags = {
-        f"lof_flag_{lof_flag}": hl.or_else(
-            t.lof_flags.split(",").contains(lof_flag), False
-        )
-        for lof_flag in lof_flag_set or []
-    }
-
     # Add filtering expressions for HC LoF variants with no flags or any flags.
     lof_hc_flags = {}
     if lof_no_flags or lof_any_flags:
         hc_expr = hl.or_else(t.lof == "HC", False)
-        if "no_lof_flags" in t.row:
-            no_lof_flags_expr = t.no_lof_flags
-        elif "lof_flags" in t.row:
-            no_lof_flags_expr = hl.is_missing(t.lof_flags) | (t.lof_flags == "")
-        else:
-            raise ValueError(
-                "No LoF flag info found in input Table/MatrixTable/StructExpression."
-            )
         if lof_no_flags:
-            lof_hc_flags["lof_HC_no_flags"] = hc_expr & no_lof_flags_expr
+            lof_hc_flags["lof_HC_no_flags"] = hc_expr & t.no_lof_flags
         if lof_any_flags:
-            lof_flags["lof_HC_with_flags"] = hc_expr & ~no_lof_flags_expr
+            lof_hc_flags["lof_HC_with_flags"] = hc_expr & ~t.no_lof_flags
 
     # Update summary stats filter expressions with LoF labels and flags.
-    ss_filter_expr.update({**lof_labels, **lof_flags, **lof_hc_flags})
+    ss_filter_expr.update({**lof_labels, **lof_hc_flags})
 
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
@@ -520,18 +505,9 @@ def _create_filter_by_csq(
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()},
-                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_flags.items()},
             }
             ss_filter_expr.update(lof_combo)
 
-    if not ss_filter_expr:
-        logger.warning("No filtering applied to consequences for summary stats.")
-        return True if collapse_filters else ss_filter_expr
-
-    # Collapse all filters into a single expression if requested.
-    if collapse_filters:
-        ss_filter_expr = functools.reduce(operator.iand, ss_filter_expr.values())
-
     return ss_filter_expr
 
 

From 9195c5b44e74dada553bd046c4146ed5156e5848 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:05:00 -0600
Subject: [PATCH 16/26] Update docstrings

---
 gnomad/assessment/summary_stats.py | 6 +++---
 gnomad/utils/filtering.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 6e2852a6c..5dabc015e 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -426,10 +426,10 @@ def get_summary_stats_csq_filter_expr(
     :param t: Input Table/MatrixTable/StructExpression.
     :param lof_csq_set: Set of LoF consequence strings. Default is None.
     :param lof_label_set: Set of LoF consequence labels. Default is None.
-    :param lof_no_flags: Whether to filter to LoF variants with no flags. Default is
-        False.
-    :param lof_any_flags: Whether to filter to LoF variants with any flags. Default is
+    :param lof_no_flags: Whether to filter to HC LoF variants with no flags. Default is
         False.
+    :param lof_any_flags: Whether to filter to HC LoF variants with any flags. Default
+        is False.
     :param lof_loftee_combinations: Whether to add combinations of LOFTEE and
         consequence type filters. Default is False.
     :param additional_csq_sets: Dictionary containing additional consequence sets.
diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py
index 0bcc2ad96..9727bfa18 100644
--- a/gnomad/utils/filtering.py
+++ b/gnomad/utils/filtering.py
@@ -138,7 +138,7 @@ def low_conf_regions_expr(
     high_conf_regions: Optional[List[str]] = None,
 ) -> hl.expr.BooleanExpression:
     """
-    Filter low-confidence regions.
+    Create an expression to filter low confidence regions.
 
     :param locus_expr: Locus expression to use for filtering.
     :param filter_lcr: Whether to filter LCR regions

From 595f97a913a18a40ee5b6baebdc969a643259ca1 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:05:51 -0600
Subject: [PATCH 17/26] Update gnomad/assessment/summary_stats.py

Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com>
---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index c0f2997a2..aa50fd0d7 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -385,7 +385,7 @@ def get_summary_stats_variant_filter_expr(
     log_list = []
     ss_filter_expr = {}
     if filter_lcr:
-        log_list.append("variants in low confidence regions")
+        log_list.append("variants not in low confidence regions")
         ss_filter_expr["no_lcr"] = low_conf_regions_expr(t.locus, filter_decoy=False)
     if filter_expr is not None:
         log_list.append("variants that pass all variant QC filters")

From 524dd5505c7f7540f1b155eef15694f6671d2225 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:09:27 -0600
Subject: [PATCH 18/26] remove unused imports

---
 gnomad/assessment/summary_stats.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index aa50fd0d7..9f4986aaa 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -1,7 +1,5 @@
 # noqa: D100
-import functools
 import logging
-import operator
 from typing import Dict, List, Optional, Set, Union
 
 import hail as hl

From 52ff64ff5854049a967032c6bc8fcc4b9c9dfdb3 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:11:01 -0600
Subject: [PATCH 19/26] Update gnomad/assessment/summary_stats.py

Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com>
---
 gnomad/assessment/summary_stats.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 9f4986aaa..862ae3f7b 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -448,7 +448,12 @@ def get_summary_stats_csq_filter_expr(
 
     # Set up filters for specific consequences or sets of consequences.
     csq_filters = {
-        **({"csq_set_lof": lof_csq_set or {}}),
+    csq_filters = {
+        **({"lof": lof_csq_set or {}}),
+        **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}),
+        **({f"{c}": {c} for c in lof_csq_set or []}),
+        **({f"{c}": {c} for c in additional_csqs or []}),
+    }
         **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}),
         **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}),
         **({f"csq_{c}": {c} for c in additional_csqs or []}),

From 652b3840c642ba4ccff555049ccb002c965abc74 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 14:12:43 -0600
Subject: [PATCH 20/26] Fix reviewer's suggestion

---
 gnomad/assessment/summary_stats.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 862ae3f7b..1471c09d5 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -447,18 +447,12 @@ def get_summary_stats_csq_filter_expr(
         )
 
     # Set up filters for specific consequences or sets of consequences.
-    csq_filters = {
     csq_filters = {
         **({"lof": lof_csq_set or {}}),
         **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}),
         **({f"{c}": {c} for c in lof_csq_set or []}),
         **({f"{c}": {c} for c in additional_csqs or []}),
     }
-        **({f"csq_set_{l}": c for l, c in (additional_csq_sets or {}).items()}),
-        **({f"lof_csq_{c}": {c} for c in lof_csq_set or []}),
-        **({f"csq_{c}": {c} for c in additional_csqs or []}),
-        **({f"csq_{c}": {c} for c in additional_csqs or []}),
-    }
 
     def _create_filter_by_csq(
         t: Union[hl.Table, hl.MatrixTable],

From f432b59713514d280698310c55f0f08f3fd91c20 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 15:04:43 -0600
Subject: [PATCH 21/26] Fix reviewer's suggestion

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 1471c09d5..273df9aef 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -497,7 +497,7 @@ def _create_filter_by_csq(
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
         lof_labels.pop("lof_HC")
-        lof_csq = {f"lof_csq_{v}": ss_filter_expr[f"lof_csq_{v}"] for v in lof_csq_set}
+        lof_csq = {k: v for k, v in ss_filter_expr.items() if v in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},

From c314780f58dce4de8e5de39a16bd0b3af79b7b7e Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Mon, 20 May 2024 17:37:36 -0600
Subject: [PATCH 22/26] check for existence of key instead of value

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 273df9aef..353b6d3b6 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -497,7 +497,7 @@ def _create_filter_by_csq(
     # Add expressions for LOFTEE and consequence type combinations.
     if lof_loftee_combinations:
         lof_labels.pop("lof_HC")
-        lof_csq = {k: v for k, v in ss_filter_expr.items() if v in lof_csq_set}
+        lof_csq = {k: v for k, v in ss_filter_expr.items() if k in lof_csq_set}
         for v, v_e in lof_csq.items():
             lof_combo = {
                 **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},

From 228adc7f5bfe76d7731b61ebb740097154bb7c78 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Tue, 21 May 2024 23:28:04 -0600
Subject: [PATCH 23/26] Remove lof combinations from summary stats filter
 expressions and update some expressions to be structs of str: boolean

---
 gnomad/assessment/summary_stats.py | 129 +++++++++++++++++------------
 1 file changed, 75 insertions(+), 54 deletions(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 353b6d3b6..b77065363 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -360,11 +360,20 @@ def get_summary_stats_variant_filter_expr(
     filter_expr: hl.expr.SetExpression = None,
     freq_expr: hl.expr.SetExpression = None,
     max_af: Optional[Union[float, List[float]]] = None,
-    min_an_proportion: Optional[float] = None,
-) -> Dict[str, hl.expr.BooleanExpression]:
+    min_an_proportion: Optional[Union[float, List[float]]] = None,
+) -> Dict[str, Union[hl.expr.BooleanExpression, hl.expr.StructExpression]]:
     """
     Generate variant filtering expression for summary stats.
 
+    The possible filtering groups are:
+
+        - 'no_lcr' if `filter_lcr` is True.
+        - 'variant_qc_pass' if `filter_expr` is provided.
+        - 'max_af' as a struct with a field for each `af` in `max_af` if `max_af` is
+          provided.
+        - 'min_an' as a struct with a field for each `an_proportion` in
+          `min_an_proportion` if `min_an_proportion` is provided.
+
     :param t: Input Table/MatrixTable.
     :param filter_lcr: Whether to filter out low confidence regions. Default is False.
     :param filter_expr: SetExpression containing variant filters. Default is None.
@@ -373,9 +382,7 @@ def get_summary_stats_variant_filter_expr(
         of floats. Default is None.
     :param min_an_proportion: Minimum allele number proportion (used as a proxy for
         call rate). Default is None.
-    :param collapse_filters: Whether to collapse all filters into a single expression.
-        Default is False.
-    :return: Dict of BooleanExpressions for filtering variants.
+    :return: Dict of BooleanExpressions or StructExpressions for filtering variants.
     """
     if max_af is not None and freq_expr is None:
         raise ValueError("Frequency expression must be provided when filtering by AF!")
@@ -391,9 +398,10 @@ def get_summary_stats_variant_filter_expr(
     if max_af is not None:
         if isinstance(max_af, float):
             max_af = [max_af]
-        for af in max_af:
-            log_list.append(f"variants with (AF < {af:.2e})")
-            ss_filter_expr[f"max_af_{af}"] = freq_expr < af
+        log_list.extend([f"variants with (AF < {af:.2e})" for af in max_af])
+        ss_filter_expr["max_af"] = hl.struct(
+            **{f"{af}": freq_expr < af for af in max_af}
+        )
     if min_an_proportion is not None:
         log_list.append(
             "variants that meet a minimum call rate of %.2f (using AN as a call rate "
@@ -414,46 +422,57 @@ def get_summary_stats_csq_filter_expr(
     lof_label_set: Optional[Set[str]] = None,
     lof_no_flags: bool = False,
     lof_any_flags: bool = False,
-    lof_loftee_combinations: bool = False,
     additional_csq_sets: Optional[Dict[str, Set[str]]] = None,
     additional_csqs: Optional[Set[str]] = None,
-) -> Dict[str, hl.expr.BooleanExpression]:
+) -> Dict[str, Union[hl.expr.BooleanExpression, hl.expr.StructExpression]]:
     """
     Generate consequence filtering expression for summary stats.
 
+    .. note::
+
+        - Assumes that input Table/MatrixTable/StructExpression contains the required
+          annotations for the requested filtering groups.
+            - 'lof' annotation for `lof_csq_set`.
+            - 'no_lof_flags' annotation for `lof_no_flags` and `lof_any_flags`.
+
+
+    The possible filtering groups are:
+
+        - 'lof' if `lof_csq_set` is provided.
+        - 'loftee_no_flags' if `lof_no_flags` is True.
+        - 'loftee_with_flags' if `lof_any_flags` is True.
+        - 'loftee_label' as a struct with a field for each `lof_label` in
+          `lof_label_set` if provided.
+        - 'csq' as a struct with a field for each consequence in `additional_csqs` and
+          `lof_csq_set` if provided.
+        - 'csq_set' as a struct with a field for each consequence set in
+          `additional_csq_sets` if provided. This will also have and `lof` field if
+          `lof_csq_set` is provided.
+
     :param t: Input Table/MatrixTable/StructExpression.
     :param lof_csq_set: Set of LoF consequence strings. Default is None.
     :param lof_label_set: Set of LoF consequence labels. Default is None.
-    :param lof_no_flags: Whether to filter to HC LoF variants with no flags. Default is
+    :param lof_no_flags: Whether to filter to variants with no flags. Default is
         False.
-    :param lof_any_flags: Whether to filter to HC LoF variants with any flags. Default
+    :param lof_any_flags: Whether to filter to variants with any flags. Default
         is False.
-    :param lof_loftee_combinations: Whether to add combinations of LOFTEE and
-        consequence type filters. Default is False.
     :param additional_csq_sets: Dictionary containing additional consequence sets.
         Default is None.
     :param additional_csqs: Set of additional consequences to keep. Default is None.
-    :return: Dict of BooleanExpressions for filtering consequences.
+    :return: Dict of BooleanExpressions or StructExpressions for filtering by
+        consequence.
     """
     if (lof_no_flags or lof_any_flags) and "no_lof_flags" not in t.row:
         raise ValueError(
             "The required 'no_lof_flags' annotation is not found in input "
             "Table/MatrixTable/StructExpression."
         )
-    if (lof_label_set or lof_no_flags or lof_any_flags) and "lof" not in t.row:
+    if lof_label_set and "lof" not in t.row:
         raise ValueError(
             "The required 'lof' annotation is not found in input "
             "Table/MatrixTable/StructExpression."
         )
 
-    # Set up filters for specific consequences or sets of consequences.
-    csq_filters = {
-        **({"lof": lof_csq_set or {}}),
-        **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}),
-        **({f"{c}": {c} for c in lof_csq_set or []}),
-        **({f"{c}": {c} for c in additional_csqs or []}),
-    }
-
     def _create_filter_by_csq(
         t: Union[hl.Table, hl.MatrixTable],
         csq_set: Union[Set, List, hl.expr.CollectionExpression],
@@ -470,40 +489,42 @@ def _create_filter_by_csq(
 
         return csq_set.contains(t.most_severe_csq)
 
-    # Create filtering expressions for each consequence set.
-    ss_filter_expr = {
-        filter_name: _create_filter_by_csq(t, csq_set)
-        for filter_name, csq_set in csq_filters.items()
+    # Set up filters for specific consequences or sets of consequences.
+    csq_filters = {
+        "csq": {
+            **({f"{c}": {c} for c in lof_csq_set or []}),
+            **({f"{c}": {c} for c in additional_csqs or []}),
+        },
+        "csq_set": {
+            **({"lof": lof_csq_set or {}}),
+            **({f"{l}": c for l, c in (additional_csq_sets or {}).items()}),
+        },
     }
 
-    # Add filtering expressions for LoF consequence labels.
-    lof_labels = {
-        f"lof_{lof_label}": hl.or_else(t.lof == lof_label, False)
-        for lof_label in lof_label_set or []
+    # Create filtering expressions for each consequence/ consequence set.
+    ss_filter_expr = {
+        group_name: hl.struct(
+            **{
+                filter_name: _create_filter_by_csq(t, csq_set)
+                for filter_name, csq_set in group.items()
+            }
+        )
+        for group_name, group in csq_filters.items()
     }
 
-    # Add filtering expressions for HC LoF variants with no flags or any flags.
-    lof_hc_flags = {}
-    if lof_no_flags or lof_any_flags:
-        hc_expr = hl.or_else(t.lof == "HC", False)
-        if lof_no_flags:
-            lof_hc_flags["lof_HC_no_flags"] = hc_expr & t.no_lof_flags
-        if lof_any_flags:
-            lof_hc_flags["lof_HC_with_flags"] = hc_expr & ~t.no_lof_flags
-
-    # Update summary stats filter expressions with LoF labels and flags.
-    ss_filter_expr.update({**lof_labels, **lof_hc_flags})
-
-    # Add expressions for LOFTEE and consequence type combinations.
-    if lof_loftee_combinations:
-        lof_labels.pop("lof_HC")
-        lof_csq = {k: v for k, v in ss_filter_expr.items() if k in lof_csq_set}
-        for v, v_e in lof_csq.items():
-            lof_combo = {
-                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_hc_flags.items()},
-                **{f"{v}_{l}": v_e & l_e for l, l_e in lof_labels.items()},
-            }
-            ss_filter_expr.update(lof_combo)
+    # Add filtering expressions for LOFTEE labels.
+    ss_filter_expr["loftee_label"] = hl.struct(
+        **{
+            lof_label: hl.or_else(t.lof == lof_label, False)
+            for lof_label in lof_label_set or []
+        }
+    )
+
+    # Add filtering expressions variants with no flags or any flags.
+    if lof_no_flags:
+        ss_filter_expr["loftee_no_flags"] = t.no_lof_flags
+    if lof_any_flags:
+        ss_filter_expr["loftee_with_flags"] = ~t.no_lof_flags
 
     return ss_filter_expr
 

From f88a502d19a0af85429f20edc816caf0c028483f Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Wed, 22 May 2024 09:04:11 -0600
Subject: [PATCH 24/26] Fix docstring

---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index b77065363..876210f48 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -432,10 +432,10 @@ def get_summary_stats_csq_filter_expr(
 
         - Assumes that input Table/MatrixTable/StructExpression contains the required
           annotations for the requested filtering groups.
+
             - 'lof' annotation for `lof_csq_set`.
             - 'no_lof_flags' annotation for `lof_no_flags` and `lof_any_flags`.
 
-
     The possible filtering groups are:
 
         - 'lof' if `lof_csq_set` is provided.

From 192404e6ca4ebfeca15273a5cac1cbf1754eb29a Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 24 May 2024 09:52:30 -0600
Subject: [PATCH 25/26] Update gnomad/assessment/summary_stats.py

Co-authored-by: Qin He <44242118+KoalaQin@users.noreply.github.com>
---
 gnomad/assessment/summary_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 876210f48..a3eb8ab36 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -446,7 +446,7 @@ def get_summary_stats_csq_filter_expr(
         - 'csq' as a struct with a field for each consequence in `additional_csqs` and
           `lof_csq_set` if provided.
         - 'csq_set' as a struct with a field for each consequence set in
-          `additional_csq_sets` if provided. This will also have and `lof` field if
+          `additional_csq_sets` if provided. This will also have an `lof` field if
           `lof_csq_set` is provided.
 
     :param t: Input Table/MatrixTable/StructExpression.

From ecf79f1017436a3ae1315b0df0ca98fd9c0f288b Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 24 May 2024 10:22:11 -0600
Subject: [PATCH 26/26] Add generate_filter_combinations and
 get_summary_stats_filter_group_meta

---
 gnomad/assessment/summary_stats.py | 242 +++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
index 876210f48..53dac5e1b 100644
--- a/gnomad/assessment/summary_stats.py
+++ b/gnomad/assessment/summary_stats.py
@@ -1,5 +1,7 @@
 # noqa: D100
+import itertools
 import logging
+from copy import deepcopy
 from typing import Dict, List, Optional, Set, Union
 
 import hail as hl
@@ -529,6 +531,246 @@ def _create_filter_by_csq(
     return ss_filter_expr
 
 
+def generate_filter_combinations(
+    combos: List[Union[List[str], Dict[str, List[str]]]],
+    combo_options: Optional[Dict[str, List[str]]] = None,
+) -> List[Dict[str, str]]:
+    """
+    Generate list of all possible filter combinations from a list of filter options.
+
+    Example input:
+
+    .. code-block:: python
+
+        [
+            {'pass_filters': [False, True]},
+            {'pass_filters': [False, True], 'capture': ['ukb', 'broad']}
+        ]
+
+    Example output:
+
+    .. code-block:: python
+
+        [
+            {'pass_filters': False},
+            {'pass_filters': True},
+            {'pass_filters': False, 'capture': 'ukb'},
+            {'pass_filters': False, 'capture': 'broad'},
+            {'pass_filters': True, 'capture': 'ukb'},
+            {'pass_filters': True, 'capture': 'broad'},
+        ]
+
+    :param combos: List of filter groups and their options.
+    :param combo_options: Dictionary of filter groups and their options that can be
+        supplied if `combos` is a list of lists.
+    :return: List of all possible filter combinations for each filter group.
+    """
+    if isinstance(combos[0], list):
+        if combo_options is None:
+            raise ValueError(
+                "If `combos` is a list of lists, `combo_options` must be provided."
+            )
+        combos = [{k: combo_options[k] for k in combo} for combo in combos]
+
+    # Create combinations of filter options.
+
+    def _expand_combinations(filter_dict: Dict[str, List[str]]) -> List[Dict[str, str]]:
+        """
+        Expand filter combinations.
+
+        :param filter_dict: Dictionary of filter options.
+        :return: List of dicts of expanded filter options.
+        """
+        keys, values = zip(*filter_dict.items())
+        return [dict(zip(keys, combo)) for combo in itertools.product(*values)]
+
+    # Flatten list of combinations.
+    expanded_meta = sum([_expand_combinations(sublist) for sublist in combos], [])
+
+    return expanded_meta
+
+
+def get_summary_stats_filter_group_meta(
+    all_sum_stat_filters: Dict[str, List[str]],
+    common_filter_combos: List[List[str]] = None,
+    common_filter_override: Dict[str, List[str]] = None,
+    lof_filter_combos: Optional[List[List[str]]] = None,
+    lof_filter_override: Dict[str, List[str]] = None,
+    filter_key_rename: Dict[str, str] = None,
+) -> List[Dict[str, str]]:
+    """
+    Generate list of filter group combination metadata for summary stats.
+
+    This function combines various filter settings for summary statistics and generates
+    all possible filter combinations. It ensures that the generated combinations include
+    both common filters and specific loss-of-function (LOF) filters.
+
+    .. note::
+
+        - The "variant_qc" filter group is removed if the value is "none", which can
+          lead to a filter group of {} (no filters).
+        - The `filter_key_rename` parameter can be used to rename keys in the
+          `all_sum_stat_filters`, `common_filter_override`, or `lof_filter_override`
+          after creating all combinations.
+
+    Example:
+        Given the following input:
+
+        .. code-block:: python
+
+            all_sum_stat_filters = {
+                "variant_qc": ["none", "pass"],
+                "capture": ["1", "2"],
+                "max_af": [0.01],
+                "lof_csq": ["stop_gained"],
+                "lof_csq_set": ["lof"],
+            }
+            common_filter_combos = [["variant_qc"], ["variant_qc", "capture"]]
+            common_filter_override = {"variant_qc": ["pass"], "capture": ["1"]}
+            lof_filter_combos = [
+                ["lof_csq_set", "loftee_HC"],
+                ["lof_csq_set", "loftee_HC", "loftee_flags"],
+                ["lof_csq", "loftee_HC", "loftee_flags"],
+            ]
+            lof_filter_override = {"loftee_HC": ["HC"], "loftee_flags": ["with_flags"]}
+            filter_key_rename = {
+                "lof_csq": "csq",
+                "loftee_HC": "loftee_labels",
+                "lof_csq_set": "csq_set",
+            }
+
+        The function will generate the following filter combinations:
+
+        .. code-block:: python
+
+            [
+               # Combinations of all common filter keys and their possible values.
+                {},
+                {'capture': '1'},
+                {'capture': '2'},
+                {'variant_qc': 'pass'},
+                {'variant_qc': 'pass', 'capture': '1'},
+                {'variant_qc': 'pass', 'capture': '2'},
+
+                # Combinations of all requested common filter combinations with all
+                # possible other filter keys and values.
+                {'variant_qc': 'pass', 'max_af': '0.01'},
+                {'variant_qc': 'pass', 'csq': 'stop_gained'},
+                {'variant_qc': 'pass', 'csq_set': 'lof'},
+                {'variant_qc': 'pass', 'capture': '1', 'max_af': '0.01'},
+                {'variant_qc': 'pass', 'capture': '1', 'csq': 'stop_gained'},
+                {'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof'},
+
+                # Combinations of all requested common filter combinations with all
+                # requested LOF filter combination keys and their requested values.
+                {'variant_qc': 'pass', 'csq_set': 'lof', 'loftee_labels': 'HC'},
+                {
+                    'variant_qc': 'pass', 'csq_set': 'lof', 'loftee_labels': 'HC',
+                    'loftee_flags': 'with_flags'
+                },
+                {
+                    'variant_qc': 'pass', 'csq': 'stop_gained', 'loftee_labels': 'HC',
+                    'loftee_flags': 'with_flags'
+                },
+                {
+                    'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof',
+                    'loftee_labels': 'HC'
+                },
+                {
+                    'variant_qc': 'pass', 'capture': '1', 'csq_set': 'lof',
+                    'loftee_labels': 'HC', 'loftee_flags': 'with_flags'
+                },
+                {
+                    'variant_qc': 'pass', 'capture': '1', 'csq': 'stop_gained',
+                    'loftee_labels': 'HC', 'loftee_flags': 'with_flags'
+                }
+            ]
+
+    :param all_sum_stat_filters: Dictionary of all possible filter types.
+    :param common_filter_combos: Optional list of lists of common filter keys to use
+        for creating common filter combinations.
+    :param common_filter_override: Optional dictionary of filter groups and their
+        options to override the values in `all_sum_stat_filters` for use with values in
+        `common_filter_combos`. This is only used if `common_filter_combos` is not None.
+    :param lof_filter_combos: Optional List of loss-of-function keys in all_sum_stat_filters
+        to use for creating filter combinations.
+    :param lof_filter_override: Optional Dictionary of filter groups and their options
+        to override the values in `all_sum_stat_filters` for use with values in
+        `lof_combos`. This is only used if `lof_filter_combos` is not None.
+    :param filter_key_rename: Optional dictionary to rename keys in
+        `all_sum_stat_filters`, `common_filter_override`, or `lof_filter_override` to
+        final metadata keys.
+    :return: Dictionary of filter field to metadata.
+    """
+    if common_filter_override is not None and common_filter_combos is None:
+        raise ValueError(
+            "If `common_combo_override` is provided, `common_filter_combos` must be "
+            "provided."
+        )
+    if lof_filter_override is not None and lof_filter_combos is None:
+        raise ValueError(
+            "If `lof_filter_override` is provided, `lof_filter_combos` must be "
+            "provided."
+        )
+
+    # Initialize dictionaries and lists to handle cases where the optional parameters
+    # are not provided.
+    common_filter_override = common_filter_override or {}
+    lof_filter_override = lof_filter_override or {}
+    filter_key_rename = filter_key_rename or {}
+
+    # Generate all possible filter combinations for common filter combinations.
+    all_sum_stat_filters = deepcopy(all_sum_stat_filters)
+    filter_combinations = []
+    if common_filter_combos is not None:
+        filter_combinations.extend(
+            generate_filter_combinations(
+                [
+                    {f: all_sum_stat_filters[f] for f in combo}
+                    for combo in common_filter_combos
+                ]
+            )
+        )
+    common_filter_combos = common_filter_combos or [[]]
+
+    # Update the all_sum_stat_filters dictionary with common_filter_override values and
+    # remove all filters in common_filter_combos from all_sum_stat_filters and
+    # into a common_filters dictionary.
+    all_sum_stat_filters.update(common_filter_override)
+    common_filters = {
+        k: all_sum_stat_filters.pop(k) for k in set(sum(common_filter_combos, []))
+    }
+
+    # Add combinations of common filters with all other filters.
+    filter_combinations.extend(
+        generate_filter_combinations(
+            [c + [f] for c in common_filter_combos for f in all_sum_stat_filters],
+            {**all_sum_stat_filters, **common_filters},
+        )
+    )
+
+    if lof_filter_combos is not None:
+        # Add combinations of common filters with LOF specific filters.
+        all_sum_stat_filters.update(lof_filter_override)
+        filter_combinations.extend(
+            generate_filter_combinations(
+                [c + f for c in common_filter_combos for f in lof_filter_combos],
+                {**all_sum_stat_filters, **common_filters},
+            )
+        )
+
+    filter_combinations = [
+        {
+            filter_key_rename.get(str(k), str(k)): str(v)
+            for k, v in filter_group.items()
+            if not (k == "variant_qc" and v == "none")
+        }
+        for filter_group in filter_combinations
+    ]
+
+    return filter_combinations
+
+
 def default_generate_gene_lof_matrix(
     mt: hl.MatrixTable,
     tx_ht: Optional[hl.Table],