Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated liftover functions to be more generic #246

Merged
merged 5 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
* Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241)
* Updated liftover functions to be more generic [(#246)](https://github.com/broadinstitute/gnomad_methods/pull/246)

## Version 0.4.0 - July 9th, 2020

Expand Down
247 changes: 101 additions & 146 deletions gnomad/utils/liftover.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
from os.path import basename, dirname
from typing import Union
from typing import Tuple, Union

import hail as hl
from gnomad.utils.reference_genome import get_reference_genome

from gnomad.utils.reference_genome import add_reference_sequence, get_reference_genome

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
Expand All @@ -13,193 +13,148 @@
logger.setLevel(logging.INFO)


def get_checkpoint_path(gnomad: bool, data_type: str, path: str, is_table: bool) -> str:
"""
Creates a checkpoint path for Table

:param gnomad: Whether data is gnomAD data
:param data_type: Data type (exomes or genomes for gnomAD; not used otherwise)
:param path: Path to input Table/MatrixTable (if data is not gnomAD data)
:param is_table: Whether data is a Table
:return: Output checkpoint path
"""
GRCH37_to_GRCH38_CHAIN = "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
"""
Path to chain file required to lift data from GRCh37 to GRCh38.
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
"""

if gnomad:
return f"gs://gnomad/liftover/release/2.1.1/ht/{data_type}/gnomad.{data_type}.r2.1.1.liftover.ht"
else:
out_name = basename(path).split(".")[0]
out = f"{dirname(path)}/{out_name}_lift"
return f"{out}.ht" if is_table else f"{out}.mt"
GRCH38_TO_GRCH37_CHAIN = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
"""
Path to chain file required to lift data from GRCh38 to GRCh37.
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
"""


def get_liftover_genome(t: Union[hl.MatrixTable, hl.Table]) -> list:
def get_liftover_genome(
t: Union[hl.MatrixTable, hl.Table]
) -> Tuple[hl.genetics.ReferenceGenome, hl.genetics.ReferenceGenome]:
"""
Infers genome build of input data and assumes destination build. Prepares to liftover to destination genome build
Infers reference genome build of input data and assumes destination reference genome build.

Adds liftover chain to source reference genome and sequence to destination reference genome.
Returns tuple containing both reference genomes in preparation for liftover.

:param t: Input Table or MatrixTable
:return: List of source build (with liftover chain added) and destination build (with sequence loaded)
:param t: Input Table or MatrixTable.
:return: Tuple of source reference genome (with liftover chain added)
and destination reference genome (with sequence loaded)
"""

logger.info("Inferring build of input")
build = get_reference_genome(t.locus).name
logger.info("Inferring reference genome of input...")
input_build = get_reference_genome(t.locus).name
source = hl.get_reference(input_build)

logger.info(
"Loading reference genomes, adding chain file, and loading fasta sequence for destination build"
)
if build == "GRCh38":
source = hl.get_reference("GRCh38")
logger.info("Loading fasta sequence for destination build...")
if input_build == "GRCh38":
target = hl.get_reference("GRCh37")
chain = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
target.add_sequence(
"gs://hail-common/references/human_g1k_v37.fasta.gz",
"gs://hail-common/references/human_g1k_v37.fasta.fai",
)
chain = GRCH38_TO_GRCH37_CHAIN

else:
source = hl.get_reference("GRCh37")
target = hl.get_reference("GRCh38")
chain = "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
target.add_sequence(
"gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz",
"gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai",
chain = GRCH37_to_GRCH38_CHAIN

logger.info("Adding liftover chain to input build...")
if source.has_liftover():
logger.warning(
f"Source reference build {source.name} already has a chain file!\
Using whichever chain has already been added."
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
)
else:
source.add_liftover(chain, target)

return (source, add_reference_sequence(target))
ch-kr marked this conversation as resolved.
Show resolved Hide resolved


def liftover_expr(
locus: hl.expr.LocusExpression,
alleles: hl.expr.ArrayExpression,
destination_reference: hl.ReferenceGenome,
) -> hl.expr.StructExpression:
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
"""
Generates struct liftover expression.

Struct contains:
- locus: Liftover coordinates
- alleles: Liftover alleles
- original_locus: Locus prior to liftover
- original_alleles: Alleles prior to liftover
- locus_fail_liftover: Whether the locus failed liftover
- ref_allele_mismatch: Whether the allele at index 0 of alleles (lifted over reference allele)
doesn't match the allele at that position in the destination reference

:param locus: Input locus.
:param alleles: Input alleles.
:param destination_reference: Desired reference genome build for liftover.
:return: Struct containing expressions for lifted over locus/alleles as well as original locus/alleles.
"""
lifted_over_locus = hl.liftover(locus, destination_reference, include_strand=True)
lifted_over_alleles = alleles.map(
lambda a: hl.if_else(
lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
)
)

source.add_liftover(chain, target)
return [source, target]
return hl.struct(
new_locus=lifted_over_locus.result,
new_alleles=lifted_over_alleles,
original_locus=locus,
original_alleles=alleles,
locus_fail_liftover=hl.is_missing(lifted_over_locus),
ref_allele_mismatch=lifted_over_locus.sequence_context()
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
!= lifted_over_alleles[0],
)


def lift_data(
def default_lift_data(
t: Union[hl.MatrixTable, hl.Table],
gnomad: bool,
data_type: str,
path: str,
rg: hl.genetics.ReferenceGenome,
overwrite: bool,
) -> Union[hl.MatrixTable, hl.Table]:
"""
Lifts input Table or MatrixTable from one reference build to another

:param t: Table or MatrixTable
:param gnomad: Whether data is gnomAD data
:param data_type: Data type (exomes or genomes for gnomAD; not used otherwise)
:param path: Path to input Table/MatrixTable (if data is not gnomAD data)
:param rg: Reference genome
:param overwrite: Whether to overwrite data
:return: Table or MatrixTablewith liftover annotations
Lifts input Table or MatrixTable from one reference build to another.

:param t: Table or MatrixTable.
:return: Table or MatrixTable with liftover annotations.
"""
logger.info("Inferring input reference and destination reference...")
_, target_genome = get_liftover_genome(t)

logger.info("Annotating input with liftover coordinates")
liftover_expr = {
"new_locus": hl.liftover(t.locus, rg, include_strand=True),
"old_locus": t.locus,
}
logger.info("Annotating input data with liftover coordinates...")
t = (
t.annotate(**liftover_expr)
t.annotate(**liftover_expr(t.locus, t.alleles, target_genome))
if isinstance(t, hl.Table)
else t.annotate_rows(**liftover_expr)
else t.annotate_rows(**liftover_expr(t.locus, t.alleles, target_genome))
)

no_target_expr = hl.agg.count_where(hl.is_missing(t.new_locus))
no_target_expr = hl.agg.count_where(t.locus_fail_liftover)
num_no_target = (
t.aggregate(no_target_expr)
if isinstance(t, hl.Table)
else t.aggregate_rows(no_target_expr)
)

logger.info(f"Filtering out {num_no_target} sites that failed to liftover")
keep_expr = hl.is_defined(t.new_locus)
logger.info(f"Filtering out {num_no_target} sites that failed to liftover...")
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
keep_expr = ~t.locus_fail_liftover
t = t.filter(keep_expr) if isinstance(t, hl.Table) else t.filter_rows(keep_expr)

row_key_expr = {"locus": t.new_locus.result, "alleles": t.alleles}
t = (
row_key_expr = {"locus": t.new_locus.result, "alleles": t.new_alleles}
return (
t.key_by(**row_key_expr)
if isinstance(t, hl.Table)
else t.key_rows_by(**row_key_expr)
)

logger.info("Writing out lifted over data")
t = t.checkpoint(
get_checkpoint_path(gnomad, data_type, path, isinstance(t, hl.Table)),
overwrite=overwrite,
)
return t


def annotate_snp_mismatch(
t: Union[hl.MatrixTable, hl.Table], rg: hl.genetics.ReferenceGenome
) -> Union[hl.MatrixTable, hl.Table]:
"""
Annotates mismatches between reference allele and allele in reference fasta

Assumes input Table/MatrixTable has t.new_locus annotation

:param t: Table/MatrixTable of SNPs to be annotated
:param rg: Reference genome with fasta sequence loaded
:return: Table annotated with mismatches between reference allele and allele in fasta
"""

logger.info("Filtering to SNPs")
snp_expr = hl.is_snp(t.alleles[0], t.alleles[1])
t = t.filter(snp_expr) if isinstance(t, hl.Table) else t.filter_rows(snp_expr)

mismatch_expr = {
"reference_mismatch": hl.cond(
t.new_locus.is_negative_strand,
(
hl.reverse_complement(t.alleles[0])
!= hl.get_sequence(
t.locus.contig, t.locus.position, reference_genome=rg
)
),
(
t.alleles[0]
!= hl.get_sequence(
t.locus.contig, t.locus.position, reference_genome=rg
)
),
)
}
logger.info("Checking if reference allele matches what is in reference fasta")
logger.info(
"For SNPs on the negative strand, make sure the reverse complement of the ref alleles matches what is in the ref fasta"
)
return (
t.annotate(**mismatch_expr)
if isinstance(t, hl.Table)
else t.annotate_rows(**mismatch_expr)
)


def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
"""
Checks for mismatches between reference allele and allele in reference fasta

:param ht: Table to be checked
:return: StructExpression containing counts for mismatches and count for all variants on negative strand
def liftover_using_gnomad_map(ht: hl.Table, data_type: str):
"""
Liftover a gnomAD table using already-established liftover file.
ch-kr marked this conversation as resolved.
Show resolved Hide resolved

mismatch = ht.aggregate(
hl.struct(
total_variants=hl.agg.count(),
total_mismatch=hl.agg.count_where(ht.reference_mismatch),
negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
negative_strand_mismatch=hl.agg.count_where(
ht.new_locus.is_negative_strand & ht.reference_mismatch
),
)
)
return mismatch


def liftover_using_gnomad_map(ht, data_type):
"""
Liftover a gnomAD table using already-established liftover file. Warning: shuffles!
.. note::
This function shuffles!

:param ht: Input Hail table
:param data_type: one of "exomes" or "genomes" which to map across
:return: Lifted over table
:param ht: Input Hail Table.
:param data_type: Which gnomAD data type to map across. One of "exomes" or "genomes".
:return: Lifted over Table
"""
from gnomad.resources.grch37.gnomad import liftover

logger.warning("This function will trigger a shuffle! Pre-emptibles may not work.")
lift_ht = liftover(data_type).ht()
ht = ht.key_by(original_locus=ht.locus, original_alleles=ht.alleles).drop(
"locus", "alleles"
Expand Down