Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: flag MHC credible sets based on lead #767

Merged
merged 11 commits into from
Sep 18, 2024
1 change: 1 addition & 0 deletions docs/python_api/common/_common.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ title: Common

Common utilities used in gentropy package.

- [**Genomic Region**](genomic_region.md): class to represent genomic regions
- [**Version Engine**](version_engine.md): class to extract version from datasource input paths
- [**Types**](types.md): Literal types used in the gentropy
6 changes: 6 additions & 0 deletions docs/python_api/common/genomic_region.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
title: Genomic Region
---

:::gentropy.common.genomic_region.KnownGenomicRegions
:::gentropy.common.genomic_region.GenomicRegion
103 changes: 103 additions & 0 deletions src/gentropy/common/genomic_region.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Genomic Region class."""

from enum import Enum


class KnownGenomicRegions(Enum):
"""Known genomic regions in the human genome in string format."""

MHC = "chr6:25726063-33400556"


class GenomicRegion:
"""Genomic regions of interest.

Attributes:
chromosome (str): Chromosome.
start (int): Start position.
end (int):
"""

def __init__(self, chromosome: str, start: int, end: int) -> None:
"""Class constructor.

Args:
chromosome (str): Chromosome.
start (int): Start position.
end (int): End position.
"""
self.chromosome = chromosome
self.start = start
self.end = end

def __str__(self) -> str:
"""String representation of the genomic region.

Returns:
str: Genomic region in chr:start-end format.
"""
return f"{self.chromosome}:{self.start}-{self.end}"

@classmethod
def from_string(cls: type["GenomicRegion"], region: str) -> "GenomicRegion":
"""Parse region string to chr:start-end.

Args:
region (str): Genomic region expected to follow chr##:#,###-#,### format or ##:####-#####.

Returns:
GenomicRegion: Genomic region object.

Raises:
ValueError: If the end and start positions cannot be casted to integer or not all three values value error is raised.

Examples:
>>> print(GenomicRegion.from_string('chr6:28,510,120-33,480,577'))
6:28510120-33480577
>>> print(GenomicRegion.from_string('6:28510120-33480577'))
6:28510120-33480577
>>> print(GenomicRegion.from_string('6:28510120'))
Traceback (most recent call last):
...
ValueError: Genomic region should follow a ##:####-#### format.
>>> print(GenomicRegion.from_string('6:28510120-foo'))
Traceback (most recent call last):
...
ValueError: Start and the end position of the region has to be integer.
"""
region = region.replace(":", "-").replace(",", "")
try:
chromosome, start_position, end_position = region.split("-")
except ValueError as err:
raise ValueError(
"Genomic region should follow a ##:####-#### format."
) from err

try:
return cls(
chromosome=chromosome.replace("chr", ""),
start=int(start_position),
end=int(end_position),
)
except ValueError as err:
raise ValueError(
"Start and the end position of the region has to be integer."
) from err

@classmethod
def from_known_genomic_region(
cls: type["GenomicRegion"], region: KnownGenomicRegions
) -> "GenomicRegion":
"""Get known genomic region.

Args:
region (KnownGenomicRegions): Known genomic region.

Returns:
GenomicRegion: Genomic region object.

Examples:
>>> print(GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC))
6:25726063-33400556
"""
return GenomicRegion.from_string(region.value)
42 changes: 1 addition & 41 deletions src/gentropy/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import sys
from math import floor, log10
from typing import TYPE_CHECKING, Tuple
from typing import TYPE_CHECKING

import hail as hl
import numpy as np
Expand All @@ -19,46 +19,6 @@
from pyspark.sql import Column


def parse_region(region: str) -> Tuple[str, int, int]:
"""Parse region string to chr:start-end.

Args:
region (str): Genomic region expected to follow chr##:#,###-#,### format or ##:####-#####.

Returns:
Tuple[str, int, int]: Chromosome, start position, end position

Raises:
ValueError: If the end and start positions cannot be casted to integer or not all three values value error is raised.

Examples:
>>> parse_region('chr6:28,510,120-33,480,577')
('6', 28510120, 33480577)
>>> parse_region('6:28510120-33480577')
('6', 28510120, 33480577)
>>> parse_region('6:28510120')
Traceback (most recent call last):
...
ValueError: Genomic region should follow a ##:####-#### format.
>>> parse_region('6:28510120-foo')
Traceback (most recent call last):
...
ValueError: Start and the end position of the region has to be integer.
"""
region = region.replace(":", "-").replace(",", "")
try:
(chromosome, start_position, end_position) = region.split("-")
except ValueError as err:
raise ValueError("Genomic region should follow a ##:####-#### format.") from err

try:
return (chromosome.replace("chr", ""), int(start_position), int(end_position))
except ValueError as err:
raise ValueError(
"Start and the end position of the region has to be integer."
) from err


def calculate_confidence_interval(
pvalue_mantissa: Column,
pvalue_exponent: Column,
Expand Down
45 changes: 35 additions & 10 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, FloatType, StringType

from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import (
calculate_neglog_pvalue,
order_array_of_structs_by_field,
)
from gentropy.common.utils import get_logsum, parse_region
from gentropy.common.utils import get_logsum
from gentropy.dataset.dataset import Dataset
from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
from gentropy.dataset.variant_index import VariantIndex
Expand Down Expand Up @@ -49,6 +50,7 @@ class StudyLocusQualityCheck(Enum):
MISSING_STUDY (str): Flagging study loci if the study is not found in the study index as a reference
DUPLICATED_STUDYLOCUS_ID (str): Study-locus identifier is not unique.
INVALID_VARIANT_IDENTIFIER (str): Flagging study loci where identifier of any tagging variant was not found in the variant index
IN_MHC (str): Flagging study loci in the MHC region
"""

SUBSIGNIFICANT_FLAG = "Subsignificant p-value"
Expand All @@ -70,6 +72,7 @@ class StudyLocusQualityCheck(Enum):
INVALID_VARIANT_IDENTIFIER = (
"Some variant identifiers of this locus were not found in variant index"
)
IN_MHC = "MHC region"


class CredibleInterval(Enum):
Expand Down Expand Up @@ -817,32 +820,31 @@ def clump(self: StudyLocus) -> StudyLocus:
return self

def exclude_region(
self: StudyLocus, region: str, exclude_overlap: bool = False
self: StudyLocus, region: GenomicRegion, exclude_overlap: bool = False
) -> StudyLocus:
"""Exclude a region from the StudyLocus dataset.

Args:
region (str): region given in "chr##:#####-####" format
region (GenomicRegion): genomic region object.
exclude_overlap (bool): If True, excludes StudyLocus windows with any overlap with the region.

Returns:
StudyLocus: filtered StudyLocus object.
"""
(chromosome, start_position, end_position) = parse_region(region)
if exclude_overlap:
filter_condition = ~(
(f.col("chromosome") == chromosome)
(f.col("chromosome") == region.chromosome)
& (
(f.col("locusStart") <= end_position)
& (f.col("locusEnd") >= start_position)
(f.col("locusStart") <= region.end)
& (f.col("locusEnd") >= region.start)
)
)
else:
filter_condition = ~(
(f.col("chromosome") == chromosome)
(f.col("chromosome") == region.chromosome)
& (
(f.col("position") >= start_position)
& (f.col("position") <= end_position)
(f.col("position") >= region.start)
& (f.col("position") <= region.end)
)
)

Expand All @@ -851,6 +853,29 @@ def exclude_region(
_schema=StudyLocus.get_schema(),
)

def qc_MHC_region(self: StudyLocus) -> StudyLocus:
"""Adds qualityControl flag when lead overlaps with MHC region.

Returns:
StudyLocus: including qualityControl flag if in MHC region.
"""
region = GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC)
self.df = self.df.withColumn(
"qualityControls",
self.update_quality_flag(
f.col("qualityControls"),
~(
(f.col("chromosome") == region.chromosome)
& (
(f.col("position") <= region.end)
& (f.col("position") >= region.start)
)
),
StudyLocusQualityCheck.IN_MHC,
),
)
return self

def _qc_no_population(self: StudyLocus) -> StudyLocus:
"""Flag associations where the study doesn't have population information to resolve LD.

Expand Down
17 changes: 9 additions & 8 deletions src/gentropy/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

import pyspark.sql.functions as f

from gentropy.common.genomic_region import GenomicRegion
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.utils import parse_region, split_pvalue
from gentropy.common.utils import split_pvalue
from gentropy.config import LocusBreakerClumpingConfig, WindowBasedClumpingStepConfig
from gentropy.dataset.dataset import Dataset

Expand Down Expand Up @@ -112,25 +113,25 @@ def locus_breaker_clumping(
flanking_distance,
)

def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
def exclude_region(
self: SummaryStatistics, region: GenomicRegion
) -> SummaryStatistics:
"""Exclude a region from the summary stats dataset.

Args:
region (str): region given in "chr##:#####-####" format
region (GenomicRegion): Genomic region to be excluded.

Returns:
SummaryStatistics: filtered summary statistics.
"""
(chromosome, start_position, end_position) = parse_region(region)

return SummaryStatistics(
_df=(
self.df.filter(
~(
(f.col("chromosome") == chromosome)
(f.col("chromosome") == region.chromosome)
& (
(f.col("position") >= start_position)
& (f.col("position") <= end_position)
(f.col("position") >= region.start)
& (f.col("position") <= region.end)
)
)
)
Expand Down
4 changes: 3 additions & 1 deletion src/gentropy/locus_breaker_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions
from gentropy.common.session import Session
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.method.locus_breaker_clumping import LocusBreakerClumping
Expand Down Expand Up @@ -63,7 +64,8 @@ def __init__(
)
if remove_mhc:
clumped_result = clumped_result.exclude_region(
"chr6:25726063-33400556", exclude_overlap=True
GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC),
exclude_overlap=True,
)

if collect_locus:
Expand Down
8 changes: 5 additions & 3 deletions src/gentropy/study_locus_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ def __init__(
# Running validation then writing output:
study_locus_with_qc = (
StudyLocus.from_parquet(session, list(study_locus_path))
.validate_lead_pvalue(
pvalue_cutoff=gwas_significance
) # Flagging study locus with subsignificant p-values
# Flagging study locus with subsignificant p-values
.validate_lead_pvalue(pvalue_cutoff=gwas_significance)
# Add flag for MHC region
.qc_MHC_region()
# Excluding MHC region
d0choa marked this conversation as resolved.
Show resolved Hide resolved
.validate_study(study_index) # Flagging studies not in study index
.validate_unique_study_locus_id() # Flagging duplicated study locus ids
).persist() # we will need this for 2 types of outputs
Expand Down
5 changes: 5 additions & 0 deletions tests/gentropy/dataset/test_study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,11 @@ def test__qc_no_population(mock_study_locus: StudyLocus) -> None:
assert isinstance(mock_study_locus._qc_no_population(), StudyLocus)


def test_qc_MHC_region(mock_study_locus: StudyLocus) -> None:
"""Test qc_MHC_region."""
assert isinstance(mock_study_locus.qc_MHC_region(), StudyLocus)


def test_ldannotate(
mock_study_locus: StudyLocus, mock_study_index: StudyIndex, mock_ld_index: LDIndex
) -> None:
Expand Down
8 changes: 6 additions & 2 deletions tests/gentropy/dataset/test_summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pyspark.sql import types as t

from gentropy.common.genomic_region import GenomicRegion
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.summary_statistics import SummaryStatistics

Expand Down Expand Up @@ -44,7 +45,10 @@ def test_summary_statistics__exclude_region__return_type(
) -> None:
"""Testing if the exclude region method returns the right datatype."""
assert isinstance(
mock_summary_statistics.exclude_region("chr12:124-1245"), SummaryStatistics
mock_summary_statistics.exclude_region(
GenomicRegion.from_string("chr12:124-1245")
),
SummaryStatistics,
)


Expand Down Expand Up @@ -85,7 +89,7 @@ def test_summary_statistics__exclude_region__correctness(
df = spark.createDataFrame(data, schema=schema)
filtered_sumstas = SummaryStatistics(
_df=df, _schema=SummaryStatistics.get_schema()
).exclude_region("c1:9-16")
).exclude_region(GenomicRegion.from_string("c1:9-16"))

# Test for the correct number of rows returned:
assert filtered_sumstas.df.count() == 8
Expand Down