opentargets · d0choa · Sep 18, 2024 · Sep 16, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/docs/python_api/common/_common.md b/docs/python_api/common/_common.md
@@ -4,5 +4,6 @@ title: Common
 
 Common utilities used in gentropy package.
 
+- [**Genomic Region**](genomic_region.md): class to represent genomic regions
 - [**Version Engine**](version_engine.md): class to extract version from datasource input paths
 - [**Types**](types.md): Literal types used in the gentropy
diff --git a/docs/python_api/common/genomic_region.md b/docs/python_api/common/genomic_region.md
@@ -0,0 +1,6 @@
+---
+title: Genomic Region
+---
+
+:::gentropy.common.genomic_region.KnownGenomicRegions
+:::gentropy.common.genomic_region.GenomicRegion
diff --git a/src/gentropy/common/genomic_region.py b/src/gentropy/common/genomic_region.py
@@ -0,0 +1,103 @@
+"""Genomic Region class."""
+
+from enum import Enum
+
+
+class KnownGenomicRegions(Enum):
+    """Known genomic regions in the human genome in string format."""
+
+    MHC = "chr6:25726063-33400556"
+
+
+class GenomicRegion:
+    """Genomic regions of interest.
+
+    Attributes:
+        chromosome (str): Chromosome.
+        start (int): Start position.
+        end (int):
+    """
+
+    def __init__(self, chromosome: str, start: int, end: int) -> None:
+        """Class constructor.
+
+        Args:
+            chromosome (str): Chromosome.
+            start (int): Start position.
+            end (int): End position.
+        """
+        self.chromosome = chromosome
+        self.start = start
+        self.end = end
+
+    def __str__(self) -> str:
+        """String representation of the genomic region.
+
+        Returns:
+            str: Genomic region in chr:start-end format.
+        """
+        return f"{self.chromosome}:{self.start}-{self.end}"
+
+    @classmethod
+    def from_string(cls: type["GenomicRegion"], region: str) -> "GenomicRegion":
+        """Parse region string to chr:start-end.
+
+        Args:
+            region (str): Genomic region expected to follow chr##:#,###-#,### format or ##:####-#####.
+
+        Returns:
+            GenomicRegion: Genomic region object.
+
+        Raises:
+            ValueError: If the end and start positions cannot be casted to integer or not all three values value error is raised.
+
+        Examples:
+            >>> print(GenomicRegion.from_string('chr6:28,510,120-33,480,577'))
+            6:28510120-33480577
+            >>> print(GenomicRegion.from_string('6:28510120-33480577'))
+            6:28510120-33480577
+            >>> print(GenomicRegion.from_string('6:28510120'))
+            Traceback (most recent call last):
+                ...
+            ValueError: Genomic region should follow a ##:####-#### format.
+            >>> print(GenomicRegion.from_string('6:28510120-foo'))
+            Traceback (most recent call last):
+                ...
+            ValueError: Start and the end position of the region has to be integer.
+        """
+        region = region.replace(":", "-").replace(",", "")
+        try:
+            chromosome, start_position, end_position = region.split("-")
+        except ValueError as err:
+            raise ValueError(
+                "Genomic region should follow a ##:####-#### format."
+            ) from err
+
+        try:
+            return cls(
+                chromosome=chromosome.replace("chr", ""),
+                start=int(start_position),
+                end=int(end_position),
+            )
+        except ValueError as err:
+            raise ValueError(
+                "Start and the end position of the region has to be integer."
+            ) from err
+
+    @classmethod
+    def from_known_genomic_region(
+        cls: type["GenomicRegion"], region: KnownGenomicRegions
+    ) -> "GenomicRegion":
+        """Get known genomic region.
+
+        Args:
+            region (KnownGenomicRegions): Known genomic region.
+
+        Returns:
+            GenomicRegion: Genomic region object.
+
+        Examples:
+            >>> print(GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC))
+            6:25726063-33400556
+        """
+        return GenomicRegion.from_string(region.value)
diff --git a/src/gentropy/common/utils.py b/src/gentropy/common/utils.py
@@ -4,7 +4,7 @@
 
 import sys
 from math import floor, log10
-from typing import TYPE_CHECKING, Tuple
+from typing import TYPE_CHECKING
 
 import hail as hl
 import numpy as np
@@ -19,46 +19,6 @@
     from pyspark.sql import Column
 
 
-def parse_region(region: str) -> Tuple[str, int, int]:
-    """Parse region string to chr:start-end.
-
-    Args:
-        region (str): Genomic region expected to follow chr##:#,###-#,### format or ##:####-#####.
-
-    Returns:
-        Tuple[str, int, int]: Chromosome, start position, end position
-
-    Raises:
-        ValueError: If the end and start positions cannot be casted to integer or not all three values value error is raised.
-
-    Examples:
-        >>> parse_region('chr6:28,510,120-33,480,577')
-        ('6', 28510120, 33480577)
-        >>> parse_region('6:28510120-33480577')
-        ('6', 28510120, 33480577)
-        >>> parse_region('6:28510120')
-        Traceback (most recent call last):
-            ...
-        ValueError: Genomic region should follow a ##:####-#### format.
-        >>> parse_region('6:28510120-foo')
-        Traceback (most recent call last):
-            ...
-        ValueError: Start and the end position of the region has to be integer.
-    """
-    region = region.replace(":", "-").replace(",", "")
-    try:
-        (chromosome, start_position, end_position) = region.split("-")
-    except ValueError as err:
-        raise ValueError("Genomic region should follow a ##:####-#### format.") from err
-
-    try:
-        return (chromosome.replace("chr", ""), int(start_position), int(end_position))
-    except ValueError as err:
-        raise ValueError(
-            "Start and the end position of the region has to be integer."
-        ) from err
-
-
 def calculate_confidence_interval(
     pvalue_mantissa: Column,
     pvalue_exponent: Column,

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -10,12 +10,13 @@
 import pyspark.sql.functions as f
 from pyspark.sql.types import ArrayType, FloatType, StringType
 
+from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.common.spark_helpers import (
     calculate_neglog_pvalue,
     order_array_of_structs_by_field,
 )
-from gentropy.common.utils import get_logsum, parse_region
+from gentropy.common.utils import get_logsum
 from gentropy.dataset.dataset import Dataset
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
 from gentropy.dataset.variant_index import VariantIndex
@@ -49,6 +50,7 @@ class StudyLocusQualityCheck(Enum):
         MISSING_STUDY (str): Flagging study loci if the study is not found in the study index as a reference
         DUPLICATED_STUDYLOCUS_ID (str): Study-locus identifier is not unique.
         INVALID_VARIANT_IDENTIFIER (str): Flagging study loci where identifier of any tagging variant was not found in the variant index
+        IN_MHC (str): Flagging study loci in the MHC region
     """
 
     SUBSIGNIFICANT_FLAG = "Subsignificant p-value"
@@ -70,6 +72,7 @@ class StudyLocusQualityCheck(Enum):
     INVALID_VARIANT_IDENTIFIER = (
         "Some variant identifiers of this locus were not found in variant index"
     )
+    IN_MHC = "MHC region"
 
 
 class CredibleInterval(Enum):
@@ -817,32 +820,31 @@ def clump(self: StudyLocus) -> StudyLocus:
         return self
 
     def exclude_region(
-        self: StudyLocus, region: str, exclude_overlap: bool = False
+        self: StudyLocus, region: GenomicRegion, exclude_overlap: bool = False
     ) -> StudyLocus:
         """Exclude a region from the StudyLocus dataset.
 
         Args:
-            region (str): region given in "chr##:#####-####" format
+            region (GenomicRegion): genomic region object.
             exclude_overlap (bool): If True, excludes StudyLocus windows with any overlap with the region.
 
         Returns:
             StudyLocus: filtered StudyLocus object.
         """
-        (chromosome, start_position, end_position) = parse_region(region)
         if exclude_overlap:
             filter_condition = ~(
-                (f.col("chromosome") == chromosome)
+                (f.col("chromosome") == region.chromosome)
                 & (
-                    (f.col("locusStart") <= end_position)
-                    & (f.col("locusEnd") >= start_position)
+                    (f.col("locusStart") <= region.end)
+                    & (f.col("locusEnd") >= region.start)
                 )
             )
         else:
             filter_condition = ~(
-                (f.col("chromosome") == chromosome)
+                (f.col("chromosome") == region.chromosome)
                 & (
-                    (f.col("position") >= start_position)
-                    & (f.col("position") <= end_position)
+                    (f.col("position") >= region.start)
+                    & (f.col("position") <= region.end)
                 )
             )
 
@@ -851,6 +853,29 @@ def exclude_region(
             _schema=StudyLocus.get_schema(),
         )
 
+    def qc_MHC_region(self: StudyLocus) -> StudyLocus:
+        """Adds qualityControl flag when lead overlaps with MHC region.
+
+        Returns:
+            StudyLocus: including qualityControl flag if in MHC region.
+        """
+        region = GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC)
+        self.df = self.df.withColumn(
+            "qualityControls",
+            self.update_quality_flag(
+                f.col("qualityControls"),
+                ~(
+                    (f.col("chromosome") == region.chromosome)
+                    & (
+                        (f.col("position") <= region.end)
+                        & (f.col("position") >= region.start)
+                    )
+                ),
+                StudyLocusQualityCheck.IN_MHC,
+            ),
+        )
+        return self
+
     def _qc_no_population(self: StudyLocus) -> StudyLocus:
         """Flag associations where the study doesn't have population information to resolve LD.
 

diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py
@@ -7,8 +7,9 @@
 
 import pyspark.sql.functions as f
 
+from gentropy.common.genomic_region import GenomicRegion
 from gentropy.common.schemas import parse_spark_schema
-from gentropy.common.utils import parse_region, split_pvalue
+from gentropy.common.utils import split_pvalue
 from gentropy.config import LocusBreakerClumpingConfig, WindowBasedClumpingStepConfig
 from gentropy.dataset.dataset import Dataset
 
@@ -112,25 +113,25 @@ def locus_breaker_clumping(
             flanking_distance,
         )
 
-    def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
+    def exclude_region(
+        self: SummaryStatistics, region: GenomicRegion
+    ) -> SummaryStatistics:
         """Exclude a region from the summary stats dataset.
 
         Args:
-            region (str): region given in "chr##:#####-####" format
+            region (GenomicRegion): Genomic region to be excluded.
 
         Returns:
             SummaryStatistics: filtered summary statistics.
         """
-        (chromosome, start_position, end_position) = parse_region(region)
-
         return SummaryStatistics(
             _df=(
                 self.df.filter(
                     ~(
-                        (f.col("chromosome") == chromosome)
+                        (f.col("chromosome") == region.chromosome)
                         & (
-                            (f.col("position") >= start_position)
-                            & (f.col("position") <= end_position)
+                            (f.col("position") >= region.start)
+                            & (f.col("position") <= region.end)
                         )
                     )
                 )

diff --git a/src/gentropy/locus_breaker_clumping.py b/src/gentropy/locus_breaker_clumping.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions
 from gentropy.common.session import Session
 from gentropy.dataset.summary_statistics import SummaryStatistics
 from gentropy.method.locus_breaker_clumping import LocusBreakerClumping
@@ -63,7 +64,8 @@ def __init__(
         )
         if remove_mhc:
             clumped_result = clumped_result.exclude_region(
-                "chr6:25726063-33400556", exclude_overlap=True
+                GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC),
+                exclude_overlap=True,
             )
 
         if collect_locus:

diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py
@@ -41,9 +41,11 @@ def __init__(
         # Running validation then writing output:
         study_locus_with_qc = (
             StudyLocus.from_parquet(session, list(study_locus_path))
-            .validate_lead_pvalue(
-                pvalue_cutoff=gwas_significance
-            )  # Flagging study locus with subsignificant p-values
+            # Flagging study locus with subsignificant p-values
+            .validate_lead_pvalue(pvalue_cutoff=gwas_significance)
+            # Add flag for MHC region
+            .qc_MHC_region()
+            # Excluding MHC region
             .validate_study(study_index)  # Flagging studies not in study index
             .validate_unique_study_locus_id()  # Flagging duplicated study locus ids
         ).persist()  # we will need this for 2 types of outputs

diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
@@ -526,6 +526,11 @@ def test__qc_no_population(mock_study_locus: StudyLocus) -> None:
     assert isinstance(mock_study_locus._qc_no_population(), StudyLocus)
 
 
+def test_qc_MHC_region(mock_study_locus: StudyLocus) -> None:
+    """Test qc_MHC_region."""
+    assert isinstance(mock_study_locus.qc_MHC_region(), StudyLocus)
+
+
 def test_ldannotate(
     mock_study_locus: StudyLocus, mock_study_index: StudyIndex, mock_ld_index: LDIndex
 ) -> None:

diff --git a/tests/gentropy/dataset/test_summary_statistics.py b/tests/gentropy/dataset/test_summary_statistics.py
@@ -6,6 +6,7 @@
 
 from pyspark.sql import types as t
 
+from gentropy.common.genomic_region import GenomicRegion
 from gentropy.dataset.study_locus import StudyLocus
 from gentropy.dataset.summary_statistics import SummaryStatistics
 
@@ -44,7 +45,10 @@ def test_summary_statistics__exclude_region__return_type(
 ) -> None:
     """Testing if the exclude region method returns the right datatype."""
     assert isinstance(
-        mock_summary_statistics.exclude_region("chr12:124-1245"), SummaryStatistics
+        mock_summary_statistics.exclude_region(
+            GenomicRegion.from_string("chr12:124-1245")
+        ),
+        SummaryStatistics,
     )
 
 
@@ -85,7 +89,7 @@ def test_summary_statistics__exclude_region__correctness(
     df = spark.createDataFrame(data, schema=schema)
     filtered_sumstas = SummaryStatistics(
         _df=df, _schema=SummaryStatistics.get_schema()
-    ).exclude_region("c1:9-16")
+    ).exclude_region(GenomicRegion.from_string("c1:9-16"))
 
     # Test for the correct number of rows returned:
     assert filtered_sumstas.df.count() == 8