From 34eca00b87f493689461a0938d05b6b15ee80ccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?=
 <45119610+ireneisdoomed@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:41:08 +0100
Subject: [PATCH] feat(ld_annotator): optional r2 threshold (#648)

* feat(ld_annotator): apply r2 threshold

* feat(ld_annotator): apply r2 threshold

* chore(ldannotator): change threshold to 0.5
---
 src/gentropy/dataset/study_locus.py        | 27 ++++++++++++++++++++--
 src/gentropy/method/ld.py                  | 12 ++++++++++
 tests/gentropy/dataset/test_study_locus.py | 21 ++++++++++++-----
 3 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 9ef536b4d..44778311e 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -326,6 +326,25 @@ def filter_credible_set(
         )
         return self
 
+    @staticmethod
+    def filter_ld_set(ld_set: Column, r2_threshold: float) -> Column:
+        """Filter the LD set by a given R2 threshold.
+
+        Args:
+            ld_set (Column): LD set
+            r2_threshold (float): R2 threshold to filter the LD set on
+
+        Returns:
+            Column: Filtered LD index
+        """
+        return f.when(
+            ld_set.isNotNull(),
+            f.filter(
+                ld_set,
+                lambda tag: tag["r2Overall"] >= r2_threshold,
+            ),
+        )
+
     def find_overlaps(
         self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False
     ) -> StudyLocusOverlap:
@@ -524,20 +543,24 @@ def annotate_locus_statistics(
         return self
 
     def annotate_ld(
-        self: StudyLocus, study_index: StudyIndex, ld_index: LDIndex
+        self: StudyLocus,
+        study_index: StudyIndex,
+        ld_index: LDIndex,
+        r2_threshold: float = 0.0,
     ) -> StudyLocus:
         """Annotate LD information to study-locus.
 
         Args:
             study_index (StudyIndex): Study index to resolve ancestries.
             ld_index (LDIndex): LD index to resolve LD information.
+            r2_threshold (float): R2 threshold to filter the LD index. Default is 0.0.
 
         Returns:
             StudyLocus: Study locus annotated with ld information from LD index.
         """
         from gentropy.method.ld import LDAnnotator
 
-        return LDAnnotator.ld_annotate(self, study_index, ld_index)
+        return LDAnnotator.ld_annotate(self, study_index, ld_index, r2_threshold)
 
     def clump(self: StudyLocus) -> StudyLocus:
         """Perform LD clumping of the studyLocus.
diff --git a/src/gentropy/method/ld.py b/src/gentropy/method/ld.py
index f0eab7c4b..68b78b103 100644
--- a/src/gentropy/method/ld.py
+++ b/src/gentropy/method/ld.py
@@ -1,4 +1,5 @@
 """Performing linkage disequilibrium (LD) operations."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -120,6 +121,7 @@ def ld_annotate(
         associations: StudyLocus,
         studies: StudyIndex,
         ld_index: LDIndex,
+        r2_threshold: float = 0.5,
     ) -> StudyLocus:
         """Annotate linkage disequilibrium (LD) information to a set of studyLocus.
 
@@ -131,10 +133,14 @@ def ld_annotate(
             5. Flags associations with variants that are not found in the LD reference
             6. Rescues lead variant when no LD information is available but lead variant is available
 
+        !!! note
+            Because the LD index has a pre-set threshold of R2 = 0.5, this is the minimum threshold for the LD information to be included in the ldSet.
+
         Args:
             associations (StudyLocus): Dataset to be LD annotated
             studies (StudyIndex): Dataset with study information
             ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix
+            r2_threshold (float): R2 threshold to filter the LD set on. Default is 0.5.
 
         Returns:
             StudyLocus: including additional column with LD information.
@@ -175,6 +181,12 @@ def ld_annotate(
                     ),
                 )
                 .drop("ldPopulationStructure")
+                # Filter the LD set by the R2 threshold and set to null if no LD information passes the threshold
+                .withColumn(
+                    "ldSet",
+                    StudyLocus.filter_ld_set(f.col("ldSet"), r2_threshold),
+                )
+                .withColumn("ldSet", f.when(f.size("ldSet") > 0, f.col("ldSet")))
                 # QC: Flag associations with variants that are not found in the LD reference
                 .withColumn(
                     "qualityControls",
diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index 1401b9dd3..772e49742 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -11,7 +11,7 @@
 from gentropy.dataset.study_locus import CredibleInterval, StudyLocus
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
 from gentropy.dataset.summary_statistics import SummaryStatistics
-from pyspark.sql import Column, SparkSession
+from pyspark.sql import Column, Row, SparkSession
 from pyspark.sql.types import (
     ArrayType,
     BooleanType,
@@ -23,11 +23,6 @@
 )
 
 
-def test_study_locus_creation(mock_study_locus: StudyLocus) -> None:
-    """Test study locus creation with mock data."""
-    assert isinstance(mock_study_locus, StudyLocus)
-
-
 @pytest.mark.parametrize(
     "has_overlap, expected",
     [
@@ -531,3 +526,17 @@ def test_ldannotate(
     assert isinstance(
         mock_study_locus.annotate_ld(mock_study_index, mock_ld_index), StudyLocus
     )
+
+
+def test_filter_ld_set(spark: SparkSession) -> None:
+    """Test filter_ld_set."""
+    observed_data = [
+        Row(studyLocusId="sl1", ldSet=[{"tagVariantId": "tag1", "r2Overall": 0.4}])
+    ]
+    observed_df = spark.createDataFrame(
+        observed_data, ["studyLocusId", "ldSet"]
+    ).withColumn("ldSet", StudyLocus.filter_ld_set(f.col("ldSet"), 0.5))
+    expected_tags_in_ld = 0
+    assert (
+        observed_df.filter(f.size("ldSet") > 1).count() == expected_tags_in_ld
+    ), "Expected tags in ld set differ from observed."