From c15a3a3d04f99eb793680dd25c5d88c2d10de2ff Mon Sep 17 00:00:00 2001
From: louwenjjr <jorislouwen@hotmail.com>
Date: Fri, 15 Mar 2024 18:30:00 +0100
Subject: [PATCH] fix(sumstats): correct study id for dir of finngen studies

---
 src/gentropy/datasource/finngen/summary_stats.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/datasource/finngen/summary_stats.py b/src/gentropy/datasource/finngen/summary_stats.py
index 0d77f7d5c..651c9a2d3 100644
--- a/src/gentropy/datasource/finngen/summary_stats.py
+++ b/src/gentropy/datasource/finngen/summary_stats.py
@@ -50,7 +50,6 @@ def from_source(
         Returns:
             SummaryStatistics: Processed summary statistics dataset
         """
-        study_id = raw_file.split("/")[-1].split(".")[0].upper()
         processed_summary_stats_df = (
             spark.read.schema(cls.raw_schema)
             .option("delimiter", "\t")
@@ -59,7 +58,11 @@ def from_source(
             .filter(f.col("pos").cast(t.IntegerType()).isNotNull())
             .select(
                 # From the full path, extracts just the filename, and converts to upper case to get the study ID.
-                f.lit(study_id).alias("studyId"),
+                f.upper(
+                    f.regexp_extract(
+                        f.input_file_name(), r"([^/]+)(\.tsv\.gz|\.gz|\.tsv)", 1
+                    )
+                ).alias("studyId"),
                 # Add variant information.
                 f.concat_ws(
                     "_",
@@ -85,6 +88,7 @@ def from_source(
             .repartitionByRange(30, "chromosome", "position")
             .sortWithinPartitions("chromosome", "position")
         )
+        processed_summary_stats_df.head(4)
 
         # Initializing summary statistics object:
         return SummaryStatistics(