From c15a3a3d04f99eb793680dd25c5d88c2d10de2ff Mon Sep 17 00:00:00 2001 From: louwenjjr Date: Fri, 15 Mar 2024 18:30:00 +0100 Subject: [PATCH] fix(sumstats): correct study id for dir of finngen studies --- src/gentropy/datasource/finngen/summary_stats.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gentropy/datasource/finngen/summary_stats.py b/src/gentropy/datasource/finngen/summary_stats.py index 0d77f7d5c..651c9a2d3 100644 --- a/src/gentropy/datasource/finngen/summary_stats.py +++ b/src/gentropy/datasource/finngen/summary_stats.py @@ -50,7 +50,6 @@ def from_source( Returns: SummaryStatistics: Processed summary statistics dataset """ - study_id = raw_file.split("/")[-1].split(".")[0].upper() processed_summary_stats_df = ( spark.read.schema(cls.raw_schema) .option("delimiter", "\t") @@ -59,7 +58,11 @@ def from_source( .filter(f.col("pos").cast(t.IntegerType()).isNotNull()) .select( # From the full path, extracts just the filename, and converts to upper case to get the study ID. - f.lit(study_id).alias("studyId"), + f.upper( + f.regexp_extract( + f.input_file_name(), r"([^/]+)(\.tsv\.gz|\.gz|\.tsv)", 1 + ) + ).alias("studyId"), # Add variant information. f.concat_ws( "_", @@ -85,6 +88,7 @@ def from_source( .repartitionByRange(30, "chromosome", "position") .sortWithinPartitions("chromosome", "position") ) + processed_summary_stats_df.head(4) # Initializing summary statistics object: return SummaryStatistics(