fix: updating cohort parser to new GWAS Catalog format

opentargets · Feb 28, 2024 · 655a5f3 · 655a5f3
1 parent ba956c7
commit 655a5f3
Showing 1 changed file with 2 additions and 12 deletions.
diff --git a/src/gentropy/datasource/gwas_catalog/study_index.py b/src/gentropy/datasource/gwas_catalog/study_index.py
@@ -305,6 +305,7 @@ def _parse_study_table(
                 parse_efos(f.col("MAPPED BACKGROUND TRAIT URI")).alias(
                     "backgroundTraitFromSourceMappedIds"
                 ),
+                cls.parse_cohorts(f.col("COHORTS")).alias("cohorts"),
             ),
             _schema=StudyIndexGWASCatalog.get_schema(),
         )
@@ -548,14 +549,6 @@ def annotate_ancestries(
             )  # studyId has not been split yet
         )
 
-        # Parsing cohort information:
-        # cohorts = ancestry_lut.select(
-        #     f.col("STUDY ACCESSION").alias("studyId"),
-        #     GWASCatalogStudyIndexParser.parse_cohorts(f.col("COHORT(S)")).alias(
-        #         "cohorts"
-        #     ),
-        # ).distinct()
-
         # Get a high resolution dataset on experimental stage:
         ancestry_stages = (
             ancestry.groupBy("studyId")
@@ -644,10 +637,7 @@ def annotate_ancestries(
         ).select(
             "studyId", "discoverySamples", "ldPopulationStructure", "replicationSamples"
         )
-        self.df = (
-            self.df.join(parsed_ancestry_lut, on="studyId", how="left")
-            # .join(cohorts, on="studyId", how="left")
-        )
+        self.df = self.df.join(parsed_ancestry_lut, on="studyId", how="left")
         return self
 
     def annotate_sumstats_info(