Merge pull request #181 from sanogenetics/feature/parse-plink

Add PLINK test
apriha · Aug 21, 2024 · 813fe86 · 813fe86
2 parents 9af329c + 371fdc2
commit 813fe86
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 1 deletion.
diff --git a/README.rst b/README.rst
@@ -58,6 +58,7 @@ genotype files from the following DNA testing sources:
 - `LivingDNA <https://livingdna.com>`_
 - `Mapmygenome <https://mapmygenome.in>`_
 - `MyHeritage <https://www.myheritage.com>`_
+- `PLINK <https://www.cog-genomics.org/plink/>`_
 - `Sano Genetics <https://sanogenetics.com>`_
 - `tellmeGen <https://www.tellmegen.com>`_
 

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -150,6 +150,14 @@ def read(self):
             d = self.read_snps_csv(file, comments, compression)
         elif "rsid\tChromosome\tposition\tgenotype" == first_line.strip():
             d = self.read_tellmegen(file, compression)
+        elif (
+            "# This file was derived from the corresponding VCF" in comments
+            or re.match(
+                r"^\s*rsid\s+chromosome\s+position\s+allele_1\s+allele_2\s*$",
+                first_line,
+            )
+        ):
+            d = self.read_sano_dtc(file, compression)
         elif re.match("^#*[ \t]*rsid[, \t]*chr", first_line):
             d = self.read_generic(file, compression)
         elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line):
@@ -1135,6 +1143,45 @@ def parser():
 
         return self.read_helper("CircleDNA", parser)
 
+    def read_sano_dtc(self, file, compression):
+        """Read and parse Sano Genetics DTC file.
+
+        https://sanogenetics.com
+
+        Parameters
+        ----------
+        file : str
+            path to file
+
+        Returns
+        -------
+        dict
+            result of `read_helper`
+        """
+
+        def parser():
+            df = pd.read_csv(
+                file,
+                comment="#",
+                header=0,
+                engine="c",
+                sep=r"\s+",
+                na_values="-",
+                names=["rsid", "chrom", "pos", "allele1", "allele2"],
+                index_col=0,
+                dtype=TWO_ALLELE_DTYPES,
+                compression=compression,
+            )
+
+            # create genotype column from allele columns and keep only relevant columns
+            df = df.assign(genotype=df["allele1"] + df["allele2"].fillna(""))[
+                ["chrom", "pos", "genotype"]
+            ]
+
+            return (df,)
+
+        return self.read_helper("Sano", parser)
+
     def read_plink(self, file, compression):
         """Read and parse plink file.
 
@@ -1217,7 +1264,7 @@ def parser():
             df = df.set_index("rsid")
             return (df,)
 
-        return self.read_helper("plink", parser)
+        return self.read_helper("PLINK", parser)
 
     def read_snps_csv(self, file, comments, compression):
         """Read and parse CSV file generated by ``snps``.

diff --git a/tests/input/plink.txt b/tests/input/plink.txt
@@ -0,0 +1,10 @@
+# Below is a text version of your data.
+rsid	chromosome	position	genotype
+rs1	1	101	AA
+rs2	1	102	CC
+rs3	1	103	GG
+rs4	1	104	TT
+rs5	1	105	--
+rs6	1	106	GC
+rs7	1	107	TC
+rs8	1	108	AT
diff --git a/tests/input/sano_dtc.txt b/tests/input/sano_dtc.txt
@@ -0,0 +1,11 @@
+# This file was derived from the corresponding VCF file to provide genetic
+# information in a more accessible format.
+rsid	chromosome	position	allele_1	allele_2
+rs1	1	101	A	A
+rs2	1	102	C	C
+rs3	1	103	G	G
+rs4	1	104	T	T
+rs5	1	105	-	-
+rs6	1	106	G	C
+rs7	1	107	T	C
+rs8	1	108	A	T
diff --git a/tests/input/sano_dtc_no_comments.txt b/tests/input/sano_dtc_no_comments.txt
@@ -0,0 +1,9 @@
+rsid	chromosome	position	allele_1	allele_2
+rs1	1	101	A	A
+rs2	1	102	C	C
+rs3	1	103	G	G
+rs4	1	104	T	T
+rs5	1	105	-	-
+rs6	1	106	G	C
+rs7	1	107	T	C
+rs8	1	108	A	T
diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py
@@ -154,6 +154,9 @@ def test_read_circledna(self):
         df.drop("rs5", inplace=True)  # only called genotypes
         self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)
 
+    def test_read_plink(self):
+        self.run_parsing_tests("tests/input/plink.txt", "PLINK")
+
     def test_read_ftdna(self):
         # https://www.familytreedna.com
         self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")
@@ -303,6 +306,11 @@ def test_read_sano(self):
             self.run_parsing_tests("tests/input/sano.txt", "Sano")
             self._teardown_gsa_test()
 
+    def test_read_sano_dtc(self):
+        # https://sanogenetics.com
+        self.run_parsing_tests("tests/input/sano_dtc.txt", "Sano")
+        self.run_parsing_tests("tests/input/sano_dtc_no_comments.txt", "Sano")
+
     def test_read_vcf(self):
         self.run_parsing_tests_vcf("tests/input/testvcf.vcf")