From 02fbc272a4c8a4f5d29e4aea0d05530d24d201f2 Mon Sep 17 00:00:00 2001
From: Andrew Riha <andrew@sanogenetics.com>
Date: Mon, 19 Aug 2024 21:27:38 -0700
Subject: [PATCH 1/3] Parse Sano DTC format

---
 src/snps/io/reader.py                | 47 ++++++++++++++++++++++++++++
 tests/input/sano_dtc.txt             | 11 +++++++
 tests/input/sano_dtc_no_comments.txt |  9 ++++++
 tests/io/test_reader.py              |  5 +++
 4 files changed, 72 insertions(+)
 create mode 100644 tests/input/sano_dtc.txt
 create mode 100644 tests/input/sano_dtc_no_comments.txt

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
index 10dc2ba..692fbce 100644
--- a/src/snps/io/reader.py
+++ b/src/snps/io/reader.py
@@ -150,6 +150,14 @@ def read(self):
             d = self.read_snps_csv(file, comments, compression)
         elif "rsid\tChromosome\tposition\tgenotype" == first_line.strip():
             d = self.read_tellmegen(file, compression)
+        elif (
+            "# This file was derived from the corresponding VCF" in comments
+            or re.match(
+                r"^\s*rsid\s+chromosome\s+position\s+allele_1\s+allele_2\s*$",
+                first_line,
+            )
+        ):
+            d = self.read_sano_dtc(file, compression)
         elif re.match("^#*[ \t]*rsid[, \t]*chr", first_line):
             d = self.read_generic(file, compression)
         elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line):
@@ -1133,6 +1141,45 @@ def parser():
 
         return self.read_helper("CircleDNA", parser)
 
+    def read_sano_dtc(self, file, compression):
+        """Read and parse Sano Genetics DTC file.
+
+        https://sanogenetics.com
+
+        Parameters
+        ----------
+        file : str
+            path to file
+
+        Returns
+        -------
+        dict
+            result of `read_helper`
+        """
+
+        def parser():
+            df = pd.read_csv(
+                file,
+                comment="#",
+                header=0,
+                engine="c",
+                sep=r"\s+",
+                na_values="-",
+                names=["rsid", "chrom", "pos", "allele1", "allele2"],
+                index_col=0,
+                dtype=TWO_ALLELE_DTYPES,
+                compression=compression,
+            )
+
+            # create genotype column from allele columns and keep only relevant columns
+            df = df.assign(genotype=df["allele1"] + df["allele2"].fillna(""))[
+                ["chrom", "pos", "genotype"]
+            ]
+
+            return (df,)
+
+        return self.read_helper("Sano", parser)
+
     def read_snps_csv(self, file, comments, compression):
         """Read and parse CSV file generated by ``snps``.
 
diff --git a/tests/input/sano_dtc.txt b/tests/input/sano_dtc.txt
new file mode 100644
index 0000000..da6c6e3
--- /dev/null
+++ b/tests/input/sano_dtc.txt
@@ -0,0 +1,11 @@
+# This file was derived from the corresponding VCF file to provide genetic
+# information in a more accessible format.
+rsid	chromosome	position	allele_1	allele_2
+rs1	1	101	A	A
+rs2	1	102	C	C
+rs3	1	103	G	G
+rs4	1	104	T	T
+rs5	1	105	-	-
+rs6	1	106	G	C
+rs7	1	107	T	C
+rs8	1	108	A	T
diff --git a/tests/input/sano_dtc_no_comments.txt b/tests/input/sano_dtc_no_comments.txt
new file mode 100644
index 0000000..a20f2df
--- /dev/null
+++ b/tests/input/sano_dtc_no_comments.txt
@@ -0,0 +1,9 @@
+rsid	chromosome	position	allele_1	allele_2
+rs1	1	101	A	A
+rs2	1	102	C	C
+rs3	1	103	G	G
+rs4	1	104	T	T
+rs5	1	105	-	-
+rs6	1	106	G	C
+rs7	1	107	T	C
+rs8	1	108	A	T
diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py
index 95e887e..a61ee4a 100644
--- a/tests/io/test_reader.py
+++ b/tests/io/test_reader.py
@@ -303,6 +303,11 @@ def test_read_sano(self):
             self.run_parsing_tests("tests/input/sano.txt", "Sano")
             self._teardown_gsa_test()
 
+    def test_read_sano_dtc(self):
+        # https://sanogenetics.com
+        self.run_parsing_tests("tests/input/sano_dtc.txt", "Sano")
+        self.run_parsing_tests("tests/input/sano_dtc_no_comments.txt", "Sano")
+
     def test_read_vcf(self):
         self.run_parsing_tests_vcf("tests/input/testvcf.vcf")
 

From ad391f49c5794bfdfc09d38e3017fa79747ae70b Mon Sep 17 00:00:00 2001
From: Andrew Riha <andrew@sanogenetics.com>
Date: Tue, 20 Aug 2024 20:59:49 -0700
Subject: [PATCH 2/3] Add PLINK test

---
 src/snps/io/reader.py   |  2 +-
 tests/input/plink.txt   | 10 ++++++++++
 tests/io/test_reader.py |  3 +++
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 tests/input/plink.txt

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
index a00d54a..83059bf 100644
--- a/src/snps/io/reader.py
+++ b/src/snps/io/reader.py
@@ -1217,7 +1217,7 @@ def parser():
             df = df.set_index("rsid")
             return (df,)
 
-        return self.read_helper("plink", parser)
+        return self.read_helper("PLINK", parser)
 
     def read_snps_csv(self, file, comments, compression):
         """Read and parse CSV file generated by ``snps``.
diff --git a/tests/input/plink.txt b/tests/input/plink.txt
new file mode 100644
index 0000000..a8a797d
--- /dev/null
+++ b/tests/input/plink.txt
@@ -0,0 +1,10 @@
+# Below is a text version of your data.
+rsid	chromosome	position	genotype
+rs1	1	101	AA
+rs2	1	102	CC
+rs3	1	103	GG
+rs4	1	104	TT
+rs5	1	105	--
+rs6	1	106	GC
+rs7	1	107	TC
+rs8	1	108	AT
diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py
index 95e887e..c2c3473 100644
--- a/tests/io/test_reader.py
+++ b/tests/io/test_reader.py
@@ -154,6 +154,9 @@ def test_read_circledna(self):
         df.drop("rs5", inplace=True)  # only called genotypes
         self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)
 
+    def test_read_plink(self):
+        self.run_parsing_tests("tests/input/plink.txt", "PLINK")
+
     def test_read_ftdna(self):
         # https://www.familytreedna.com
         self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")

From 23ca2e4f7e4c06548fa3fab13fafef7245a4baa7 Mon Sep 17 00:00:00 2001
From: Andrew Riha <andrew@sanogenetics.com>
Date: Tue, 20 Aug 2024 21:14:47 -0700
Subject: [PATCH 3/3] Update README

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index 7fb8bec..024359b 100644
--- a/README.rst
+++ b/README.rst
@@ -58,6 +58,7 @@ genotype files from the following DNA testing sources:
 - `LivingDNA <https://livingdna.com>`_
 - `Mapmygenome <https://mapmygenome.in>`_
 - `MyHeritage <https://www.myheritage.com>`_
+- `PLINK <https://www.cog-genomics.org/plink/>`_
 - `Sano Genetics <https://sanogenetics.com>`_
 - `tellmeGen <https://www.tellmegen.com>`_