From 02fbc272a4c8a4f5d29e4aea0d05530d24d201f2 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Mon, 19 Aug 2024 21:27:38 -0700 Subject: [PATCH 1/3] Parse Sano DTC format --- src/snps/io/reader.py | 47 ++++++++++++++++++++++++++++ tests/input/sano_dtc.txt | 11 +++++++ tests/input/sano_dtc_no_comments.txt | 9 ++++++ tests/io/test_reader.py | 5 +++ 4 files changed, 72 insertions(+) create mode 100644 tests/input/sano_dtc.txt create mode 100644 tests/input/sano_dtc_no_comments.txt diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 10dc2ba..692fbce 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -150,6 +150,14 @@ def read(self): d = self.read_snps_csv(file, comments, compression) elif "rsid\tChromosome\tposition\tgenotype" == first_line.strip(): d = self.read_tellmegen(file, compression) + elif ( + "# This file was derived from the corresponding VCF" in comments + or re.match( + r"^\s*rsid\s+chromosome\s+position\s+allele_1\s+allele_2\s*$", + first_line, + ) + ): + d = self.read_sano_dtc(file, compression) elif re.match("^#*[ \t]*rsid[, \t]*chr", first_line): d = self.read_generic(file, compression) elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line): @@ -1133,6 +1141,45 @@ def parser(): return self.read_helper("CircleDNA", parser) + def read_sano_dtc(self, file, compression): + """Read and parse Sano Genetics DTC file. + + https://sanogenetics.com + + Parameters + ---------- + file : str + path to file + + Returns + ------- + dict + result of `read_helper` + """ + + def parser(): + df = pd.read_csv( + file, + comment="#", + header=0, + engine="c", + sep=r"\s+", + na_values="-", + names=["rsid", "chrom", "pos", "allele1", "allele2"], + index_col=0, + dtype=TWO_ALLELE_DTYPES, + compression=compression, + ) + + # create genotype column from allele columns and keep only relevant columns + df = df.assign(genotype=df["allele1"] + df["allele2"].fillna(""))[ + ["chrom", "pos", "genotype"] + ] + + return (df,) + + return self.read_helper("Sano", parser) + def read_snps_csv(self, file, comments, compression): """Read and parse CSV file generated by ``snps``. diff --git a/tests/input/sano_dtc.txt b/tests/input/sano_dtc.txt new file mode 100644 index 0000000..da6c6e3 --- /dev/null +++ b/tests/input/sano_dtc.txt @@ -0,0 +1,11 @@ +# This file was derived from the corresponding VCF file to provide genetic +# information in a more accessible format. +rsid chromosome position allele_1 allele_2 +rs1 1 101 A A +rs2 1 102 C C +rs3 1 103 G G +rs4 1 104 T T +rs5 1 105 - - +rs6 1 106 G C +rs7 1 107 T C +rs8 1 108 A T diff --git a/tests/input/sano_dtc_no_comments.txt b/tests/input/sano_dtc_no_comments.txt new file mode 100644 index 0000000..a20f2df --- /dev/null +++ b/tests/input/sano_dtc_no_comments.txt @@ -0,0 +1,9 @@ +rsid chromosome position allele_1 allele_2 +rs1 1 101 A A +rs2 1 102 C C +rs3 1 103 G G +rs4 1 104 T T +rs5 1 105 - - +rs6 1 106 G C +rs7 1 107 T C +rs8 1 108 A T diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py index 95e887e..a61ee4a 100644 --- a/tests/io/test_reader.py +++ b/tests/io/test_reader.py @@ -303,6 +303,11 @@ def test_read_sano(self): self.run_parsing_tests("tests/input/sano.txt", "Sano") self._teardown_gsa_test() + def test_read_sano_dtc(self): + # https://sanogenetics.com + self.run_parsing_tests("tests/input/sano_dtc.txt", "Sano") + self.run_parsing_tests("tests/input/sano_dtc_no_comments.txt", "Sano") + def test_read_vcf(self): self.run_parsing_tests_vcf("tests/input/testvcf.vcf") From ad391f49c5794bfdfc09d38e3017fa79747ae70b Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Tue, 20 Aug 2024 20:59:49 -0700 Subject: [PATCH 2/3] Add PLINK test --- src/snps/io/reader.py | 2 +- tests/input/plink.txt | 10 ++++++++++ tests/io/test_reader.py | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 tests/input/plink.txt diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index a00d54a..83059bf 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -1217,7 +1217,7 @@ def parser(): df = df.set_index("rsid") return (df,) - return self.read_helper("plink", parser) + return self.read_helper("PLINK", parser) def read_snps_csv(self, file, comments, compression): """Read and parse CSV file generated by ``snps``. diff --git a/tests/input/plink.txt b/tests/input/plink.txt new file mode 100644 index 0000000..a8a797d --- /dev/null +++ b/tests/input/plink.txt @@ -0,0 +1,10 @@ +# Below is a text version of your data. +rsid chromosome position genotype +rs1 1 101 AA +rs2 1 102 CC +rs3 1 103 GG +rs4 1 104 TT +rs5 1 105 -- +rs6 1 106 GC +rs7 1 107 TC +rs8 1 108 AT diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py index 95e887e..c2c3473 100644 --- a/tests/io/test_reader.py +++ b/tests/io/test_reader.py @@ -154,6 +154,9 @@ def test_read_circledna(self): df.drop("rs5", inplace=True) # only called genotypes self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df) + def test_read_plink(self): + self.run_parsing_tests("tests/input/plink.txt", "PLINK") + def test_read_ftdna(self): # https://www.familytreedna.com self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA") From 23ca2e4f7e4c06548fa3fab13fafef7245a4baa7 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Tue, 20 Aug 2024 21:14:47 -0700 Subject: [PATCH 3/3] Update README --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 7fb8bec..024359b 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,7 @@ genotype files from the following DNA testing sources: - `LivingDNA `_ - `Mapmygenome `_ - `MyHeritage `_ +- `PLINK `_ - `Sano Genetics `_ - `tellmeGen `_