Skip to content

Commit

Permalink
Merge pull request #181 from sanogenetics/feature/parse-plink
Browse files Browse the repository at this point in the history
Add PLINK test
  • Loading branch information
apriha authored Aug 21, 2024
2 parents 9af329c + 371fdc2 commit 813fe86
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ genotype files from the following DNA testing sources:
- `LivingDNA <https://livingdna.com>`_
- `Mapmygenome <https://mapmygenome.in>`_
- `MyHeritage <https://www.myheritage.com>`_
- `PLINK <https://www.cog-genomics.org/plink/>`_
- `Sano Genetics <https://sanogenetics.com>`_
- `tellmeGen <https://www.tellmegen.com>`_

Expand Down
49 changes: 48 additions & 1 deletion src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,14 @@ def read(self):
d = self.read_snps_csv(file, comments, compression)
elif "rsid\tChromosome\tposition\tgenotype" == first_line.strip():
d = self.read_tellmegen(file, compression)
elif (
"# This file was derived from the corresponding VCF" in comments
or re.match(
r"^\s*rsid\s+chromosome\s+position\s+allele_1\s+allele_2\s*$",
first_line,
)
):
d = self.read_sano_dtc(file, compression)
elif re.match("^#*[ \t]*rsid[, \t]*chr", first_line):
d = self.read_generic(file, compression)
elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line):
Expand Down Expand Up @@ -1135,6 +1143,45 @@ def parser():

return self.read_helper("CircleDNA", parser)

def read_sano_dtc(self, file, compression):
"""Read and parse Sano Genetics DTC file.
https://sanogenetics.com
Parameters
----------
file : str
path to file
Returns
-------
dict
result of `read_helper`
"""

def parser():
df = pd.read_csv(
file,
comment="#",
header=0,
engine="c",
sep=r"\s+",
na_values="-",
names=["rsid", "chrom", "pos", "allele1", "allele2"],
index_col=0,
dtype=TWO_ALLELE_DTYPES,
compression=compression,
)

# create genotype column from allele columns and keep only relevant columns
df = df.assign(genotype=df["allele1"] + df["allele2"].fillna(""))[
["chrom", "pos", "genotype"]
]

return (df,)

return self.read_helper("Sano", parser)

def read_plink(self, file, compression):
"""Read and parse plink file.
Expand Down Expand Up @@ -1217,7 +1264,7 @@ def parser():
df = df.set_index("rsid")
return (df,)

return self.read_helper("plink", parser)
return self.read_helper("PLINK", parser)

def read_snps_csv(self, file, comments, compression):
"""Read and parse CSV file generated by ``snps``.
Expand Down
10 changes: 10 additions & 0 deletions tests/input/plink.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Below is a text version of your data.
rsid chromosome position genotype
rs1 1 101 AA
rs2 1 102 CC
rs3 1 103 GG
rs4 1 104 TT
rs5 1 105 --
rs6 1 106 GC
rs7 1 107 TC
rs8 1 108 AT
11 changes: 11 additions & 0 deletions tests/input/sano_dtc.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# This file was derived from the corresponding VCF file to provide genetic
# information in a more accessible format.
rsid chromosome position allele_1 allele_2
rs1 1 101 A A
rs2 1 102 C C
rs3 1 103 G G
rs4 1 104 T T
rs5 1 105 - -
rs6 1 106 G C
rs7 1 107 T C
rs8 1 108 A T
9 changes: 9 additions & 0 deletions tests/input/sano_dtc_no_comments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
rsid chromosome position allele_1 allele_2
rs1 1 101 A A
rs2 1 102 C C
rs3 1 103 G G
rs4 1 104 T T
rs5 1 105 - -
rs6 1 106 G C
rs7 1 107 T C
rs8 1 108 A T
8 changes: 8 additions & 0 deletions tests/io/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def test_read_circledna(self):
df.drop("rs5", inplace=True) # only called genotypes
self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)

def test_read_plink(self):
self.run_parsing_tests("tests/input/plink.txt", "PLINK")

def test_read_ftdna(self):
# https://www.familytreedna.com
self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")
Expand Down Expand Up @@ -303,6 +306,11 @@ def test_read_sano(self):
self.run_parsing_tests("tests/input/sano.txt", "Sano")
self._teardown_gsa_test()

def test_read_sano_dtc(self):
# https://sanogenetics.com
self.run_parsing_tests("tests/input/sano_dtc.txt", "Sano")
self.run_parsing_tests("tests/input/sano_dtc_no_comments.txt", "Sano")

def test_read_vcf(self):
self.run_parsing_tests_vcf("tests/input/testvcf.vcf")

Expand Down

0 comments on commit 813fe86

Please sign in to comment.