From dd4f54c369c0d1073da1d41668ae75f647b174e4 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 2 Apr 2020 12:05:59 -0400 Subject: [PATCH 01/17] Make sure all files are bundled correctly --- MANIFEST.in | 3 ++- setup.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 330aebe2..fce7694f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include versioneer.py include lusSTR/_version.py -lusSTR/str_markers.json +include lusSTR/str_markers.json +include lusSTR/tests/data/* diff --git a/setup.py b/setup.py index 3a4de6f6..2792df0d 100755 --- a/setup.py +++ b/setup.py @@ -10,12 +10,13 @@ from setuptools import setup import versioneer +desc = 'Tool for converting NGS sequence data of forensic STR loci to various annotation styles' setup( name='lusSTR', version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), - description='Convert ForenSeq sequence strings to a compact representation', - packages=['lusSTR'], + description=desc, + packages=['lusSTR', 'lusSTR.tests'], package_data={ 'lusSTR': ['lusSTR/str_markers.json', 'lusSTR/tests/data/*'] }, From 40b4f8b3b2f8ccc7ca3f14d6496fe1e80133414e Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Thu, 23 Apr 2020 10:11:03 -0400 Subject: [PATCH 02/17] fixed style issues --- lusSTR/annot.py | 48 ++++++++++++------ lusSTR/cli.py | 9 ++++ lusSTR/str_markers.json | 108 ++++++++++++++++++++++++++++++---------- 3 files changed, 123 insertions(+), 42 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index 8abcc452..2a0def40 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -591,6 +591,16 @@ def PentaD_annotation(sequence, no_of_repeat_bases, repeat_list): return re.sub(" ", " ", final_string) +def full_foren(full_seq, front, back): + if front == 0: + seq_uas = full_seq[:-back] + elif back == 0: + seq_uas = full_seq[front:] + else: + seq_uas = full_seq[front:-back] + return seq_uas + + def main(args): cannot_split = [ "D19S433", "D6S1043", "TH01", "D21S11", "D1S1656", "D7S820", "D5S818", "D12S391", @@ -619,13 +629,20 @@ def main(args): lus = str_dict[locus]['LUS'] sec = str_dict[locus]['Sec'] tert = str_dict[locus]['Tert'] + foren_5 = str_dict[locus]['Foren_5'] + foren_3 = str_dict[locus]['Foren_3'] + if args.uas: + uas_sequence = sequence + else: + if args.kit == "forenseq": + uas_sequence = full_foren(sequence, foren_5, foren_3) str_allele = traditional_str_allele(sequence, no_of_repeat_bases, no_of_sub_bases) if ( locus in cannot_split or - ((len(sequence) % no_of_repeat_bases != 0) and locus not in must_split) + ((len(uas_sequence) % no_of_repeat_bases != 0) and locus not in must_split) ): if str_dict[locus]['ReverseCompNeeded'] == "Yes": - reverse_comp_sequence = rev_complement_anno(sequence) + reverse_comp_sequence = rev_complement_anno(uas_sequence) forward_strand_bracketed_form = rev_comp_forward_strand_bracket( reverse_comp_sequence, no_of_repeat_bases, repeats, locus, cannot_split ) @@ -634,17 +651,17 @@ def main(args): ) elif locus == "D21S11": forward_strand_bracketed_form = D21_bracket( - sequence, no_of_split_bases, repeats + uas_sequence, no_of_split_bases, repeats ) - elif locus == "TH01" and (len(sequence) % no_of_repeat_bases != 0): - forward_strand_bracketed_form = TH01_annotation(sequence, repeats) + elif locus == "TH01" and (len(uas_sequence) % no_of_repeat_bases != 0): + forward_strand_bracketed_form = TH01_annotation(uas_sequence, repeats) elif locus == "PentaD": forward_strand_bracketed_form = PentaD_annotation( - sequence, no_of_repeat_bases, repeats + uas_sequence, no_of_repeat_bases, repeats ) else: forward_strand_bracketed_form = split_string( - sequence, no_of_repeat_bases, repeats + uas_sequence, no_of_repeat_bases, repeats ) lus_final, sec_final, tert_final = lus_anno( forward_strand_bracketed_form, lus, sec, tert, locus, str_allele @@ -653,14 +670,14 @@ def main(args): if locus == "D18S51": if type(str_allele) == str: forward_strand_bracketed_form = split_string( - sequence, no_of_repeat_bases, repeats + uas_sequence, no_of_repeat_bases, repeats ) else: forward_strand_bracketed_form = loci_need_split_anno( - sequence, no_of_repeat_bases + uas_sequence, no_of_repeat_bases ) elif str_dict[locus]['ReverseCompNeeded'] == "Yes": - reverse_comp_sequence = rev_complement_anno(sequence) + reverse_comp_sequence = rev_complement_anno(uas_sequence) forward_strand_bracketed_form = rev_comp_forward_strand_bracket( reverse_comp_sequence, no_of_repeat_bases, repeats, locus, cannot_split ) @@ -669,10 +686,11 @@ def main(args): ) elif locus == "PentaD": forward_strand_bracketed_form = PentaD_annotation( - sequence, no_of_repeat_bases, repeats + uas_sequence, no_of_repeat_bases, repeats ) else: - forward_strand_bracketed_form = loci_need_split_anno(sequence, no_of_repeat_bases) + forward_strand_bracketed_form = + loci_need_split_anno(uas_sequence, no_of_repeat_bases) lus_final, sec_final, tert_final = lus_anno( forward_strand_bracketed_form, lus, sec, tert, locus, str_allele ) @@ -692,9 +710,9 @@ def main(args): lus_plus = f"{str_allele}_{lus_final}_{sec_final}_{tert_final}" if str_dict[locus]['ReverseCompNeeded'] == "Yes": summary = [ - sampleid, project, analysis, locus, sequence, reverse_comp_sequence, str_allele, - forward_strand_bracketed_form, reverse_strand_bracketed_form, lus_final_output, - lus_plus, reads + sampleid, project, analysis, locus, uas_sequence, reverse_comp_sequence, + str_allele, forward_strand_bracketed_form, reverse_strand_bracketed_form, + lus_final_output, lus_plus, reads ] summary = '\t'.join(str(i) for i in summary) else: diff --git a/lusSTR/cli.py b/lusSTR/cli.py index 02e60122..422cbaa4 100644 --- a/lusSTR/cli.py +++ b/lusSTR/cli.py @@ -33,6 +33,15 @@ def annot_subparser(subparsers): 'input', help='sample(s) in CSV format; first four columns must be Locus, NumReads, ' 'Sequence, SampleID; Optional last two columns can be Project and Analysis.' ) + cli.add_argument( + '--kit', choices=['forenseq', 'powerseq'], default='forenseq', + help='Kit used to develop sequences; only forenseq or powerseq accepted;' + 'default = forenseq' + ) + cli.add_argument( + '--uas', action='store_true', + help='Use if sequences have been run through the ForenSeq UAS.' + ) mains = { diff --git a/lusSTR/str_markers.json b/lusSTR/str_markers.json index 85a40968..4c2b8e97 100644 --- a/lusSTR/str_markers.json +++ b/lusSTR/str_markers.json @@ -9,7 +9,9 @@ "ReverseCompNeeded": "Yes", "LUS": "ATCT", "Sec": "ACCT", - "Tert": "" + "Tert": "", + "Foren_5": 6, + "Foren_3": 14 }, "D10S1248": { "BasesToSubtract": 0, @@ -21,7 +23,9 @@ "ReverseCompNeeded": "No", "LUS": "GGAA", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 18, + "Foren_3": 23 }, "D12S391": { "BasesToSubtract": 0, @@ -34,7 +38,9 @@ "ReverseCompNeeded": "No", "LUS": "AGAT", "Sec": "AGAC", - "Tert": "AGGT" + "Tert": "AGGT", + "Foren_5": 39, + "Foren_3": 73 }, "D13S317": { "BasesToSubtract": 31, @@ -50,7 +56,9 @@ "ReverseCompNeeded": "No", "LUS": "TATC", "Sec": "ATCT", - "Tert": "GTCT" + "Tert": "GTCT", + "Foren_5": 39, + "Foren_3": 8 }, "D16S539": { "BasesToSubtract": 0, @@ -62,7 +70,9 @@ "ReverseCompNeeded": "No", "LUS": "GATA", "Sec": "GACA", - "Tert": "" + "Tert": "", + "Foren_5": 38, + "Foren_3": 36 }, "D17S1301": { "BasesToSubtract": 0, @@ -74,7 +84,9 @@ "ReverseCompNeeded": "No", "LUS": "AGAT", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 9, + "Foren_3": 16 }, "D18S51": { "BasesToSubtract": 10, @@ -88,7 +100,9 @@ "ReverseCompNeeded": "No", "LUS": "AGAA", "Sec": "AAAG", - "Tert": "" + "Tert": "", + "Foren_5": 5, + "Foren_3": 48 }, "D19S433": { "BasesToSubtract": 26, @@ -102,7 +116,9 @@ "ReverseCompNeeded": "Yes", "LUS": "CCTT", "Sec": "CTTT", - "Tert": "CCTG" + "Tert": "CCTG", + "Foren_5": 12, + "Foren_3": 54 }, "D1S1656": { "BasesToSubtract": 10, @@ -118,7 +134,9 @@ "ReverseCompNeeded": "Yes", "LUS": "TCTA", "Sec": "CCTA", - "Tert": "TCTG" + "Tert": "TCTG", + "Foren_5": 51, + "Foren_3": 0 }, "D20S482": { "BasesToSubtract": 0, @@ -130,7 +148,9 @@ "ReverseCompNeeded": "No", "LUS": "AGAT", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 18, + "Foren_3": 24 }, "D21S11": { "BasesToSubtract": 11, @@ -143,7 +163,9 @@ "ReverseCompNeeded": "No", "LUS": "TCTA", "Sec": "TCTA", - "Tert": "TCTG" + "Tert": "TCTG", + "Foren_5": 34, + "Foren_3": 12 }, "D22S1045": { "BasesToSubtract": 0, @@ -155,7 +177,9 @@ "ReverseCompNeeded": "No", "LUS": "ATT", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 106, + "Foren_3": 20 }, "D2S1338": { "BasesToSubtract": 0, @@ -168,7 +192,9 @@ "ReverseCompNeeded": "Yes", "LUS": "GGAA", "Sec": "GGAC", - "Tert": "GGCA" + "Tert": "GGCA", + "Foren_5": 3, + "Foren_3": 14 }, "D2S441": { "BasesToSubtract": 0, @@ -181,7 +207,9 @@ "ReverseCompNeeded": "No", "LUS": "TCTA", "Sec": "TGTA", - "Tert": "" + "Tert": "", + "Foren_5": 29, + "Foren_3": 23 }, "D3S1358": { "BasesToSubtract": 0, @@ -194,7 +222,9 @@ "ReverseCompNeeded": "No", "LUS": "TCTA", "Sec": "TCTG", - "Tert": "" + "Tert": "", + "Foren_5": 48, + "Foren_3": 18 }, "D4S2408": { "BasesToSubtract": 0, @@ -206,7 +236,9 @@ "ReverseCompNeeded": "No", "LUS": "ATCT", "Sec": "CTCT", - "Tert": "" + "Tert": "", + "Foren_5": 6, + "Foren_3": 8 }, "D5S818": { "BasesToSubtract": 4, @@ -218,7 +250,9 @@ "ReverseCompNeeded": "Yes", "LUS": "ATCT", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 9, + "Foren_3": 7 }, "D6S1043": { "BasesToSubtract": 0, @@ -231,7 +265,9 @@ "ReverseCompNeeded": "Yes", "LUS": "ATCT", "Sec": "ATGT", - "Tert": "" + "Tert": "", + "Foren_5": 65, + "Foren_3": 20 }, "D7S820": { "BasesToSubtract": 16, @@ -244,7 +280,9 @@ "ReverseCompNeeded": "Yes", "LUS": "TATC", "Sec": "TGTC", - "Tert": "" + "Tert": "", + "Foren_5": 19, + "Foren_3": 20 }, "D8S1179": { "BasesToSubtract": 0, @@ -257,7 +295,9 @@ "ReverseCompNeeded": "No", "LUS": "TCTA", "Sec": "TCTG", - "Tert": "TGTA" + "Tert": "TGTA", + "Foren_5": 0, + "Foren_3": 5 }, "D9S1122": { "BasesToSubtract": 0, @@ -269,7 +309,9 @@ "ReverseCompNeeded": "No", "LUS": "TAGA", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 17, + "Foren_3": 7 }, "FGA": { "BasesToSubtract": 0, @@ -285,7 +327,9 @@ "ReverseCompNeeded": "Yes", "LUS": "AAAG", "Sec": "GAAA", - "Tert": "GAAG" + "Tert": "GAAG", + "Foren_5": 23, + "Foren_3": 0 }, "PentaD": { "BasesToSubtract": 5, @@ -297,7 +341,9 @@ "ReverseCompNeeded": "No", "LUS": "AAAGA", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 100, + "Foren_3": 9 }, "PentaE": { "BasesToSubtract": 0, @@ -309,7 +355,9 @@ "ReverseCompNeeded": "Yes", "LUS": "TCTTT", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 19, + "Foren_3": 75 }, "TH01": { "BasesToSubtract": 0, @@ -321,7 +369,9 @@ "ReverseCompNeeded": "No", "LUS": "AATG", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 32, + "Foren_3": 12 }, "TPOX": { "BasesToSubtract": 0, @@ -333,7 +383,9 @@ "ReverseCompNeeded": "No", "LUS": "AATG", "Sec": "", - "Tert": "" + "Tert": "", + "Foren_5": 2, + "Foren_3": 5 }, "vWA": { "BasesToSubtract": 8, @@ -347,6 +399,8 @@ "ReverseCompNeeded": "Yes", "LUS": "TAGA", "Sec": "CAGA", - "Tert": "TGGA" + "Tert": "TGGA", + "Foren_5": 19, + "Foren_3": 5 } } From fc444f7f2cbc3fc6af3513d34524d87c799a2985 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Thu, 23 Apr 2020 14:35:00 -0400 Subject: [PATCH 03/17] Added full length sequence capability --- lusSTR/annot.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index 2a0def40..a8f3c570 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -116,9 +116,9 @@ def rev_complement_anno(sequence): ''' Function creates reverse complement of sequence - Sequences in which the UAS software output contains the sequence on the reverse strand require - translation of the sequence to the forward strand. This allows for consistency between both - loci and any outside analyses in which comparisons may be made. + Sequences in which the UAS software output contains the sequence on the reverse strand + require translation of the sequence to the forward strand. This allows for consistency + between both loci and any outside analyses in which comparisons may be made. ''' complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} bases = list(sequence) @@ -132,9 +132,9 @@ def rev_comp_forward_strand_bracket(rev_sequence, n, repeat_list, locusid, canno ''' Function creates bracketed annotation for reverse complement sequences - Function is used to create the bracketed annotation for reverse complement sequences (i.e. the - forward strand). It calls additional functions depending on the locus and/or if the sequence - is a microvariant or not. + Function is used to create the bracketed annotation for reverse complement sequences (i.e + the forward strand). It calls additional functions depending on the locus and/or if the + sequence is a microvariant or not. ''' if locusid in cannot_split_list: if locusid == "D19S433": @@ -592,6 +592,13 @@ def PentaD_annotation(sequence, no_of_repeat_bases, repeat_list): def full_foren(full_seq, front, back): + ''' + Function to trim full sequences to the UAS region. + + It identifies the number of bases to remove from the 5' and 3' ends of the sequence to + trim to the UAS region. The downstream annotation, including length-based allele + designations, LUS, LUS+ and bracketed annotation is based on this region in the sequence. + ''' if front == 0: seq_uas = full_seq[:-back] elif back == 0: @@ -635,17 +642,23 @@ def main(args): uas_sequence = sequence else: if args.kit == "forenseq": - uas_sequence = full_foren(sequence, foren_5, foren_3) - str_allele = traditional_str_allele(sequence, no_of_repeat_bases, no_of_sub_bases) + if str_dict[locus]['ReverseCompNeeded'] == "No": + uas_sequence = full_foren(sequence, foren_5, foren_3) + else: + uas_from_full = full_foren(sequence, foren_5, foren_3) + uas_sequence = rev_complement_anno(uas_from_full) + str_allele = traditional_str_allele(uas_sequence, no_of_repeat_bases, no_of_sub_bases) if ( locus in cannot_split or ((len(uas_sequence) % no_of_repeat_bases != 0) and locus not in must_split) ): if str_dict[locus]['ReverseCompNeeded'] == "Yes": reverse_comp_sequence = rev_complement_anno(uas_sequence) + print(reverse_comp_sequence) forward_strand_bracketed_form = rev_comp_forward_strand_bracket( reverse_comp_sequence, no_of_repeat_bases, repeats, locus, cannot_split ) + print(forward_strand_bracketed_form) reverse_strand_bracketed_form = rev_comp_uas_output_bracket( forward_strand_bracketed_form, no_of_repeat_bases ) @@ -689,8 +702,9 @@ def main(args): uas_sequence, no_of_repeat_bases, repeats ) else: - forward_strand_bracketed_form = - loci_need_split_anno(uas_sequence, no_of_repeat_bases) + forward_strand_bracketed_form = loci_need_split_anno( + uas_sequence, no_of_repeat_bases + ) lus_final, sec_final, tert_final = lus_anno( forward_strand_bracketed_form, lus, sec, tert, locus, str_allele ) @@ -717,7 +731,7 @@ def main(args): summary = '\t'.join(str(i) for i in summary) else: summary = [ - sampleid, project, analysis, locus, sequence, sequence, str_allele, + sampleid, project, analysis, locus, uas_sequence, uas_sequence, str_allele, forward_strand_bracketed_form, forward_strand_bracketed_form, lus_final_output, lus_plus, reads ] From 4040bd2aef6ff0fb7d3f5cac64755018e1c22192 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Thu, 23 Apr 2020 16:52:08 -0400 Subject: [PATCH 04/17] Added PowerSeq cut points --- lusSTR/str_markers.json | 109 ++++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/lusSTR/str_markers.json b/lusSTR/str_markers.json index 4c2b8e97..f7cfe976 100644 --- a/lusSTR/str_markers.json +++ b/lusSTR/str_markers.json @@ -11,7 +11,9 @@ "Sec": "ACCT", "Tert": "", "Foren_5": 6, - "Foren_3": 14 + "Foren_3": 14, + "Power_5": 13, + "Power_3": 107 }, "D10S1248": { "BasesToSubtract": 0, @@ -25,7 +27,9 @@ "Sec": "", "Tert": "", "Foren_5": 18, - "Foren_3": 23 + "Foren_3": 23, + "Power_5": 57, + "Power_3": 1 }, "D12S391": { "BasesToSubtract": 0, @@ -40,7 +44,10 @@ "Sec": "AGAC", "Tert": "AGGT", "Foren_5": 39, - "Foren_3": 73 + "Foren_3": 73, + "Power_5": 21, + "Power_3": 11 + }, "D13S317": { "BasesToSubtract": 31, @@ -58,7 +65,9 @@ "Sec": "ATCT", "Tert": "GTCT", "Foren_5": 39, - "Foren_3": 8 + "Foren_3": 8, + "Power_5": 70, + "Power_3": 43 }, "D16S539": { "BasesToSubtract": 0, @@ -72,7 +81,9 @@ "Sec": "GACA", "Tert": "", "Foren_5": 38, - "Foren_3": 36 + "Foren_3": 36, + "Power_5": 133, + "Power_3": 4 }, "D17S1301": { "BasesToSubtract": 0, @@ -86,7 +97,9 @@ "Sec": "", "Tert": "", "Foren_5": 9, - "Foren_3": 16 + "Foren_3": 16, + "Power_5": "", + "Power_3": "" }, "D18S51": { "BasesToSubtract": 10, @@ -102,7 +115,9 @@ "Sec": "AAAG", "Tert": "", "Foren_5": 5, - "Foren_3": 48 + "Foren_3": 48, + "Power_5": 64, + "Power_3": 43 }, "D19S433": { "BasesToSubtract": 26, @@ -118,7 +133,9 @@ "Sec": "CTTT", "Tert": "CCTG", "Foren_5": 12, - "Foren_3": 54 + "Foren_3": 54, + "Power_5": 61, + "Power_3": 39 }, "D1S1656": { "BasesToSubtract": 10, @@ -136,7 +153,9 @@ "Sec": "CCTA", "Tert": "TCTG", "Foren_5": 51, - "Foren_3": 0 + "Foren_3": 0, + "Power_5": 46, + "Power_3": 21 }, "D20S482": { "BasesToSubtract": 0, @@ -150,7 +169,9 @@ "Sec": "", "Tert": "", "Foren_5": 18, - "Foren_3": 24 + "Foren_3": 24, + "Power_5": "", + "Power_3": "" }, "D21S11": { "BasesToSubtract": 11, @@ -165,7 +186,9 @@ "Sec": "TCTA", "Tert": "TCTG", "Foren_5": 34, - "Foren_3": 12 + "Foren_3": 12, + "Power_5": 10, + "Power_3": 41 }, "D22S1045": { "BasesToSubtract": 0, @@ -179,7 +202,9 @@ "Sec": "", "Tert": "", "Foren_5": 106, - "Foren_3": 20 + "Foren_3": 20, + "Power_5": 52, + "Power_3": 11 }, "D2S1338": { "BasesToSubtract": 0, @@ -194,7 +219,9 @@ "Sec": "GGAC", "Tert": "GGCA", "Foren_5": 3, - "Foren_3": 14 + "Foren_3": 14, + "Power_5": 66, + "Power_3": 46 }, "D2S441": { "BasesToSubtract": 0, @@ -209,7 +236,9 @@ "Sec": "TGTA", "Tert": "", "Foren_5": 29, - "Foren_3": 23 + "Foren_3": 23, + "Power_5": 83, + "Power_3": 4 }, "D3S1358": { "BasesToSubtract": 0, @@ -224,7 +253,9 @@ "Sec": "TCTG", "Tert": "", "Foren_5": 48, - "Foren_3": 18 + "Foren_3": 18, + "Power_5": 97, + "Power_3": 18 }, "D4S2408": { "BasesToSubtract": 0, @@ -238,7 +269,9 @@ "Sec": "CTCT", "Tert": "", "Foren_5": 6, - "Foren_3": 8 + "Foren_3": 8, + "Power_5": "", + "Power_3": "" }, "D5S818": { "BasesToSubtract": 4, @@ -252,7 +285,9 @@ "Sec": "", "Tert": "", "Foren_5": 9, - "Foren_3": 7 + "Foren_3": 7, + "Power_5": 36, + "Power_3": 82 }, "D6S1043": { "BasesToSubtract": 0, @@ -267,7 +302,9 @@ "Sec": "ATGT", "Tert": "", "Foren_5": 65, - "Foren_3": 20 + "Foren_3": 20, + "Power_5": "", + "Power_3": "" }, "D7S820": { "BasesToSubtract": 16, @@ -282,7 +319,9 @@ "Sec": "TGTC", "Tert": "", "Foren_5": 19, - "Foren_3": 20 + "Foren_3": 20, + "Power_5": 63, + "Power_3": 67 }, "D8S1179": { "BasesToSubtract": 0, @@ -297,7 +336,9 @@ "Sec": "TCTG", "Tert": "TGTA", "Foren_5": 0, - "Foren_3": 5 + "Foren_3": 5, + "Power_5": 19, + "Power_3": 108 }, "D9S1122": { "BasesToSubtract": 0, @@ -311,7 +352,9 @@ "Sec": "", "Tert": "", "Foren_5": 17, - "Foren_3": 7 + "Foren_3": 7, + "Power_5": "", + "Power_3": "" }, "FGA": { "BasesToSubtract": 0, @@ -329,7 +372,9 @@ "Sec": "GAAA", "Tert": "GAAG", "Foren_5": 23, - "Foren_3": 0 + "Foren_3": 0, + "Power_5": 81, + "Power_3": 0 }, "PentaD": { "BasesToSubtract": 5, @@ -343,7 +388,9 @@ "Sec": "", "Tert": "", "Foren_5": 100, - "Foren_3": 9 + "Foren_3": 9, + "Power_5": 62, + "Power_3": 63 }, "PentaE": { "BasesToSubtract": 0, @@ -357,7 +404,9 @@ "Sec": "", "Tert": "", "Foren_5": 19, - "Foren_3": 75 + "Foren_3": 75, + "Power_5": 108, + "Power_3": 0 }, "TH01": { "BasesToSubtract": 0, @@ -371,7 +420,9 @@ "Sec": "", "Tert": "", "Foren_5": 32, - "Foren_3": 12 + "Foren_3": 12, + "Power_5": 7, + "Power_3": 156 }, "TPOX": { "BasesToSubtract": 0, @@ -385,7 +436,9 @@ "Sec": "", "Tert": "", "Foren_5": 2, - "Foren_3": 5 + "Foren_3": 5, + "Power_5": 121, + "Power_3": 14 }, "vWA": { "BasesToSubtract": 8, @@ -401,6 +454,8 @@ "Sec": "CAGA", "Tert": "TGGA", "Foren_5": 19, - "Foren_3": 5 + "Foren_3": 5, + "Power_5": 4, + "Power_3": 100 } } From 34b18c7593c816f4f82bda88747b278972ab215e Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Fri, 24 Apr 2020 07:57:30 -0400 Subject: [PATCH 05/17] Added PowerSeq option --- lusSTR/annot.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index a8f3c570..df1b97c3 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -591,7 +591,7 @@ def PentaD_annotation(sequence, no_of_repeat_bases, repeat_list): return re.sub(" ", " ", final_string) -def full_foren(full_seq, front, back): +def full_seq_to_uas(full_seq, front, back): ''' Function to trim full sequences to the UAS region. @@ -638,14 +638,22 @@ def main(args): tert = str_dict[locus]['Tert'] foren_5 = str_dict[locus]['Foren_5'] foren_3 = str_dict[locus]['Foren_3'] + power_5 = str_dict[locus]['Power_5'] + power_3 = str_dict[locus]['Power_3'] if args.uas: uas_sequence = sequence else: if args.kit == "forenseq": if str_dict[locus]['ReverseCompNeeded'] == "No": - uas_sequence = full_foren(sequence, foren_5, foren_3) + uas_sequence = full_seq_to_uas(sequence, foren_5, foren_3) else: - uas_from_full = full_foren(sequence, foren_5, foren_3) + uas_from_full = full_seq_to_uas(sequence, foren_5, foren_3) + uas_sequence = rev_complement_anno(uas_from_full) + elif args.kit == "powerseq": + if str_dict[locus]['ReverseCompNeeded'] == "No": + uas_sequence = full_seq_to_uas(sequence, power_5, power_3) + else: + uas_from_full = full_seq_to_uas(sequence, power_5, power_3) uas_sequence = rev_complement_anno(uas_from_full) str_allele = traditional_str_allele(uas_sequence, no_of_repeat_bases, no_of_sub_bases) if ( From 066c45e193b2a2705c02505480207c07ec5da863 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Fri, 24 Apr 2020 08:18:10 -0400 Subject: [PATCH 06/17] added test --- lusSTR/tests/test_suite.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index ef1677f4..7476c8fd 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -249,3 +249,12 @@ def test_D21_lus_sec(): lus_out, sec_out = lusSTR.annot.D21_lus_sec(sequence, repeat, tert) assert str(lus_out) == '10' assert str(sec_out) == '4' + + +@pytest.mark.parametrize('sequence, uas_seq, front, back', [ + ('CTATGCATCTATCTATCTATCTATCTATCTATCTATCTATCTAATGGTTA', 'ATCTATCTATCTATCTATCTATCTATCTATCTATCT', 6, 8), + ('TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTCCC', 'TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA', 0, 5) +]) +def test_full_seq_to_uas(sequence, uas_seq, front, back): + uas_sequence = lusSTR.annot.full_seq_to_uas(sequence, front, back) + assert uas_sequence == uas_seq \ No newline at end of file From 22dca2db6e4e1219ac6b3328298821b1fe2b427e Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 24 Apr 2020 11:45:30 -0400 Subject: [PATCH 07/17] Clean up conditional --- lusSTR/annot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index df1b97c3..fd820c8f 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -656,10 +656,10 @@ def main(args): uas_from_full = full_seq_to_uas(sequence, power_5, power_3) uas_sequence = rev_complement_anno(uas_from_full) str_allele = traditional_str_allele(uas_sequence, no_of_repeat_bases, no_of_sub_bases) - if ( - locus in cannot_split or - ((len(uas_sequence) % no_of_repeat_bases != 0) and locus not in must_split) - ): + cantsplit = locus in cannot_split + havetosplit = locus in must_split + split_incompatible = len(uas_sequence) % no_of_repeat_bases != 0 and not havetosplit + if cantsplit or split_incompatible: if str_dict[locus]['ReverseCompNeeded'] == "Yes": reverse_comp_sequence = rev_complement_anno(uas_sequence) print(reverse_comp_sequence) From 907859003c8c623d995121b2ddcd4f7fc06d4498 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Fri, 24 Apr 2020 11:57:28 -0400 Subject: [PATCH 08/17] minor fixes --- lusSTR/annot.py | 2 -- lusSTR/str_markers.json | 20 +++++--------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index fd820c8f..08fde709 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -662,11 +662,9 @@ def main(args): if cantsplit or split_incompatible: if str_dict[locus]['ReverseCompNeeded'] == "Yes": reverse_comp_sequence = rev_complement_anno(uas_sequence) - print(reverse_comp_sequence) forward_strand_bracketed_form = rev_comp_forward_strand_bracket( reverse_comp_sequence, no_of_repeat_bases, repeats, locus, cannot_split ) - print(forward_strand_bracketed_form) reverse_strand_bracketed_form = rev_comp_uas_output_bracket( forward_strand_bracketed_form, no_of_repeat_bases ) diff --git a/lusSTR/str_markers.json b/lusSTR/str_markers.json index f7cfe976..7936dace 100644 --- a/lusSTR/str_markers.json +++ b/lusSTR/str_markers.json @@ -97,9 +97,7 @@ "Sec": "", "Tert": "", "Foren_5": 9, - "Foren_3": 16, - "Power_5": "", - "Power_3": "" + "Foren_3": 16 }, "D18S51": { "BasesToSubtract": 10, @@ -169,9 +167,7 @@ "Sec": "", "Tert": "", "Foren_5": 18, - "Foren_3": 24, - "Power_5": "", - "Power_3": "" + "Foren_3": 24 }, "D21S11": { "BasesToSubtract": 11, @@ -269,9 +265,7 @@ "Sec": "CTCT", "Tert": "", "Foren_5": 6, - "Foren_3": 8, - "Power_5": "", - "Power_3": "" + "Foren_3": 8 }, "D5S818": { "BasesToSubtract": 4, @@ -302,9 +296,7 @@ "Sec": "ATGT", "Tert": "", "Foren_5": 65, - "Foren_3": 20, - "Power_5": "", - "Power_3": "" + "Foren_3": 20 }, "D7S820": { "BasesToSubtract": 16, @@ -352,9 +344,7 @@ "Sec": "", "Tert": "", "Foren_5": 17, - "Foren_3": 7, - "Power_5": "", - "Power_3": "" + "Foren_3": 7 }, "FGA": { "BasesToSubtract": 0, From 197cccc63db6075357cba7e251eaa02d780f2470 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 24 Apr 2020 12:35:22 -0400 Subject: [PATCH 09/17] Refactor trimming --- .coveragerc | 6 ++++++ lusSTR/annot.py | 48 +++++++++++++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 19 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..01214685 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,6 @@ +[report] +show_missing = True + +[run] +branch = True +omit = lusSTR/_version.py, lusSTR/tests/test_*.py diff --git a/lusSTR/annot.py b/lusSTR/annot.py index 08fde709..4c27c1a1 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -591,6 +591,33 @@ def PentaD_annotation(sequence, no_of_repeat_bases, repeat_list): return re.sub(" ", " ", final_string) +def resolve_uas_sequence(sequence, str_data, uas, kit): + assert kit in ('forenseq', 'powerseq') + + foren_5, foren_3, power_5, power_3 = 0, 0, 0, 0 + if 'Foren_5' in str_data: + foren_5 = str_data['Foren_5'] + foren_3 = str_data['Foren_3'] + if 'Power_5' in str_data: + power_5 = str_data['Power_5'] + power_3 = str_data['Power_3'] + + if uas: + uas_sequence = sequence + else: + if kit == "forenseq": + trim5, trim3 = foren_5, foren_3 + else: + trim5, trim3 = power_5, power_3 + if str_data['ReverseCompNeeded'] == "No": + uas_sequence = full_seq_to_uas(sequence, trim5, trim3) + else: + uas_from_full = full_seq_to_uas(sequence, trim5, trim3) + uas_sequence = rev_complement_anno(uas_from_full) + + return uas_sequence + + def full_seq_to_uas(full_seq, front, back): ''' Function to trim full sequences to the UAS region. @@ -636,25 +663,8 @@ def main(args): lus = str_dict[locus]['LUS'] sec = str_dict[locus]['Sec'] tert = str_dict[locus]['Tert'] - foren_5 = str_dict[locus]['Foren_5'] - foren_3 = str_dict[locus]['Foren_3'] - power_5 = str_dict[locus]['Power_5'] - power_3 = str_dict[locus]['Power_3'] - if args.uas: - uas_sequence = sequence - else: - if args.kit == "forenseq": - if str_dict[locus]['ReverseCompNeeded'] == "No": - uas_sequence = full_seq_to_uas(sequence, foren_5, foren_3) - else: - uas_from_full = full_seq_to_uas(sequence, foren_5, foren_3) - uas_sequence = rev_complement_anno(uas_from_full) - elif args.kit == "powerseq": - if str_dict[locus]['ReverseCompNeeded'] == "No": - uas_sequence = full_seq_to_uas(sequence, power_5, power_3) - else: - uas_from_full = full_seq_to_uas(sequence, power_5, power_3) - uas_sequence = rev_complement_anno(uas_from_full) + + uas_sequence = resolve_uas_sequence(sequence, str_dict[locus], args.uas, args.kit) str_allele = traditional_str_allele(uas_sequence, no_of_repeat_bases, no_of_sub_bases) cantsplit = locus in cannot_split havetosplit = locus in must_split From cc5f28a407bf28e5958da9eae24395fba0d95d74 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 24 Apr 2020 13:52:18 -0400 Subject: [PATCH 10/17] More refactoring --- lusSTR/annot.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/lusSTR/annot.py b/lusSTR/annot.py index 4c27c1a1..859bb7a4 100644 --- a/lusSTR/annot.py +++ b/lusSTR/annot.py @@ -591,29 +591,20 @@ def PentaD_annotation(sequence, no_of_repeat_bases, repeat_list): return re.sub(" ", " ", final_string) -def resolve_uas_sequence(sequence, str_data, uas, kit): +def resolve_uas_sequence(sequence, str_data, kit): assert kit in ('forenseq', 'powerseq') + if kit == 'forenseq': + trim5 = str_data['Foren_5'] + trim3 = str_data['Foren_3'] + else: + trim5 = str_data['Power_5'] + trim3 = str_data['Power_3'] - foren_5, foren_3, power_5, power_3 = 0, 0, 0, 0 - if 'Foren_5' in str_data: - foren_5 = str_data['Foren_5'] - foren_3 = str_data['Foren_3'] - if 'Power_5' in str_data: - power_5 = str_data['Power_5'] - power_3 = str_data['Power_3'] - - if uas: - uas_sequence = sequence + if str_data['ReverseCompNeeded'] == "No": + uas_sequence = full_seq_to_uas(sequence, trim5, trim3) else: - if kit == "forenseq": - trim5, trim3 = foren_5, foren_3 - else: - trim5, trim3 = power_5, power_3 - if str_data['ReverseCompNeeded'] == "No": - uas_sequence = full_seq_to_uas(sequence, trim5, trim3) - else: - uas_from_full = full_seq_to_uas(sequence, trim5, trim3) - uas_sequence = rev_complement_anno(uas_from_full) + uas_from_full = full_seq_to_uas(sequence, trim5, trim3) + uas_sequence = rev_complement_anno(uas_from_full) return uas_sequence @@ -664,7 +655,10 @@ def main(args): sec = str_dict[locus]['Sec'] tert = str_dict[locus]['Tert'] - uas_sequence = resolve_uas_sequence(sequence, str_dict[locus], args.uas, args.kit) + if args.uas: + uas_sequence = sequence + else: + uas_sequence = resolve_uas_sequence(sequence, str_dict[locus], args.kit) str_allele = traditional_str_allele(uas_sequence, no_of_repeat_bases, no_of_sub_bases) cantsplit = locus in cannot_split havetosplit = locus in must_split From 7060c4c72a7b7f3b338a578678111588207023e8 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 24 Apr 2020 13:53:33 -0400 Subject: [PATCH 11/17] Clean up makefile --- Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 0477f454..b42ffcb7 100755 --- a/Makefile +++ b/Makefile @@ -4,12 +4,8 @@ help: Makefile @sed -n 's/^## //p' Makefile @echo '' -## test: run the automated test suite +## test: run the automated test suite and print coverage information test: - pytest lusSTR/tests/test_suite.py - -## testcov: run the automated test suite and print coverage information -testcov: pytest --cov=lusSTR lusSTR/tests/test_suite.py ## style: check code style against PEP8 From 567c340ff9b64b94b236d90f77c994191ee4f530 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 24 Apr 2020 13:54:00 -0400 Subject: [PATCH 12/17] Fix CI config --- .github/workflows/cibuild.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cibuild.yml b/.github/workflows/cibuild.yml index ef433dda..be072286 100644 --- a/.github/workflows/cibuild.yml +++ b/.github/workflows/cibuild.yml @@ -24,4 +24,4 @@ jobs: - name: Style check run: make style - name: Test with pytest - run: make testcov + run: make test From 10e46890a51b93a6b16e465b6e23bcbcde1a74d8 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Mon, 27 Apr 2020 16:03:23 -0400 Subject: [PATCH 13/17] added UAS annotate test --- lusSTR/tests/data/2800M_formatted_full.csv | 1 + lusSTR/tests/data/2800M_formatted_uas.csv | 1 + lusSTR/tests/data/2800M_full_anno.txt | 50 ++++++++++++++++++++++ lusSTR/tests/data/2800M_uas_anno.txt | 50 ++++++++++++++++++++++ lusSTR/tests/test_suite.py | 18 +++++++- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 lusSTR/tests/data/2800M_formatted_full.csv create mode 100644 lusSTR/tests/data/2800M_formatted_uas.csv create mode 100644 lusSTR/tests/data/2800M_full_anno.txt create mode 100644 lusSTR/tests/data/2800M_uas_anno.txt diff --git a/lusSTR/tests/data/2800M_formatted_full.csv b/lusSTR/tests/data/2800M_formatted_full.csv new file mode 100644 index 00000000..621814d3 --- /dev/null +++ b/lusSTR/tests/data/2800M_formatted_full.csv @@ -0,0 +1 @@ +Locus,Reads,Sequence,Sample CSF1PO,100,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A01 D10S1248,100,TTGAACAAATGAGTGAGTGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAATGAAGACAATACAACCAGAGTT,A01 D10S1248,100,TTGAACAAATGAGTGAGTGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAATGAAGACAATACAACCAGAGTT,A01 D12S391,100,CAGAGAGAAAGAATCAACAGGATCAATGGATGCATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGATGAGAGGGGATTTATTAGAGGAATTAGCTCAAGTGATATGGAGGCTGAAAAATCTCATGACAGTCCATCTGCAA,A01 D12S391,100,CAGAGAGAAAGAATCAACAGGATCAATGGATGCATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGACGAGAGGGGATTTATTAGAGGAATTAGCTCAAGTGATATGGAGGCTGAAAAATCTCATGACAGTCCATCTGCAA,A01 D13S317,100,TCTGACCCATCTAACGCCTATCTGTATTTACAAATACATTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGG,A01 D13S317,100,TCTGACCCATCTAACGCCTATCTGTATTTACAAATACATTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGG,A01 D16S539,100,TCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTGGATAGATAGATAGATAGATAGATAGATAGATAGATATCATTGAAAGACAAACCAGAGATGGATGATAGATAC,A01 D16S539,100,TCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATCATTGAAAGACAAAACAGAGATGGATGATAGATAC,A01 D17S1301,100,ATATGTGTGAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATCCATCATAGGAATTTT,A01 D17S1301,100,ATATGTGTGAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATCCATCATAGGAATTTT,A01 D18S51,100,GTCTCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGA,A01 D18S51,100,GTCTCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGA,A01 D19S433,100,AATAAAAATCTTCTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTTCAACAGAATCTTATTCTGTTGCCCAGGCTGGAGTGCAGTGGTACAATTATAGCT,A01 D19S433,100,AATAAAAATCTTCTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTTCAACAGAATCTTATTCTGTTGCCCAGGCTGGAGTGCAGTGGTACAATTATAGCT,A01 D1S1656,100,TTCAGAGAAATAGAATCACTAGGGAACCAAATATATATACATACAATTAAACACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D1S1656,100,TTCAGAGAAATAGAATCACTAGGGAACCAAATATATATACATACAATTAAACACACACACATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D20S482,100,AGACACCGAACCAATAAGAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAGATTTATTATAGGAATTGATT,A01 D20S482,100,AGACACCGAACCAATAAGAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAGATTTATTATAGGAATTGATT,A01 D21S11,100,AAATATGTGAGTCAATTCCCCAAGTGAATTGCCTTCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGTCTATCTAT,A01 D21S11,100,AAATATGTGAGTCAATTCCCCAAGTGAATTGCCTTCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTATCGTCTATCTAT,A01 D22S1045,100,CGTTGGAATTCCCCAAACTGGCCAGTTCCTCTCCACCCTATAGACCCTGTCCTAGCCTTCTTATAGCTGCTATGGGGGCTAGATTTTCCCCGATGATAGTAGTCTCATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATTGTTATAAAAATATTGCCAAT,A01 D2S1338,100,GAGGGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCAAGGCCAAGCCATTT,A01 D2S1338,100,GAGGGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCAAGGCCAAGCCATTT,A01 D2S441,100,CCAGGAACTGTGGCTCATCTATGAAAACTTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCATAACACCACAGCCACTTA,A01 D2S441,100,CCAGGAACTGTGGCTCATCTATGAAAACTTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTATATCATAACACCACAGCCACTTA,A01 D3S1358,100,TTTGGGGGCATCTCTTATACTCATGAAATCAACAGAGGCTTGCATGTATCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATGAGACAGGGTCTTGCTC,A01 D3S1358,100,TTTGGGGGCATCTCTTATACTCATGAAATCAACAGAGGCTTGCATGTATCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATGAGACAGGGTCTTGCTC,A01 D4S2408,100,CTATGCATCTATCTATCTATCTATCTATCTATCTATCTATCTAATGGTTA,A01 D5S818,100,TATTTATACCTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTTCAAAAT,A01 D6S1043,100,AGATGGCATATTGTGAAATTTCTCAGCTTCCATAATTGTATGAGCCACTTCCCATAATAAATCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGATCTATCAATCTATTGATC,A01 D6S1043,100,AGATGGCATATTGTGAAATTTCTCAGCTTCCATAATTGTATGAGCCACTTCCCATAATAAATCCTATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGATCTATCAATCTATTGATC,A01 D7S820,100,TATTTAGTGAGATAAAAAAAAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCGTTAGTTCGTTCTAAACTAT,A01 D7S820,100,TATTTAGTGAGATAAAAAAAAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGTTAGTTCGTTCTAAACTAT,A01 D8S1179,100,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTCCC,A01 D8S1179,100,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTCCC,A01 D9S1122,100,AGATAACTGTAGATAGGTAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTAAT,A01 D9S1122,100,AGATAACTGTAGATAGGTAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTAAT,A01 FGA,100,CCAGCAAAAAAGAAAGGAAGAAAGGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,A01 FGA,100,CCAGCAAAAAAGAAAGGAAGAAAGGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,A01 PentaD,100,GATCACTTGAGCCTGGAAGGTCGAAGCTGAAGTGAGCCATGATCACACCACTACACTCCAGCCTAGGTGACAGAGCAAGACACCATCTCAAGAAAGAAAAAAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAAACGAA,A01 PentaD,100,GATCACTTGAGCCTGGAAGGTCGAAGCTGAAGTGAGCCATGATCACACCACTACACTCCAGCCTAGGTGACAGAGCAAGACACCATCTCAAGAAAGAAAAAAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAAACGAA,A01 PentaE,100,AGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGACTGAGTCTTGCTCAGTCGCCCAGGCTGGAGTGCAATGGCGTGATCTCGGCTCACTTCAATCTCCACCTCCT,A01 PentaE,100,AGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGACTGAGTCTTGCTCAGTCGCCCAGGCTGGAGTGCAATGGCGTGATCTCGGCTCACTTCAATCTCCACCTCCT,A01 TH01,100,TGCAGGTCACAGGGAACACAGACTCCATGGTGAATGAATGAATGAATGAATGAATGAGGGAAATAAGG,A01 TH01,100,TGCAGGTCACAGGGAACACAGACTCCATGGTGAATGAATGAATGAATGAATGAATGATGAATGAATGAATGAGGGAAATAAGG,A01 TPOX,100,TGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGTTTGG,A01 vWA,100,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A01 vWA,100,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A01 \ No newline at end of file diff --git a/lusSTR/tests/data/2800M_formatted_uas.csv b/lusSTR/tests/data/2800M_formatted_uas.csv new file mode 100644 index 00000000..a1d91fd2 --- /dev/null +++ b/lusSTR/tests/data/2800M_formatted_uas.csv @@ -0,0 +1 @@ +Locus,Reads,sequence,Sample CSF1PO,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D10S1248,100,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,A01 D10S1248,100,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,A01 D12S391,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT,A01 D12S391,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGAC,A01 D13S317,100,TATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTC,A01 D13S317,100,TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTC,A01 D16S539,100,GATAGATAGATAGATAGATAGATAGATAGATAGATA,A01 D16S539,100,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA,A01 D17S1301,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D17S1301,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D18S51,100,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,A01 D18S51,100,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,A01 D19S433,100,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,A01 D19S433,100,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,A01 D1S1656,100,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,A01 D1S1656,100,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,A01 D20S482,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D20S482,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D21S11,100,TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D21S11,100,TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA,A01 D22S1045,100,ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,A01 D2S1338,100,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,A01 D2S1338,100,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,A01 D2S441,100,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D2S441,100,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA,A01 D3S1358,100,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D3S1358,100,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D4S2408,100,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,A01 D5S818,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG,A01 D6S1043,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,A01 D6S1043,100,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT,A01 D7S820,100,GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,A01 D7S820,100,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,A01 D8S1179,100,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D8S1179,100,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,A01 D9S1122,100,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,A01 D9S1122,100,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,A01 FGA,100,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,A01 FGA,100,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,A01 PentaD,100,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,A01 PentaD,100,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,A01 PentaE,100,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,A01 PentaE,100,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,A01 TH01,100,AATGAATGAATGAATGAATGAATG,A01 TH01,100,AATGAATGAATGAATGAATGAATGATGAATGAATGAATG,A01 TPOX,100,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,A01 vWA,100,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,A01 vWA,100,TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,A01 \ No newline at end of file diff --git a/lusSTR/tests/data/2800M_full_anno.txt b/lusSTR/tests/data/2800M_full_anno.txt new file mode 100644 index 00000000..bd62dfe3 --- /dev/null +++ b/lusSTR/tests/data/2800M_full_anno.txt @@ -0,0 +1,50 @@ +SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence Traditional_STR_Allele Forward_Strand_Bracketed_form UAS_Output_Bracketed_Form LUS LUS_Plus Reads +A01 NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 [ATCT]12 [AGAT]12 12_12 12_12_0 100 +A01 NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA 13 [GGAA]13 [GGAA]13 13_13 13_13 100 +A01 NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA 15 [GGAA]15 [GGAA]15 15_15 15_15 100 +A01 NA NA D12S391 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT 18 [AGAT]11 [AGAC]6 AGAT [AGAT]11 [AGAC]6 AGAT 18_11 18_11_6_0 100 +A01 NA NA D12S391 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGAC AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGAC 23 [AGAT]14 [AGAC]9 [AGAT]14 [AGAC]9 23_14 23_14_9_0 100 +A01 NA NA D13S317 TATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTC TATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTC 9 [TATC]9 [AATC]2 [ATCT]3 TTCT GTCT GTC [TATC]9 [AATC]2 [ATCT]3 TTCT GTCT GTC 9_9 9_9_3_1 100 +A01 NA NA D13S317 TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTC TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTC 11 [TATC]12 AATC [ATCT]3 TTCT GTCT GTC [TATC]12 AATC [ATCT]3 TTCT GTCT GTC 11_12 11_12_3_1 100 +A01 NA NA D16S539 GATAGATAGATAGATAGATAGATAGATAGATAGATA GATAGATAGATAGATAGATAGATAGATAGATAGATA 9 [GATA]9 [GATA]9 9_9 9_9_0 100 +A01 NA NA D16S539 GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA 13 [GATA]13 [GATA]13 13_13 13_13_0 100 +A01 NA NA D17S1301 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 11 [AGAT]11 [AGAT]11 11_11 11_11 100 +A01 NA NA D17S1301 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 12 [AGAT]12 [AGAT]12 12_12 12_12 100 +A01 NA NA D18S51 AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG 16 [AGAA]16 AAAG AGAG AG [AGAA]16 AAAG AGAG AG 16_16 16_16_1 100 +A01 NA NA D18S51 AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG 18 [AGAA]18 AAAG AGAG AG [AGAA]18 AAAG AGAG AG 18_18 18_18_1 100 +A01 NA NA D19S433 AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT 13 CT CTCT TTCT TCCT CTCT [CCTT]11 CCTA CCTT CTTT CCTT AAGG AAAG AAGG TAGG [AAGG]11 AGAG AGGA AGAA AGAG AG 13_11 13_11_1_0 100 +A01 NA NA D19S433 AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT 14 CT CTCT TTCT TCCT CTCT [CCTT]12 CCTA CCTT CTTT CCTT AAGG AAAG AAGG TAGG [AAGG]12 AGAG AGGA AGAA AGAG AG 14_12 14_12_1_0 100 +A01 NA NA D1S1656 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 12 CA [CACA]2 CCTA [TCTA]11 [TAGA]11 TAGG [TGTG]2 TG 12_11 12_11_1_0 100 +A01 NA NA D1S1656 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG CACACACACATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 13 CA [CACA]2 [TCTA]13 [TAGA]13 [TGTG]2 TG 13_13 13_13_0_0 100 +A01 NA NA D20S482 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 14 [AGAT]14 [AGAT]14 14_14 14_14 100 +A01 NA NA D20S482 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 15 [AGAT]15 [AGAT]15 15_15 15_15 100 +A01 NA NA D21S11 TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 29 [TCTA]4 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 [TCTA]4 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 29_11 29_11_4_6 100 +A01 NA NA D21S11 TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA 31.2 [TCTA]5 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 TA TCTA [TCTA]5 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 TA TCTA 31.2_11 31.2_11_5_6 100 +A01 NA NA D22S1045 ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT 16 [ATT]13 ACT [ATT]2 [ATT]13 ACT [ATT]2 16_13 16_13 100 +A01 NA NA D2S1338 TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA 22 [GGAA]2 GGAC [GGAA]12 [GGCA]7 [TGCC]7 [TTCC]12 GTCC [TTCC]2 22_12 22_12_1_7 100 +A01 NA NA D2S1338 TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA 25 [GGAA]2 GGAC [GGAA]15 [GGCA]7 [TGCC]7 [TTCC]15 GTCC [TTCC]2 25_15 25_15_1_7 100 +A01 NA NA D2S441 TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 10 [TCTA]10 [TCTA]10 10_10 10_10_0 100 +A01 NA NA D2S441 TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA 14 [TCTA]11 TTTA [TCTA]2 [TCTA]11 TTTA [TCTA]2 14_11 14_11_0 100 +A01 NA NA D3S1358 TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 17 TCTA [TCTG]3 [TCTA]13 TCTA [TCTG]3 [TCTA]13 17_13 17_13_3 100 +A01 NA NA D3S1358 TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 18 TCTA [TCTG]3 [TCTA]14 TCTA [TCTG]3 [TCTA]14 18_14 18_14_3 100 +A01 NA NA D4S2408 ATCTATCTATCTATCTATCTATCTATCTATCTATCT ATCTATCTATCTATCTATCTATCTATCTATCTATCT 9 [ATCT]9 [ATCT]9 9_9 9_9_0 100 +A01 NA NA D5S818 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 CTCT [ATCT]12 [AGAT]12 AGAG 12_12 12_12 100 +A01 NA NA D6S1043 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 [ATCT]12 [AGAT]12 12_12 12_12_0 100 +A01 NA NA D6S1043 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 20 [ATCT]5 ATGT [ATCT]14 [AGAT]14 ACAT [AGAT]5 20_14 20_14_1 100 +A01 NA NA D7S820 GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATC 8 AAAC TATC AATC TGTC [TATC]8 [GATA]8 GACA GATT GATA GTTT 8_8 8_8_1_0 100 +A01 NA NA D7S820 GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC 11 AAAC TATC AATC TGTC [TATC]11 [GATA]11 GACA GATT GATA GTTT 11_11 11_11_1_0 100 +A01 NA NA D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 14 TCTA TCTG [TCTA]12 TCTA TCTG [TCTA]12 14_12 14_12_1_0 100 +A01 NA NA D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 15 [TCTA]2 TCTG [TCTA]12 [TCTA]2 TCTG [TCTA]12 15_12 15_12_1_0 100 +A01 NA NA D9S1122 TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA 12 TAGA TCGA [TAGA]10 TAGA TCGA [TAGA]10 12_10 12_10 100 +A01 NA NA D9S1122 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA 12 [TAGA]12 [TAGA]12 12_12 12_12 100 +A01 NA NA FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA 20 [GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3 [TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2 20_12 20_12_3_0 100 +A01 NA NA FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA 23 [GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3 [TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2 23_15 23_15_3_0 100 +A01 NA NA PentaD AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA 12 AAAAG [AAAGA]12 AAAAG [AAAGA]12 12_12 12_12 100 +A01 NA NA PentaD AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA 13 AAAAG [AAAGA]13 AAAAG [AAAGA]13 13_13 13_13 100 +A01 NA NA PentaE AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT 7 [TCTTT]7 [AAAGA]7 7_7 7_7 100 +A01 NA NA PentaE AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT 14 [TCTTT]14 [AAAGA]14 14_14 14_14 100 +A01 NA NA TH01 AATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATG 6 [AATG]6 [AATG]6 6_6 6_6 100 +A01 NA NA TH01 AATGAATGAATGAATGAATGAATGATGAATGAATGAATG AATGAATGAATGAATGAATGAATGATGAATGAATGAATG 9.3 [AATG]6 ATG [AATG]3 [AATG]6 ATG [AATG]3 9.3_6 9.3_6 100 +A01 NA NA TPOX AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG 11 [AATG]11 [AATG]11 11_11 11_11 100 +A01 NA NA vWA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA 16 TAGA TGGA [TAGA]12 [CAGA]3 TAGA TCTA [TCTG]3 [TCTA]12 TCCA TCTA 16_12 16_12_3_1 100 +A01 NA NA vWA TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA 19 TAGA TGGA [TAGA]14 [CAGA]4 TAGA TCTA [TCTG]4 [TCTA]14 TCCA TCTA 19_14 19_14_4_1 100 diff --git a/lusSTR/tests/data/2800M_uas_anno.txt b/lusSTR/tests/data/2800M_uas_anno.txt new file mode 100644 index 00000000..bd62dfe3 --- /dev/null +++ b/lusSTR/tests/data/2800M_uas_anno.txt @@ -0,0 +1,50 @@ +SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence Traditional_STR_Allele Forward_Strand_Bracketed_form UAS_Output_Bracketed_Form LUS LUS_Plus Reads +A01 NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 [ATCT]12 [AGAT]12 12_12 12_12_0 100 +A01 NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA 13 [GGAA]13 [GGAA]13 13_13 13_13 100 +A01 NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA 15 [GGAA]15 [GGAA]15 15_15 15_15 100 +A01 NA NA D12S391 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT 18 [AGAT]11 [AGAC]6 AGAT [AGAT]11 [AGAC]6 AGAT 18_11 18_11_6_0 100 +A01 NA NA D12S391 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGAC AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGAC 23 [AGAT]14 [AGAC]9 [AGAT]14 [AGAC]9 23_14 23_14_9_0 100 +A01 NA NA D13S317 TATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTC TATCTATCTATCTATCTATCTATCTATCTATCTATCAATCAATCATCTATCTATCTTTCTGTCTGTC 9 [TATC]9 [AATC]2 [ATCT]3 TTCT GTCT GTC [TATC]9 [AATC]2 [ATCT]3 TTCT GTCT GTC 9_9 9_9_3_1 100 +A01 NA NA D13S317 TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTC TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCAATCATCTATCTATCTTTCTGTCTGTC 11 [TATC]12 AATC [ATCT]3 TTCT GTCT GTC [TATC]12 AATC [ATCT]3 TTCT GTCT GTC 11_12 11_12_3_1 100 +A01 NA NA D16S539 GATAGATAGATAGATAGATAGATAGATAGATAGATA GATAGATAGATAGATAGATAGATAGATAGATAGATA 9 [GATA]9 [GATA]9 9_9 9_9_0 100 +A01 NA NA D16S539 GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA 13 [GATA]13 [GATA]13 13_13 13_13_0 100 +A01 NA NA D17S1301 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 11 [AGAT]11 [AGAT]11 11_11 11_11 100 +A01 NA NA D17S1301 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 12 [AGAT]12 [AGAT]12 12_12 12_12 100 +A01 NA NA D18S51 AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG 16 [AGAA]16 AAAG AGAG AG [AGAA]16 AAAG AGAG AG 16_16 16_16_1 100 +A01 NA NA D18S51 AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG 18 [AGAA]18 AAAG AGAG AG [AGAA]18 AAAG AGAG AG 18_18 18_18_1 100 +A01 NA NA D19S433 AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT 13 CT CTCT TTCT TCCT CTCT [CCTT]11 CCTA CCTT CTTT CCTT AAGG AAAG AAGG TAGG [AAGG]11 AGAG AGGA AGAA AGAG AG 13_11 13_11_1_0 100 +A01 NA NA D19S433 AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT 14 CT CTCT TTCT TCCT CTCT [CCTT]12 CCTA CCTT CTTT CCTT AAGG AAAG AAGG TAGG [AAGG]12 AGAG AGGA AGAA AGAG AG 14_12 14_12_1_0 100 +A01 NA NA D1S1656 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 12 CA [CACA]2 CCTA [TCTA]11 [TAGA]11 TAGG [TGTG]2 TG 12_11 12_11_1_0 100 +A01 NA NA D1S1656 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG CACACACACATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 13 CA [CACA]2 [TCTA]13 [TAGA]13 [TGTG]2 TG 13_13 13_13_0_0 100 +A01 NA NA D20S482 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 14 [AGAT]14 [AGAT]14 14_14 14_14 100 +A01 NA NA D20S482 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT 15 [AGAT]15 [AGAT]15 15_15 15_15 100 +A01 NA NA D21S11 TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 29 [TCTA]4 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 [TCTA]4 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 29_11 29_11_4_6 100 +A01 NA NA D21S11 TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA 31.2 [TCTA]5 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 TA TCTA [TCTA]5 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11 TA TCTA 31.2_11 31.2_11_5_6 100 +A01 NA NA D22S1045 ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT 16 [ATT]13 ACT [ATT]2 [ATT]13 ACT [ATT]2 16_13 16_13 100 +A01 NA NA D2S1338 TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA 22 [GGAA]2 GGAC [GGAA]12 [GGCA]7 [TGCC]7 [TTCC]12 GTCC [TTCC]2 22_12 22_12_1_7 100 +A01 NA NA D2S1338 TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA 25 [GGAA]2 GGAC [GGAA]15 [GGCA]7 [TGCC]7 [TTCC]15 GTCC [TTCC]2 25_15 25_15_1_7 100 +A01 NA NA D2S441 TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 10 [TCTA]10 [TCTA]10 10_10 10_10_0 100 +A01 NA NA D2S441 TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA 14 [TCTA]11 TTTA [TCTA]2 [TCTA]11 TTTA [TCTA]2 14_11 14_11_0 100 +A01 NA NA D3S1358 TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 17 TCTA [TCTG]3 [TCTA]13 TCTA [TCTG]3 [TCTA]13 17_13 17_13_3 100 +A01 NA NA D3S1358 TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 18 TCTA [TCTG]3 [TCTA]14 TCTA [TCTG]3 [TCTA]14 18_14 18_14_3 100 +A01 NA NA D4S2408 ATCTATCTATCTATCTATCTATCTATCTATCTATCT ATCTATCTATCTATCTATCTATCTATCTATCTATCT 9 [ATCT]9 [ATCT]9 9_9 9_9_0 100 +A01 NA NA D5S818 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 CTCT [ATCT]12 [AGAT]12 AGAG 12_12 12_12 100 +A01 NA NA D6S1043 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 12 [ATCT]12 [AGAT]12 12_12 12_12_0 100 +A01 NA NA D6S1043 AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT 20 [ATCT]5 ATGT [ATCT]14 [AGAT]14 ACAT [AGAT]5 20_14 20_14_1 100 +A01 NA NA D7S820 GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATC 8 AAAC TATC AATC TGTC [TATC]8 [GATA]8 GACA GATT GATA GTTT 8_8 8_8_1_0 100 +A01 NA NA D7S820 GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC 11 AAAC TATC AATC TGTC [TATC]11 [GATA]11 GACA GATT GATA GTTT 11_11 11_11_1_0 100 +A01 NA NA D8S1179 TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 14 TCTA TCTG [TCTA]12 TCTA TCTG [TCTA]12 14_12 14_12_1_0 100 +A01 NA NA D8S1179 TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA 15 [TCTA]2 TCTG [TCTA]12 [TCTA]2 TCTG [TCTA]12 15_12 15_12_1_0 100 +A01 NA NA D9S1122 TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA 12 TAGA TCGA [TAGA]10 TAGA TCGA [TAGA]10 12_10 12_10 100 +A01 NA NA D9S1122 TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA 12 [TAGA]12 [TAGA]12 12_12 12_12 100 +A01 NA NA FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA 20 [GGAA]2 GGAG [AAAG]12 AGAA AAAA [GAAA]3 [TTTC]3 TTTT TTCT [CTTT]12 CTCC [TTCC]2 20_12 20_12_3_0 100 +A01 NA NA FGA TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA 23 [GGAA]2 GGAG [AAAG]15 AGAA AAAA [GAAA]3 [TTTC]3 TTTT TTCT [CTTT]15 CTCC [TTCC]2 23_15 23_15_3_0 100 +A01 NA NA PentaD AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA 12 AAAAG [AAAGA]12 AAAAG [AAAGA]12 12_12 12_12 100 +A01 NA NA PentaD AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA 13 AAAAG [AAAGA]13 AAAAG [AAAGA]13 13_13 13_13 100 +A01 NA NA PentaE AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT 7 [TCTTT]7 [AAAGA]7 7_7 7_7 100 +A01 NA NA PentaE AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT 14 [TCTTT]14 [AAAGA]14 14_14 14_14 100 +A01 NA NA TH01 AATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATG 6 [AATG]6 [AATG]6 6_6 6_6 100 +A01 NA NA TH01 AATGAATGAATGAATGAATGAATGATGAATGAATGAATG AATGAATGAATGAATGAATGAATGATGAATGAATGAATG 9.3 [AATG]6 ATG [AATG]3 [AATG]6 ATG [AATG]3 9.3_6 9.3_6 100 +A01 NA NA TPOX AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG 11 [AATG]11 [AATG]11 11_11 11_11 100 +A01 NA NA vWA TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA 16 TAGA TGGA [TAGA]12 [CAGA]3 TAGA TCTA [TCTG]3 [TCTA]12 TCCA TCTA 16_12 16_12_3_1 100 +A01 NA NA vWA TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA 19 TAGA TGGA [TAGA]14 [CAGA]4 TAGA TCTA [TCTG]4 [TCTA]14 TCCA TCTA 19_14 19_14_4_1 100 diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index 7476c8fd..9b84b4c4 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -12,7 +12,7 @@ import pytest import lusSTR from lusSTR.tests import data_file -from tempfile import NamedTemporaryFile +from tempfile import NamedTemporaryFile, TemporaryDirectory def test_format(): @@ -257,4 +257,18 @@ def test_D21_lus_sec(): ]) def test_full_seq_to_uas(sequence, uas_seq, front, back): uas_sequence = lusSTR.annot.full_seq_to_uas(sequence, front, back) - assert uas_sequence == uas_seq \ No newline at end of file + assert uas_sequence == uas_seq + + +def test_annotate_uas(): + with NamedTemporaryFile() as outfile: + input = data_file('2800M_formatted_uas.csv') + arglist = ['annotate', input, '-o', outfile.name, '--kit', 'forenseq', '--uas'] + args = lusSTR.cli.get_parser().parse_args(arglist) + lusSTR.annot.main(args) + with open(outfile.name, 'r') as fh: + results = fh.read().strip('\n') + with open(data_file('2800M_uas_anno.txt'), 'r') as fh: + testresults = fh.read().strip('\n') + assert results == testresults + From 556ec5cb7f5b7bb6ccb8379395f44db10a05dc40 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Tue, 28 Apr 2020 08:23:40 -0400 Subject: [PATCH 14/17] revised test --- lusSTR/tests/test_suite.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index 9b84b4c4..7394fb6d 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -12,7 +12,7 @@ import pytest import lusSTR from lusSTR.tests import data_file -from tempfile import NamedTemporaryFile, TemporaryDirectory +from tempfile import NamedTemporaryFile def test_format(): @@ -263,12 +263,9 @@ def test_full_seq_to_uas(sequence, uas_seq, front, back): def test_annotate_uas(): with NamedTemporaryFile() as outfile: input = data_file('2800M_formatted_uas.csv') + testanno = data_file('2800M_uas_anno.txt') arglist = ['annotate', input, '-o', outfile.name, '--kit', 'forenseq', '--uas'] args = lusSTR.cli.get_parser().parse_args(arglist) lusSTR.annot.main(args) - with open(outfile.name, 'r') as fh: - results = fh.read().strip('\n') - with open(data_file('2800M_uas_anno.txt'), 'r') as fh: - testresults = fh.read().strip('\n') - assert results == testresults + assert filecmp.cmp(testanno, outfile.name) is True From 6e7fa724827f9402c2631a6eb6868356cbfe81fd Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Tue, 28 Apr 2020 11:32:55 -0400 Subject: [PATCH 15/17] Fix test by deleting output file before `lusstr annotate` writes to it --- lusSTR/tests/test_suite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index 7394fb6d..bb06f429 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -8,6 +8,7 @@ # ----------------------------------------------------------------------------- import filecmp +import os import pandas as pd import pytest import lusSTR @@ -262,10 +263,10 @@ def test_full_seq_to_uas(sequence, uas_seq, front, back): def test_annotate_uas(): with NamedTemporaryFile() as outfile: + os.unlink(outfile.name) input = data_file('2800M_formatted_uas.csv') testanno = data_file('2800M_uas_anno.txt') arglist = ['annotate', input, '-o', outfile.name, '--kit', 'forenseq', '--uas'] args = lusSTR.cli.get_parser().parse_args(arglist) lusSTR.annot.main(args) assert filecmp.cmp(testanno, outfile.name) is True - From 3558926425e42d7f614e3ac9e9b8d04f88c047cc Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Tue, 28 Apr 2020 11:33:28 -0400 Subject: [PATCH 16/17] Rename variable to avoid conflicts with builtin function https://stackoverflow.com/a/20670757/459780 --- lusSTR/tests/test_suite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index bb06f429..bb5c98f4 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -264,9 +264,9 @@ def test_full_seq_to_uas(sequence, uas_seq, front, back): def test_annotate_uas(): with NamedTemporaryFile() as outfile: os.unlink(outfile.name) - input = data_file('2800M_formatted_uas.csv') + inputfile = data_file('2800M_formatted_uas.csv') testanno = data_file('2800M_uas_anno.txt') - arglist = ['annotate', input, '-o', outfile.name, '--kit', 'forenseq', '--uas'] + arglist = ['annotate', inputfile, '-o', outfile.name, '--kit', 'forenseq', '--uas'] args = lusSTR.cli.get_parser().parse_args(arglist) lusSTR.annot.main(args) assert filecmp.cmp(testanno, outfile.name) is True From 37ce9c26085b21cab6cf905aad9cb00ecf9f5a54 Mon Sep 17 00:00:00 2001 From: Rebecca Mitchell Date: Tue, 28 Apr 2020 13:07:49 -0400 Subject: [PATCH 17/17] added full seq test --- lusSTR/tests/test_suite.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index bb5c98f4..d5192fa6 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -270,3 +270,15 @@ def test_annotate_uas(): args = lusSTR.cli.get_parser().parse_args(arglist) lusSTR.annot.main(args) assert filecmp.cmp(testanno, outfile.name) is True + + +def test_annotate_full(): + with NamedTemporaryFile() as outfile: + os.unlink(outfile.name) + inputfile = data_file('2800M_formatted_full.csv') + testanno = data_file('2800M_full_anno.txt') + arglist = ['annotate', inputfile, '-o', outfile.name, '--kit', 'forenseq'] + args = lusSTR.cli.get_parser().parse_args(arglist) + lusSTR.annot.main(args) + assert filecmp.cmp(testanno, outfile.name) is True +