-
Notifications
You must be signed in to change notification settings - Fork 0
/
DNA Sequence assembly
136 lines (115 loc) · 4.26 KB
/
DNA Sequence assembly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import sys
from TextGrid import TextGrid, Cell
SEQUENCES_FOLDER = 'tiny'
def assemble_sequence(sequence1, sequence2, sequence3):
"""
Given three TextGrid objects, this function creates and returns a solution genome based on the sequences passed in. All the sequences passed in will be the same size.
Input:
three TextGrids to be processed
Returns:
a new solution genome (also a TextGrid)
"""
full_genome = TextGrid.blank(sequence1.width, sequence1.height)
for y in range(full_genome.height):
for x in range(full_genome.width):
nucleotide1 = sequence1.get_cell(x,y)
nucleotide2 = sequence2.get_cell(x,y)
nucleotide3 = sequence3.get_cell(x,y)
best = get_best_nucleotide(nucleotide1, nucleotide2, nucleotide3)
full_genome.set_cell(x,y, best)
return full_genome
def get_best_nucleotide(nucleotide1, nucleotide2, nucleotide3):
"""
Given three nucleotides, returns a nucleotide with the most common *non-blank* value. If multiple nucleotides have the same value, it doesn't matter which specific nucleotide is returned so long as its value is correct. You can assume there will never be ambiguity -- there will always be a clear majority non-blank character.
Input:
three nucleotides (Cells) to be compared
Returns:
best (Cell): nucleotide with most common non-blank value
This doctest creates nucleotides for simple tests.
>>> grid = TextGrid('ATCG_.txt')
>>> A = grid.get_cell(0,0)
>>> T = grid.get_cell(1,0)
>>> C = grid.get_cell(2,0)
>>> G = grid.get_cell(3,0)
>>> blank = grid.get_cell(4,0)
>>> best1 = get_best_nucleotide(A, A, A)
>>> best1.value
'A'
>>> best2 = get_best_nucleotide(A, T, T)
>>> best2.value
'T'
>>> best3 = get_best_nucleotide(A, blank, A)
>>> best3.value
'A'
>>> best4 = get_best_nucleotide(blank, blank, T)
>>> best4.value
'T'
"""
best = ()
if not_blank(nucleotide1) and equal_value(nucleotide1, nucleotide2):
best = nucleotide1.value
elif not_blank(nucleotide1) and equal_value(nucleotide1, nucleotide3):
best = nucleotide1.value
elif not_blank(nucleotide1) and both_blank(nucleotide2, nucleotide3):
best = nucleotide1.value
elif not_blank(nucleotide2) and equal_value(nucleotide2, nucleotide3):
best = nucleotide2.value
elif not_blank(nucleotide2) and both_blank(nucleotide1, nucleotide3):
best = nucleotide2.value
elif not_blank(nucleotide3) and both_blank(nucleotide1, nucleotide2):
best = nucleotide3.value
return Cell(best,0,0)
def not_blank(nucleotide):
if nucleotide.value != '_':
return True
def equal_value(nuc1, nuc2):
if nuc1.value == nuc2.value:
return True
def both_blank(nuc1, nuc2):
if not(not_blank(nuc1) and not_blank(nuc2)):
return True
def txts_in_dir(directory):
"""
Given the name of a directory, returns a list of the .txt filenames
within it.
Input:
dir (string): name of directory
Returns:
filenames(List[string]): names of jpg files in directory
"""
filenames = []
for filename in os.listdir(directory):
if filename.endswith('.txt'):
filenames.append(os.path.join(directory, filename))
return filenames
def load_sequences(directory):
"""
Given a directory name, reads all the .txt files within it into memory and
returns them in a list. Prints to terminal the names of the files it loads.
Input:
dir (string): name of directory
Returns:
sequences (List[TextGrids]): list of sequences in directory
"""
sequences = []
txts = txts_in_dir(directory)
txts.sort()
for filename in txts:
print("Loading", filename)
sequence = TextGrid(filename)
sequences.append(sequence)
return sequences
def main():
sequences = load_sequences(SEQUENCES_FOLDER)
full_genome = assemble_sequence(sequences[0], sequences[1], sequences[2])
if full_genome:
print("Genome assembled as follows:")
print(full_genome)
else:
print("No genome to display!")
if __name__ == '__main__':
args = sys.argv[1:]
if len(args) > 0:
SEQUENCES_FOLDER = args[0]
main()