-
Notifications
You must be signed in to change notification settings - Fork 1
/
needlemanWunschGlobalAlignment.py
137 lines (116 loc) · 4.6 KB
/
needlemanWunschGlobalAlignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''An implementation of Needleman-Wunsch algorithm,
not sure if any mistake for now :D'''
# First define the constants
# Scoring matrix, diagonal is for matching
S = [[2, 0, 0, 0],
[0, 2, 0, 0],
[0, 0, 2, 0],
[0, 0, 0, 2]]
# A dictionary for looking up the matrix
ntIdx = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
# Gap penalty
gapP = -1
def printMatrix(m):
'''
print the matrix with each line for better visualization when debugging
'''
for i in range(len(m)):
print(i, m[i])
class align():
"""Needleman-Wunsch algorithm"""
def __init__(self, seq1, seq2, S = S, gapP = gapP):
assert len(seq1) > 0, 'Input sequence is empty'
assert len(seq2) > 0, 'Input sequence is empty'
self.seq1 = seq1.upper()
self.seq2 = seq2.upper()
self.S = S
self.gapP = gapP
self.aln = 'NOT ALIGNED YET'
self.score = None
self.run()
def run(self):
'''Run all the processes'''
self.initializeMatrix()
self.DP()
self.traceback()
def initializeMatrix(self):
'''Initialize the scoring matrix and traceback matrix'''
# First inititalize the scoring matrix, so that it has len(seq1) rows
# and len(seq2) columns.
self.M = [[0 for j in range(len(self.seq2) + 1)]
for i in range(len(self.seq1) + 1)]
# And a matrix for recording the trace back path, with the same size.
self.TB = [[0 for j in range(len(self.seq2) + 1)]
for i in range(len(self.seq1) + 1)]
# Initialize the gap scores of the first row and the first column.
for j in range(1, len(self.seq2) + 1):
self.M[0][j] = self.M[0][j - 1] + self.gapP
for i in range(1, len(self.seq1) + 1):
self.M[i][0] = self.M[i - 1][0] + self.gapP
for j in range(1, len(self.seq2) + 1):
self.TB[0][j] = 1 # for gap in seq1
for i in range(1, len(self.seq1) + 1):
self.TB[i][0] = 2 # for gap in seq2
def DP(self):
'''The DP step for filling in the matrix'''
for i in range(1, len(self.seq1) + 1):
for j in range(1, len(self.seq2) + 1):
# Note that here i - 1 is the i'th index in self.seq1
# get the scores from three directions
g1 = self.M[i][j - 1] + gapP
g2 = self.M[i - 1][j] + gapP
m = self.M[i - 1][j - 1] + \
S[ntIdx[self.seq1[i - 1]]][ntIdx[self.seq2[j - 1]]]
if m == max(g1, g2, m):
self.M[i][j] = m
self.TB[i][j] = 3 # for matching
elif g1 == max(g1, g2, m):
self.M[i][j] = g1
self.TB[i][j] = 1 # for gap in seq1
elif g2 == max(g1, g2, m):
self.M[i][j] = g2
self.TB[i][j] = 2 # for gap in seq2
self.score = self.M[i][j]
printMatrix(self.M)
def traceback(self):
'''The traceback step'''
Pos = (len(self.seq1), len(self.seq2))
Dir = self.TB[Pos[0]][Pos[1]]
seq1out = []
seq2out = []
matching = []
while Dir != 0:
if Dir == 3 and self.seq1[Pos[0] - 1] == self.seq2[Pos[1] - 1]:
matching.insert(0, '|')
else:
matching.insert(0, ' ')
if Dir == 3:
seq1out.insert(0, self.seq1[Pos[0] - 1])
seq2out.insert(0, self.seq2[Pos[1] - 1])
Pos = (Pos[0] - 1, Pos[1] - 1)
elif Dir == 1:
seq1out.insert(0, '-')
seq2out.insert(0, self.seq2[Pos[1] - 1])
Pos = (Pos[0], Pos[1] - 1)
elif Dir == 2:
seq1out.insert(0, self.seq1[Pos[0] - 1])
seq2out.insert(0, '-')
Pos = (Pos[0] - 1, Pos[1])
Dir = self.TB[Pos[0]][Pos[1]]
assert len(seq1out) == len(seq2out)
seq1out = ''.join(seq1out)
seq2out = ''.join(seq2out)
matching = ''.join(matching)
self.aln = seq1out + '\n' + matching + '\n' + seq2out
def __str__(self):
string = 'original input sequences:\nseq1 - %s\nseq2 - %s\
\nFinal alignment: \n%s\nAlignment score: \n%d'\
% (self.seq1, self.seq2, self.aln, self.score)
return string
def main():
seq1 = 'caggt'
seq2 = 'gatggct'
a = align(seq1, seq2)
print(a)
if __name__ == '__main__':
main()