-
Notifications
You must be signed in to change notification settings - Fork 0
/
find-2s-400NT-seqs.py
105 lines (89 loc) · 2.5 KB
/
find-2s-400NT-seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
### script to remove clusters with only one species and sequences less than 400NT
### this section removes sequences that are less than 400NT
### first open original file and loop through, only write long enough sequences to the new file
fout = open("new-length-400NT.fa.clstr","w")
with open("reduced-all.fa.clstr","r") as file:
for line in file:
if line.startswith(">"):
fout.write(line)
continue
else:
right = line.split("\t")[1]
size = right.split("n")[0]
size = int(size)
if size >= 400:
fout.write(line)
continue
else:
continue
fout.close()
### this section removes clusters with only one species
### first find the number of lines in the current file and open new file
fout2 = open("new-both-2s-400NT.fa.clstr","w")
with open("new-length-400NT.fa.clstr","r") as fh:
length = 0
for i in fh:
length += 1
### then loop through the current file and remove the entries that only have one species
with open("new-length-400NT.fa.clstr","r") as fh1:
line = fh1.readline()
lines = 1
while lines <= length:
if line.startswith(">"):
count = 0
while count < 25:
nextline = fh1.readline()
lines += 1
if nextline == "":
if count > 1:
fout2.write(line)
count = 0
break
elif not nextline.startswith(">"):
count += 1
line += nextline
elif nextline.startswith(">"):
if count > 1:
fout2.write(line)
line = nextline
count = 0
break
fout2.close()
### this section will find the corresponding sequences for each of the kept cluster seqs: need this for transdecoder
### first open the reduced file and keep the headers in a list to use in the next step
fout3 = open("new-cluster-seqs-GA08.fasta","w")
headers = []
with open("new-both-2s-400NT.fa.clstr","r") as fh:
for l in fh:
if l.startswith(">"):
continue
else:
l = l.split(", ")[1]
l = l.split(".")[0]
headers.append(l)
### then write all the sequences that are in the chose clusters into the new sequences file
with open("all-species.fa","r") as file:
match = False
wholeLine = ""
for line in file:
if line.startswith(">"):
if match == True:
fout3.write(wholeLine)
wholeLine = ""
if match == False:
wholeLine = ""
header = line.split(" ")[0]
if header in headers:
wholeLine += line
match = True
elif header not in headers:
match = False
elif line == "":
if match == True:
fout3.write(wholeLine)
wholeLine = ""
else:
wholeLine += line
if match == True:
fout3.write(wholeLine)
fout3.close()