-
Notifications
You must be signed in to change notification settings - Fork 0
/
breaks.py
138 lines (124 loc) · 4.81 KB
/
breaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import argparse
import pysam
from Bio import SeqIO
def read_regions(regionfile):
'''
read in regions file as csv
return list of regions [tig, start, end]
'''
with open(regionfile) as f:
content=f.read().split('\n')
f.close()
regions=[]
for i in content:
if len(i)>0:
info=i.split(',')
regions.append([info[1], int(info[2]), int(info[3])])
return regions
def filter_regions(regions, asmdict, end=0):
'''
filter list of lists regions by end
return list of lists that still should be breakpoints
'''
keepregions=[]
for i in regions:
##end filtering: keep region for later deletion if any part is inside the end threshold
lower=0+end
upper=len(asmdict[i[0]])-end
if i[2] > lower and i[1] <= upper:
keepregions.append(i)
return keepregions
def align_span(regions, npbam, minrange=5):
'''
take in list of lists regions and relevant bam
return list of lists regions that includes number of aligns that spans each region
'''
spanregions=[]
for i in regions:
tiglen=npbam.lengths[npbam.references.index(i[0])]
numspans=0
if i[1]-minrange < 0 :
lower=0
else:
lower=i[1]-minrange
if i[2]+minrange > tiglen :
upper=tiglen
else:
upper=i[2]+minrange
##npbam.count() would only tell me reads that touch the region
for read in npbam.fetch(i[0], lower, upper):
start=read.query_alignment_start
end=read.query_alignment_end
alignlower=min(start, end)
alignupper=max(start, end)
if alignlower < lower and alignupper > upper:
numspans+=1
span=[x for x in i]
span.append(numspans)
spanregions.append(span)
return spanregions
def break_tigs(asmdict, spanregions, thresh=1):
'''
take fastadict and regions list of lists [tig, start, end, npnone, illnone]
return fastadict of broken contigs
'''
newseqs={}
for tig in asmdict:
points=[]
for region in spanregions:
if region[0] == tig and region[3]+region[4] < thresh:
points.extend(region[1:3])
if len(points) > 0:
points.sort()
seg_count=0
if points[0]!=1:
name=tig+'_break'+str(seg_count)
newseqs[name]=asmdict[tig][0:points[0]]
newseqs[name].id=name
newseqs[name].name=name
newseqs[name].description=name
seg_count+=1
del points[0]
while len(points)>1:
name=tig+'_break'+str(seg_count)
newseqs[name]=asmdict[tig][points[0]:points[1]]
newseqs[name].id=name
newseqs[name].name=name
newseqs[name].description=name
seg_count+=1
del points[0:2]
if points[0]!=len(asmdict[tig]):
name=tig+'_break'+str(seg_count)
newseqs[name]=asmdict[tig][points[0]:len(asmdict[tig])]
newseqs[name].id=name
newseqs[name].name=name
newseqs[name].description=name
else:
newseqs[tig]=asmdict[tig]
return newseqs
def main(asmfile, npbamfile, illbamfile, regionfile, outfile):
'''
take a fasta and list of zero coverage [tig, start, end]
return seqio records of broken contigs
option to disregard ends of contigs
'''
regions=read_regions(regionfile)
asmdict=SeqIO.to_dict(SeqIO.parse(asmfile, 'fasta'))
keepregions=filter_regions(regions, asmdict, end=100)
npbam=pysam.AlignmentFile(npbamfile, 'rb')
npregions=align_span(keepregions, npbam)
illbam=pysam.AlignmentFile(illbamfile, 'rb')
illregions=align_span(npregions, illbam)
newseqs=break_tigs(asmdict, illregions, thresh=1)
with open(outfile, 'w') as f:
SeqIO.write(newseqs.values(), f, 'fasta')
if __name__ == '__main__':
import argparse
parser=argparse.ArgumentParser(description = 'Break asm contigs based on finding spots of 0 coverage and no spanning reads')
parser.add_argument('--asmfile', '-a', type=str, required=True, help='asm to be broken')
parser.add_argument('--npbamfile', '-n', type=str, required=True, help='nanopore reads aligned to asm')
parser.add_argument('--illbamfile', '-i', type=str, required=True, help='illumina reads aligned to asm')
parser.add_argument('--regionfile', '-r', type=str, required=True, help='zero cov regions as generated by the R script')
parser.add_argument('--outfile', '-o', type=str, required=True, help='output path')
args=parser.parse_args()
main(args.asmfile, args.npbamfile, args.illbamfile, args.regionfile, args.outfile)