forked from BenLangmead/bowtie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ref_read.cpp
298 lines (289 loc) · 8.46 KB
/
ref_read.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#include "ref_read.h"
/**
* Reads past the next ambiguous or unambiguous stretch of sequence
* from the given FASTA file and returns its length. Does not do
* anything with the sequence characters themselves; this is purely for
* measuring lengths.
*/
RefRecord fastaRefReadSize(FileBuf& in,
const RefReadInParams& rparms,
bool first,
BitpairOutFileBuf* bpout)
{
int c;
static int lastc = '>'; // last character seen
// RefRecord params
TIndexOffU len = 0; // 'len' counts toward total length
// 'off' counts number of ambiguous characters before first
// unambiguous character
size_t off = 0;
// Pick off the first carat and any preceding whitespace
if(first) {
assert(!in.eof());
lastc = '>';
c = in.getPastWhitespace();
if(in.eof()) {
// Got eof right away; emit warning
cerr << "Warning: Empty input file" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
assert(c == '>');
}
first = true;
// Skip to the end of the id line; if the next line is either
// another id line or a comment line, keep skipping
if(lastc == '>') {
// Skip to the end of the name line
do {
if((c = in.getPastNewline()) == -1) {
// No more input
cerr << "Warning: Encountered empty reference sequence" << endl;
lastc = -1;
return RefRecord(0, 0, true);
}
if(c == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
// continue until a non-name, non-comment line
} while (c == '>');
} else {
first = false; // not the first in a sequence
off = 1; // The gap has already been consumed, so count it
if((c = in.get()) == -1) {
// Don't emit a warning, since this might legitimately be
// a gap on the end of the final sequence in the file
lastc = -1;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
}
// Now skip to the first DNA character, counting gap characters
// as we go
int lc = -1; // last-DNA char variable for color conversion
while(true) {
int cat = dna4Cat[c];
if(rparms.nsToAs && cat == 2) c = 'A';
if(cat == 1) {
// This is a DNA character
if(rparms.color) {
if(lc != -1) {
// Got two consecutive unambiguous DNAs
break; // to read-in loop
}
// Keep going; we need two consecutive unambiguous DNAs
lc = charToDna5[(int)c];
// The 'if(off > 0)' takes care of the case where
// the reference is entirely unambiguous and we don't
// want to incorrectly increment off.
if(off > 0) off++;
} else {
break; // to read-in loop
}
} else if(cat == 2) {
if(lc != -1 && off == 0) off++;
lc = -1;
off++; // skip over gap character and increment
} else if(c == '>') {
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = '>';
return RefRecord((TIndexOffU)off, 0, first);
}
c = in.get();
if(c == -1) {
// End-of-file
if(off > 0 && lastc == '>') {
cerr << "Warning: Encountered reference sequence with only gaps" << endl;
} else if(lastc == '>') {
cerr << "Warning: Encountered empty reference sequence" << endl;
}
lastc = -1;
return RefRecord((TIndexOffU)off, 0, first);
}
}
assert(!rparms.color || (lc != -1));
assert_eq(1, dna4Cat[c]); // C must be unambiguous base
if(off > 0 && rparms.color && first) {
// Handle the case where the first record has ambiguous
// characters but we're in color space; one of those counts is
// spurious
off--;
}
// in now points just past the first character of a sequence
// line, and c holds the first character
while(c != -1 && c != '>') {
if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
uint8_t cat = dna4Cat[c];
int cc = toupper(c);
if(rparms.bisulfite && cc == 'C') c = cc = 'T';
if(cat == 1) {
// It's a DNA character
assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
// Consume it
len++;
// Output it
if(bpout != NULL) {
if(rparms.color) {
// output color
bpout->write(dinuc2color[charToDna5[(int)c]][lc]);
} else if(!rparms.color) {
// output nucleotide
bpout->write(charToDna5[c]);
}
}
lc = charToDna5[(int)c];
} else if(cat == 2) {
// It's an N or a gap
lastc = c;
assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
} else {
// Not DNA and not a gap, ignore it
#ifndef NDEBUG
if(!isspace(c)) {
cerr << "Unexpected character in sequence: ";
if(isprint(c)) {
cerr << ((char)c) << endl;
} else {
cerr << "(" << c << ")" << endl;
}
}
#endif
}
c = in.get();
}
lastc = c;
return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}
static void
printRecords(ostream& os, const vector<RefRecord>& l) {
for(size_t i = 0; i < l.size(); i++) {
os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
}
}
/**
* Reverse the 'src' list of RefRecords into the 'dst' list. Don't
* modify 'src'.
*/
void reverseRefRecords(const vector<RefRecord>& src,
vector<RefRecord>& dst,
bool recursive,
bool verbose)
{
dst.clear();
{
vector<RefRecord> cur;
for(int64_t i = (int64_t)src.size()-1; i >= 0; i--) {
bool first = (i == (int)src.size()-1 || src[i+1].first);
if(src[i].len) {
cur.push_back(RefRecord(0, src[i].len, first));
first = false;
}
if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
}
for(int64_t i = 0; i < (int64_t)cur.size(); i++) {
assert(cur[i].off == 0 || cur[i].len == 0);
if(i < (int64_t)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
i++;
} else {
dst.push_back(cur[i]);
}
}
}
if(verbose) {
cout << "Source: " << endl;
printRecords(cout, src);
cout << "Dest: " << endl;
printRecords(cout, dst);
}
#ifndef NDEBUG
if(!recursive) {
vector<RefRecord> tmp;
reverseRefRecords(dst, tmp, true);
assert_eq(tmp.size(), src.size());
for(size_t i = 0; i < src.size(); i++) {
assert_eq(src[i].len, tmp[i].len);
assert_eq(src[i].off, tmp[i].off);
assert_eq(src[i].first, tmp[i].first);
}
}
#endif
}
/**
* Calculate a vector containing the sizes of all of the patterns in
* all of the given input files, in order. Returns the total size of
* all references combined. Rewinds each istream before returning.
*/
std::pair<size_t, size_t>
fastaRefReadSizes(vector<FileBuf*>& in,
vector<RefRecord>& recs,
vector<uint32_t>& plens,
const RefReadInParams& rparms,
BitpairOutFileBuf* bpout,
TIndexOff& numSeqs)
{
TIndexOffU unambigTot = 0;
size_t bothTot = 0;
assert_gt(in.size(), 0);
uint32_t both = 0, unambig = 0;
// For each input istream
for(size_t i = 0; i < in.size(); i++) {
bool first = true;
assert(!in[i]->eof());
// For each pattern in this istream
while(!in[i]->eof()) {
RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
// Update plens
if(rec.first) {
if(unambig > 0) {
plens.push_back(both);
}
both = 0;
unambig = 0;
}
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
if(rec.len == 0) rec.first = false;
#endif
if((unambigTot + rec.len) < unambigTot) {
#ifdef BOWTIE_64BIT_INDEX
cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl
<< "reference into smaller chunks and index each independently." << endl;
#else
cerr << "Error: Reference sequence has more than 2^32-1 characters! Please try to" << endl
<< "build a large index instead using the appropiate options." << endl;
#endif
throw 1;
}
// Add the length of this record.
if(rec.first) numSeqs++;
unambigTot += rec.len; unambig += rec.len;
bothTot += rec.len; both += rec.len;
bothTot += rec.off; both += rec.off;
first = false;
if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
recs.push_back(rec);
}
// Reset the input stream
in[i]->reset();
assert(!in[i]->eof());
#ifndef NDEBUG
// Check that it's really reset
int c = in[i]->get();
assert_eq('>', c);
in[i]->reset();
assert(!in[i]->eof());
#endif
}
assert_geq(bothTot, 0);
assert_geq(unambigTot, 0);
if(unambig > 0) {
plens.push_back(both);
}
return make_pair(
unambigTot, // total number of unambiguous DNA characters read
bothTot); // total number of DNA characters read, incl. ambiguous ones
}