Skip to content

Commit

Permalink
alternate alignments
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiasrausch committed Aug 23, 2024
1 parent 2cc3dcc commit d232cfe
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 632 deletions.
15 changes: 0 additions & 15 deletions src/delly.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
#include "tegua.h"
#include "coral.h"
#include "asmode.h"
#include "dpe.h"
#include "pangenome.h"
#include "chimera.h"

using namespace torali;

Expand All @@ -41,9 +38,6 @@ displayUsage() {
std::cerr << "Long-read SV calling:" << std::endl;
std::cerr << " lr long-read SV discovery" << std::endl;
std::cerr << std::endl;
//std::cerr << "Pan-genome based SV calling (work-in-progress):" << std::endl;
//std::cerr << " pg pan-genome SV discovery" << std::endl;
//std::cerr << std::endl;
//std::cerr << "Assembly-based SV calling (work-in-progress):" << std::endl;
//std::cerr << " asm assembly SV site discovery" << std::endl;
//std::cerr << std::endl;
Expand Down Expand Up @@ -92,15 +86,6 @@ int main(int argc, char **argv) {
else if ((std::string(argv[1]) == "asm")) {
return asmode(argc-1,argv+1);
}
else if ((std::string(argv[1]) == "pg")) {
return pg(argc-1,argv+1);
}
else if ((std::string(argv[1]) == "dpe")) {
return dpe(argc-1,argv+1);
}
else if ((std::string(argv[1]) == "chimera")) {
return chimera(argc-1,argv+1);
}
else if ((std::string(argv[1]) == "cnv")) {
return coral(argc-1,argv+1);
}
Expand Down
4 changes: 2 additions & 2 deletions src/gfa.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ namespace torali
gfaFile.close();

// Graph statistics
std::cerr << "Parsed: " << g.offset.size() << " segments, " << g.links.size() << " links" << std::endl;
std::cerr << "Total sequence size: " << g.sequence.size() << std::endl;
boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
std::cerr << '[' << boost::posix_time::to_simple_string(now) << "] " << "GFA loaded: " << g.offset.size() << " segments, " << g.links.size() << " links, seq.size: " << g.sequence.size() << std::endl;

return true;
}
Expand Down
128 changes: 81 additions & 47 deletions src/junction.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,47 +14,11 @@

#include "util.h"
#include "assemble.h"
#include "pangenome.h"

namespace torali
{

struct SRBamRecord {
int32_t chr;
int32_t pos;
int32_t chr2;
int32_t pos2;
int32_t rstart;
int32_t sstart;
int32_t qual;
int32_t inslen;
int32_t svid;
std::size_t id;

SRBamRecord(int32_t const c, int32_t const p, int32_t const c2, int32_t const p2, int32_t const rst, int32_t const sst, int32_t const qval, int32_t const il, std::size_t const idval) : chr(c), pos(p), chr2(c2), pos2(p2), rstart(rst), sstart(sst), qual(qval), inslen(il), svid(-1), id(idval) {}
};

template<typename TSRBamRecord>
struct SortSRBamRecord : public std::binary_function<TSRBamRecord, TSRBamRecord, bool>
{
inline bool operator()(TSRBamRecord const& sv1, TSRBamRecord const& sv2) {
return ((sv1.chr<sv2.chr) || ((sv1.chr==sv2.chr) && (sv1.pos<sv2.pos)) || ((sv1.chr==sv2.chr) && (sv1.pos==sv2.pos) && (sv1.chr2<sv2.chr2)) || ((sv1.chr==sv2.chr) && (sv1.pos==sv2.pos) && (sv1.chr2==sv2.chr2) && (sv1.pos2 < sv2.pos2)));
}
};


struct Junction {
bool forward;
bool scleft;
int32_t refidx;
int32_t rstart;
int32_t refpos;
int32_t seqpos;
uint16_t qual;

Junction(bool const fw, bool const cl, int32_t const idx, int32_t const rst, int32_t const r, int32_t const s, uint16_t const qval) : forward(fw), scleft(cl), refidx(idx), rstart(rst), refpos(r), seqpos(s), qual(qval) {}
};


template<typename TReadBp>
inline void
_insertJunction(TReadBp& readBp, std::size_t const seed, bam1_t* rec, int32_t const rp, int32_t const sp, bool const scleft) {
Expand All @@ -76,15 +40,6 @@ namespace torali
}
}

template<typename TJunction>
struct SortJunction : public std::binary_function<TJunction, TJunction, bool>
{
inline bool operator()(TJunction const& j1, TJunction const& j2) {
return ((j1.seqpos<j2.seqpos) || ((j1.seqpos==j2.seqpos) && (j1.refidx<j2.refidx)) || ((j1.seqpos==j2.seqpos) && (j1.refidx==j2.refidx) && (j1.refpos<j2.refpos)) || ((j1.seqpos==j2.seqpos) && (j1.refidx==j2.refidx) && (j1.refpos==j2.refpos) && (j1.scleft < j2.scleft)));
}
};


inline int32_t
_selectReadStart(std::vector<Junction> const& jcvec) {
for(uint32_t i = 0; i < jcvec.size(); ++i) {
Expand Down Expand Up @@ -524,8 +479,87 @@ namespace torali
typedef std::vector<TSRBamRecord> TSvtSRBamRecord;
TSvtSRBamRecord srBR(2 * DELLY_SVT_TRANS, TSRBamRecord());
_findSRBreakpoints(c, validRegions, srBR);


// Alternate alignments
if (c.hasAltFile) {
std::set<std::size_t> validSR; // Set of split-reads in ALL alternate alignments
// Parse alternate alignments
std::vector<boost::filesystem::path> align;
std::vector<boost::filesystem::path> genome;
_alternateAlignments(c, align, genome);
for(uint32_t i = 0; i < align.size(); ++i) {
std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "Parsing alternate alignment " << align[i] << std::endl;

TSvtSRBamRecord altSR(2 * DELLY_SVT_TRANS, TSRBamRecord());
TConfig altConfig(c);
altConfig.genome = genome[i];
altConfig.files.clear();
altConfig.files.push_back(align[i]);
if (isBamCram(align[i].string())) {
// Alternate alignment in BAM/CRAM format
samFile* samfile = sam_open(c.files[0].string().c_str(), "r");
bam_hdr_t* hdr = sam_hdr_read(samfile);
TValidRegions vR;
_parseExcludeIntervals(altConfig, hdr, vR);
bam_hdr_destroy(hdr);
sam_close(samfile);
_findSRBreakpoints(altConfig, vR, altSR);

// Debug
//std::cerr << "AltAlign: " << align[i] << std::endl;
//outputSRBamRecords(altConfig, altSR, true);
} else {
Graph g;
parseGfa(altConfig, g);
altConfig.nchr = g.smap.size();

// Alternate alignment in GAF format
_findGraphSRBreakpoints(altConfig, g, altSR);

// Debug
//std::cerr << "AltAlign: " << align[i] << std::endl;
//outputGraphSRBamRecords(altConfig, g, altSR);
}

// Update valid SR set
std::set<std::size_t> newValidSR;
for(uint32_t svt = 0; svt < altSR.size(); ++svt) {
if (altSR[svt].empty()) continue;
for(uint32_t i = 0; i < altSR[svt].size(); ++i) {
if (validSR.empty()) newValidSR.insert(altSR[svt][i].id);
else {
if (validSR.find(altSR[svt][i].id) != validSR.end()) newValidSR.insert(altSR[svt][i].id);
}
}
}
validSR = newValidSR;

// Debug
//std::cerr << validSR.size() << std::endl;
}

// Filter primary SR records
uint32_t origSRsize = 0;
uint32_t newSRsize = 0;
TSvtSRBamRecord copySR(2 * DELLY_SVT_TRANS, TSRBamRecord());
for(uint32_t svt = 0; svt < srBR.size(); ++svt) {
origSRsize += srBR[svt].size();
copySR[svt] = srBR[svt];
srBR[svt].clear();
for(uint32_t i = 0; i < copySR[svt].size(); ++i) {
if (validSR.find(copySR[svt][i].id) != validSR.end()) srBR[svt].push_back(copySR[svt][i]);
}
newSRsize += srBR[svt].size();
}
uint32_t filtSR = origSRsize - newSRsize;
double filtRatio = 0;
if (origSRsize) filtRatio = (double) filtSR / (double) origSRsize;
std::cerr << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "Filtered " << filtSR << " out of " << origSRsize << " split-read records (" << (filtRatio * 100) << "%)" << std::endl;
}


// Debug
//std::cerr << "MainAlign: " << std::endl;
//outputSRBamRecords(c, srBR, true);

// Cluster BAM records
Expand Down
Loading

0 comments on commit d232cfe

Please sign in to comment.