Skip to content

Commit

Permalink
Merge pull request #438 from ksahlin/details
Browse files Browse the repository at this point in the history
Add some diagnostics
  • Loading branch information
marcelm authored Oct 1, 2024
2 parents a01bac4 + 4e7ed23 commit 63e2715
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 83 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ When `--details` is provided, the following additional SAM tags are output for
mapped reads.

`na`: Number of NAMs (seeds) found
`nr`: Whether NAM rescue was needed (1 if yes, 0 if not)
`nr`: The number of NAMs found during NAM rescue or -1 if no rescue was attempted
`mr`: Number of times mate rescue was attempted (local alignment of a mate to
the expected region given its mate)
`al`: Number of times an attempt was made to extend a seed (by gapped or ungapped alignment)
Expand Down
20 changes: 14 additions & 6 deletions src/aln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1040,18 +1040,22 @@ void align_or_map_paired(

// Find NAMs
Timer nam_timer;
auto [nonrepetitive_fraction, nams] = find_nams(query_randstrobes, index);
auto [nonrepetitive_fraction, n_hits, nams] = find_nams(query_randstrobes, index);
statistics.tot_find_nams += nam_timer.duration();
statistics.n_hits += n_hits;
details[is_revcomp].nams = nams.size();

if (map_param.rescue_level > 1) {
Timer rescue_timer;
if (nams.empty() || nonrepetitive_fraction < 0.7) {
nams = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff);
int n_rescue_hits;
std::tie(n_rescue_hits, nams) = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff);
details[is_revcomp].nam_rescue = true;
details[is_revcomp].rescue_nams = nams.size();
statistics.n_rescue_hits += n_rescue_hits;
}
statistics.tot_time_rescue += rescue_timer.duration();
}
details[is_revcomp].nams = nams.size();
Timer nam_sort_timer;
std::sort(nams.begin(), nams.end(), by_score<Nam>);
shuffle_top_nams(nams, random_engine);
Expand Down Expand Up @@ -1179,18 +1183,22 @@ void align_or_map_single(

// Find NAMs
Timer nam_timer;
auto [nonrepetitive_fraction, nams] = find_nams(query_randstrobes, index);
auto [nonrepetitive_fraction, n_hits, nams] = find_nams(query_randstrobes, index);
statistics.tot_find_nams += nam_timer.duration();
statistics.n_hits += n_hits;
details.nams = nams.size();

if (map_param.rescue_level > 1) {
Timer rescue_timer;
if (nams.empty() || nonrepetitive_fraction < 0.7) {
int n_rescue_hits;
std::tie(n_rescue_hits, nams) = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff);
statistics.n_rescue_hits += n_rescue_hits;
details.rescue_nams = nams.size();
details.nam_rescue = true;
nams = find_nams_rescue(query_randstrobes, index, map_param.rescue_cutoff);
}
statistics.tot_time_rescue += rescue_timer.duration();
}
details.nams = nams.size();

Timer nam_sort_timer;
std::sort(nams.begin(), nams.end(), by_score<Nam>);
Expand Down
45 changes: 1 addition & 44 deletions src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,51 +10,8 @@
#include "sam.hpp"
#include "aligner.hpp"
#include "insertsizedistribution.hpp"
#include "statistics.hpp"

struct AlignmentStatistics {
std::chrono::duration<double> tot_read_file{0};
std::chrono::duration<double> tot_construct_strobemers{0};
std::chrono::duration<double> tot_find_nams{0};
std::chrono::duration<double> tot_time_rescue{0};
std::chrono::duration<double> tot_find_nams_alt{0};
std::chrono::duration<double> tot_sort_nams{0};
std::chrono::duration<double> tot_extend{0};
std::chrono::duration<double> tot_write_file{0};

uint64_t n_reads{0};
uint64_t tot_aligner_calls{0};
uint64_t tot_rescued{0};
uint64_t tot_all_tried{0};
uint64_t inconsistent_nams{0};
uint64_t nam_rescue{0};

AlignmentStatistics operator+=(const AlignmentStatistics& other) {
this->tot_read_file += other.tot_read_file;
this->tot_construct_strobemers += other.tot_construct_strobemers;
this->tot_find_nams += other.tot_find_nams;
this->tot_time_rescue += other.tot_time_rescue;
this->tot_find_nams_alt += other.tot_find_nams_alt;
this->tot_sort_nams += other.tot_sort_nams;
this->tot_extend += other.tot_extend;
this->tot_write_file += other.tot_write_file;
this->n_reads += other.n_reads;
this->tot_aligner_calls += other.tot_aligner_calls;
this->tot_rescued += other.tot_rescued;
this->tot_all_tried += other.tot_all_tried;
this->inconsistent_nams += other.inconsistent_nams;
this->nam_rescue += other.nam_rescue;
return *this;
}

AlignmentStatistics operator+=(const Details& details) {
this->nam_rescue += details.nam_rescue;
this->tot_rescued += details.mate_rescue;
this->tot_all_tried += details.tried_alignment;
this->inconsistent_nams += details.nam_inconsistent;

return *this;
}
};

enum class OutputFormat {
SAM,
Expand Down
13 changes: 12 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,18 @@ int run_strobealign(int argc, char **argv) {
output_abundance(out, abundances, references);
}

logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
logger.debug()
<< "Number of reads: " << tot_statistics.n_reads << std::endl
<< "Number of non-rescue hits: " << tot_statistics.n_hits
<< " total. Per read: " << static_cast<float>(tot_statistics.n_hits) / tot_statistics.n_reads << std::endl
<< "Number of non-rescue NAMs: " << tot_statistics.n_nams
<< " total. Per read: " << static_cast<float>(tot_statistics.n_nams) / tot_statistics.n_reads << std::endl
<< "Number of rescue hits: " << tot_statistics.n_rescue_hits
<< " total. Per rescue attempt: " << static_cast<float>(tot_statistics.n_rescue_hits) / tot_statistics.nam_rescue << std::endl
<< "Number of rescue NAMs: " << tot_statistics.n_rescue_nams
<< " total. Per rescue attempt: " << static_cast<float>(tot_statistics.n_rescue_nams) / tot_statistics.nam_rescue << std::endl;
logger.info()
<< "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
<< "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
<< "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl
<< "Tried NAM rescue: " << tot_statistics.nam_rescue << std::endl
Expand Down
14 changes: 9 additions & 5 deletions src/nam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,15 @@ std::vector<Nam> merge_hits_into_nams_forward_and_reverse(
*
* Return the fraction of nonrepetitive hits (those not above the filter_cutoff threshold)
*/
std::pair<float, std::vector<Nam>> find_nams(
std::tuple<float, int, std::vector<Nam>> find_nams(
const QueryRandstrobeVector &query_randstrobes,
const StrobemerIndex& index
) {
std::array<robin_hood::unordered_map<unsigned int, std::vector<Hit>>, 2> hits_per_ref;
hits_per_ref[0].reserve(100);
hits_per_ref[1].reserve(100);
int nr_good_hits = 0, total_hits = 0;
int nr_good_hits = 0;
int total_hits = 0;
for (const auto &q : query_randstrobes) {
size_t position = index.find(q.hash);
if (position != index.end()){
Expand All @@ -176,15 +177,16 @@ std::pair<float, std::vector<Nam>> find_nams(
}
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
auto nams = merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), false);
return make_pair(nonrepetitive_fraction, nams);
return {nonrepetitive_fraction, nr_good_hits, nams};
}

/*
* Find a query’s NAMs, using also some of the randstrobes that occur more often
* than filter_cutoff.
*
* Return the number of hits and the vector of NAMs.
*/
std::vector<Nam> find_nams_rescue(
std::pair<int, std::vector<Nam>> find_nams_rescue(
const QueryRandstrobeVector &query_randstrobes,
const StrobemerIndex& index,
unsigned int rescue_cutoff
Expand Down Expand Up @@ -224,6 +226,7 @@ std::vector<Nam> find_nams_rescue(

std::sort(hits_fw.begin(), hits_fw.end());
std::sort(hits_rc.begin(), hits_rc.end());
int n_hits = 0;
size_t is_revcomp = 0;
for (auto& rescue_hits : {hits_fw, hits_rc}) {
int cnt = 0;
Expand All @@ -233,11 +236,12 @@ std::vector<Nam> find_nams_rescue(
}
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
cnt++;
n_hits++;
}
is_revcomp++;
}

return merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true);
return {n_hits, merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true)};
}

std::ostream& operator<<(std::ostream& os, const Nam& n) {
Expand Down
4 changes: 2 additions & 2 deletions src/nam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ struct Nam {
}
};

std::pair<float, std::vector<Nam>> find_nams(
std::tuple<float, int, std::vector<Nam>> find_nams(
const QueryRandstrobeVector &query_randstrobes,
const StrobemerIndex& index
);

std::vector<Nam> find_nams_rescue(
std::pair<int, std::vector<Nam>> find_nams_rescue(
const QueryRandstrobeVector &query_randstrobes,
const StrobemerIndex& index,
unsigned int rescue_cutoff
Expand Down
3 changes: 2 additions & 1 deletion src/python/strobealign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ NB_MODULE(strobealign_extension, m_) {

m.def("randstrobes_query", &randstrobes_query);
m.def("find_nams", [](const QueryRandstrobeVector &query_randstrobes, const StrobemerIndex& index) -> std::vector<Nam> {
return find_nams(query_randstrobes, index).second;
auto [nonrepetitive_fraction, n_hits, nams] = find_nams(query_randstrobes, index);
return nams;
});
}
2 changes: 1 addition & 1 deletion src/sam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void Sam::append_rg() {
void Sam::append_details(const Details& details) {
std::stringstream s;
s << "\tna:i:" << details.nams
<< "\tnr:i:" << details.nam_rescue
<< "\tnr:i:" << (details.nam_rescue ? static_cast<int>(details.rescue_nams) : -1)
<< "\tal:i:" << details.tried_alignment
<< "\tga:i:" << details.gapped
<< "\tX0:i:" << details.best_alignments;
Expand Down
23 changes: 1 addition & 22 deletions src/sam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "kseq++/kseq++.hpp"
#include "refs.hpp"
#include "cigar.hpp"
#include "statistics.hpp"


struct Alignment {
Expand Down Expand Up @@ -43,28 +44,6 @@ enum struct CigarOps {
M = 1, // use M CIGAR operations
};

/* Details about aligning a single or paired-end read */
struct Details {
bool nam_rescue{false}; // find_nams_rescue() was needed
uint64_t nams{0}; // No. of NAMs found
uint64_t nam_inconsistent{0};
uint64_t mate_rescue{0}; // No. of times rescue by local alignment was attempted
uint64_t tried_alignment{0}; // No. of computed alignments (get_alignment or rescue_mate)
uint64_t gapped{0}; // No. of gapped alignments computed (in get_alignment)
uint64_t best_alignments{0}; // No. of best alignments with same score

Details& operator+=(const Details& other) {
nam_rescue = nam_rescue || other.nam_rescue;
nams += other.nams;
nam_inconsistent += other.nam_inconsistent;
mate_rescue += other.mate_rescue;
tried_alignment += other.tried_alignment;
gapped += other.gapped;
best_alignments += other.best_alignments;
return *this;
}
};

class Sam {

public:
Expand Down
83 changes: 83 additions & 0 deletions src/statistics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#ifndef STROBEALIGN_STATISTICS_HPP
#define STROBEALIGN_STATISTICS_HPP

/* Details about aligning a single or paired-end read */
struct Details {
bool nam_rescue{false}; // find_nams_rescue() was needed
uint32_t nams{0}; // No. of NAMs found
uint32_t rescue_nams{0}; // No. of NAMs found during rescue
uint32_t nam_inconsistent{0};
uint32_t mate_rescue{0}; // No. of times rescue by local alignment was attempted
uint32_t tried_alignment{0}; // No. of computed alignments (get_alignment or rescue_mate)
uint32_t gapped{0}; // No. of gapped alignments computed (in get_alignment)
uint32_t best_alignments{0}; // No. of best alignments with same score

Details& operator+=(const Details& other) {
nam_rescue = nam_rescue || other.nam_rescue;
nams += other.nams;
rescue_nams += other.rescue_nams;
nam_inconsistent += other.nam_inconsistent;
mate_rescue += other.mate_rescue;
tried_alignment += other.tried_alignment;
gapped += other.gapped;
best_alignments += other.best_alignments;
return *this;
}
};

struct AlignmentStatistics {
std::chrono::duration<double> tot_read_file{0};
std::chrono::duration<double> tot_construct_strobemers{0};
std::chrono::duration<double> tot_find_nams{0};
std::chrono::duration<double> tot_time_rescue{0};
std::chrono::duration<double> tot_find_nams_alt{0};
std::chrono::duration<double> tot_sort_nams{0};
std::chrono::duration<double> tot_extend{0};
std::chrono::duration<double> tot_write_file{0};

uint64_t n_reads{0};
uint64_t n_hits{0}; // non-rescue hits
uint64_t n_rescue_hits{0};
uint64_t n_nams{0};
uint64_t n_rescue_nams{0};
uint64_t tot_aligner_calls{0};
uint64_t tot_rescued{0};
uint64_t tot_all_tried{0};
uint64_t inconsistent_nams{0};
uint64_t nam_rescue{0};

AlignmentStatistics operator+=(const AlignmentStatistics& other) {
this->tot_read_file += other.tot_read_file;
this->tot_construct_strobemers += other.tot_construct_strobemers;
this->tot_find_nams += other.tot_find_nams;
this->tot_time_rescue += other.tot_time_rescue;
this->tot_find_nams_alt += other.tot_find_nams_alt;
this->tot_sort_nams += other.tot_sort_nams;
this->tot_extend += other.tot_extend;
this->tot_write_file += other.tot_write_file;
this->n_reads += other.n_reads;
this->n_hits += other.n_hits;
this->n_rescue_hits += other.n_rescue_hits;
this->n_nams += other.n_nams;
this->n_rescue_nams += other.n_rescue_nams;
this->tot_aligner_calls += other.tot_aligner_calls;
this->tot_rescued += other.tot_rescued;
this->tot_all_tried += other.tot_all_tried;
this->inconsistent_nams += other.inconsistent_nams;
this->nam_rescue += other.nam_rescue;
return *this;
}

AlignmentStatistics operator+=(const Details& details) {
this->n_nams += details.nams;
this->n_rescue_nams += details.rescue_nams;
this->nam_rescue += details.nam_rescue;
this->tot_rescued += details.mate_rescue;
this->tot_all_tried += details.tried_alignment;
this->inconsistent_nams += details.nam_inconsistent;

return *this;
}
};

#endif

0 comments on commit 63e2715

Please sign in to comment.