Skip to content

Commit

Permalink
Rename Hit struct to Match
Browse files Browse the repository at this point in the history
We use the term "hit" both for
- a query strobemer successfully looked up in the index
- a locus on the query paired with a locus on the reference (this type of
  hit is merged into NAMs)

I think choosing the name for the latter to be "match" makes sense because
the M in NAM stands for "match". That is, we can then say that we merge
multiple overlapping matches into non-overlapping, approximate matches (=NAMs).

The flow would then be: hit -> match -> NAM.
  • Loading branch information
marcelm committed Oct 1, 2024
1 parent 63e2715 commit 7eccbe4
Showing 1 changed file with 24 additions and 24 deletions.
48 changes: 24 additions & 24 deletions src/nam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

namespace {

struct Hit {
struct Match {
int query_start;
int query_end;
int ref_start;
int ref_end;
};

inline void add_to_hits_per_ref(
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
inline void add_to_matches_map(
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
int query_start,
int query_end,
const StrobemerIndex& index,
Expand All @@ -22,22 +22,22 @@ inline void add_to_hits_per_ref(
int ref_end = ref_start + index.strobe2_offset(position) + index.k();
int diff = std::abs((query_end - query_start) - (ref_end - ref_start));
if (diff <= min_diff) {
hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end});
matches_map[index.reference_index(position)].push_back(Match{query_start, query_end, ref_start, ref_end});
min_diff = diff;
}
}
}

void merge_hits_into_nams(
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
void merge_matches_into_nams(
robin_hood::unordered_map<unsigned int, std::vector<Match>>& matches_map,
int k,
bool sort,
bool is_revcomp,
std::vector<Nam>& nams // inout
) {
for (auto &[ref_id, hits] : hits_per_ref) {
for (auto &[ref_id, matches] : matches_map) {
if (sort) {
std::sort(hits.begin(), hits.end(), [](const Hit& a, const Hit& b) -> bool {
std::sort(matches.begin(), matches.end(), [](const Match& a, const Match& b) -> bool {
// first sort on query starts, then on reference starts
return (a.query_start < b.query_start) || ( (a.query_start == b.query_start) && (a.ref_start < b.ref_start) );
}
Expand All @@ -46,7 +46,7 @@ void merge_hits_into_nams(

std::vector<Nam> open_nams;
int prev_q_start = 0;
for (auto &h : hits) {
for (auto &h : matches) {
bool is_added = false;
for (auto & o : open_nams) {

Expand Down Expand Up @@ -77,7 +77,7 @@ void merge_hits_into_nams(
}

}
// Add the hit to open matches
// Add to open matches
if (!is_added){
Nam n;
n.query_start = h.query_start;
Expand Down Expand Up @@ -134,15 +134,15 @@ void merge_hits_into_nams(
}
}

std::vector<Nam> merge_hits_into_nams_forward_and_reverse(
std::array<robin_hood::unordered_map<unsigned int, std::vector<Hit>>, 2>& hits_per_ref,
std::vector<Nam> merge_matches_into_nams_forward_and_reverse(
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2>& matches_map,
int k,
bool sort
) {
std::vector<Nam> nams;
for (size_t is_revcomp = 0; is_revcomp < 2; ++is_revcomp) {
auto& hits_oriented = hits_per_ref[is_revcomp];
merge_hits_into_nams(hits_oriented, k, sort, is_revcomp, nams);
auto& hits_oriented = matches_map[is_revcomp];
merge_matches_into_nams(hits_oriented, k, sort, is_revcomp, nams);
}
return nams;
}
Expand All @@ -159,9 +159,9 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
const QueryRandstrobeVector &query_randstrobes,
const StrobemerIndex& index
) {
std::array<robin_hood::unordered_map<unsigned int, std::vector<Hit>>, 2> hits_per_ref;
hits_per_ref[0].reserve(100);
hits_per_ref[1].reserve(100);
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
matches_map[0].reserve(100);
matches_map[1].reserve(100);
int nr_good_hits = 0;
int total_hits = 0;
for (const auto &q : query_randstrobes) {
Expand All @@ -172,11 +172,11 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
continue;
}
nr_good_hits++;
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
add_to_matches_map(matches_map[q.is_reverse], q.start, q.end, index, position);
}
}
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
auto nams = merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), false);
auto nams = merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), false);
return {nonrepetitive_fraction, nr_good_hits, nams};
}

Expand All @@ -203,11 +203,11 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
}
};

std::array<robin_hood::unordered_map<unsigned int, std::vector<Hit>>, 2> hits_per_ref;
std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
std::vector<RescueHit> hits_fw;
std::vector<RescueHit> hits_rc;
hits_per_ref[0].reserve(100);
hits_per_ref[1].reserve(100);
matches_map[0].reserve(100);
matches_map[1].reserve(100);
hits_fw.reserve(5000);
hits_rc.reserve(5000);

Expand All @@ -234,14 +234,14 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {
break;
}
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
add_to_matches_map(matches_map[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
cnt++;
n_hits++;
}
is_revcomp++;
}

return {n_hits, merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true)};
return {n_hits, merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), true)};
}

std::ostream& operator<<(std::ostream& os, const Nam& n) {
Expand Down

0 comments on commit 7eccbe4

Please sign in to comment.