-
Notifications
You must be signed in to change notification settings - Fork 0
/
debruijn_hypergraph.hpp
143 lines (125 loc) · 5.8 KB
/
debruijn_hypergraph.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#pragma once
#ifndef _DEBRUIJN_HYPERGRAPH_H
#define _DEBRUIJN_HYPERGRAPH_H
#include <vector>
#include <algorithm>
#include <boost/optional.hpp>
//#include <sdsl/wt_algorithm.hpp>
#include "wt_algorithm.hpp"
#include "debruijn_graph.hpp"
using namespace boost;
template <class t_debruijn_graph = debruijn_graph<>,
class t_lcs_vector = wt_int<rrr_vector<63>> >
class debruijn_hypergraph {
// Nodes are now ranges of edges (could be done for standard de bruijn graph too...
// would save time by saving state?)
// A node is a contiguous range of edges, a hypernode is a contiguous range of nodes (hence a range of edges)
// A hypernode then replaces the defn of node without loss of generality
public:
typedef size_t edge_type;
typedef tuple<edge_type, edge_type, size_t> node_type;
//typedef optional<node_type> hypernode_type; // optional return type
typedef typename t_debruijn_graph::symbol_type symbol_type;
const t_debruijn_graph & m_dbg;
const t_lcs_vector m_lcs;
debruijn_hypergraph(const t_debruijn_graph & dbg, const t_lcs_vector & lcs) : m_dbg(dbg), m_lcs(lcs) {}
// shorter(v, k) - returns the hypernode whose label is the last k characters of v's label (reduce context)
node_type shorter(const node_type & v, size_t k) const {
if (k < 1) return node_type(0, m_dbg.num_edges()-1, k);
// search backward on WT to find the first occurence of a number less than k
size_t i = get<0>(v);
size_t j = get<1>(v);
// find largest i' <= i with L*[i' - 1] < k
size_t i_prime = prev_lte(m_lcs, i+1, k-1)-1;
// find smallest j' >= j with L*[j'] < k
size_t j_prime = next_lte(m_lcs, j+2, k-1)-2;
return node_type(i_prime, std::max(i_prime, j_prime), k);
}
// longer(v, k) - list nodes (new "node") whose labels have length k <= K and end with v's label
// could make iterator instead
vector<node_type> longer(const node_type & v, size_t k) const {
vector<node_type> longer_nodes;
size_t i = get<0>(v);
size_t j = get<1>(v);
auto starts = range_lte(m_lcs, i, j, k-1);
// add code to add first edge?
if (0 == starts.size()) {
longer_nodes.push_back(node_type(get<0>(v), get<1>(v), k));
return longer_nodes;
}
for (size_t idx = 1; idx < starts.size()-1; idx++) {
size_t start = starts[idx-1];
size_t end = starts[idx]-1;
longer_nodes.push_back(node_type(start, end, k));
}
size_t last = starts[starts.size()-1];
longer_nodes.push_back(node_type(last-1, j, k));
return longer_nodes;
}
symbol_type lastchar(const node_type& v) const {
return m_dbg._symbol_access(get<0>(v));
}
optional<node_type> outgoing(const node_type & v, symbol_type x) const {
assert(x < m_dbg.sigma + 1);
if (x == 0) return optional<node_type>();
auto max = maxlen(v, x);
if (!max) return optional<node_type>();
ssize_t i = m_dbg._outgoing_edge_pair(get<0>(*max), get<1>(*max), x);
assert (i != -1); // we do this check in maxlen
size_t j = m_dbg._last_edge_of_node(m_dbg._edge_to_node(i));
auto result = shorter(node_type(i,j,m_dbg.k-1), get<2>(v));
return optional<node_type>(result);
}
// convert top level debruijn graph node to this type
node_type get_node(size_t v) const {
auto r = m_dbg._node_range(v);
return node_type(get<0>(r), get<1>(r), m_dbg.k-1);
}
vector<node_type> backward(const node_type & v) const {
// This could be done lazily, and searched over...
//if (get<2>(v) == m_dbg.k-1) {} // do standard version - not needed?
auto l = longer(v, get<2>(v)+1);
// map maxlen to each element in l
transform(l.begin(), l.end(), l.begin(), [&](const node_type & u){ return maxlen(u); } );
// map standard dbg backward to each element
transform(l.begin(), l.end(), l.begin(), [&](const node_type & u){
size_t start = m_dbg._backward(get<0>(u));
size_t end = m_dbg._last_edge_of_node(m_dbg._edge_to_node(start));
return node_type(start, end, get<2>(u));
});
// map shorter to each element
transform(l.begin(), l.end(), l.begin(), [&](const node_type & u){ return shorter(u, get<2>(v)); });
return l;
}
// maxlen(v, x) - returns some node in the *original* (kmax) graph whose label ends with v's
// label, and that has an outgoing edge labelled x, or NULL otherwise
// should return dbg::node_type
node_type maxlen(const node_type & v) const {
size_t start = get<0>(v); // have to update this to select to the next node
// Range_start must also be a start of a node at the top level context, so find the next top level node to find the range
// TODO: make (or check if it exists) uniform interface for rank/select, so I can easily make next(), prev(), etc
// could easily scan as well - might be faster (but its already super fast... only do this if maxlen is called repeatedly)
size_t end = m_dbg._last_edge_of_node(m_dbg._edge_to_node(start));
return node_type(start, end, m_dbg.k-1);
}
optional<node_type> maxlen(const node_type & v, const symbol_type x) const {
assert(x < m_dbg.sigma + 1);
// For both flagged and nonflagged symbol in W
for (symbol_type flag : {0,1}) {
symbol_type x_with_flag = m_dbg._with_edge_flag(x, flag);
size_t prev_count = m_dbg.m_edges.rank(get<0>(v)+1, x_with_flag);
size_t most_recent = m_dbg.m_edges.select(prev_count, x_with_flag);
// check if edge falls outside our range...
if (get<0>(v) <= most_recent && most_recent <= get<1>(v)) {
// Find node range
size_t node_rank = m_dbg._edge_to_node(most_recent);
auto n_range = m_dbg._node_range(node_rank);
return optional<node_type>(node_type(get<0>(n_range), get<1>(n_range), m_dbg.k-1));
}
}
return optional<node_type>();
}
// function to get standard node by rank (then use shorter)
// fwd, back, lastchar
};
#endif