-
-
Notifications
You must be signed in to change notification settings - Fork 7.3k
/
boyer_moore.cpp
272 lines (239 loc) · 8.43 KB
/
boyer_moore.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/**
* @file
* @brief
* The
* [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
* algorithm searches for occurrences of pattern P in text T by performing
* explicit character comparisons at different alignments. Instead of a
* brute-force search of all alignments (of which there are n - m + 1),
* Boyer–Moore uses information gained by preprocessing P to skip as many
* alignments as possible.
*
* @details
* The key insight in this algorithm is that if the end of the pattern is
* compared to the text, then jumps along the text can be made rather than
* checking every character of the text. The reason that this works is that in
* lining up the pattern against the text, the last character of the pattern is
* compared to the character in the text.
*
* If the characters do not match, there is no need to continue searching
* backwards along the text. This leaves us with two cases.
*
* Case 1:
* If the character in the text does not match any of the characters in the
* pattern, then the next character in the text to check is located m characters
* farther along the text, where m is the length of the pattern.
*
* Case 2:
* If the character in the text is in the pattern, then a partial shift of the
* pattern along the text is done to line up along the matching character and
* the process is repeated.
*
* There are two shift rules:
*
* [The bad character rule]
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule)
*
* [The good suffix rule]
* (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule)
*
* The shift rules are implemented as constant-time table lookups, using tables
* generated during the preprocessing of P.
* @author [Stoycho Kyosev](https://github.com/stoychoX)
*/
#include <cassert> /// for assert
#include <climits> /// for CHAR_MAX macro
#include <cstring> /// for strlen
#include <iostream> /// for IO operations
#include <string> /// for std::string
#include <vector> /// for std::vector
#define APLHABET_SIZE CHAR_MAX ///< number of symbols in the alphabet we use
/**
* @namespace
* @brief String algorithms
*/
namespace strings {
/**
* @namespace
* @brief Functions for the [Boyer
* Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm)
* algorithm implementation
*/
namespace boyer_moore {
/**
* @brief A structure representing all the data we need to search the
* preprocessed pattern in text.
*/
struct pattern {
std::string pat;
std::vector<size_t>
bad_char; ///< bad char table used in [Bad Character
///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/)
std::vector<size_t>
good_suffix; ///< good suffix table used for [Good Suffix
///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp)
};
/**
* @brief A function that preprocess the good suffix thable
*
* @param str The string being preprocessed
* @param arg The good suffix table
* @returns void
*/
void init_good_suffix(const std::string& str, std::vector<size_t>& arg) {
arg.resize(str.size() + 1, 0);
// border_pos[i] - the index of the longest proper suffix of str[i..] which
// is also a proper prefix.
std::vector<size_t> border_pos(str.size() + 1, 0);
size_t current_char = str.length();
size_t border_index = str.length() + 1;
border_pos[current_char] = border_index;
while (current_char > 0) {
while (border_index <= str.length() &&
str[current_char - 1] != str[border_index - 1]) {
if (arg[border_index] == 0) {
arg[border_index] = border_index - current_char;
}
border_index = border_pos[border_index];
}
current_char--;
border_index--;
border_pos[current_char] = border_index;
}
size_t largest_border_index = border_pos[0];
for (size_t i = 0; i < str.size(); i++) {
if (arg[i] == 0) {
arg[i] = largest_border_index;
}
// If we go pass the largest border we find the next one as we iterate
if (i == largest_border_index) {
largest_border_index = border_pos[largest_border_index];
}
}
}
/**
* @brief A function that preprocess the bad char table
*
* @param str The string being preprocessed
* @param arg The bad char table
* @returns void
*/
void init_bad_char(const std::string& str, std::vector<size_t>& arg) {
arg.resize(APLHABET_SIZE, str.length());
for (size_t i = 0; i < str.length(); i++) {
arg[str[i]] = str.length() - i - 1;
}
}
/**
* @brief A function that initializes pattern
*
* @param str Text used for initialization
* @param arg Initialized structure
* @returns void
*/
void init_pattern(const std::string& str, pattern& arg) {
arg.pat = str;
init_bad_char(str, arg.bad_char);
init_good_suffix(str, arg.good_suffix);
}
/**
* @brief A function that implements Boyer-Moore's algorithm.
*
* @param str Text we are seatching in.
* @param arg pattern structure containing the preprocessed pattern
* @return Vector of indexes of the occurrences of pattern in text
*/
std::vector<size_t> search(const std::string& str, const pattern& arg) {
size_t index_position = arg.pat.size() - 1;
std::vector<size_t> index_storage;
while (index_position < str.length()) {
size_t index_string = index_position;
int index_pattern = static_cast<int>(arg.pat.size()) - 1;
while (index_pattern >= 0 &&
str[index_string] == arg.pat[index_pattern]) {
--index_pattern;
--index_string;
}
if (index_pattern < 0) {
index_storage.push_back(index_position - arg.pat.length() + 1);
index_position += arg.good_suffix[0];
} else {
index_position += std::max(arg.bad_char[str[index_string]],
arg.good_suffix[index_pattern + 1]);
}
}
return index_storage;
}
/**
* @brief Check if pat is prefix of str.
*
* @param str pointer to some part of the input text.
* @param pat the searched pattern.
* @param len length of the searched pattern
* @returns `true` if pat IS prefix of str.
* @returns `false` if pat is NOT a prefix of str.
*/
bool is_prefix(const char* str, const char* pat, size_t len) {
if (strlen(str) < len) {
return false;
}
for (size_t i = 0; i < len; i++) {
if (str[i] != pat[i]) {
return false;
}
}
return true;
}
} // namespace boyer_moore
} // namespace strings
/**
* @brief A test case in which we search for every appearance of the word 'and'
* @param text The text in which we search for appearance of the word 'and'
* @returns void
*/
void and_test(const char* text) {
strings::boyer_moore::pattern ands;
strings::boyer_moore::init_pattern("and", ands);
std::vector<size_t> indexes = strings::boyer_moore::search(text, ands);
assert(indexes.size() == 2);
assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3));
assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3));
}
/**
* @brief A test case in which we search for every appearance of the word 'pat'
* @param text The text in which we search for appearance of the word 'pat'
* @returns void
*/
void pat_test(const char* text) {
strings::boyer_moore::pattern pat;
strings::boyer_moore::init_pattern("pat", pat);
std::vector<size_t> indexes = strings::boyer_moore::search(text, pat);
assert(indexes.size() == 6);
for (const auto& currentIndex : indexes) {
assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3));
}
}
/**
* @brief Self-test implementations
* @returns void
*/
static void tests() {
const char* text =
"When pat Mr. and Mrs. pat Dursley woke up on the dull, gray \
Tuesday our story starts, \
there was nothing about pat the cloudy sky outside to pat suggest that\
strange and \
mysterious things would pat soon be happening all pat over the \
country.";
and_test(text);
pat_test(text);
std::cout << "All tests have successfully passed!\n";
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main() {
tests(); // run self-test implementations
return 0;
}