forked from williamfiset/Algorithms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SubstringVerificationSuffixArray.java
125 lines (108 loc) · 4.05 KB
/
SubstringVerificationSuffixArray.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/**
* This file shows you how to use a suffix array to determine if a pattern exists within a text.
* This implementation has the advantage that once the suffix array is built queries can be very
* fast.
*
* <p>Time complexity: O(nlogn) for suffix array construction and O(mlogn) time for individual
* queries (where m is query string length). As noted below, depending on the length of the string
* (if it is very large) it may be faster to use KMP or if you're doing a lot of queries on small
* strings then Rabin-Karp in combination with a bloom filter.
*
* @author William Fiset, william.alexandre.fiset@gmail.com
*/
package com.williamfiset.algorithms.strings;
import java.util.*;
// Example usage
public class SubstringVerificationSuffixArray {
public static void main(String[] args) {
String pattern = "hello world";
String text = "hello lemon Lennon wallet world tree cabbage hello world teapot calculator";
SuffixArray sa = new SuffixArray(text);
System.out.println(sa.contains(pattern));
System.out.println(sa.contains("this pattern does not exist"));
}
public static class SuffixArray {
// ALPHABET_SZ is the default alphabet size, this may need to be much
// larger if you're using the LCS method with multiple sentinels
int ALPHABET_SZ = 256, N;
int[] T, sa, sa2, rank, tmp, c;
public SuffixArray(String str) {
this(toIntArray(str));
}
private static int[] toIntArray(String s) {
int[] text = new int[s.length()];
for (int i = 0; i < s.length(); i++) text[i] = s.charAt(i);
return text;
}
// Designated constructor
public SuffixArray(int[] text) {
T = text;
N = text.length;
sa = new int[N];
sa2 = new int[N];
rank = new int[N];
c = new int[Math.max(ALPHABET_SZ, N)];
construct();
}
private void construct() {
int i, p, r;
for (i = 0; i < N; ++i) c[rank[i] = T[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[T[i]]] = i;
for (p = 1; p < N; p <<= 1) {
for (r = 0, i = N - p; i < N; ++i) sa2[r++] = i;
for (i = 0; i < N; ++i) if (sa[i] >= p) sa2[r++] = sa[i] - p;
Arrays.fill(c, 0, ALPHABET_SZ, 0);
for (i = 0; i < N; ++i) c[rank[i]]++;
for (i = 1; i < ALPHABET_SZ; ++i) c[i] += c[i - 1];
for (i = N - 1; i >= 0; --i) sa[--c[rank[sa2[i]]]] = sa2[i];
for (sa2[sa[0]] = r = 0, i = 1; i < N; ++i) {
if (!(rank[sa[i - 1]] == rank[sa[i]]
&& sa[i - 1] + p < N
&& sa[i] + p < N
&& rank[sa[i - 1] + p] == rank[sa[i] + p])) r++;
sa2[sa[i]] = r;
}
tmp = rank;
rank = sa2;
sa2 = tmp;
if (r == N - 1) break;
ALPHABET_SZ = r + 1;
}
}
// Runs on O(mlog(n)) where m is the length of the substring
// and n is the length of the text. Depending on the length
// of the string (if it is very large) it may be faster to
// use KMP or if you're doing a lot of queries then Rabin-Karp
// in combination with a bloom filter.
//
// NOTE: This is the naive implementation. There exists an
// implementation which runs in O(m + log(n)) time
public boolean contains(String substr) {
if (substr == null) return false;
if (substr.equals("")) return true;
String suffix_str;
int lo = 0, hi = N - 1;
int substr_len = substr.length();
// Do binary search
while (lo <= hi) {
int mid = (lo + hi) >>> 1;
int suffix_index = sa[mid];
int suffix_len = N - suffix_index;
// Extract part of the suffix we need to compare
if (suffix_len <= substr_len) suffix_str = new String(T, suffix_index, suffix_len);
else suffix_str = new String(T, suffix_index, substr_len);
int cmp = suffix_str.compareTo(substr);
// Found a match
if (cmp == 0) {
return true;
} else if (cmp < 0) {
lo = mid + 1;
} else {
hi = mid - 1;
}
}
return false;
}
}
}