diff options
Diffstat (limited to 'string/ahoCorasick.cpp')
| -rw-r--r-- | string/ahoCorasick.cpp | 103 |
1 files changed, 50 insertions, 53 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp index d221fcb..530490e 100644 --- a/string/ahoCorasick.cpp +++ b/string/ahoCorasick.cpp @@ -1,57 +1,54 @@ -// Laufzeit: O(n + m + z), n = #Text, m = Summe #Pattern, z = #Matches -// Findet mehrere Patterns gleichzeitig in einem String. -// 1) Wurzel erstellen: aho.push_back(vertex()); -// 2) Mit addString(0, pattern, idx); Patterns hinzufügen. -// 3) finishAutomaton(0) aufrufen. -// 4) Mit state = go(state, c) in nächsten Zustand wechseln. -// DANACH: Wenn patterns-Vektor nicht leer ist: Hier enden alle -// enthaltenen Patterns. -// ACHTUNG: Die Zahlenwerte der auftretenden Buchstaben müssen -// zusammenhängend sein und bei 0 beginnen! -struct vertex { - int next[ALPHABET_SIZE], failure; - int character; - vector<int> patterns; // Indizes der Patterns, die hier enden. - vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = -1; } -}; -vector<vertex> aho; +constexpr ll ALPHABET_SIZE = 26; +constexpr char OFFSET = 26; +struct AhoCorasick { + struct vert { + int suffix, exit, character, parent; + vector<int> nxt, patterns; + vert(int c, int p) : suffix(-1), exit(-1), + character(c), nxt(ALPHABET_SIZE, -1), parent(p) {} + }; + vector<vert> aho; -void addString(int v, vector<int> &pattern, int patternIdx) { - for (int i = 0; i < (int)pattern.size(); i++) { - if (aho[v].next[pattern[i]] == -1) { - aho[v].next[pattern[i]] = aho.size(); - aho.push_back(vertex()); - aho.back().character = pattern[i]; - } - v = aho[v].next[pattern[i]]; - } - aho[v].patterns.push_back(patternIdx); -} + AhoCorasick() {aho.push_back(vert(-1, 0));} -void finishAutomaton(int v) { - for (int i = 0; i < ALPHABET_SIZE; i++) - if (aho[v].next[i] == -1) aho[v].next[i] = v; + // Call once for each pattern. + void addString(string &s, int patternIdx) { + int v = 0; + for (char c : s) { + int idx = c - OFFSET; + if (aho[v].nxt[idx] == -1) { + aho[v].nxt[idx] = sz(aho); + aho.emplace_back(idx, v); + } + v = aho[v].nxt[idx]; + } + aho[v].patterns.push_back(patternIdx); + } - queue<int> q; - for (int i = 0; i < ALPHABET_SIZE; i++) { - if (aho[v].next[i] != v) { - aho[aho[v].next[i]].failure = v; - q.push(aho[v].next[i]); - }} - while (!q.empty()) { - int r = q.front(); q.pop(); - for (int i = 0; i < ALPHABET_SIZE; i++) { - if (aho[r].next[i] != -1) { - q.push(aho[r].next[i]); - int f = aho[r].failure; - while (aho[f].next[i] == -1) f = aho[f].failure; - aho[aho[r].next[i]].failure = aho[f].next[i]; - for (int j = 0; j < (int)aho[aho[f].next[i]].patterns.size(); j++) { - aho[aho[r].next[i]].patterns.push_back( - aho[aho[f].next[i]].patterns[j]); -}}}}} + int getSuffix(int v) { + if (aho[v].suffix == -1) { + if (v == 0 || aho[v].parent == 0) aho[v].suffix = 0; + else aho[v].suffix = go(getSuffix(aho[v].parent), + aho[v].character); + } + return aho[v].suffix; + } -int go(int v, int c) { - if (aho[v].next[c] != -1) return aho[v].next[c]; - else return go(aho[v].failure, c); -} + int getExit(int v) { + if (aho[v].exit == -1) { + int suffix = getSuffix(v); + if (v == 0) aho[v].exit = 0; + else { + if (aho[suffix].patterns.empty()) { + aho[v].exit = getExit(suffix); + } else { + aho[v].exit = suffix; + }}} + return aho[v].exit; + } + + int go(int v, int idx) { // Root is v=0. + if (aho[v].nxt[idx] != -1) return aho[v].nxt[idx]; + else return v == 0 ? 0 : go(getSuffix(v), idx); + } +};
\ No newline at end of file |
