summaryrefslogtreecommitdiff
path: root/string/ahoCorasick.cpp
diff options
context:
space:
mode:
authormzuenni <michi.zuendorf@gmail.com>2022-06-27 17:19:28 +0200
committermzuenni <michi.zuendorf@gmail.com>2022-06-27 17:19:28 +0200
commit5ab8a5088b729a9953b8dff1b2a985dc8fb2098b (patch)
treeed40d6936c0e9eee40ba62751cbf99ecddbaddc2 /string/ahoCorasick.cpp
parentadabbad9c51cf7cd3874bfde8eac1fbcf84fec10 (diff)
updated tcr
Diffstat (limited to 'string/ahoCorasick.cpp')
-rw-r--r--string/ahoCorasick.cpp103
1 files changed, 50 insertions, 53 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp
index d221fcb..530490e 100644
--- a/string/ahoCorasick.cpp
+++ b/string/ahoCorasick.cpp
@@ -1,57 +1,54 @@
-// Laufzeit: O(n + m + z), n = #Text, m = Summe #Pattern, z = #Matches
-// Findet mehrere Patterns gleichzeitig in einem String.
-// 1) Wurzel erstellen: aho.push_back(vertex());
-// 2) Mit addString(0, pattern, idx); Patterns hinzufügen.
-// 3) finishAutomaton(0) aufrufen.
-// 4) Mit state = go(state, c) in nächsten Zustand wechseln.
-// DANACH: Wenn patterns-Vektor nicht leer ist: Hier enden alle
-// enthaltenen Patterns.
-// ACHTUNG: Die Zahlenwerte der auftretenden Buchstaben müssen
-// zusammenhängend sein und bei 0 beginnen!
-struct vertex {
- int next[ALPHABET_SIZE], failure;
- int character;
- vector<int> patterns; // Indizes der Patterns, die hier enden.
- vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = -1; }
-};
-vector<vertex> aho;
+constexpr ll ALPHABET_SIZE = 26;
+constexpr char OFFSET = 26;
+struct AhoCorasick {
+ struct vert {
+ int suffix, exit, character, parent;
+ vector<int> nxt, patterns;
+ vert(int c, int p) : suffix(-1), exit(-1),
+ character(c), nxt(ALPHABET_SIZE, -1), parent(p) {}
+ };
+ vector<vert> aho;
-void addString(int v, vector<int> &pattern, int patternIdx) {
- for (int i = 0; i < (int)pattern.size(); i++) {
- if (aho[v].next[pattern[i]] == -1) {
- aho[v].next[pattern[i]] = aho.size();
- aho.push_back(vertex());
- aho.back().character = pattern[i];
- }
- v = aho[v].next[pattern[i]];
- }
- aho[v].patterns.push_back(patternIdx);
-}
+ AhoCorasick() {aho.push_back(vert(-1, 0));}
-void finishAutomaton(int v) {
- for (int i = 0; i < ALPHABET_SIZE; i++)
- if (aho[v].next[i] == -1) aho[v].next[i] = v;
+ // Call once for each pattern.
+ void addString(string &s, int patternIdx) {
+ int v = 0;
+ for (char c : s) {
+ int idx = c - OFFSET;
+ if (aho[v].nxt[idx] == -1) {
+ aho[v].nxt[idx] = sz(aho);
+ aho.emplace_back(idx, v);
+ }
+ v = aho[v].nxt[idx];
+ }
+ aho[v].patterns.push_back(patternIdx);
+ }
- queue<int> q;
- for (int i = 0; i < ALPHABET_SIZE; i++) {
- if (aho[v].next[i] != v) {
- aho[aho[v].next[i]].failure = v;
- q.push(aho[v].next[i]);
- }}
- while (!q.empty()) {
- int r = q.front(); q.pop();
- for (int i = 0; i < ALPHABET_SIZE; i++) {
- if (aho[r].next[i] != -1) {
- q.push(aho[r].next[i]);
- int f = aho[r].failure;
- while (aho[f].next[i] == -1) f = aho[f].failure;
- aho[aho[r].next[i]].failure = aho[f].next[i];
- for (int j = 0; j < (int)aho[aho[f].next[i]].patterns.size(); j++) {
- aho[aho[r].next[i]].patterns.push_back(
- aho[aho[f].next[i]].patterns[j]);
-}}}}}
+ int getSuffix(int v) {
+ if (aho[v].suffix == -1) {
+ if (v == 0 || aho[v].parent == 0) aho[v].suffix = 0;
+ else aho[v].suffix = go(getSuffix(aho[v].parent),
+ aho[v].character);
+ }
+ return aho[v].suffix;
+ }
-int go(int v, int c) {
- if (aho[v].next[c] != -1) return aho[v].next[c];
- else return go(aho[v].failure, c);
-}
+ int getExit(int v) {
+ if (aho[v].exit == -1) {
+ int suffix = getSuffix(v);
+ if (v == 0) aho[v].exit = 0;
+ else {
+ if (aho[suffix].patterns.empty()) {
+ aho[v].exit = getExit(suffix);
+ } else {
+ aho[v].exit = suffix;
+ }}}
+ return aho[v].exit;
+ }
+
+ int go(int v, int idx) { // Root is v=0.
+ if (aho[v].nxt[idx] != -1) return aho[v].nxt[idx];
+ else return v == 0 ? 0 : go(getSuffix(v), idx);
+ }
+}; \ No newline at end of file