summaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
authorPaul Jungeblut <paul.jungeblut@gmail.com>2016-02-06 12:04:00 +0100
committerPaul Jungeblut <paul.jungeblut@gmail.com>2016-02-06 12:04:00 +0100
commit2f428f1415fcbd3700def0f513b30a4818b6e39d (patch)
tree8ea68f0e4fb6fd8047c64ca28603e6335af6fdc2 /string
parenta66f9b1434c454087019461c23d341786ca5d195 (diff)
Adding an Aho-Corasick automaton for string multimatching.
Diffstat (limited to 'string')
-rw-r--r--string/ahoCorasick.cpp52
-rw-r--r--string/string.tex3
2 files changed, 55 insertions, 0 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp
new file mode 100644
index 0000000..1a6f8ed
--- /dev/null
+++ b/string/ahoCorasick.cpp
@@ -0,0 +1,52 @@
+// Laufzeit: O(n + m + z), n = Suchstringlänge, m = Summe der Patternlängen, z = #Matches
+// Findet mehrere Patterns gleichzeitig in einem String.
+// 1) Wurzel erstellen: vertex *automaton = new vertex();
+// 2) Mit addString(automaton, s, idx); Patterns hinzufügen.
+// 3) finishAutomaton aufrufen.
+// 4) Mit automaton = go(automaton, c) in nächsten Zustand wechseln. Wenn patterns-Vektor nicht leer ist:
+// Hier enden alle anthaltenen Patterns.
+struct vertex {
+ vertex *next[ALPHABET_SIZE], *failure;
+ char character;
+ vector<int> patterns; // Indizes der Patterns, die hier enden.
+ vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = NULL; }
+};
+
+void addString(vertex *v, string &pattern, int patternIdx) {
+ for (int i = 0; i < (int)pattern.length(); i++) {
+ if (!v->next[(int)pattern[i]]) {
+ vertex *w = new vertex();
+ w->character = pattern[i];
+ v->next[(int)pattern[i]] = w;
+ }
+ v = v->next[(int)pattern[i]];
+ }
+ v->patterns.push_back(patternIdx);
+}
+
+void finishAutomaton(vertex *v) {
+ for (int i = 0; i < ALPHABET_SIZE; i++)
+ if (!v->next[i]) v->next[i] = v;
+
+ queue<vertex*> q;
+ for (int i = 0; i < ALPHABET_SIZE; i++) {
+ if (v->next[i] != v) {
+ v->next[i]->failure = v;
+ q.push(v->next[i]);
+ }}
+ while (!q.empty()) {
+ vertex *r = q.front(); q.pop();
+ for (int i = 0; i < ALPHABET_SIZE; i++) {
+ if (r->next[i]) {
+ q.push(r->next[i]);
+ vertex *f = r->failure;
+ while (!f->next[i]) f = f->failure;
+ r->next[i]->failure = f->next[i];
+ for (int j = 0; j < (int)f->next[i]->patterns.size(); j++) {
+ r->next[i]->patterns.push_back(f->next[i]->patterns[j]);
+}}}}}
+
+vertex* go(vertex *v, char c) {
+ if (v->next[(int)c]) return v->next[(int)c];
+ else return go(v->failure, c);
+}
diff --git a/string/string.tex b/string/string.tex
index 0fddb86..d4180c6 100644
--- a/string/string.tex
+++ b/string/string.tex
@@ -3,6 +3,9 @@
\subsection{\textsc{Knuth-Morris-Pratt}-Algorithmus}
\lstinputlisting{string/kmp.cpp}
+\subsection{\textsc{Aho-Corasick}-Automat}
+\lstinputlisting{string/ahoCorasick.cpp}
+
\subsection{\textsc{Levenshtein}-Distanz}
\lstinputlisting{string/levenshtein.cpp}