diff options
| author | Paul Jungeblut <paul.jungeblut@gmail.com> | 2016-02-06 12:04:00 +0100 |
|---|---|---|
| committer | Paul Jungeblut <paul.jungeblut@gmail.com> | 2016-02-06 12:04:00 +0100 |
| commit | 2f428f1415fcbd3700def0f513b30a4818b6e39d (patch) | |
| tree | 8ea68f0e4fb6fd8047c64ca28603e6335af6fdc2 | |
| parent | a66f9b1434c454087019461c23d341786ca5d195 (diff) | |
Adding an Aho-Corasick automaton for string multimatching.
| -rw-r--r-- | string/ahoCorasick.cpp | 52 | ||||
| -rw-r--r-- | string/string.tex | 3 | ||||
| -rw-r--r-- | tcr.pdf | bin | 238838 -> 242708 bytes |
3 files changed, 55 insertions, 0 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp new file mode 100644 index 0000000..1a6f8ed --- /dev/null +++ b/string/ahoCorasick.cpp @@ -0,0 +1,52 @@ +// Laufzeit: O(n + m + z), n = Suchstringlänge, m = Summe der Patternlängen, z = #Matches +// Findet mehrere Patterns gleichzeitig in einem String. +// 1) Wurzel erstellen: vertex *automaton = new vertex(); +// 2) Mit addString(automaton, s, idx); Patterns hinzufügen. +// 3) finishAutomaton aufrufen. +// 4) Mit automaton = go(automaton, c) in nächsten Zustand wechseln. Wenn patterns-Vektor nicht leer ist: +// Hier enden alle anthaltenen Patterns. +struct vertex { + vertex *next[ALPHABET_SIZE], *failure; + char character; + vector<int> patterns; // Indizes der Patterns, die hier enden. + vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = NULL; } +}; + +void addString(vertex *v, string &pattern, int patternIdx) { + for (int i = 0; i < (int)pattern.length(); i++) { + if (!v->next[(int)pattern[i]]) { + vertex *w = new vertex(); + w->character = pattern[i]; + v->next[(int)pattern[i]] = w; + } + v = v->next[(int)pattern[i]]; + } + v->patterns.push_back(patternIdx); +} + +void finishAutomaton(vertex *v) { + for (int i = 0; i < ALPHABET_SIZE; i++) + if (!v->next[i]) v->next[i] = v; + + queue<vertex*> q; + for (int i = 0; i < ALPHABET_SIZE; i++) { + if (v->next[i] != v) { + v->next[i]->failure = v; + q.push(v->next[i]); + }} + while (!q.empty()) { + vertex *r = q.front(); q.pop(); + for (int i = 0; i < ALPHABET_SIZE; i++) { + if (r->next[i]) { + q.push(r->next[i]); + vertex *f = r->failure; + while (!f->next[i]) f = f->failure; + r->next[i]->failure = f->next[i]; + for (int j = 0; j < (int)f->next[i]->patterns.size(); j++) { + r->next[i]->patterns.push_back(f->next[i]->patterns[j]); +}}}}} + +vertex* go(vertex *v, char c) { + if (v->next[(int)c]) return v->next[(int)c]; + else return go(v->failure, c); +} diff --git a/string/string.tex b/string/string.tex index 0fddb86..d4180c6 100644 --- a/string/string.tex +++ b/string/string.tex @@ -3,6 +3,9 @@ \subsection{\textsc{Knuth-Morris-Pratt}-Algorithmus} \lstinputlisting{string/kmp.cpp} +\subsection{\textsc{Aho-Corasick}-Automat} +\lstinputlisting{string/ahoCorasick.cpp} + \subsection{\textsc{Levenshtein}-Distanz} \lstinputlisting{string/levenshtein.cpp} |
