diff options
Diffstat (limited to 'string')
| -rw-r--r-- | string/ahoCorasick.cpp | 66 | ||||
| -rw-r--r-- | string/trie.cpp | 35 |
2 files changed, 54 insertions, 47 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp index c283f6f..d221fcb 100644 --- a/string/ahoCorasick.cpp +++ b/string/ahoCorasick.cpp @@ -1,55 +1,57 @@ // Laufzeit: O(n + m + z), n = #Text, m = Summe #Pattern, z = #Matches // Findet mehrere Patterns gleichzeitig in einem String. -// 1) Wurzel erstellen: vertex *automaton = new vertex(); -// 2) Mit addString(automaton, s, idx); Patterns hinzufügen. -// 3) finishAutomaton(automaton) aufrufen. -// 4) Mit automaton = go(automaton, c) in nächsten Zustand wechseln. +// 1) Wurzel erstellen: aho.push_back(vertex()); +// 2) Mit addString(0, pattern, idx); Patterns hinzufügen. +// 3) finishAutomaton(0) aufrufen. +// 4) Mit state = go(state, c) in nächsten Zustand wechseln. // DANACH: Wenn patterns-Vektor nicht leer ist: Hier enden alle // enthaltenen Patterns. // ACHTUNG: Die Zahlenwerte der auftretenden Buchstaben müssen // zusammenhängend sein und bei 0 beginnen! struct vertex { - vertex *next[ALPHABET_SIZE], *failure; - char character; + int next[ALPHABET_SIZE], failure; + int character; vector<int> patterns; // Indizes der Patterns, die hier enden. - vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = NULL; } + vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = -1; } }; +vector<vertex> aho; -void addString(vertex *v, string &pattern, int patternIdx) { - for (int i = 0; i < (int)pattern.length(); i++) { - if (!v->next[(int)pattern[i]]) { - vertex *w = new vertex(); - w->character = pattern[i]; - v->next[(int)pattern[i]] = w; +void addString(int v, vector<int> &pattern, int patternIdx) { + for (int i = 0; i < (int)pattern.size(); i++) { + if (aho[v].next[pattern[i]] == -1) { + aho[v].next[pattern[i]] = aho.size(); + aho.push_back(vertex()); + aho.back().character = pattern[i]; } - v = v->next[(int)pattern[i]]; + v = aho[v].next[pattern[i]]; } - v->patterns.push_back(patternIdx); + aho[v].patterns.push_back(patternIdx); } -void finishAutomaton(vertex *v) { +void finishAutomaton(int v) { for (int i = 0; i < ALPHABET_SIZE; i++) - if (!v->next[i]) v->next[i] = v; + if (aho[v].next[i] == -1) aho[v].next[i] = v; - queue<vertex*> q; + queue<int> q; for (int i = 0; i < ALPHABET_SIZE; i++) { - if (v->next[i] != v) { - v->next[i]->failure = v; - q.push(v->next[i]); + if (aho[v].next[i] != v) { + aho[aho[v].next[i]].failure = v; + q.push(aho[v].next[i]); }} while (!q.empty()) { - vertex *r = q.front(); q.pop(); + int r = q.front(); q.pop(); for (int i = 0; i < ALPHABET_SIZE; i++) { - if (r->next[i]) { - q.push(r->next[i]); - vertex *f = r->failure; - while (!f->next[i]) f = f->failure; - r->next[i]->failure = f->next[i]; - for (int j = 0; j < (int)f->next[i]->patterns.size(); j++) { - r->next[i]->patterns.push_back(f->next[i]->patterns[j]); + if (aho[r].next[i] != -1) { + q.push(aho[r].next[i]); + int f = aho[r].failure; + while (aho[f].next[i] == -1) f = aho[f].failure; + aho[aho[r].next[i]].failure = aho[f].next[i]; + for (int j = 0; j < (int)aho[aho[f].next[i]].patterns.size(); j++) { + aho[aho[r].next[i]].patterns.push_back( + aho[aho[f].next[i]].patterns[j]); }}}}} -vertex* go(vertex *v, char c) { - if (v->next[(int)c]) return v->next[(int)c]; - else return go(v->failure, c); +int go(int v, int c) { + if (aho[v].next[c] != -1) return aho[v].next[c]; + else return go(aho[v].failure, c); } diff --git a/string/trie.cpp b/string/trie.cpp index 1b5f573..fa9ec49 100644 --- a/string/trie.cpp +++ b/string/trie.cpp @@ -1,20 +1,25 @@ +// Zahlenwerte müssen bei 0 beginnen und zusammenhängend sein. struct node { - node *e[26]; // Implementierung für Kleinbuchstaben. - int c = 0; // Anzahl der Wörter, die an diesem node enden. - node() { for(int i = 0; i < 26; i++) e[i] = NULL; } + int children[ALPHABET_SIZE], c; // c = #Wörter, die hier enden. + node () { + idx = -1; + for (int i = 0; i < ALPHABET_SIZE; i++) children[i] = -1; + } }; +vector<node> trie; // Anlegen mit trie.push_back(node()); -void insert(node *root, string &txt, int s = 0) { // Laufzeit: O(|txt|) - if(s == (int)txt.size()) root->c++; - else { - int idx = (int)(txt[s] - 'a'); - if(root->e[idx] == NULL) root->e[idx] = new node(); - insert(root->e[idx], txt, s+1); -}} +void insert(int vert, vector<int> &txt, int s) { // Laufzeit: O(|txt|) + if (s == (int)txt.size()) { trie[vert].c++; return; } + if (trie[vert].children[txt[s]] == -1) { + trie[vert].children[txt[s]] = trie.size(); + trie.push_back(node()); + } + insert(trie[vert].children[txt[s]], txt, s + 1); +} -int contains(node *root, string &txt, int s = 0) { // Laufzeit: O(|txt|) - if(s == (int)txt.size()) return root->c; - int idx = (int)(txt[s] - 'a'); - if(root->e[idx] != NULL) return contains(root->e[idx], txt, s + 1); - else return 0; +int contains(int vert, vector<int> &txt, int s) { // Laufzeit: O(|txt|) + if (s == (int)txt.size()) return trie[vert].c; + if (trie[vert].children[txt[s]] != -1) { + return contains(trie[vert].children[txt[s]], txt, s + 1); + } else return 0; } |
