summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYidi <noob999noob999@gmail.com>2024-03-20 21:33:18 +0100
committerYidi <noob999noob999@gmail.com>2024-03-20 21:33:18 +0100
commit9906aa7bbf98bee5cdb91e80f6a2311e43129c7d (patch)
tree56461810b2b5dcf43a245f4032386c2535cccebc
parent85a69d5ae4060e31a192cd8c3c81766fc85d6a50 (diff)
improve aho corasick
-rw-r--r--string/ahoCorasick.cpp66
-rw-r--r--string/string.tex10
2 files changed, 38 insertions, 38 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp
index 9ffa6c9..eac312c 100644
--- a/string/ahoCorasick.cpp
+++ b/string/ahoCorasick.cpp
@@ -1,54 +1,52 @@
-constexpr ll ALPHABET_SIZE = 26;
-constexpr char OFFSET = 'a';
+constexpr ll ALPHABET_SIZE = 26, OFFSET = 'a';
struct AhoCorasick {
struct vert {
- int suffix, exit, character, parent;
- vector<int> nxt, patterns;
- vert(int c, int p) : suffix(-1), exit(-1),
- character(c), nxt(ALPHABET_SIZE, -1), parent(p) {}
- };
- vector<vert> aho;
+ int suffix = 0, ch, cnt = 0;
+ array<int, ALPHABET_SIZE> nxt = {};
- AhoCorasick() {aho.push_back(vert(-1, 0));}
+ vert(int p, int c) : suffix(-p), ch(c) {}
+ };
+ vector<vert> aho = {{0, -1}};
- // Call once for each pattern.
- void addString(string &s, int patternIdx) {
+ int addString(string &s) {
int v = 0;
- for (char c : s) {
+ for (auto c : s) {
int idx = c - OFFSET;
- if (aho[v].nxt[idx] == -1) {
+ if (!aho[v].nxt[idx]) {
aho[v].nxt[idx] = sz(aho);
- aho.emplace_back(idx, v);
+ aho.emplace_back(v, idx);
}
v = aho[v].nxt[idx];
}
- aho[v].patterns.push_back(patternIdx);
+ aho[v].cnt++;
+ return v; // trie node index of pattern (pattern state)
}
int getSuffix(int v) {
- if (aho[v].suffix == -1) {
- if (v == 0 || aho[v].parent == 0) aho[v].suffix = 0;
- else aho[v].suffix = go(getSuffix(aho[v].parent),
- aho[v].character);
+ if (aho[v].suffix < 0) {
+ aho[v].suffix = go(getSuffix(-aho[v].suffix), aho[v].ch);
}
return aho[v].suffix;
}
- int getExit(int v) {
- if (aho[v].exit == -1) {
- int suffix = getSuffix(v);
- if (v == 0) aho[v].exit = 0;
- else {
- if (aho[suffix].patterns.empty()) {
- aho[v].exit = getExit(suffix);
- } else {
- aho[v].exit = suffix;
- }}}
- return aho[v].exit;
- }
-
- int go(int v, int idx) { // Root is v=0.
- if (aho[v].nxt[idx] != -1) return aho[v].nxt[idx];
+ int go(int v, int idx) { // Root is v=0, idx is char - OFFSET
+ if (aho[v].nxt[idx]) return aho[v].nxt[idx];
else return v == 0 ? 0 : go(getSuffix(v), idx);
}
+
+ vector<vector<int>> adj;
+ vector<ll> dp;
+ void buildGraph() {
+ adj.resize(sz(aho));
+ dp.assign(sz(aho), 0);
+ for (int i = 1; i < sz(aho); i++) {
+ adj[getSuffix(i)].push_back(i);
+ }}
+
+ void dfs(int v = 0) { // dp on tree
+ for (int u : adj[v]) {
+ //dp[u] = dp[v] + aho[u].cnt; // pattern count
+ dfs(u);
+ dp[v] += dp[u]; // no of matches
+ }}
};
diff --git a/string/string.tex b/string/string.tex
index c36ec69..526faa2 100644
--- a/string/string.tex
+++ b/string/string.tex
@@ -49,16 +49,18 @@
\begin{algorithm}{\textsc{Aho-Corasick}-Automat}
\begin{methods}[ll]
- sucht patterns im Text & \runtime{\abs{Text}+\sum\abs{pattern}+\abs{matches}}
+ sucht patterns im Text & \runtime{\abs{Text}+\sum\abs{pattern}}
\end{methods}
\begin{enumerate}
- \item mit \code{addString(pattern, idx);} Patterns hinzufügen.
+ \item mit \code{addString(pattern, idx)} Patterns hinzufügen.
+ \item rufe \code{buildGraph()} auf
\item mit \code{state = go(state, idx)} in nächsten Zustand wechseln.
- \item mit \code{state = getExit(state)} den exit Kanten folgen bis \code{state == 0}
- \item dabei mit \code{aho[state].patterns} die Matches zählen
+ \item erhöhe dabei \code{dp[state]++}
+ \item rufe \code{dfs()} auf. In dp[pattern state] stehen die Anzahl der Matches
\end{enumerate}
\sourcecode{string/ahoCorasick.cpp}
\end{algorithm}
+\clearpage
\begin{algorithm}{Lyndon und De-Bruijn}
\begin{itemize}