diff options
Diffstat (limited to 'content/string')
| -rw-r--r-- | content/string/ahoCorasick.cpp | 52 | ||||
| -rw-r--r-- | content/string/deBruijn.cpp | 7 | ||||
| -rw-r--r-- | content/string/duval.cpp | 21 | ||||
| -rw-r--r-- | content/string/kmp.cpp | 20 | ||||
| -rw-r--r-- | content/string/longestCommonSubsequence.cpp | 15 | ||||
| -rw-r--r-- | content/string/lyndon.cpp | 11 | ||||
| -rw-r--r-- | content/string/manacher.cpp | 20 | ||||
| -rw-r--r-- | content/string/rollingHash.cpp | 18 | ||||
| -rw-r--r-- | content/string/rollingHashCf.cpp | 17 | ||||
| -rw-r--r-- | content/string/string.tex | 132 | ||||
| -rw-r--r-- | content/string/suffixArray.cpp | 38 | ||||
| -rw-r--r-- | content/string/suffixAutomaton.cpp | 63 | ||||
| -rw-r--r-- | content/string/suffixTree.cpp | 72 | ||||
| -rw-r--r-- | content/string/trie.cpp | 35 | ||||
| -rw-r--r-- | content/string/z.cpp | 10 |
15 files changed, 531 insertions, 0 deletions
diff --git a/content/string/ahoCorasick.cpp b/content/string/ahoCorasick.cpp new file mode 100644 index 0000000..eac312c --- /dev/null +++ b/content/string/ahoCorasick.cpp @@ -0,0 +1,52 @@ +constexpr ll ALPHABET_SIZE = 26, OFFSET = 'a'; +struct AhoCorasick { + struct vert { + int suffix = 0, ch, cnt = 0; + array<int, ALPHABET_SIZE> nxt = {}; + + vert(int p, int c) : suffix(-p), ch(c) {} + }; + vector<vert> aho = {{0, -1}}; + + int addString(string &s) { + int v = 0; + for (auto c : s) { + int idx = c - OFFSET; + if (!aho[v].nxt[idx]) { + aho[v].nxt[idx] = sz(aho); + aho.emplace_back(v, idx); + } + v = aho[v].nxt[idx]; + } + aho[v].cnt++; + return v; // trie node index of pattern (pattern state) + } + + int getSuffix(int v) { + if (aho[v].suffix < 0) { + aho[v].suffix = go(getSuffix(-aho[v].suffix), aho[v].ch); + } + return aho[v].suffix; + } + + int go(int v, int idx) { // Root is v=0, idx is char - OFFSET + if (aho[v].nxt[idx]) return aho[v].nxt[idx]; + else return v == 0 ? 0 : go(getSuffix(v), idx); + } + + vector<vector<int>> adj; + vector<ll> dp; + void buildGraph() { + adj.resize(sz(aho)); + dp.assign(sz(aho), 0); + for (int i = 1; i < sz(aho); i++) { + adj[getSuffix(i)].push_back(i); + }} + + void dfs(int v = 0) { // dp on tree + for (int u : adj[v]) { + //dp[u] = dp[v] + aho[u].cnt; // pattern count + dfs(u); + dp[v] += dp[u]; // no of matches + }} +}; diff --git a/content/string/deBruijn.cpp b/content/string/deBruijn.cpp new file mode 100644 index 0000000..e829137 --- /dev/null +++ b/content/string/deBruijn.cpp @@ -0,0 +1,7 @@ +string deBruijn(int n, char mi = '0', char ma = '1') { + string res, c(1, mi); + do { + if (n % sz(c) == 0) res += c; + } while(next(c, n, mi, ma)); + return res; +} diff --git a/content/string/duval.cpp b/content/string/duval.cpp new file mode 100644 index 0000000..bf36cce --- /dev/null +++ b/content/string/duval.cpp @@ -0,0 +1,21 @@ +vector<pair<int, int>> duval(const string& s) { + vector<pair<int, int>> res; + for (int i = 0; i < sz(s);) { + int j = i + 1, k = i; + for (; j < sz(s) && s[k] <= s[j]; j++) { + if (s[k] < s[j]) k = i; + else k++; + } + while (i <= k) { + res.push_back({i, i + j - k}); + i += j - k; + }} + return res; +} + +int minrotation(const string& s) { + auto parts = duval(s+s); + for (auto [l, r] : parts) { + if (l < sz(s) && r >= sz(s)) { + return l; +}}} diff --git a/content/string/kmp.cpp b/content/string/kmp.cpp new file mode 100644 index 0000000..421479e --- /dev/null +++ b/content/string/kmp.cpp @@ -0,0 +1,20 @@ +vector<int> kmpPreprocessing(const string& sub) { + vector<int> b(sz(sub) + 1); + b[0] = -1; + for (int i = 0, j = -1; i < sz(sub);) { + while (j >= 0 && sub[i] != sub[j]) j = b[j]; + b[++i] = ++j; + } + return b; +} +vector<int> kmpSearch(const string& s, const string& sub) { + vector<int> result, pre = kmpPreprocessing(sub); + for (int i = 0, j = 0; i < sz(s);) { + while (j >= 0 && s[i] != sub[j]) j = pre[j]; + i++; j++; + if (j == sz(sub)) { + result.push_back(i - j); + j = pre[j]; + }} + return result; +} diff --git a/content/string/longestCommonSubsequence.cpp b/content/string/longestCommonSubsequence.cpp new file mode 100644 index 0000000..6c9ea44 --- /dev/null +++ b/content/string/longestCommonSubsequence.cpp @@ -0,0 +1,15 @@ +string lcss(const string& a, const string& b) { + vector<vector<int>> m(sz(a) + 1, vector<int>(sz(b) + 1)); + for (int i = sz(a) - 1; i >= 0; i--) { + for (int j = sz(b) - 1; j >= 0; j--) { + if (a[i] == b[j]) m[i][j] = 1 + m[i+1][j+1]; + else m[i][j] = max(m[i+1][j], m[i][j+1]); + }} // Für die Länge: return m[0][0]; + string res; + for (int j = 0, i = 0; j < sz(b) && i < sz(a);) { + if (a[i] == b[j]) res += a[i++], j++; + else if (m[i][j+1] > m[i+1][j]) j++; + else i++; + } + return res; +} diff --git a/content/string/lyndon.cpp b/content/string/lyndon.cpp new file mode 100644 index 0000000..e44379b --- /dev/null +++ b/content/string/lyndon.cpp @@ -0,0 +1,11 @@ +bool next(string& s, int maxLen, char mi = '0', char ma = '1') { + for (int i = sz(s), j = sz(s); i < maxLen; i++) + s.push_back(s[i % j]); + while(!s.empty() && s.back() == ma) s.pop_back(); + if (s.empty()) { + s = mi; + return false; + } else { + s.back()++; + return true; +}} diff --git a/content/string/manacher.cpp b/content/string/manacher.cpp new file mode 100644 index 0000000..112bd55 --- /dev/null +++ b/content/string/manacher.cpp @@ -0,0 +1,20 @@ +vector<int> manacher(const string& t) { + //transforms "aa" to ".a.a." to find even length palindromes + string s(sz(t) * 2 + 1, '.'); + for (int i = 0; i < sz(t); i++) s[2 * i + 1] = t[i]; + + int mid = 0, r = 0, n = sz(s); + vector<int> pal(n); + for (int i = 1; i < n - 1; i++) { + if (r > i) pal[i] = min(r - i, pal[2 * mid - i]); + while (pal[i] < min(i, n - i - 1) && + s[i + pal[i] + 1] == s[i - pal[i] - 1]) { + pal[i]++; + } + if (i + pal[i] > r) mid = i, r = i + pal[i]; + } + + //convert lengths to constructed string s (optional) + //for (int i = 0; i < n; i++) pal[i] = 2 * pal[i] + 1; + return pal; +} diff --git a/content/string/rollingHash.cpp b/content/string/rollingHash.cpp new file mode 100644 index 0000000..6e914aa --- /dev/null +++ b/content/string/rollingHash.cpp @@ -0,0 +1,18 @@ +// M = 1.7e9 + 9, 1e18L + 9, 2.2e18L + 7 +struct Hash { + static constexpr ll M = 3e18L + 37; + static constexpr ll Q = 318LL << 53; // Random in [SIGMA+1, M) + vector<ll> pref = {0}, power = {1}; + + Hash(const string& s) { + for (auto c : s) { // c > 0 + pref.push_back((mul(pref.back(), Q) + c + M) % M); + power.push_back(mul(power.back(), Q)); + }} + + ll operator()(int l, int r) { + return (pref[r] - mul(power[r-l], pref[l]) + M) % M; + } + + static ll mul(__int128 a, ll b) {return a * b % M;} +}; diff --git a/content/string/rollingHashCf.cpp b/content/string/rollingHashCf.cpp new file mode 100644 index 0000000..84b2e4e --- /dev/null +++ b/content/string/rollingHashCf.cpp @@ -0,0 +1,17 @@ +// M = 1.7e9 + 9, 1e18L + 9, 2.2e18L + 7 +struct Hash { + static constexpr ll M = 3e18L + 37; + vector<ll> pref = {0}, power = {1}; + + Hash(const string& s, ll Q) { // Q Random in [SIGMA+1, M) + for (auto c : s) { // c > 0 + pref.push_back((mul(pref.back(), Q) + c + M) % M); + power.push_back(mul(power.back(), Q)); + }} + + ll operator()(int l, int r) { + return (pref[r] - mul(power[r-l], pref[l]) + M) % M; + } + + static ll mul(__int128 a, ll b) {return a * b % M;} +}; diff --git a/content/string/string.tex b/content/string/string.tex new file mode 100644 index 0000000..0e482bf --- /dev/null +++ b/content/string/string.tex @@ -0,0 +1,132 @@ +\section{Strings} + +\begin{algorithm}{\textsc{Knuth-Morris-Pratt}-Algorithmus} + \begin{methods} + \method{kmpSearch}{sucht \code{sub} in \code{s}}{\abs{s}+\abs{sub}} + \end{methods} + \sourcecode{string/kmp.cpp} +\end{algorithm} + +\begin{algorithm}{Z-Algorithmus} + \begin{methods}[ll] + $z_i\coloneqq$ Längstes gemeinsames Präfix von $s_0\cdots s_{n-1}$ und $s_i\cdots s_{n-1}$ & \runtime{n} + \end{methods} + Suchen: Z-Algorithmus auf \code{P\$S} ausführen, Positionen mit $z_i=\abs{P}$ zurückgeben + \sourcecode{string/z.cpp} +\end{algorithm} + +\begin{algorithm}{Rolling Hash} + \sourcecode{string/rollingHash.cpp} +\end{algorithm} + +\begin{algorithm}{Pattern Matching mit Wildcards} + Gegeben zwei strings $A$ und $B$,$B$ enthält $k$ \emph{wildcards} enthält. Sei: + \begin{align*} + a_i&=\cos(\alpha_i) + i\sin(\alpha_i) &\text{ mit } \alpha_i&=\frac{2\pi A[i]}{\Sigma}\\ + b_i&=\cos(\beta_i) + i\sin(\beta_i) &\text{ mit } \beta_i&=\begin{cases*} + \frac{2\pi B[\abs{B}-i-1]}{\Sigma} & falls $B[\abs{B}-i-1]\in\Sigma$ \\ + 0 & sonst + \end{cases*} + \end{align*} + $B$ matcht $A$ an stelle $i$ wenn $(b\cdot a)[|B|-1+i]=|B|-k$. + Benutze FFT um $(b\cdot a)$ zu berechnen. +\end{algorithm} + +\begin{algorithm}{\textsc{Manacher}'s Algorithm, Longest Palindrome} + \begin{methods} + \method{init}{transformiert \code{string a}}{n} + \method{manacher}{berechnet Längen der Palindrome in longest}{n} + \end{methods} + \sourcecode{string/manacher.cpp} +\end{algorithm} + +\begin{algorithm}{Longest Common Subsequence} + \begin{methods} + \method{lcss}{findet längste gemeinsame Sequenz}{\abs{a}\*\abs{b}} + \end{methods} + \sourcecode{string/longestCommonSubsequence.cpp} +\end{algorithm} + +\columnbreak +\begin{algorithm}{\textsc{Aho-Corasick}-Automat} + \begin{methods}[ll] + sucht patterns im Text & \runtime{\abs{Text}+\sum\abs{pattern}} + \end{methods} + \begin{enumerate} + \item mit \code{addString(pattern, idx)} Patterns hinzufügen. + \item rufe \code{buildGraph()} auf + \item mit \code{state = go(state, idx)} in nächsten Zustand wechseln. + \item erhöhe dabei \code{dp[state]++} + \item rufe \code{dfs()} auf. In dp[pattern state] stehen die Anzahl der Matches + \end{enumerate} + \sourcecode{string/ahoCorasick.cpp} +\end{algorithm} +\clearpage + +\begin{algorithm}{\textsc{Lyndon} und \textsc{De-Bruijn}} + \begin{itemize} + \item \textbf{\textsc{Lyndon}-Wort:} Ein Wort das lexikographisch kleiner ist als jede seiner Rotationen. + \item Jedes Wort kann \emph{eindeutig} in eine nicht ansteigende Folge von \textsc{Lyndon}-Worten zerlegt werden. + \item Für \textsc{Lyndon}-Worte $u, v$ mit $u<v$ gilt, dass $uv$ auch ein \textsc{Lyndon}-Wort ist. + \end{itemize} + \begin{methods} + \method[, Durchschnitt $\Theta(1)$]{next}{lexikographisch nächstes \textsc{Lyndon}-Wort}{n} + \method{duval}{zerlegt $s$ in \textsc{Lyndon}-Worte}{n} + \method{minrotation}{berechnet kleinste Rotation von $s$}{n} + \end{methods} + \sourcecode{string/lyndon.cpp} + \sourcecode{string/duval.cpp} + \begin{itemize} + \item \textbf{\textsc{De-Bruijn}-Sequenz $\boldsymbol{B(\Sigma, n)}$:}~~~ein Wort das jedes Wort der Länge $n$ genau einmal als substring enthält (und minimal ist). Wobei $B(\Sigma, n)$ zyklisch betrachtet wird. + \item es gibt $\frac{(k!)^{k^{n-1}}}{k^{n}}$ verschiedene $B(\Sigma, n)$ + \item $B(\Sigma, n)$ hat Länge $\abs{\Sigma}^n$ + \end{itemize} + \begin{methods} + \method{deBruijn}{berechnet ein festes $B(\Sigma, n)$}{\abs{\Sigma}^n} + \end{methods} + \sourcecode{string/deBruijn.cpp} +\end{algorithm} + +\begin{algorithm}{Suffix-Array} +\begin{methods} + \method{SuffixArray}{berechnet ein Suffix Array}{\abs{s}\*\log^2(\abs{s})} + \method{lcp}{berechnet Länge des longest common prefix}{\log(\abs{s})} + \method{}{von \code{s[x]} und \code{s[y]}}{} +\end{methods} +\sourcecode{string/suffixArray.cpp} +\end{algorithm} + +\begin{algorithm}{Suffix-Baum} + \begin{methods} + \method{SuffixTree}{berechnet einen Suffixbaum}{\abs{s}} + \method{extend}{fügt den nächsten Buchstaben aus \code{s} ein}{1} + \end{methods} + \sourcecode{string/suffixTree.cpp} +\end{algorithm} + +\begin{algorithm}{Suffix-Automaton} + \begin{itemize} + \item \textbf{Ist \textit{w} Substring von \textit{s}?} + Baue Automaten für \textit{s} und wende ihn auf \textit{w} an. + Wenn alle Übergänge vorhanden sind, ist \textit{w} Substring von \textit{s}. + + \item \textbf{Ist \textit{w} Suffix von \textit{s}?} + Wie oben und prüfe, ob Endzustand ein Terminal ist. + + \item \textbf{Anzahl verschiedener Substrings.} + Jeder Pfad im Automaten entspricht einem Substring. + Für einen Knoten ist die Anzahl der ausgehenden Pfade gleich der Summe über die Anzahlen der Kindknoten plus 1. + Der letzte Summand ist der Pfad, der in diesem Knoten endet. + + \item \textbf{Wie oft taucht \textit{w} in \textit{s} auf?} + Sei \textit{p} der Zustand nach Abarbeitung von \textit{w}. + Lösung ist Anzahl der Pfade, die in \textit{p} starten und in einem Terminal enden. + Diese Zahl lässt sich wie oben rekursiv berechnen. + Bei jedem Knoten darf nur dann plus 1 gerechnet werden, wenn es ein Terminal ist. + \end{itemize} + \sourcecode{string/suffixAutomaton.cpp} +\end{algorithm} + +\begin{algorithm}{Trie} + \sourcecode{string/trie.cpp} +\end{algorithm} diff --git a/content/string/suffixArray.cpp b/content/string/suffixArray.cpp new file mode 100644 index 0000000..8b698d2 --- /dev/null +++ b/content/string/suffixArray.cpp @@ -0,0 +1,38 @@ +constexpr int MAX_CHAR = 256; +struct SuffixArray { + int n; + vector<int> SA, LCP; + vector<vector<int>> P; + + SuffixArray(const string& s) : n(sz(s)), SA(n), LCP(n), + P(__lg(2 * n - 1) + 1, vector<int>(n)) { + P[0].assign(all(s)); + iota(all(SA), 0); + sort(all(SA), [&](int a, int b) {return s[a] < s[b];}); + vector<int> x(n); + for (int k = 1, c = 1; c < n; k++, c *= 2) { + iota(all(x), n - c); + for (int ptr = c; int i : SA) if (i >= c) x[ptr++] = i - c; + + vector<int> cnt(k == 1 ? MAX_CHAR : n); + for (int i : P[k-1]) cnt[i]++; + partial_sum(all(cnt), begin(cnt)); + for (int i : x | views::reverse) SA[--cnt[P[k-1][i]]] = i; + + auto p = [&](int i) {return i < n ? P[k-1][i] : -1;}; + for (int i = 1; i < n; i++) { + int a = SA[i-1], b = SA[i]; + P[k][b] = P[k][a] + (p(a) != p(b) || p(a+c) != p(b+c)); + }} + for (int i = 1; i < n; i++) LCP[i] = lcp(SA[i-1], SA[i]); + } + + int lcp(int x, int y) {//x & y are text-indices, not SA-indices + if (x == y) return n - x; + int res = 0; + for (int i = sz(P) - 1; i >= 0 && max(x, y) + res < n; i--) { + if (P[i][x + res] == P[i][y + res]) res |= 1 << i; + } + return res; + } +}; diff --git a/content/string/suffixAutomaton.cpp b/content/string/suffixAutomaton.cpp new file mode 100644 index 0000000..9a68cb3 --- /dev/null +++ b/content/string/suffixAutomaton.cpp @@ -0,0 +1,63 @@ +constexpr int ALPHABET_SIZE = 26; +constexpr char OFFSET = 'a'; +struct SuffixAutomaton { + struct State { + int len, link = -1; + array<int, ALPHABET_SIZE> nxt; // map if large Alphabet + State(int l) : len(l) {fill(all(nxt), -1);} + }; + + vector<State> st = {State(0)}; + int cur = 0; + + SuffixAutomaton(const string& s) { + st.reserve(2 * sz(s)); + for (auto c : s) extend(c - OFFSET); + } + + void extend(int c) { + int p = cur; + cur = sz(st); + st.emplace_back(st[p].len + 1); + for (; p != -1 && st[p].nxt[c] < 0; p = st[p].link) { + st[p].nxt[c] = cur; + } + if (p == -1) { + st[cur].link = 0; + } else { + int q = st[p].nxt[c]; + if (st[p].len + 1 == st[q].len) { + st[cur].link = q; + } else { + st.emplace_back(st[p].len + 1); + st.back().link = st[q].link; + st.back().nxt = st[q].nxt; + for (; p != -1 && st[p].nxt[c] == q; p = st[p].link) { + st[p].nxt[c] = sz(st) - 1; + } + st[q].link = st[cur].link = sz(st) - 1; + }}} + + vector<int> calculateTerminals() { + vector<int> terminals; + for (int p = cur; p != -1; p = st[p].link) { + terminals.push_back(p); + } + return terminals; + } + + // Pair with start index (in t) and length of LCS. + pair<int, int> longestCommonSubstring(const string& t) { + int v = 0, l = 0, best = 0, bestp = -1; + for (int i = 0; i < sz(t); i++) { + int c = t[i] - OFFSET; + while (v > 0 && st[v].nxt[c] < 0) { + v = st[v].link; + l = st[v].len; + } + if (st[v].nxt[c] >= 0) v = st[v].nxt[c], l++; + if (l > best) best = l, bestp = i; + } + return {bestp - best + 1, best}; + } +}; diff --git a/content/string/suffixTree.cpp b/content/string/suffixTree.cpp new file mode 100644 index 0000000..7112f39 --- /dev/null +++ b/content/string/suffixTree.cpp @@ -0,0 +1,72 @@ +struct SuffixTree { + struct Vert { + int start, end, suf; //s[start...end) along parent edge + map<char, int> nxt; + }; + string s; + int needsSuffix, pos, remainder, curVert, curEdge, curLen; + // Each Vertex gives its children range as [start, end) + vector<Vert> tree = {Vert{-1, -1, 0, {}}}; + + SuffixTree(const string& s_) : s(s_) { + needsSuffix = remainder = curVert = curEdge = curLen = 0; + pos = -1; + for (int i = 0; i < sz(s); i++) extend(); + } + + int newVert(int start, int end) { + tree.push_back({start, end, 0, {}}); + return sz(tree) - 1; + } + + void addSuffixLink(int vert) { + if (needsSuffix) tree[needsSuffix].suf = vert; + needsSuffix = vert; + } + + bool fullImplicitEdge(int vert) { + int len = min(tree[vert].end, pos + 1) - tree[vert].start; + if (curLen >= len) { + curEdge += len; + curLen -= len; + curVert = vert; + return true; + } else { + return false; + }} + + void extend() { + pos++; + needsSuffix = 0; + remainder++; + while (remainder) { + if (curLen == 0) curEdge = pos; + if (!tree[curVert].nxt.count(s[curEdge])) { + int leaf = newVert(pos, sz(s)); + tree[curVert].nxt[s[curEdge]] = leaf; + addSuffixLink(curVert); + } else { + int nxt = tree[curVert].nxt[s[curEdge]]; + if (fullImplicitEdge(nxt)) continue; + if (s[tree[nxt].start + curLen] == s[pos]) { + curLen++; + addSuffixLink(curVert); + break; + } + int split = newVert(tree[nxt].start, + tree[nxt].start + curLen); + tree[curVert].nxt[s[curEdge]] = split; + int leaf = newVert(pos, sz(s)); + tree[split].nxt[s[pos]] = leaf; + tree[nxt].start += curLen; + tree[split].nxt[s[tree[nxt].start]] = nxt; + addSuffixLink(split); + } + remainder--; + if (curVert == 0 && curLen) { + curLen--; + curEdge = pos - remainder + 1; + } else { + curVert = tree[curVert].suf ? tree[curVert].suf : 0; + }}} +};
\ No newline at end of file diff --git a/content/string/trie.cpp b/content/string/trie.cpp new file mode 100644 index 0000000..03cf947 --- /dev/null +++ b/content/string/trie.cpp @@ -0,0 +1,35 @@ +// Zahlenwerte müssen bei 0 beginnen und zusammenhängend sein. +constexpr int ALPHABET_SIZE = 2; +struct node { + int words, ends; + array<int, ALPHABET_SIZE> nxt; + node() : words(0), ends(0) {fill(all(nxt), -1);} +}; +vector<node> trie = {node()}; + +int traverse(const vector<int>& word, int x) { + int id = 0; + for (int c : word) { + if (id < 0 || (trie[id].words == 0 && x <= 0)) return -1; + trie[id].words += x; + if (trie[id].nxt[c] < 0 && x > 0) { + trie[id].nxt[c] = sz(trie); + trie.emplace_back(); + } + id = trie[id].nxt[c]; + } + trie[id].words += x; + trie[id].ends += x; + return id; +} + +int insert(const vector<int>& word) { + return traverse(word, 1); +} + +bool erase(const vector<int>& word) { + int id = traverse(word, 0); + if (id < 0 || trie[id].ends <= 0) return false; + traverse(word, -1); + return true; +} diff --git a/content/string/z.cpp b/content/string/z.cpp new file mode 100644 index 0000000..069fa38 --- /dev/null +++ b/content/string/z.cpp @@ -0,0 +1,10 @@ +vector<int> Z(const string& s) { + int n = sz(s); + vector<int> z(n); + for (int i = 1, x = 0; i < n; i++) { + z[i] = max(0, min(z[i - x], x + z[x] - i)); + while (i + z[i] < n && s[z[i]] == s[i + z[i]]) { + x = i, z[i]++; + }} + return z; +} |
