diff options
Diffstat (limited to 'string')
| -rw-r--r-- | string/ahoCorasick.cpp | 103 | ||||
| -rw-r--r-- | string/deBruijn.cpp | 7 | ||||
| -rw-r--r-- | string/duval.cpp | 21 | ||||
| -rw-r--r-- | string/kmp.cpp | 44 | ||||
| -rw-r--r-- | string/longestCommonSubsequence.cpp | 14 | ||||
| -rw-r--r-- | string/lyndon.cpp | 11 | ||||
| -rw-r--r-- | string/manacher.cpp | 51 | ||||
| -rw-r--r-- | string/rollingHash.cpp | 32 | ||||
| -rw-r--r-- | string/string.tex | 142 | ||||
| -rw-r--r-- | string/suffixArray.cpp | 52 | ||||
| -rw-r--r-- | string/suffixAutomaton.cpp | 49 | ||||
| -rw-r--r-- | string/suffixTree.cpp | 151 | ||||
| -rw-r--r-- | string/trie.cpp | 44 | ||||
| -rw-r--r-- | string/z.cpp | 15 |
14 files changed, 434 insertions, 302 deletions
diff --git a/string/ahoCorasick.cpp b/string/ahoCorasick.cpp index d221fcb..530490e 100644 --- a/string/ahoCorasick.cpp +++ b/string/ahoCorasick.cpp @@ -1,57 +1,54 @@ -// Laufzeit: O(n + m + z), n = #Text, m = Summe #Pattern, z = #Matches -// Findet mehrere Patterns gleichzeitig in einem String. -// 1) Wurzel erstellen: aho.push_back(vertex()); -// 2) Mit addString(0, pattern, idx); Patterns hinzufügen. -// 3) finishAutomaton(0) aufrufen. -// 4) Mit state = go(state, c) in nächsten Zustand wechseln. -// DANACH: Wenn patterns-Vektor nicht leer ist: Hier enden alle -// enthaltenen Patterns. -// ACHTUNG: Die Zahlenwerte der auftretenden Buchstaben müssen -// zusammenhängend sein und bei 0 beginnen! -struct vertex { - int next[ALPHABET_SIZE], failure; - int character; - vector<int> patterns; // Indizes der Patterns, die hier enden. - vertex() { for (int i = 0; i < ALPHABET_SIZE; i++) next[i] = -1; } -}; -vector<vertex> aho; +constexpr ll ALPHABET_SIZE = 26; +constexpr char OFFSET = 26; +struct AhoCorasick { + struct vert { + int suffix, exit, character, parent; + vector<int> nxt, patterns; + vert(int c, int p) : suffix(-1), exit(-1), + character(c), nxt(ALPHABET_SIZE, -1), parent(p) {} + }; + vector<vert> aho; -void addString(int v, vector<int> &pattern, int patternIdx) { - for (int i = 0; i < (int)pattern.size(); i++) { - if (aho[v].next[pattern[i]] == -1) { - aho[v].next[pattern[i]] = aho.size(); - aho.push_back(vertex()); - aho.back().character = pattern[i]; - } - v = aho[v].next[pattern[i]]; - } - aho[v].patterns.push_back(patternIdx); -} + AhoCorasick() {aho.push_back(vert(-1, 0));} -void finishAutomaton(int v) { - for (int i = 0; i < ALPHABET_SIZE; i++) - if (aho[v].next[i] == -1) aho[v].next[i] = v; + // Call once for each pattern. + void addString(string &s, int patternIdx) { + int v = 0; + for (char c : s) { + int idx = c - OFFSET; + if (aho[v].nxt[idx] == -1) { + aho[v].nxt[idx] = sz(aho); + aho.emplace_back(idx, v); + } + v = aho[v].nxt[idx]; + } + aho[v].patterns.push_back(patternIdx); + } - queue<int> q; - for (int i = 0; i < ALPHABET_SIZE; i++) { - if (aho[v].next[i] != v) { - aho[aho[v].next[i]].failure = v; - q.push(aho[v].next[i]); - }} - while (!q.empty()) { - int r = q.front(); q.pop(); - for (int i = 0; i < ALPHABET_SIZE; i++) { - if (aho[r].next[i] != -1) { - q.push(aho[r].next[i]); - int f = aho[r].failure; - while (aho[f].next[i] == -1) f = aho[f].failure; - aho[aho[r].next[i]].failure = aho[f].next[i]; - for (int j = 0; j < (int)aho[aho[f].next[i]].patterns.size(); j++) { - aho[aho[r].next[i]].patterns.push_back( - aho[aho[f].next[i]].patterns[j]); -}}}}} + int getSuffix(int v) { + if (aho[v].suffix == -1) { + if (v == 0 || aho[v].parent == 0) aho[v].suffix = 0; + else aho[v].suffix = go(getSuffix(aho[v].parent), + aho[v].character); + } + return aho[v].suffix; + } -int go(int v, int c) { - if (aho[v].next[c] != -1) return aho[v].next[c]; - else return go(aho[v].failure, c); -} + int getExit(int v) { + if (aho[v].exit == -1) { + int suffix = getSuffix(v); + if (v == 0) aho[v].exit = 0; + else { + if (aho[suffix].patterns.empty()) { + aho[v].exit = getExit(suffix); + } else { + aho[v].exit = suffix; + }}} + return aho[v].exit; + } + + int go(int v, int idx) { // Root is v=0. + if (aho[v].nxt[idx] != -1) return aho[v].nxt[idx]; + else return v == 0 ? 0 : go(getSuffix(v), idx); + } +};
\ No newline at end of file diff --git a/string/deBruijn.cpp b/string/deBruijn.cpp new file mode 100644 index 0000000..e829137 --- /dev/null +++ b/string/deBruijn.cpp @@ -0,0 +1,7 @@ +string deBruijn(int n, char mi = '0', char ma = '1') { + string res, c(1, mi); + do { + if (n % sz(c) == 0) res += c; + } while(next(c, n, mi, ma)); + return res; +} diff --git a/string/duval.cpp b/string/duval.cpp new file mode 100644 index 0000000..6d80e95 --- /dev/null +++ b/string/duval.cpp @@ -0,0 +1,21 @@ +vector<pair<int, int>> duval(const string& s) { + vector<pair<int, int>> res; + for (int i = 0; i < sz(s);) { + int j = i + 1, k = i; + for (; j < sz(s) && s[k] <= s[j]; j++) { + if (s[k] < s[j]) k = i; + else k++; + } + while (i <= k) { + res.push_back({i, i + j - k}); + i += j - k; + }} + return res; +} + +int minrotation(const string& s) { + auto parts = duval(s+s); + for (auto e : parts) { + if (e.first < sz(s) && e.second >= sz(s)) { + return e.first; +}}} diff --git a/string/kmp.cpp b/string/kmp.cpp index 450b368..282019e 100644 --- a/string/kmp.cpp +++ b/string/kmp.cpp @@ -1,25 +1,23 @@ -// Laufzeit: O(n + m), n = #Text, m = #Pattern -vector<int> kmpPreprocessing(string &sub) { - vector<int> b(sub.length() + 1); - b[0] = -1; - int i = 0, j = -1; - while (i < (int)sub.length()) { - while (j >= 0 && sub[i] != sub[j]) j = b[j]; - i++; j++; - b[i] = j; - } - return b; +vector<int> kmpPreprocessing(const string& sub) { + vector<int> b(sub.size() + 1); + b[0] = -1; + int i = 0, j = -1; + while (i < (int)sub.size()) { + while (j >= 0 && sub[i] != sub[j]) j = b[j]; + i++; j++; + b[i] = j; + } + return b; } - -vector<int> kmpSearch(string &s, string &sub) { - vector<int> pre = kmpPreprocessing(sub), result; - int i = 0, j = 0; - while (i < (int)s.length()) { - while (j >= 0 && s[i] != sub[j]) j = pre[j]; - i++; j++; - if (j == (int)sub.length()) { - result.push_back(i - j); - j = pre[j]; - }} - return result; +vector<int> kmpSearch(const string& s, const string& sub) { + vector<int> pre = kmpPreprocessing(sub), result; + int i = 0, j = 0; + while (i < (int)s.size()) { + while (j >= 0 && s[i] != sub[j]) j = pre[j]; + i++; j++; + if (j == (int)sub.size()) { + result.push_back(i - j); + j = pre[j]; + }} + return result; } diff --git a/string/longestCommonSubsequence.cpp b/string/longestCommonSubsequence.cpp index 7bcf851..dd2368e 100644 --- a/string/longestCommonSubsequence.cpp +++ b/string/longestCommonSubsequence.cpp @@ -1,14 +1,12 @@ -// Laufzeit: O(|a|*|b|) -string lcss(string &a, string &b) { - int m[a.length() + 1][b.length() + 1], x=0, y=0; - memset(m, 0, sizeof(m)); - for(int y = a.length() - 1; y >= 0; y--) { - for(int x = b.length() - 1; x >= 0; x--) { +string lcss(string& a, string& b) { + vector<vector<int>> m(a.size() + 1, vector<int>(b.size() + 1)); + for(int y = a.size() - 1; y >= 0; y--) { + for(int x = b.size() - 1; x >= 0; x--) { if(a[y] == b[x]) m[y][x] = 1 + m[y+1][x+1]; else m[y][x] = max(m[y+1][x], m[y][x+1]); }} // Für die Länge: return m[0][0]; - string res; - while(x < b.length() && y < a.length()) { + string res; int x=0; int y=0; + while(x < b.size() && y < a.size()) { if(a[y] == b[x]) res += a[y++], x++; else if(m[y][x+1] > m[y+1][x+1]) x++; else y++; diff --git a/string/lyndon.cpp b/string/lyndon.cpp new file mode 100644 index 0000000..6a131a5 --- /dev/null +++ b/string/lyndon.cpp @@ -0,0 +1,11 @@ +bool next(string& s, int n, char mi = '0', char ma = '1') { + for (ll i = sz(s), j = sz(s); i < n; i++) + s.push_back(s[i % j]); + while(!s.empty() && s.back() == ma) s.pop_back(); + if (s.empty()) { + s = mi; + return false; + } else { + s.back()++; + return true; +}} diff --git a/string/manacher.cpp b/string/manacher.cpp index 8bd58fb..a1cf2da 100644 --- a/string/manacher.cpp +++ b/string/manacher.cpp @@ -1,30 +1,27 @@ -char input[MAX_N]; -char s[2 * MAX_N + 1]; -int longest[2 * MAX_N + 1]; +string a, b; //a needs to be set +vector<int> longest; -void setDots() { - s[0] = '.'; - int j = 1; - for (int i = 0; i < (int)strlen(input); i++) { - s[j++] = input[i]; - s[j++] = '.'; - } - s[j] = '\0'; -} +//transformes "aa" to ".a.a." to find even length palindromes +void init() { + b = string(sz(a) * 2 + 1, '.'); + longest.assign(sz(b), 0); + for (int i = 0; i < sz(a); i++) { + b[2 * i + 1] = a[i]; +}} void manacher() { - int center = 0, last = 0, n = strlen(s); - memset(longest, 0, sizeof(longest)); - - for (int i = 1; i < n - 1; i++) { - int i2 = 2 * center - i; - longest[i] = (last > i) ? min(last - i, longest[i2]) : 0; - while (i + longest[i] + 1 < n && i - longest[i] - 1 >= 0 && - s[i + longest[i] + 1] == s[i - longest[i] - 1]) longest[i]++; - if (i + longest[i] > last) { - center = i; - last = i + longest[i]; - } - } - for (int i = 0; i < n; i++) longest[i] = 2 * longest[i] + 1; -} + int center = 0, last = 0, n = sz(b); + for (int i = 1; i < n - 1; i++) { + int i2 = 2 * center - i; + longest[i] = (last > i) ? min(last - i, longest[i2]) : 0; + while (i + longest[i] + 1 < n && i - longest[i] - 1 >= 0 && + b[i + longest[i] + 1] == b[i - longest[i] - 1]) { + longest[i]++; + } + if (i + longest[i] > last) { + center = i; + last = i + longest[i]; + }} + //convert lengths to string b (optional) + for (int i = 0; i < n; i++) longest[i] = 2 * longest[i] + 1; +}
\ No newline at end of file diff --git a/string/rollingHash.cpp b/string/rollingHash.cpp index 0df4c87..2185207 100644 --- a/string/rollingHash.cpp +++ b/string/rollingHash.cpp @@ -1,19 +1,17 @@ -ll q = 31; // Größer als Alphabetgröße. q=31,53,311 +// q = 29, 53, 101, 257, 1009, 65537 +// or choose q random from [sigma, m) +// m = 1500000001, 1600000009, 1700000009 struct Hasher { - string s; - ll mod; - vector<ll> power, pref; - Hasher(const string& s, ll mod) : s(s), mod(mod) { - power.push_back(1); - for (int i = 1; i < (int)s.size(); i++) - power.push_back(power.back() * q % mod); - pref.push_back(0); - for (int i = 0; i < (int)s.size(); i++) - pref.push_back((pref.back() * q % mod + s[i]) % mod); - } + vector<ll> power = {1}, pref = {0}; + ll m, q; char c; + Hasher(const string& s, ll m, ll q, char c) : + m(m), q(q), c(c) { + for (char x : s) { + power.push_back(power.back() * q % m); + pref.push_back((pref.back() * q % m + (x - c)) % m); + }} - // Berechnet hash(s[l..r]). l,r inklusive. - ll hash(int l, int r) { - return (pref[r+1] - power[r-l+1] * pref[l] % mod + mod) % mod; - } -}; + ll hash(int l, int r) { // Berechnet hash(s[l..r)). + return (pref[r] - power[r-l] * pref[l] % m + m) % m; + } +};
\ No newline at end of file diff --git a/string/string.tex b/string/string.tex index e13b516..f426c61 100644 --- a/string/string.tex +++ b/string/string.tex @@ -1,48 +1,118 @@ \section{Strings} -\subsection{\textsc{Knuth-Morris-Pratt}-Algorithmus} -\lstinputlisting{string/kmp.cpp} +\begin{algorithm}{\textsc{Knuth-Morris-Pratt}-Algorithmus} + \begin{methods} + \method{kmpSearch}{sucht \code{sub} in \code{s}}{\abs{s}+\abs{sub}} + \end{methods} + \sourcecode{string/kmp.cpp} +\end{algorithm} -\subsection{\textsc{Aho-Corasick}-Automat} -\lstinputlisting{string/ahoCorasick.cpp} +\begin{algorithm}{Z-Algorithmus} + \begin{methods}[ll] + $z_i\coloneqq$ Längstes gemeinsames Präfix von $s_0\cdots s_{n-1}$ und $s_i\cdots s_{n-1}$ & \runtime{n} + \end{methods} + Suchen: Z-Algorithmus auf \code{P\$S} ausführen, Positionen mit $z_i=\abs{P}$ zurückgeben + \sourcecode{string/z.cpp} +\end{algorithm} -\subsection{Trie} -\lstinputlisting{string/trie.cpp} +\begin{algorithm}{Trie} + \sourcecode{string/trie.cpp} +\end{algorithm} -\subsection{Suffix-Baum} -\lstinputlisting{string/suffixTree.cpp} +\begin{algorithm}{Longest Common Subsequence} + \begin{methods} + \method{lcss}{findet längste gemeinsame Sequenz}{\abs{a}\*\abs{b}} + \end{methods} + \sourcecode{string/longestCommonSubsequence.cpp} +\end{algorithm} -\subsection{Suffix-Array} -\lstinputlisting{string/suffixArray.cpp} +\begin{algorithm}{\textsc{Manacher}'s Algorithm, Longest Palindrome} + \begin{methods} + \method{init}{transformiert \code{string a}}{n} + \method{manacher}{berechnet Länge der Palindrome}{n} + \end{methods} + \sourcecode{string/manacher.cpp} +\end{algorithm} -\subsection{Suffix-Automaton} -\lstinputlisting{string/suffixAutomaton.cpp} -\begin{itemize}[nosep] - \item \textbf{Ist \lstinline{w} Substring von \lstinline{s}?} - Baue Automaten für \lstinline{s} und wende ihn auf \lstinline{w} an. - Wenn alle Übergänge vorhanden sind, ist \lstinline{w} Substring von \lstinline{s}. +\begin{algorithm}{Rolling Hash} + \sourcecode{string/rollingHash.cpp} +\end{algorithm} - \item \textbf{Ist \lstinline{w} Suffix von \lstinline{s}?} - Wie oben. - Überprüfe am Ende, ob aktueller Zustand ein Terminal ist. +\begin{algorithm}{\textsc{Aho-Corasick}-Automat} + \begin{methods}[ll] + sucht patterns im Text & \runtime{\abs{Text}+\sum\abs{pattern}+\abs{matches}} + \end{methods} + \begin{enumerate} + \item mit \code{addString(pattern, idx);} Patterns hinzufügen. + \item mit \code{state = go(state, idx)} in nächsten Zustand wechseln. + \item mit \code{state = getExit(state)} den exit Kanten folgen bis \code{state == 0} + \item dabei mit \code{aho[state].patterns} die Matches zählen + \end{enumerate} + \sourcecode{string/ahoCorasick.cpp} +\end{algorithm} - \item \textbf{Anzahl verschiedener Substrings.} - Jeder Pfad im Automaten entspricht einem Substring. - Für einen Knoten ist die Anzahl der ausgehenden Pfade gleich der Summe über die Anzahlen der Kindknoten plus 1. - Der letzte Summand ist der Pfad, der in diesem Knoten endet. +\begin{algorithm}{Suffix-Baum} + \begin{methods} + \method{SuffixTree}{berechnet einen Suffixbaum}{\abs{s}} + \method{extend}{fügt den nächsten Buchstaben aus \code{s} ein}{1} + \end{methods} + \sourcecode{string/suffixTree.cpp} +\end{algorithm} - \item \textbf{Wie oft taucht \lstinline{w} in \lstinline{s} auf?} - Sei \lstinline{p} der Zustand nach Abarbeitung von \lstinline{w}. - Lösung ist Anzahl der Pfade, die in \lstinline{p} starten und in einem Terminal enden. - Diese Zahl lässt sich wie oben rekursiv berechnen. - Bei jedem Knoten darf nur dann plus 1 gerechnet werden, wenn es ein Terminal ist. -\end{itemize} +\begin{algorithm}{Suffix-Array} + \begin{methods} + \method{SuffixArray}{berechnet ein Suffix Array}{\abs{s}\*\log^2(\abs{s})} + \method{lcp}{berechnet den logest common prefix}{\log(\abs{s})} + \method{}{von \code{s[x]} und \code{s[y]}}{} + \end{methods} + \textbf{ACHTUNG:} \code{s} muss mit einem sentinel enden! \code{'\$'} oder \code{'#'} + \sourcecode{string/suffixArray.cpp} +\end{algorithm} -\subsection{Longest Common Subsequence} -\lstinputlisting{string/longestCommonSubsequence.cpp} +\begin{algorithm}{Suffix-Automaton} + \sourcecode{string/suffixAutomaton.cpp} + \begin{itemize} + \item \textbf{Ist \textit{w} Substring von \textit{s}?} + Baue Automaten für \textit{s} und wende ihn auf \textit{w} an. + Wenn alle Übergänge vorhanden sind, ist \textit{w} Substring von \textit{s}. + + \item \textbf{Ist \textit{w} Suffix von \textit{s}?} + Wie oben. + Überprüfe am Ende, ob aktueller Zustand ein Terminal ist. + + \item \textbf{Anzahl verschiedener Substrings.} + Jeder Pfad im Automaten entspricht einem Substring. + Für einen Knoten ist die Anzahl der ausgehenden Pfade gleich der Summe über die Anzahlen der Kindknoten plus 1. + Der letzte Summand ist der Pfad, der in diesem Knoten endet. + + \item \textbf{Wie oft taucht \textit{w} in \textit{s} auf?} + Sei \textit{p} der Zustand nach Abarbeitung von \textit{w}. + Lösung ist Anzahl der Pfade, die in \textit{p} starten und in einem Terminal enden. + Diese Zahl lässt sich wie oben rekursiv berechnen. + Bei jedem Knoten darf nur dann plus 1 gerechnet werden, wenn es ein Terminal ist. + \end{itemize} +\end{algorithm} -\subsection{Rolling Hash} -\lstinputlisting{string/rollingHash.cpp} - -\subsection{\textsc{Manacher}'s Algorithm, Longest Palindrome} -\lstinputlisting{string/manacher.cpp} +\begin{algorithm}{Lyndon und De-Bruijn} + \begin{itemize} + \item \textbf{Lyndon-Wort:} Ein Wort das lexikographisch kleiner ist als jede seiner Rotationen. + \item Jedes Wort kann \emph{eindeutig} in eine nicht ansteigende Folge von Lyndon-Worten zerlegt werden. + \item Für Lyndon-Worte $u, v$ mit $u<v$ gilt, dass $uv$ auch ein Lyndon-Wort ist. + \end{itemize} + \begin{methods} + \method[, Durchschnitt $\Theta(1)$]{next}{lexikographisch nächstes Lyndon-Wort}{n} + \method{duval}{zerlegt $s$ in Lyndon-Worte}{n} + \method{minrotation}{berechnet kleinste Rotation von $s$}{n} + \end{methods} + \sourcecode{string/lyndon.cpp} + \sourcecode{string/duval.cpp} + \begin{itemize} + \item \textbf{De-Bruijn-Sequenze $\boldsymbol{B(\Sigma, n)}$:}~ein Wort das jedes Wort der Länge $n$ genau einmal als substring enthält (und minimal ist). Wobei $B(\Sigma, n)$ zyklisch betrachtet wird. + \item es gibt $\frac{(k!)^{k^{n-1}}}{k^{n}}$ verschiedene $B(\Sigma, n)$ + \item $B(\Sigma, n)$ hat Länge $\abs{\Sigma}^n$ + \end{itemize} + \begin{methods} + \method{deBruijn}{berechnet ein festes $B(\Sigma, n)$}{\abs{\Sigma}^n} + \end{methods} + \sourcecode{string/deBruijn.cpp} +\end{algorithm} diff --git a/string/suffixArray.cpp b/string/suffixArray.cpp index 17a10cf..66b0ee9 100644 --- a/string/suffixArray.cpp +++ b/string/suffixArray.cpp @@ -1,31 +1,37 @@ -struct SuffixArray { // MAX_LG = ceil(log2(MAX_N)) - static const int MAX_N = 100010, MAX_LG = 17; - pair<pair<int, int>, int> L[MAX_N]; - int P[MAX_LG + 1][MAX_N], n, step, count; - int suffixArray[MAX_N], lcpArray[MAX_N]; +struct SuffixArray { + vector<pair<pair<int, int>, int>> L; + int n, step, count; + vector<vector<int>> P; + vector<int> SA, LCP; - SuffixArray(const string &s) : n(s.size()) { // Laufzeit: O(n*log^2(n)) + SuffixArray(const string& s) : n(sz(s)) { + SA.resize(n); LCP.resize(n); L.resize(n); + P.assign(ceil(log2(n)) + 1, vector<int>(n)); for (int i = 0; i < n; i++) P[0][i] = s[i]; - suffixArray[0] = 0; // Falls n == 1. - for (step = 1, count = 1; count < n; step++, count <<= 1) { - for (int i = 0; i < n; i++) L[i] = - {{P[step-1][i], i+count < n ? P[step-1][i+count] : -1}, i}; - sort(L, L + n); - for (int i = 0; i < n; i++) P[step][L[i].second] = i > 0 && - L[i].first == L[i-1].first ? P[step][L[i-1].second] : i; - } - for (int i = 0; i < n; i++) suffixArray[i] = L[i].second; - for (int i = 1; i < n; i++) - lcpArray[i] = lcp(suffixArray[i - 1], suffixArray[i]); + for (step = 1, count = 1; count < n; step++, count *= 2) { + for (int i = 0; i < n; i++) + L[i] = {{P[step-1][i], + i+count < n ? P[step-1][i+count] : -1}, i}; + sort(L.begin(), L.end()); + for (int i = 0; i < n; i++) { + P[step][L[i].second] = + i > 0 && L[i].first == L[i-1].first ? + P[step][L[i-1].second] : i; + }} + for (int i = 0; i < n; i++) SA[i] = L[i].second; + for (int i = 1; i < n; i++) LCP[i] = lcp(SA[i - 1], SA[i]); } - // x und y sind Indizes im String, nicht im Suffixarray. - int lcp(int x, int y) { // Laufzeit: O(log(n)) - int k, ret = 0; + // x and y are text-indices, not SA-indices. + int lcp(int x, int y) { + int ret = 0; if (x == y) return n - x; - for (k = step - 1; k >= 0 && x < n && y < n; k--) - if (P[k][x] == P[k][y]) - x += 1 << k, y += 1 << k, ret += 1 << k; + for (int k = step - 1; k >= 0 && x < n && y < n; k--) + if (P[k][x] == P[k][y]) { + x += 1 << k; + y += 1 << k; + ret += 1 << k; + } return ret; } }; diff --git a/string/suffixAutomaton.cpp b/string/suffixAutomaton.cpp index 7f4885c..15265c8 100644 --- a/string/suffixAutomaton.cpp +++ b/string/suffixAutomaton.cpp @@ -1,42 +1,42 @@ -#define ALPHABET_SIZE 26 +constexpr char MIN_CHAR = 'a'; +constexpr long long ALPHABET_SIZE = 26; struct SuffixAutomaton { struct State { - int length; int link; int next[ALPHABET_SIZE]; - State() { memset(next, 0, sizeof(next)); } + int length, link; + vector<int> next; + State() : next(ALPHABET_SIZE) {} }; - static const int MAX_N = 100000; // Maximale Länge des Strings. - State states[2 * MAX_N]; + vector<State> states; int size, last; - SuffixAutomaton(string &s) { // Laufzeit: O(|s|) + SuffixAutomaton(string &s) { + states.resize(2 * sz(s)); size = 1; last = 0; states[0].length = 0; states[0].link = -1; - for (auto c : s) extend(c); + for (auto c : s) extend(c - MIN_CHAR); } - void extend(char c) { - c -= 'a'; // Werte von c müssen bei 0 beginnen. + void extend(int c) { int current = size++; states[current].length = states[last].length + 1; int pos = last; - while (pos != -1 && !states[pos].next[(int)c]) { - states[pos].next[(int)c] = current; + while (pos != -1 && !states[pos].next[c]) { + states[pos].next[c] = current; pos = states[pos].link; } if (pos == -1) states[current].link = 0; else { - int q = states[pos].next[(int)c]; + int q = states[pos].next[c]; if (states[pos].length + 1 == states[q].length) { states[current].link = q; } else { int clone = size++; states[clone].length = states[pos].length + 1; states[clone].link = states[q].link; - memcpy(states[clone].next, states[q].next, - sizeof(states[q].next)); - while (pos != -1 && states[pos].next[(int)c] == q) { - states[pos].next[(int)c] = clone; + states[clone].next = states[q].next; + while (pos != -1 && states[pos].next[c] == q) { + states[pos].next[c] = clone; pos = states[pos].link; } states[q].link = states[current].link = clone; @@ -44,22 +44,23 @@ struct SuffixAutomaton { last = current; } - // Paar mit Startposition und Länge des LCS. Index in Parameter s. - ii longestCommonSubstring(string &s) { // Laufzeit: O(|s|) + // Pair with start index and length of LCS. + // Index in parameter t. + pair<int, int> longestCommonSubstring(string &t) { int v = 0, l = 0, best = 0, bestpos = 0; - for (int i = 0; i < (int)s.size(); i++) { - int c = s[i] - 'a'; + for (int i = 0; i < (int)t.size(); i++) { + int c = t[i] - MIN_CHAR; while (v && !states[v].next[c]) { v = states[v].link; l = states[v].length; } - if (states[v].next[c]) { v = states[v].next[c]; l++; } - if (l > best) { best = l; bestpos = i; } + if (states[v].next[c]) {v = states[v].next[c]; l++;} + if (l > best) {best = l; bestpos = i;} } - return ii(bestpos - best + 1, best); + return {bestpos - best + 1, best}; } - // Berechnet die Terminale des Automaten. + // Returns all terminals of the automaton. vector<int> calculateTerminals() { vector<int> terminals; int pos = last; diff --git a/string/suffixTree.cpp b/string/suffixTree.cpp index fe065b2..f96992e 100644 --- a/string/suffixTree.cpp +++ b/string/suffixTree.cpp @@ -1,78 +1,83 @@ -// Baut Suffixbaum online auf. Laufzeit: O(n) -// Einmal initSuffixTree() aufrufen und dann extend für jeden Buchstaben. -// '\0'-Zeichen (oder ähnliches) an den Text anhängen! -string s; -int root, lastIdx, needsSuffix, pos, remainder, curVert, curEdge, curLen; -struct Vert { - int start, end, suffix; // Kante [start,end) - map<char, int> next; - int len() { return min(end, pos + 1) - start; } -}; -vector<Vert> tree; +struct SuffixTree { + struct Vert { + int start, end, suffix; + map<char, int> next; + }; + string s; + int root, lastIdx, needsSuffix, pos, remainder; + int curVert, curEdge, curLen; + // Each Vertex gives its children range as [start, end) + vector<Vert> tree; -int newVert(int start, int end) { - Vert v; - v.start = start; - v.end = end; - v.suffix = 0; - tree.push_back(v); - return ++lastIdx; -} + SuffixTree(string& s) : s(s) { + needsSuffix = remainder = curEdge = curLen = 0; + lastIdx = pos = -1; + root = curVert = newVert(-1, -1); + for (int i = 0; i < sz(s); i++) extend(); + } -void addSuffixLink(int vert) { - if (needsSuffix) tree[needsSuffix].suffix = vert; - needsSuffix = vert; -} + int newVert(int start, int end) { + Vert v; + v.start = start; + v.end = end; + v.suffix = 0; + tree.push_back(v); + return ++lastIdx; + } -bool fullImplicitEdge(int vert) { - if (curLen >= tree[vert].len()) { - curEdge += tree[vert].len(); - curLen -= tree[vert].len(); - curVert = vert; - return true; - } - return false; -} + int len(Vert& v) { + return min(v.end, pos + 1) - v.start; + } -void initSuffixTree() { - needsSuffix = remainder = curEdge = curLen = 0; - lastIdx = pos = -1; - root = curVert = newVert(-1, -1); -} + void addSuffixLink(int vert) { + if (needsSuffix) tree[needsSuffix].suffix = vert; + needsSuffix = vert; + } -void extend() { - pos++; - needsSuffix = 0; - remainder++; - while (remainder) { - if (curLen == 0) curEdge = pos; - if (!tree[curVert].next.count(s[curEdge])) { - int leaf = newVert(pos, s.size()); - tree[curVert].next[s[curEdge]] = leaf; - tree[curVert].next[s[curEdge]] = leaf; - addSuffixLink(curVert); - } else { - int nxt = tree[curVert].next[s[curEdge]]; - if (fullImplicitEdge(nxt)) continue; - if (s[tree[nxt].start + curLen] == s[pos]) { - curLen++; - addSuffixLink(curVert); - break; - } - int split = newVert(tree[nxt].start, tree[nxt].start + curLen); - tree[curVert].next[s[curEdge]] = split; - int leaf = newVert(pos, s.size()); - tree[split].next[s[pos]] = leaf; - tree[nxt].start += curLen; - tree[split].next[s[tree[nxt].start]] = nxt; - addSuffixLink(split); - } - remainder--; - if (curVert == root && curLen) { - curLen--; - curEdge = pos - remainder + 1; - } else { - curVert = tree[curVert].suffix ? tree[curVert].suffix : root; - } - } -} + bool fullImplicitEdge(int vert) { + if (curLen >= len(tree[vert])) { + curEdge += len(tree[vert]); + curLen -= len(tree[vert]); + curVert = vert; + return true; + } + return false; + } + + void extend() { + pos++; + needsSuffix = 0; + remainder++; + while (remainder) { + if (curLen == 0) curEdge = pos; + if (!tree[curVert].next.count(s[curEdge])) { + int leaf = newVert(pos, sz(s)); + tree[curVert].next[s[curEdge]] = leaf; + tree[curVert].next[s[curEdge]] = leaf; + addSuffixLink(curVert); + } else { + int nxt = tree[curVert].next[s[curEdge]]; + if (fullImplicitEdge(nxt)) continue; + if (s[tree[nxt].start + curLen] == s[pos]) { + curLen++; + addSuffixLink(curVert); + break; + } + int split = newVert(tree[nxt].start, + tree[nxt].start + curLen); + tree[curVert].next[s[curEdge]] = split; + int leaf = newVert(pos, sz(s)); + tree[split].next[s[pos]] = leaf; + tree[nxt].start += curLen; + tree[split].next[s[tree[nxt].start]] = nxt; + addSuffixLink(split); + } + remainder--; + if (curVert == root && curLen) { + curLen--; + curEdge = pos - remainder + 1; + } else { + curVert = tree[curVert].suffix ? tree[curVert].suffix + : root; + }}} +};
\ No newline at end of file diff --git a/string/trie.cpp b/string/trie.cpp index fa9ec49..f112e1e 100644 --- a/string/trie.cpp +++ b/string/trie.cpp @@ -1,25 +1,33 @@ // Zahlenwerte müssen bei 0 beginnen und zusammenhängend sein. +constexpr int ALPHABET_SIZE = 2; struct node { - int children[ALPHABET_SIZE], c; // c = #Wörter, die hier enden. - node () { - idx = -1; - for (int i = 0; i < ALPHABET_SIZE; i++) children[i] = -1; - } + int words, wordEnds; vector<int> children; + node() : words(0), wordEnds(0), children(ALPHABET_SIZE, -1){} }; -vector<node> trie; // Anlegen mit trie.push_back(node()); +vector<node> trie = {node()}; -void insert(int vert, vector<int> &txt, int s) { // Laufzeit: O(|txt|) - if (s == (int)txt.size()) { trie[vert].c++; return; } - if (trie[vert].children[txt[s]] == -1) { - trie[vert].children[txt[s]] = trie.size(); - trie.push_back(node()); - } - insert(trie[vert].children[txt[s]], txt, s + 1); +int insert(vector<int>& word) { + int id = 0; + for (int c : word) { + trie[id].words++; + if (trie[id].children[c] < 0) { + trie[id].children[c] = trie.size(); + trie.emplace_back(); + } + id = trie[id].children[c]; + } + trie[id].words++; + trie[id].wordEnds++; + return id; } -int contains(int vert, vector<int> &txt, int s) { // Laufzeit: O(|txt|) - if (s == (int)txt.size()) return trie[vert].c; - if (trie[vert].children[txt[s]] != -1) { - return contains(trie[vert].children[txt[s]], txt, s + 1); - } else return 0; +void erase(vector<int>& word) { + int id = 0; + for (int c : word) { + trie[id].words--; + id = trie[id].children[c]; + if (id < 0) return; + } + trie[id].words--; + trie[id].wordEnds--; } diff --git a/string/z.cpp b/string/z.cpp new file mode 100644 index 0000000..dbd5ce5 --- /dev/null +++ b/string/z.cpp @@ -0,0 +1,15 @@ +vector<int> Z(const vector<int> &s) { + int n = sz(s); + vector<int> z(n, n); + int l = 0, r = 0, p, q; + for (int i = 1; i < n; ++i) { + if (i <= r && z[i - l] < r - i + 1) { + z[i] = z[i - l]; + } else { + if (i > r) p = 0, q = i; + else p = r - i + 1, q = r + 1; + while (q < n && s[p] == s[q]) ++p, ++q; + z[i] = q - i, l = i, r = q - 1; + }} + return z; +} |
