diff options
| author | Paul Jungeblut <paul.jungeblut@gmail.com> | 2016-10-15 00:58:10 +0200 |
|---|---|---|
| committer | Paul Jungeblut <paul.jungeblut@gmail.com> | 2016-10-15 00:58:10 +0200 |
| commit | 53d83644c3bf9c37152aadee500e5e9bdb0514e1 (patch) | |
| tree | 0d25f424aad134a2061f48c991845cec1801fabd /string | |
| parent | e6d20a2e8f4044b12fc4fc814c11d9a54522cf3c (diff) | |
Adding code for a suffix automaton doing longest common substring queries in linear time.
Diffstat (limited to 'string')
| -rw-r--r-- | string/LCSubstring.cpp | 26 | ||||
| -rw-r--r-- | string/string.tex | 4 | ||||
| -rw-r--r-- | string/suffixArray.cpp | 61 | ||||
| -rw-r--r-- | string/suffixAutomaton.cpp | 62 |
4 files changed, 92 insertions, 61 deletions
diff --git a/string/LCSubstring.cpp b/string/LCSubstring.cpp deleted file mode 100644 index 69d636f..0000000 --- a/string/LCSubstring.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//longest common substring. -struct lcse { - int i = 0, s = 0; -}; -string lcp(string s[2]) { - if(s[0].length() == 0 || s[1].length() == 0) return ""; - vector<lcse> a(s[0].length()+s[1].length()); - for(int k = 0; k < a.size(); k++) a[k].i=(k < s[0].length() ? k : k - s[0].length()), a[k].s = (k < s[0].length() ? 0 : 1); - sort(a.begin(), a.end(), [&] (const lcse &u, const lcse &l) { - int ui = u.i, li = l.i; - while(ui < s[u.s].length() && li < s[l.s].length()) { - if(s[u.s][ui] < s[l.s][li]) return true; - else if(s[u.s][ui] > s[l.s][li]) return false; - ui++; li++; - } - return !(ui < s[u.s].length()); - }); - int r = 0, m=0, c=0; - for(int i = 0; i < a.size() - 1; i++) { - if(a[i].s == a[i+1].s) continue; - c = 0; - while(a[i].i+c < s[a[i].s].length() && a[i+1].i+c < s[a[i+1].s].length() && s[a[i].s][a[i].i+c] == s[a[i+1].s][a[i+1].i+c]) c++; - if(c > m) r=i, m=c; - } - return m == 0 ? "" : s[a[r].s].substr(a[r].i, m); -} diff --git a/string/string.tex b/string/string.tex index 16ad30a..854b05a 100644 --- a/string/string.tex +++ b/string/string.tex @@ -12,8 +12,8 @@ \subsection{Suffix-Array} \lstinputlisting{string/suffixArray.cpp} -\subsection{Longest Common Substring} -\lstinputlisting{string/LCSubstring.cpp} +\subsection{Suffix-Automaton} +\lstinputlisting{string/suffixAutomaton.cpp} \subsection{Longest Common Subsequence} \lstinputlisting{string/longestCommonSubsequence.cpp} diff --git a/string/suffixArray.cpp b/string/suffixArray.cpp index 73c7aff..17a10cf 100644 --- a/string/suffixArray.cpp +++ b/string/suffixArray.cpp @@ -1,36 +1,31 @@ -//longest common substring in one string (overlapping not excluded) -//contains suffix array:-------------------------------------------------------------------- -int cmp(string &s,vector<vector<int>> &v, int i, int vi, int u, int l) { - int vi2 = (vi + 1) % 2, u2 = u + i / 2, l2 = l + i / 2; - if(i == 1) return s[u] - s[l]; - else if (v[vi2][u] != v[vi2][l]) return (v[vi2][u] - v[vi2][l]); - else { //beide groesser tifft nicht mehr ein, da ansonsten vorher schon unterschied in Laenge - if(u2 >= s.length()) return -1; - else if(l2 >= s.length()) return 1; - else return v[vi2][u2] - v[vi2][l2]; - } -} +struct SuffixArray { // MAX_LG = ceil(log2(MAX_N)) + static const int MAX_N = 100010, MAX_LG = 17; + pair<pair<int, int>, int> L[MAX_N]; + int P[MAX_LG + 1][MAX_N], n, step, count; + int suffixArray[MAX_N], lcpArray[MAX_N]; -string lcsub(string s) { - if(s.length() == 0) return ""; - vector<int> a(s.length()); - vector<vector<int>> v(2, vector<int>(s.length(), 0)); - int vi = 0; - for(int k = 0; k < a.size(); k++) a[k] = k; - for(int i = 1; i <= s.length(); i *= 2, vi = (vi + 1) % 2) { - sort(a.begin(), a.end(), [&] (const int &u, const int &l) { - return cmp(s, v, i, vi, u, l) < 0; - }); - v[vi][a[0]] = 0; - for(int z = 1; z < a.size(); z++) v[vi][a[z]] = v[vi][a[z-1]] + (cmp(s, v, i, vi, a[z], a[z-1]) == 0 ? 0 : 1); - } -//------------------------------------------------------------------------------------------- - int r = 0, m=0, c=0; - for(int i = 0; i < a.size() - 1; i++) { - c = 0; - while(a[i]+c < s.length() && a[i+1]+c < s.length() && s[a[i]+c] == s[a[i+1]+c]) c++; - if(c > m) r=i, m=c; + SuffixArray(const string &s) : n(s.size()) { // Laufzeit: O(n*log^2(n)) + for (int i = 0; i < n; i++) P[0][i] = s[i]; + suffixArray[0] = 0; // Falls n == 1. + for (step = 1, count = 1; count < n; step++, count <<= 1) { + for (int i = 0; i < n; i++) L[i] = + {{P[step-1][i], i+count < n ? P[step-1][i+count] : -1}, i}; + sort(L, L + n); + for (int i = 0; i < n; i++) P[step][L[i].second] = i > 0 && + L[i].first == L[i-1].first ? P[step][L[i-1].second] : i; + } + for (int i = 0; i < n; i++) suffixArray[i] = L[i].second; + for (int i = 1; i < n; i++) + lcpArray[i] = lcp(suffixArray[i - 1], suffixArray[i]); } - return m == 0 ? "" : s.substr(a[r], m); -} + // x und y sind Indizes im String, nicht im Suffixarray. + int lcp(int x, int y) { // Laufzeit: O(log(n)) + int k, ret = 0; + if (x == y) return n - x; + for (k = step - 1; k >= 0 && x < n && y < n; k--) + if (P[k][x] == P[k][y]) + x += 1 << k, y += 1 << k, ret += 1 << k; + return ret; + } +}; diff --git a/string/suffixAutomaton.cpp b/string/suffixAutomaton.cpp new file mode 100644 index 0000000..4e8abcf --- /dev/null +++ b/string/suffixAutomaton.cpp @@ -0,0 +1,62 @@ +#define ALPHABET_SIZE 26 +struct SuffixAutomaton { + struct State { + int length; int link; int next[ALPHABET_SIZE]; + State() { memset(next, 0, sizeof(next)); } + }; + static const int MAX_N = 100000; // Maximale Länge des Strings. + State states[2 * MAX_N]; + int size, last; + + SuffixAutomaton(string &s) { // Laufzeit: O(|s|) + size = 1; last = 0; + states[0].length = 0; + states[0].link = -1; + for (auto c : s) extend(c); + } + + void extend(char c) { // Werte von c müssen bei 0 beginnen. + c -= 'a'; + int current = size++; + states[current].length = states[last].length + 1; + int pos = last; + while (pos != -1 && !states[pos].next[(int)c]) { + states[pos].next[(int)c] = current; + pos = states[pos].link; + } + if (pos == -1) states[current].link = 0; + else { + int q = states[pos].next[(int)c]; + if (states[pos].length + 1 == states[q].length) { + states[current].link = q; + } else { + int clone = size++; + states[clone].length = states[pos].length + 1; + states[clone].link = states[q].link; + memcpy(states[clone].next, states[q].next, + sizeof(states[q].next)); + while (pos != -1 && states[pos].next[(int)c] == q) { + states[pos].next[(int)c] = clone; + pos = states[pos].link; + } + states[q].link = states[current].link = clone; + }} + last = current; + } + + // Paar mit Startposition und Länge des LCS. Index in Parameter s. + ii longestCommonSubstring(string &s) { // Laufzeit: O(|s|) + int v = 0, l = 0, best = 0, bestpos = 0; + for (int i = 0; i < (int)s.size(); i++) { + int c = s[i] - 'a'; + while (v && !states[v].next[c]) { + v = states[v].link; + l = states[v].length; + } + if (states[v].next[c]) { v = states[v].next[c]; l++; } + if (l > best) { best = l; bestpos = i; } + } + return ii(bestpos - best + 1, best); + } +}; + |
