From 284f0798319fc1d945394d2f802e798356d0141d Mon Sep 17 00:00:00 2001 From: kittobi1992 Date: Sat, 22 Nov 2014 11:47:09 +0100 Subject: Create kmp.cpp --- string/kmp.cpp | 1 + 1 file changed, 1 insertion(+) create mode 100644 string/kmp.cpp (limited to 'string') diff --git a/string/kmp.cpp b/string/kmp.cpp new file mode 100644 index 0000000..7898192 --- /dev/null +++ b/string/kmp.cpp @@ -0,0 +1 @@ +a -- cgit v1.2.3 From b2859449d6facd0c78f59fd2ce6fdf651b4c8970 Mon Sep 17 00:00:00 2001 From: kittobi1992 Date: Sat, 22 Nov 2014 11:48:12 +0100 Subject: Update kmp.cpp Adding KMP-Search (Search after a Substring in a String) --- string/kmp.cpp | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'string') diff --git a/string/kmp.cpp b/string/kmp.cpp index 7898192..f7c3630 100644 --- a/string/kmp.cpp +++ b/string/kmp.cpp @@ -1 +1,35 @@ -a +#include +#include + +using namespace std; + +//Preprocessing Substring sub for KMP-Search +vector kmp_preprocessing(string& sub) { + vector b(sub.size() + 1); + b[0] = -1; + int i = 0, j = -1; + while(i < sub.size()) { + while(j >= 0 && sub[i] != sub[j]) + j = b[j]; + i++; j++; + b[i] = j; + } + return b; +} + +//Searching after Substring sub in s +vector kmp_search(string& s, string& sub) { + vector pre = kmp_preprocessing(sub); + vector result; + int i = 0, j = -1; + while(i < s.size()) { + while(j >= 0 && s[i] != sub[j]) + j = pre[j]; + i++; j++; + if(j == sub.size()) { + result.push_back(i-j); + j = pre[j]; + } + } + return result; +} -- cgit v1.2.3 From f1b3e645381d9b8ea8197fb1473f115de2ee8f96 Mon Sep 17 00:00:00 2001 From: Paul Jungeblut Date: Sat, 22 Nov 2014 13:03:04 +0100 Subject: adding string chapter --- datastructures/trie.cpp | 25 ------------------------- sonstiges/LCSub.cpp | 26 -------------------------- sonstiges/SufixArray.cpp | 36 ------------------------------------ string/LCSubstring.cpp | 26 ++++++++++++++++++++++++++ string/string.tex | 13 +++++++++++++ string/suffixArray.cpp | 36 ++++++++++++++++++++++++++++++++++++ string/trie.cpp | 25 +++++++++++++++++++++++++ tcr.pdf | Bin 297317 -> 306669 bytes tcr.tex | 1 + 9 files changed, 101 insertions(+), 87 deletions(-) delete mode 100644 datastructures/trie.cpp delete mode 100644 sonstiges/LCSub.cpp delete mode 100644 sonstiges/SufixArray.cpp create mode 100644 string/LCSubstring.cpp create mode 100644 string/string.tex create mode 100644 string/suffixArray.cpp create mode 100644 string/trie.cpp (limited to 'string') diff --git a/datastructures/trie.cpp b/datastructures/trie.cpp deleted file mode 100644 index 9cfcda5..0000000 --- a/datastructures/trie.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//nur für kleinbuchstaben! -struct node { - node *(e)[26]; - int c = 0;//anzahl der wörter die an dem node enden. - node() { for(int i = 0; i < 26; i++) e[i] = NULL; } -}; - -void insert(node *root, string *txt, int s) { - if(s >= txt->length()) root->c++; - else { - int idx = (int)((*txt).at(s) - 'a'); - if(root->e[idx] == NULL) { - root->e[idx] = new node(); - } - insert(root->e[idx], txt, s+1); - } -} - -int contains(node *root, string *txt, int s) { - if(s >= txt->length()) return root->c; - int idx = (int)((*txt).at(s) - 'a'); - if(root->e[idx] != NULL) { - return contains(root->e[idx], txt, s+1); - } else return 0; -} diff --git a/sonstiges/LCSub.cpp b/sonstiges/LCSub.cpp deleted file mode 100644 index 69d636f..0000000 --- a/sonstiges/LCSub.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//longest common substring. -struct lcse { - int i = 0, s = 0; -}; -string lcp(string s[2]) { - if(s[0].length() == 0 || s[1].length() == 0) return ""; - vector a(s[0].length()+s[1].length()); - for(int k = 0; k < a.size(); k++) a[k].i=(k < s[0].length() ? k : k - s[0].length()), a[k].s = (k < s[0].length() ? 0 : 1); - sort(a.begin(), a.end(), [&] (const lcse &u, const lcse &l) { - int ui = u.i, li = l.i; - while(ui < s[u.s].length() && li < s[l.s].length()) { - if(s[u.s][ui] < s[l.s][li]) return true; - else if(s[u.s][ui] > s[l.s][li]) return false; - ui++; li++; - } - return !(ui < s[u.s].length()); - }); - int r = 0, m=0, c=0; - for(int i = 0; i < a.size() - 1; i++) { - if(a[i].s == a[i+1].s) continue; - c = 0; - while(a[i].i+c < s[a[i].s].length() && a[i+1].i+c < s[a[i+1].s].length() && s[a[i].s][a[i].i+c] == s[a[i+1].s][a[i+1].i+c]) c++; - if(c > m) r=i, m=c; - } - return m == 0 ? "" : s[a[r].s].substr(a[r].i, m); -} diff --git a/sonstiges/SufixArray.cpp b/sonstiges/SufixArray.cpp deleted file mode 100644 index bb742f6..0000000 --- a/sonstiges/SufixArray.cpp +++ /dev/null @@ -1,36 +0,0 @@ -//longest common substring in one string (overlapping not excluded) -//contains suffix array:--------------------------------------------------------------------------------------------------------- -int cmp(string &s,vector> &v, vector &a, int i, int vi, int u, int l) { - int vi2 = (vi + 1) % 2, u2 = u + i / 2, l2 = l + i / 2; - if(i == 1) return s[u] - s[l]; - else if (v[vi2][u] != v[vi2][l]) return (v[vi2][u] - v[vi2][l]); - else { //beide größer tifft nicht mehr ein, da ansonsten vorher schon unterschied in länge - if(u2 >= s.length()) return -1; - else if(l2 >= s.length()) return 1; - else return v[vi2][u2] - v[vi2][l2]; - } -} - -string lcsub(string s) { - if(s.length() == 0) return ""; - vector a(s.length()); - vector> v(2, vector(s.length(), 0)); - int vi = 0; - for(int k = 0; k < a.size(); k++) a[k] = k; - for(int i = 1; i < s.length(); i *= 2, vi = (vi + 1) % 2) { - sort(a.begin(), a.end(), [&] (const int &u, const int &l) { - return cmp(s, v, a, i, vi, u, l) < 0; - }); - v[vi][a[0]] = 0; - for(int z = 1; z < a.size(); z++) v[vi][a[z]] = v[vi][a[z-1]] + (cmp(s, v, a, i, vi, a[z], a[z-1]) == 0 ? 0 : 1); - } -//------------------------------------------------------------------------------------------------------------------------------- - int r = 0, m=0, c=0; - for(int i = 0; i < a.size() - 1; i++) { - c = 0; - while(a[i]+c < s.length() && a[i+1]+c < s.length() && s[a[i]+c] == s[a[i+1]+c]) c++; - if(c > m) r=i, m=c; - } - return m == 0 ? "" : s.substr(a[r], m); -} - diff --git a/string/LCSubstring.cpp b/string/LCSubstring.cpp new file mode 100644 index 0000000..69d636f --- /dev/null +++ b/string/LCSubstring.cpp @@ -0,0 +1,26 @@ +//longest common substring. +struct lcse { + int i = 0, s = 0; +}; +string lcp(string s[2]) { + if(s[0].length() == 0 || s[1].length() == 0) return ""; + vector a(s[0].length()+s[1].length()); + for(int k = 0; k < a.size(); k++) a[k].i=(k < s[0].length() ? k : k - s[0].length()), a[k].s = (k < s[0].length() ? 0 : 1); + sort(a.begin(), a.end(), [&] (const lcse &u, const lcse &l) { + int ui = u.i, li = l.i; + while(ui < s[u.s].length() && li < s[l.s].length()) { + if(s[u.s][ui] < s[l.s][li]) return true; + else if(s[u.s][ui] > s[l.s][li]) return false; + ui++; li++; + } + return !(ui < s[u.s].length()); + }); + int r = 0, m=0, c=0; + for(int i = 0; i < a.size() - 1; i++) { + if(a[i].s == a[i+1].s) continue; + c = 0; + while(a[i].i+c < s[a[i].s].length() && a[i+1].i+c < s[a[i+1].s].length() && s[a[i].s][a[i].i+c] == s[a[i+1].s][a[i+1].i+c]) c++; + if(c > m) r=i, m=c; + } + return m == 0 ? "" : s[a[r].s].substr(a[r].i, m); +} diff --git a/string/string.tex b/string/string.tex new file mode 100644 index 0000000..1335c4a --- /dev/null +++ b/string/string.tex @@ -0,0 +1,13 @@ +\section{Strings} + +\subsection{\textsc{Knuth-Morris-Pratt}-Algorithmus} +\lstinputlisting{string/kmp.cpp} + +\subsection{Trie} +\lstinputlisting{string/trie.cpp} + +\subsection{Suffix-Array} +\lstinputlisting{string/suffixArray.cpp} + +\subsection{Longest Common Substring} +\lstinputlisting{string/LCSubstring.cpp} diff --git a/string/suffixArray.cpp b/string/suffixArray.cpp new file mode 100644 index 0000000..2b2a87b --- /dev/null +++ b/string/suffixArray.cpp @@ -0,0 +1,36 @@ +//longest common substring in one string (overlapping not excluded) +//contains suffix array:-------------------------------------------------------------------- +int cmp(string &s,vector> &v, vector &a, int i, int vi, int u, int l) { + int vi2 = (vi + 1) % 2, u2 = u + i / 2, l2 = l + i / 2; + if(i == 1) return s[u] - s[l]; + else if (v[vi2][u] != v[vi2][l]) return (v[vi2][u] - v[vi2][l]); + else { //beide groesser tifft nicht mehr ein, da ansonsten vorher schon unterschied in Laenge + if(u2 >= s.length()) return -1; + else if(l2 >= s.length()) return 1; + else return v[vi2][u2] - v[vi2][l2]; + } +} + +string lcsub(string s) { + if(s.length() == 0) return ""; + vector a(s.length()); + vector> v(2, vector(s.length(), 0)); + int vi = 0; + for(int k = 0; k < a.size(); k++) a[k] = k; + for(int i = 1; i < s.length(); i *= 2, vi = (vi + 1) % 2) { + sort(a.begin(), a.end(), [&] (const int &u, const int &l) { + return cmp(s, v, a, i, vi, u, l) < 0; + }); + v[vi][a[0]] = 0; + for(int z = 1; z < a.size(); z++) v[vi][a[z]] = v[vi][a[z-1]] + (cmp(s, v, a, i, vi, a[z], a[z-1]) == 0 ? 0 : 1); + } +//------------------------------------------------------------------------------------------- + int r = 0, m=0, c=0; + for(int i = 0; i < a.size() - 1; i++) { + c = 0; + while(a[i]+c < s.length() && a[i+1]+c < s.length() && s[a[i]+c] == s[a[i+1]+c]) c++; + if(c > m) r=i, m=c; + } + return m == 0 ? "" : s.substr(a[r], m); +} + diff --git a/string/trie.cpp b/string/trie.cpp new file mode 100644 index 0000000..f4b979c --- /dev/null +++ b/string/trie.cpp @@ -0,0 +1,25 @@ +//nur fuer kleinbuchstaben! +struct node { + node *(e)[26]; + int c = 0;//anzahl der woerter die an dem node enden. + node() { for(int i = 0; i < 26; i++) e[i] = NULL; } +}; + +void insert(node *root, string *txt, int s) { + if(s >= txt->length()) root->c++; + else { + int idx = (int)((*txt).at(s) - 'a'); + if(root->e[idx] == NULL) { + root->e[idx] = new node(); + } + insert(root->e[idx], txt, s+1); + } +} + +int contains(node *root, string *txt, int s) { + if(s >= txt->length()) return root->c; + int idx = (int)((*txt).at(s) - 'a'); + if(root->e[idx] != NULL) { + return contains(root->e[idx], txt, s+1); + } else return 0; +} diff --git a/tcr.pdf b/tcr.pdf index c4e7d62..345a1a7 100644 Binary files a/tcr.pdf and b/tcr.pdf differ diff --git a/tcr.tex b/tcr.tex index 4dce95e..4328b73 100644 --- a/tcr.tex +++ b/tcr.tex @@ -56,6 +56,7 @@ \input{graph/graph} \input{geometry/geometry} \input{math/math} +\input{string/string} \input{sonstiges/sonstiges} \end{document} \ No newline at end of file -- cgit v1.2.3 From 6faad4725bebb99535ebb1d479d35c39e45120e1 Mon Sep 17 00:00:00 2001 From: JBatzill Date: Sat, 22 Nov 2014 13:38:26 +0100 Subject: optimized sorting, new runtime: O(n*log²(n)) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- string/suffixArray.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'string') diff --git a/string/suffixArray.cpp b/string/suffixArray.cpp index 2b2a87b..7c03d70 100644 --- a/string/suffixArray.cpp +++ b/string/suffixArray.cpp @@ -1,6 +1,6 @@ //longest common substring in one string (overlapping not excluded) //contains suffix array:-------------------------------------------------------------------- -int cmp(string &s,vector> &v, vector &a, int i, int vi, int u, int l) { +int cmp(string &s,vector> &v, int i, int vi, int u, int l) { int vi2 = (vi + 1) % 2, u2 = u + i / 2, l2 = l + i / 2; if(i == 1) return s[u] - s[l]; else if (v[vi2][u] != v[vi2][l]) return (v[vi2][u] - v[vi2][l]); @@ -19,10 +19,10 @@ string lcsub(string s) { for(int k = 0; k < a.size(); k++) a[k] = k; for(int i = 1; i < s.length(); i *= 2, vi = (vi + 1) % 2) { sort(a.begin(), a.end(), [&] (const int &u, const int &l) { - return cmp(s, v, a, i, vi, u, l) < 0; + return cmp(s, v, i, vi, u, l) < 0; }); v[vi][a[0]] = 0; - for(int z = 1; z < a.size(); z++) v[vi][a[z]] = v[vi][a[z-1]] + (cmp(s, v, a, i, vi, a[z], a[z-1]) == 0 ? 0 : 1); + for(int z = 1; z < a.size(); z++) v[vi][a[z]] = v[vi][a[z-1]] + (cmp(s, v, i, vi, a[z], a[z-1]) == 0 ? 0 : 1); } //------------------------------------------------------------------------------------------- int r = 0, m=0, c=0; -- cgit v1.2.3 From 6671028d5a0421a95fe313a04cabd8123fe312b6 Mon Sep 17 00:00:00 2001 From: JBatzill Date: Sat, 22 Nov 2014 15:25:21 +0100 Subject: Update suffixArray.cpp --- string/suffixArray.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'string') diff --git a/string/suffixArray.cpp b/string/suffixArray.cpp index 7c03d70..73c7aff 100644 --- a/string/suffixArray.cpp +++ b/string/suffixArray.cpp @@ -17,7 +17,7 @@ string lcsub(string s) { vector> v(2, vector(s.length(), 0)); int vi = 0; for(int k = 0; k < a.size(); k++) a[k] = k; - for(int i = 1; i < s.length(); i *= 2, vi = (vi + 1) % 2) { + for(int i = 1; i <= s.length(); i *= 2, vi = (vi + 1) % 2) { sort(a.begin(), a.end(), [&] (const int &u, const int &l) { return cmp(s, v, i, vi, u, l) < 0; }); -- cgit v1.2.3 From fd6adf295b528e36c54cd193dfda83f7d3acd257 Mon Sep 17 00:00:00 2001 From: JBatzill Date: Sat, 22 Nov 2014 16:44:14 +0100 Subject: Create LCSubSequence.cpp --- string/LCSubSequence.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 string/LCSubSequence.cpp (limited to 'string') diff --git a/string/LCSubSequence.cpp b/string/LCSubSequence.cpp new file mode 100644 index 0000000..0ea2913 --- /dev/null +++ b/string/LCSubSequence.cpp @@ -0,0 +1,17 @@ +string lcss(string &a, string &b) { + int m[a.length() + 1][b.length() + 1], x=0, y=0; + memset(m, 0, sizeof(m)); + for(int y = a.length() - 1; y >= 0; y--) { + for(int x = b.length() - 1; x >= 0; x--) { + if(a[y] == b[x]) m[y][x] = 1 + m[y+1][x+1]; + else m[y][x] = max(m[y+1][x], m[y][x+1]); + } + } //for length only: return m[0][0]; + string res; + while(x < b.length() && y < a.length()) { + if(a[y] == b[x]) res += a[y++], x++; + else if(m[y][x+1] > m[y+1][x+1]) x++; + else y++; + } + return res; +} -- cgit v1.2.3 From 803282f7b1d28c4d3ee534e11edd42b91ec6b437 Mon Sep 17 00:00:00 2001 From: JBatzill Date: Sat, 22 Nov 2014 16:45:22 +0100 Subject: added longest common subsequence --- string/string.tex | 3 +++ 1 file changed, 3 insertions(+) (limited to 'string') diff --git a/string/string.tex b/string/string.tex index 1335c4a..61385fc 100644 --- a/string/string.tex +++ b/string/string.tex @@ -11,3 +11,6 @@ \subsection{Longest Common Substring} \lstinputlisting{string/LCSubstring.cpp} + +\subsection{Longest Common Subsequence} +\lstinputlisting{string/LCSubSequence.cpp} -- cgit v1.2.3