c++
#include <iostream>
#include <string>
using namespace std;
int getLongestCommonPartSize(string &pat, int len) {
// For each sub string, we have [0, end-1-i], [1+i, end]
for (int i = 0; i < len - 1; i++) {
//check if prefix[m]==suffix[m]
int m = 0;
int suffixLen = len - 1 - i;
for (; m < suffixLen; m++) {
if (pat[m] != pat[1 + i + m]) {
break;
}
}
if (m == suffixLen) {
// match
return m;
}
}
return 0;
}
void KMPSearch(string &pat, string &txt) {
//construct suffix table
//suffix table has the same size as pat
//ababa
//0 a
//0 ab
//1 aba
//2 abab
int suffix[pat.size()];
suffix[0] = -1;
// we totally have pat.size()-1 sub string
//i is the size of substring
for (int len = 1; len < pat.size(); len++) {
suffix[len] = getLongestCommonPartSize(pat, len);
}
//check if pat[j]==txt[i]
int j = 0, i = 0;
while (1) {
//break if one of the pointers overflow
//if match, then both pointer++
//else: check if can move(big's pointer+small's size==big's size)
// move pat to where fails
//set pat's p as suffix[p]
if (i >= txt.size()) {
break;
}
if (j >= pat.size()) {
cout << i - pat.size() << endl;
j = 0;
}
if (pat[j] == txt[i]) {
j++;
i++;
} else {
// check can move
if (i + pat.size() == txt.size()) {
break;
}
if (suffix[j] == -1) {
i++;
j = 0;
} else {
j = suffix[j];
}
}
}
if (j >= pat.size()) {
cout << i - pat.size() << endl;
}
}
int main() {
string txt = "ABABDABACDABABCABAB";
string pat = "ABABCABAB";
KMPSearch(pat, txt);
return 0;
}
The size of pat is M.
The process of build suffix table is O(M^2), we need to optimize it.
There are ways of implementing the loop.
One way is:
c++
//if matched then increase both pointers and set table
//else go back to find previous matched to start
for(int len=0, i=1;i<s.size();){
if (s[len] == s[i]) {
len++;
back[i] = len;
i++;
} else {
if (len==0) {
i++;
} else {
len = back[len - 1];
}
}
}
Another is:
c++
//loop util find a character matched
for(int len=0, i=1;i<s.size();){
while(len>=1&&s[i]!=s[len]){
len=back[len-1];
}
if(len<1&&s[i]!=s[len]) {
i++;
}else{
//matched
len++;
back[i]=len;
i++;
}
}
The prefix and suffix are like two eras before and after, all conditions are the same, but at a certain point in time, the latter era B is destroyed. Therefore, era A, at that point, needs to make a different move than B to possibly succeed.
Era A = [Common Part + Different Part] Era B = [Common Part]
Well, may be the DFA(deterministic finite automaton) can give me another angle to understand KMP.