I have an assigment where I get a set of string patterns T (max length 80) and a set of texts (max length 250) through the console. The goal is to use the Aho-Corasick algorithm to check which texts contain atleast one of the given patterns, then output all those that do in the order they were given. All strings used (patterns and texts) only contain printable ASCII characters, so the alphabet size is 95.
The user first enters an integer number n (max 1000) of patterns, then n patterns, and then an arbitrary number of text strings.
The course tutors gave us a trie implementation with some methods already created (such as a method that transitions between nodes in the Aho-Corasick DFA and one that computes the failure link for a node v). I have added a construct_slinks() method which goes through the trie after all the patterns are added to it and computes the failure links using their function, aswell as a search_string() function that walks along a text element (from S) using the aforementioned DFA transition method and should return true if a pattern is matched somewhere in the text.
So far, so good. All the test cases that I've checked do work using my code, but the automated checking system where I hand in the code still says "WRONG ANSWER", so I must be doing something wrong. The problem is that I'm not really familiar with Aho-Corasick yet and can't really see where I went wrong. I would be super glad if someone could look over my code or even find a test case that strictly doesnt return what it should. I have commented it as much as possible.
A pseudocode runthrough of the entire process would also help a lot!
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <cstring>
#include <queue>
using namespace std;
const int MAXN = 1000, NMAX = 80000, K = 95; //1000 patterns, 80 (pattern length)*1000 = 80000 maximal states, 95 printable chars in ASCII
int n; //Number of patterns
queue<int> vq;
struct vertex {
int next[K]; // child links
bool leaf; // marker that a word ends here
int p; // parent link
char pch; // next[pch] = current node
int link; // suffix link
int go[K]; // node to go to for given character in DFA
};
vertex t[NMAX+1]; // array nodes are stored in
int sz; // index of next free space for node
void init() { //initialize root node (0)
t[0].p = t[0].link = -1;
memset(t[0].next, 255, sizeof t[0].next);
memset(t[0].go, 255, sizeof t[0].go);
sz = 1;
}
int get_link(int v);
void add_string(const string &s) { //Add string to trie
int v = 0;
for (int i = 0; i < s.length(); i++) {
char c = s[i]-32; //-32 to skip first 31 unprintable ascii chars -> So space (ASCII 32) is 0 in array
if (t[v].next[c] == -1) {
memset(t[sz].next, 255, sizeof t[sz].next);
memset(t[sz].go, 255, sizeof t[sz].go);
t[sz].link = -1;
t[sz].p = v;
t[sz].pch = c;
vq.push(sz); //Add child to queue for suffix link construction
t[v].next[c] = sz++;
}
v = t[v].next[c]; //Pick newly created child as current node
}
t[v].leaf = true; //For the last node: Pattern ends here, so leaf = true.
}
int go(int v, char c);
int get_link(int v) { //Failure link
if (t[v].link == -1) //Link is not computed already
if (v == 0 || t[v].p == 0) //If root or parent is root -> Link = 0
t[v].link = 0;
else
t[v].link = go(get_link(t[v].p), t[v].pch+32); //Otherwise, follow failure link of the parent node using the char on the edge between. (+32 to balance out -32 in go method)
return t[v].link;
}
int go(int v, char c) { //Go to next node in DFA
c-=32; //-32 to skip the first 31 unprintable ASCII letters
if (t[v].go[c] == -1) { //Hasnt been computed yet - Compute it
if (t[v].next[c] != -1) { //Direct child with corresponding char -> Next node is the child node
t[v].go[c] = t[v].next[c];
}
else { //No direct child with corresponding char -> Follow failure link
t[v].go[c] = (v == 0) ? 0 : go(get_link(v), c+32); //c+=32 because we already subtract 32 from it at method start (balancing it out)
}
}
return t[v].go[c];
}
bool search_string(const string &s) { //Follow along the "go"-links
int v = 0;
for (char c : s) {
v = go(v,c);
if (t[v].leaf) { //Pattern ends at current node
return true;
}
}
return false;
}
void construct_slinks() { //Construct failure (suffix) links with a top-down approach using the queue we stacked
while (!vq.empty()) { //Run DFS on the trie, recursively calculate links
int curr = vq.front();
vq.pop();
int link = get_link(curr);
if (t[get_link(curr)].leaf)
t[curr].leaf = true; //If failure link of current node points to a leaf node (where a pattern ends), make current node a leaf too
}
}
int main() {
int n; cin >> n; cin.ignore(); //Read number of patterns
vector<string> trueones; //For output
init();
vq.push(0);
for (int i = 0; i < n; i++){
string s;
getline(cin, s);
add_string(s); //Build tree out of patterns
}
construct_slinks(); //Construct suffix links
string temp;
getline(cin, temp);
while ( temp != "") { //Read unknown number of strings and run search on them
if (search_string(temp))
trueones.push_back(temp); //if a pattern is in string "temp", push temp to trueones
getline(cin, temp);
}
for (auto x: trueones) { //Output all strings in trueones
cout << x << endl;
}
}
Aucun commentaire:
Enregistrer un commentaire