vendredi 12 juillet 2019

Checking for a set of texts S whether they contain atleast one element of pattern set T using the Aho-Corasick algorithm

I have an assigment where I get a set of string patterns T (max length 80) and a set of texts (max length 250) through the console. The goal is to use the Aho-Corasick algorithm to check which texts contain atleast one of the given patterns, then output all those that do in the order they were given. All strings used (patterns and texts) only contain printable ASCII characters, so the alphabet size is 95.

The user first enters an integer number n (max 1000) of patterns, then n patterns, and then an arbitrary number of text strings.

The course tutors gave us a trie implementation with some methods already created (such as a method that transitions between nodes in the Aho-Corasick DFA and one that computes the failure link for a node v). I have added a construct_slinks() method which goes through the trie after all the patterns are added to it and computes the failure links using their function, aswell as a search_string() function that walks along a text element (from S) using the aforementioned DFA transition method and should return true if a pattern is matched somewhere in the text.

So far, so good. All the test cases that I've checked do work using my code, but the automated checking system where I hand in the code still says "WRONG ANSWER", so I must be doing something wrong. The problem is that I'm not really familiar with Aho-Corasick yet and can't really see where I went wrong. I would be super glad if someone could look over my code or even find a test case that strictly doesnt return what it should. I have commented it as much as possible.

A pseudocode runthrough of the entire process would also help a lot!

#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <cstring>
#include <queue>
using namespace std;
const int MAXN = 1000, NMAX = 80000, K = 95; //1000 patterns, 80 (pattern length)*1000 = 80000 maximal states, 95 printable chars in ASCII
int n;  //Number of patterns
queue<int> vq;

struct vertex {
  int next[K]; // child links
  bool leaf;   // marker that a word ends here
  int p;       // parent link
  char pch;    // next[pch] = current node
  int link;    // suffix link
  int go[K];   // node to go to for given character in DFA
};

vertex t[NMAX+1]; // array nodes are stored in
int sz;           // index of next free space for node 

void init() {                               //initialize root node (0)
  t[0].p = t[0].link = -1;
  memset(t[0].next, 255, sizeof t[0].next);
  memset(t[0].go, 255, sizeof t[0].go);
  sz = 1;
}

int get_link(int v);

void add_string(const string &s) {          //Add string to trie
  int v = 0;
  for (int i = 0; i < s.length(); i++) {
    char c = s[i]-32;                               //-32 to skip first 31 unprintable ascii chars -> So space (ASCII 32) is 0 in array
    if (t[v].next[c] == -1) {
      memset(t[sz].next, 255, sizeof t[sz].next);
      memset(t[sz].go, 255, sizeof t[sz].go);
      t[sz].link = -1;
      t[sz].p = v;
      t[sz].pch = c;
      vq.push(sz);                                  //Add child to queue for suffix link construction
      t[v].next[c] = sz++;
    }
    v = t[v].next[c];                               //Pick newly created child as current node
  }
  t[v].leaf = true;                                 //For the last node: Pattern ends here, so leaf = true.
}

int go(int v, char c);

int get_link(int v) {                       //Failure link
  if (t[v].link == -1)                      //Link is not computed already
    if (v == 0 || t[v].p == 0)              //If root or parent is root -> Link = 0
      t[v].link = 0;
    else
      t[v].link = go(get_link(t[v].p), t[v].pch+32);    //Otherwise, follow failure link of the parent node using the char on the edge between. (+32 to balance out -32 in go method)
  return t[v].link;
}

int go(int v, char c) {                     //Go to next node in DFA
    c-=32;                                  //-32 to skip the first 31 unprintable ASCII letters
    if (t[v].go[c] == -1) {                 //Hasnt been computed yet - Compute it
            if (t[v].next[c] != -1) {       //Direct child with corresponding char -> Next node is the child node
                t[v].go[c] = t[v].next[c];
        }
            else {                          //No direct child with corresponding char -> Follow failure link
                t[v].go[c] = (v == 0) ? 0 : go(get_link(v), c+32);  //c+=32 because we already subtract 32 from it at method start (balancing it out)
        }
    }
    return t[v].go[c];
}

bool search_string(const string &s) {       //Follow along the "go"-links
    int v = 0;
    for (char c : s) {
        v = go(v,c);
        if (t[v].leaf) { //Pattern ends at current node
            return true;
        }
    }
    return false;
}

void construct_slinks() {   //Construct failure (suffix) links with a top-down approach using the queue we stacked
    while (!vq.empty()) {   //Run DFS on the trie, recursively calculate links
        int curr = vq.front();
        vq.pop();
        int link = get_link(curr);
        if (t[get_link(curr)].leaf)
            t[curr].leaf = true; //If failure link of current node points to a leaf node (where a pattern ends), make current node a leaf too
    }
}

int main() {
    int n; cin >> n; cin.ignore();  //Read number of patterns
    vector<string> trueones;        //For output
    init();
    vq.push(0);
    for (int i = 0; i < n; i++){
        string s;
        getline(cin, s);
        add_string(s);              //Build tree out of patterns
    }

    construct_slinks();             //Construct suffix links
    string temp;
    getline(cin, temp);
    while ( temp != "") {           //Read unknown number of strings and run search on them
        if (search_string(temp))
            trueones.push_back(temp);   //if a pattern is in string "temp", push temp to trueones
        getline(cin, temp);
    }

    for (auto x: trueones) {        //Output all strings in trueones
        cout << x << endl;
    }

}

Aucun commentaire:

Enregistrer un commentaire