mardi 6 octobre 2020

Is this a good design pattern in Javascript using oop for web scraping?

const axios = require('axios');
const fs = require('fs')
const {
    JSDOM
} = require('jsdom');
class Scrapper {
    static dangerDomains = ['loan', 'work', 'biz', 'racing', 'ooo', 'life', 'ltd', 'png'];
    static emailRegex = /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi;
    static domObject = null;
    static internalLinks = new Set();
    static externalLinks = new Set();
    constructor(url) {
        this.url = url;
        this.emails = new Set();
        this.dangerEmails = new Set();
    }

Execute the scrapper

    async executeScrapper() {
        const fetchedData = await this.fetchUrlAndGetDom(this.url);
        const links = await this.getInternalLinks();
        const linksArray = Array.from(Scrapper.internalLinks);
        await this.fetchInternalLinks(linksArray)
    }

Fetch all the internal links to find more emails in the internal pages

    async fetchInternalLinks(internalLink) {
        const result = await Promise.all(internalLink.map((link) => {
            return this.fetchUrlAndGetDom(`${this.url}${link}`);
        }));
        let newResult = Array.from(result);
        newResult.filter(el => el);
        this.addEmails(newResult);
    }

Fetch the url and create a new JSDOM object to extract the data

    async fetchUrlAndGetDom(url) {
        try {
            const response = await axios(url);
            if (response.status === 200) {
                let htmlData = await response.data;
                Scrapper.domObject = await new JSDOM(htmlData);
                let dades = this.searchForEmails();
                return dades;
            } else if (response.status === 404) {
                console.log('this page doesnt exists')
                return process.exit(1);
            }
        } catch (error) {
            console.log(error)
            return process.exit(1);
        }
    }

Extract all the internal links and external links of the current page

    getInternalLinks() {
        let links = Scrapper.domObject.window.document.body;
        links = links.querySelectorAll('a[href^="/"]');
        Array.from(links).filter(link => Scrapper.internalLinks.add(link.getAttribute('href')));
    }
    getExternalLinks() {
        let links = Scrapper.domObject.window.document.body;
        links = links.querySelectorAll('a[href^="http"]');
        Array.from(links).filter(link => Scrapper.externalLinks.add(link.getAttribute('href')));
    }

Use a regex to find all the emails

    searchForEmails() {
        let emailsInDom = Scrapper.domObject.window.document.body.innerHTML;
        let emails = emailsInDom.toString().match(Scrapper.emailRegex)
        return emails;
    }

Validate the domain of the email to see if its in the dangerList

    validateEmails() {
        const validatedEmails = Array.from(this.emails).filter(email => {
            let domainName = email.split('@')[1].split('.')[1];
            if (Scrapper.dangerDomains.includes(domainName)) {
                this.dangerEmails.add(email);
                this.emails.delete(email);
            } else return email;
        });
    }

Write all the emails in a file

    async writeFile() {
        return new Promise((resolve, reject) => {
            fs.writeFile('./emails.txt', Array.from(this.emails), err => {
                if (err) reject('Could not write file');
                resolve('success');
            });
        });
    }

Add emails to the set

    addEmails(emailsInDom) {
        emailsInDom.filter(el => this.emails.add(el))
    }
}
module.exports = Scrapper; 

Aucun commentaire:

Enregistrer un commentaire