design-patterns: Filter HTML Code out of Website (cache it) and load it into WebView (RegEx)

dimanche 18 octobre 2015

Filter HTML Code out of Website (cache it) and load it into WebView (RegEx)

I am relativly new to programming in java and android, so I wanted to ask you guys for a simple and understandable way of filtering two tables and their h3 headings of this website, possibly even cache it, and load it into a transparent WebView, so it doesnt look like a website. I thought of RegEx.. I do this to keep it up to date without having to service that thing. With "simple and understandable" I mean comments, and possibly show what are just var names, method names or other custom names. And many explanations, comments and other things... Of course you can also just bomb the code in there, that would also work but I probably could not understand all of it.. ;)

Here's some code I tried:

package com.mrousavy.gemeindemuckendorfwipfing;

import android.os.AsyncTask;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by Marc on 15.10.2015.
 */
public class Table {

// found on stackoverflow
public static boolean exists2(String url) {
    try {
        URL u = new URL(url);
        HttpURLConnection connection = (HttpURLConnection) u.openConnection();
        connection.setRequestMethod("HEAD");
        connection.connect();
        return connection.getResponseCode() == HttpURLConnection.HTTP_OK;
    } catch (Exception ex) {
        return false;
    }
}

/**
 * must NOT be called in main thread!!!
 */
public static String getHTML2(String url) throws Exception {
    try {
        URL u = new URL(url);
        BufferedReader in = new BufferedReader(new InputStreamReader(u.openStream()));
        String tmp, html = "";
        while ((tmp = in.readLine()) != null) {
            html += tmp;
            try {
                Thread.sleep(10);
            } catch (Exception e) {
            }
        }
        return html;
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}

/**
 * must NOT be called in main thread!!!
 */
public static List<String> getUrlsFromHTML2(String html) throws Exception {
    List<String> urls = new ArrayList();

    //init Patterns
    Pattern divsPattern = Pattern.compile("<h3>.</table>");
    //Pattern urlPattern = Pattern.compile("<a href=\"\\./files/(.*?)\"");

    //search for right divs
    Matcher divs = divsPattern.matcher(html);
    while (divs.find()) {
        //search for links
        String innerDiv = divs.group(1);
        Matcher url = urlPattern.matcher(innerDiv);
        if (url.find()) {
            if (!urls.contains(url.group(1)))
                urls.add(url.group(1));
        }
        try {
            Thread.sleep(10);
        } catch (Exception e) {
        }
    }
    return urls;
}

public static List<News> getNewsFromHTML(String html) {
    List<News> ret = new ArrayList();

    Pattern firstNewsPattern = Pattern.compile("<h3><strong>Aktuelle Meldungen</strong></h3>(.*?)<hr />");
    Pattern newsPattern = Pattern.compile("<hr />(.*?)<hr />");
    Pattern newsHeaderPattern = Pattern.compile("<h4>(.*?)</h4>");
    Pattern hrefPattern = Pattern.compile("href=\"(.*?)\"");
    Matcher newsHeader = null;
    Matcher href = null;

    Matcher firstNews = firstNewsPattern.matcher(html);
    if(firstNews.find()) {
        String content = firstNews.group(1).replace("./", "http://ift.tt/1OBVOll");
        href = hrefPattern.matcher(content);
        while(href.find()) {
            String url = href.group(1);
            if(!url.contains("/")) {
                content = content.replace("href=\"" + url + "\"", "href=\"" + "http://ift.tt/1OBVOll" + url + "\"");
            }
        }
        newsHeader = newsHeaderPattern.matcher(content);
        if(newsHeader.find())
            ret.add(new News(newsHeader.group(1).replaceAll("<(.*?)>", "").replaceAll("&#\\d{4};", ""), content));
    }

    Matcher news = newsPattern.matcher(html);
    while(news.find()) {
        String content = news.group(1).replace("./", "http://ift.tt/1OBVOll");
        href = hrefPattern.matcher(content);
        while(href.find()) {
            String url = href.group(1);
            if(!url.contains("/")) {
                content = content.replace("href=\"" + url + "\"", "href=\"" + "http://ift.tt/1OBVOll" + url + "\"");
            }
        }
        newsHeader = newsHeaderPattern.matcher(content);
        if(newsHeader.find())
            ret.add(new News(newsHeader.group(1).replaceAll("<(.*?)>", "").replaceAll("&#\\d{4};", ""), content));
    }

    return ret;
}

public static String listToString(List<String> list) {
    String ret = "";
    for(String str : list) {
        ret += str + "§";
    }
    ret = ret.substring(0, ret.length()-1);
    return ret;
}

public static List<String> stringToList(String str) {
    String[] arr = str.split("§");
    List <String> ret = new ArrayList();
    for(String s : arr) {
        if(!s.trim().equals(""))
            ret.add(s);
    }
    return ret;
}

public static String extractContentFromHTML(String html) {
    Pattern regex = Pattern.compile("<div id=\"content\">((.*?(<div.*?<\\/div>)*.*?)*)<\\/div>");
    Pattern hrefPattern = Pattern.compile("href=\"(.*?)\"");

    Matcher match = regex.matcher(html);
    if(match.find()) {
        String content = match.group(1).replace("./", "http://ift.tt/1OBVOll");
        Matcher href = hrefPattern.matcher(content);
        while(href.find()) {
            String url = href.group(1);
            if(!url.contains("/")) {
                content = content.replace("href=\"" + url + "\"", "href=\"" + "http://ift.tt/1OBVOll" + url + "\"");
            }
        }
        return content;
    }
    return "";
}

}

I hope someone can help me out! :)

Thank you! ^^

design-patterns

dimanche 18 octobre 2015

Filter HTML Code out of Website (cache it) and load it into WebView (RegEx)

Aucun commentaire:

Enregistrer un commentaire