All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.washington.cs.knowitall.util.HtmlUtils Maven / Gradle / Ivy

There is a newer version: 1.4.3
Show newest version
package edu.washington.cs.knowitall.util;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;

public class HtmlUtils {

    private static HashSet removePatterns;
    private static HashSet breakPatterns;
    private static boolean initialized = false;

    private static final String[] breakTags = { "blockquote", "br", "center",
            "dd", "div", "dt", "fieldset", "h\\d", "hr", "img", "input",
            "isindex", "li", "noframes", "noscript", "p", "pre", "q", "table",
            "td", "textarea", "th", "xmp" };

    private static final String[] removeTags = { "applet", "form", "head",
            "iframe", "legend", "map", "object", "script", "select", "style",
            "title" };

    private static Pattern tag = Pattern.compile("<[^<]*?>");
    private static Pattern whiteSpace = Pattern.compile("\\s+");
    private static Pattern multiSpace = Pattern.compile("  +");
    private static Pattern multiBreaks = Pattern.compile("\n\n+");

    public static String removeHtml(String content) {

        if (!initialized)
            initPatterns();

        // Normalize whitespace
        content = whiteSpace.matcher(content).replaceAll(" ");

        // Remove and break text
        content = applyPatterns(removePatterns, content);
        content = applyPatterns(breakPatterns, content);

        // Remove tags
        content = tag.matcher(content).replaceAll("");

        // Escape HTML codes
        content = StringEscapeUtils.unescapeCsv(content);

        // Fix more whitespace
        content = multiSpace.matcher(content).replaceAll(" ");
        content = multiBreaks.matcher(content).replaceAll("\n");
        content = content.replace(';', '\n');

        return content;

    }

    public static void main(String[] args) throws Exception {
        BufferedReader in;
        if (args.length == 1) {
            in = new BufferedReader(new FileReader(args[0]));
        } else {
            in = new BufferedReader(new InputStreamReader(System.in));
        }
        StringBuffer sb = new StringBuffer();
        for (String line = in.readLine(); line != null; line = in.readLine()) {
            sb.append(line);
        }
        System.out.println(removeHtml(sb.toString()));
    }

    private static String applyPatterns(HashSet patterns, String s) {
        for (Pattern pat : patterns) {
            s = pat.matcher(s).replaceAll("\n");
        }
        return s;
    }

    private static void initPatterns() {
        removePatterns = new HashSet();
        breakPatterns = new HashSet();
        for (int i = 0; i < removeTags.length; i++) {
            Pattern p = Pattern.compile("(?is)<" + removeTags[i]
                    + "[^<]*?>.*?");
            removePatterns.add(p);
            p = Pattern.compile("(?i)");
            breakPatterns.add(p);
        }
        for (int i = 0; i < breakTags.length; i++) {
            Pattern p = Pattern.compile("(?i)");
            breakPatterns.add(p);
        }
        initialized = true;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy