net.oschina.htmlsucker.ContentExtractor Maven / Gradle / Ivy

Go to download
package net.oschina.htmlsucker;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.*;

/**
 * 内容提取算法
 */
public class ContentExtractor {

    /**
     * Tags that should be retained in the output. This list should be fairly minimal, and equivalent
     * to the list of tags that callers can be expected to be able to handle.
     */
    private static final Collection TEXT_TAGS = Arrays.asList(
            "p", "b", "i", "u", "strong", "em", "span",
            "a", "pre", "code", "h1", "h2", "h3", "h4",
            "h5", "h6", "blockquote", "img", "hr", "br",
            "ul", "ol", "li", "embed","table"/*,"section"*/
    );

    public static String dig(Element body) {
        //删除无用节点
        body.select("script").remove();
        body.select("style").remove();
        List textNodes = findTextNode(body);
        return textNodes.stream().max(Comparator.comparingInt(e -> e.text().length())).get().outerHtml();
    }

    /**
     * 找出所有的内容节点
     * @param element
     * @return
     */
    private static List findTextNode(Element element) {
        List list = new ArrayList<>();
        //if (element.isBlock()) {
            Elements elements = new Elements();
            for (Element child : element.children()) {
                String nodeName = child.nodeName().toLowerCase();
                if (TEXT_TAGS.contains(nodeName)) {
                    elements.add(child);
                } else {
                    list.addAll(findTextNode(child));
                }
            }
            list.add(elements);
        //}
        return list;
    }

}