All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.xbynet.crawler.parser.JsoupParser Maven / Gradle / Ivy

The newest version!
package com.github.xbynet.crawler.parser;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.xbynet.crawler.Const;

public class JsoupParser implements Parser {
	private static final Logger log = LoggerFactory
			.getLogger(JsoupParser.class);

	private Document doc;

	public JsoupParser(String raw) {
		doc=Jsoup.parse(raw);
	}

	public String single(String cssSelector) {
		Elements els = getDoc().select(cssSelector);
		if (els == null || els.size() == 0) {
			log.warn("所选元素不存在" + cssSelector);
			return null;
		}
		return getValue(getDoc().select(cssSelector).get(0), null);
	}

	public String single(String cssSelector, String attrName) {
		Elements els = getDoc().select(cssSelector);
		if (els == null || els.size() == 0) {
			log.warn("所选元素不存在" + cssSelector);
			return null;
		}
		return getValue(getDoc().select(cssSelector).get(0), attrName);
	}

	public List list(String cssSelector) {
		List reslist = new ArrayList();
		Elements els = getDoc().select(cssSelector);
		if (els == null || els.size() == 0) {
			log.warn("所选元素不存在" + cssSelector);
			return reslist;
		}
		for (Element e : els) {
			reslist.add(getValue(e, null));
		}
		return reslist;
	}

	public List list(String cssSelector, String attrName) {
		List reslist = new ArrayList();
		Elements els = getDoc().select(cssSelector);
		if (els == null || els.size() == 0) {
			log.warn("所选元素不存在" + cssSelector);
			return reslist;
		}
		for (Element e : els) {
			reslist.add(getValue(e, attrName));
		}
		return reslist;
	}

	private String getValue(Element element, String attrName) {
		if (attrName == null) {
			return element.outerHtml();
		} else if ("innerHtml".equalsIgnoreCase(attrName)) {
			return element.html();
		} else if ("text".equalsIgnoreCase(attrName)) {
			return getText(element);
		} else if ("allText".equalsIgnoreCase(attrName)) {
			return element.text();
		} else {
			return element.attr(attrName);
		}
	}

	protected String getText(Element element) {
		StringBuilder accum = new StringBuilder();
		for (Node node : element.childNodes()) {
			if (node instanceof TextNode) {
				TextNode textNode = (TextNode) node;
				accum.append(textNode.text());
			}
		}
		return accum.toString();
	}

	public Element element(String cssSelector) {
		Elements els = getDoc().select(cssSelector);
		if (els == null || els.size() == 0) {
			log.warn("所选元素不存在" + cssSelector);
			return null;
		}
		return els.get(0);
	}

	public Elements elements(String cssSelector) {
		Elements els = getDoc().select(cssSelector);
		return els;
	}
	public String script(String cssSelector) {
		return single(cssSelector,Const.CssAttr.innerHtml.name());
	}
	public List scripts(String cssSelector) {
		return list(cssSelector,Const.CssAttr.innerHtml.name());
	}

	public Document getDoc() {
		return doc;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy