com.xuxueli.crawler.util.JsoupUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of xxl-crawler Show documentation
Show all versions of xxl-crawler Show documentation
A distributed web crawler framework.
The newest version!
package com.xuxueli.crawler.util;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.model.PageRequest;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.Set;
/**
* jsoup tool
*
* @author xuxueli 2015-05-14 22:44:43
*/
public class JsoupUtil {
private static Logger logger = LoggerFactory.getLogger(JsoupUtil.class);
/**
* 加载页面
*
* @param pageRequest
*
* @return Document
*/
public static Document load(PageRequest pageRequest) {
if (!UrlUtil.isUrl(pageRequest.getUrl())) {
return null;
}
try {
// 请求设置
Connection conn = Jsoup.connect(pageRequest.getUrl());
if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
conn.data(pageRequest.getParamMap());
}
if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
conn.cookies(pageRequest.getCookieMap());
}
if (pageRequest.getHeaderMap()!=null && !pageRequest.getHeaderMap().isEmpty()) {
conn.headers(pageRequest.getHeaderMap());
}
if (pageRequest.getUserAgent()!=null) {
conn.userAgent(pageRequest.getUserAgent());
}
if (pageRequest.getReferrer() != null) {
conn.referrer(pageRequest.getReferrer());
}
conn.timeout(pageRequest.getTimeoutMillis());
if (pageRequest.isValidateTLSCertificates()) {
conn.sslSocketFactory(generateSSLSocketFactory());
}
conn.maxBodySize(0); // 取消默认1M限制
// 代理
if (pageRequest.getProxy() != null) {
conn.proxy(pageRequest.getProxy());
}
// 发出请求
Document html = null;
if (pageRequest.isIfPost()) {
html = conn.post();
} else {
html = conn.get();
}
return html;
} catch (IOException e) {
logger.error(e.getMessage(), e);
return null;
}
}
private static SSLSocketFactory generateSSLSocketFactory() {
TrustManager[] trustAllCerts = new TrustManager[]{new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
} };
try {
SSLContext sslContext = SSLContext.getInstance("SSL");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
SSLSocketFactory result = sslContext.getSocketFactory();
return result;
} catch (KeyManagementException | NoSuchAlgorithmException e) {
throw new RuntimeException("Failed to create a SSL socket factory", e);
}
}
public static String loadPageSource(PageRequest pageRequest) {
if (!UrlUtil.isUrl(pageRequest.getUrl())) {
return null;
}
try {
// 请求设置
Connection conn = Jsoup.connect(pageRequest.getUrl());
if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
conn.data(pageRequest.getParamMap());
}
if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
conn.cookies(pageRequest.getCookieMap());
}
if (pageRequest.getHeaderMap()!=null && !pageRequest.getHeaderMap().isEmpty()) {
conn.headers(pageRequest.getHeaderMap());
}
if (pageRequest.getUserAgent()!=null) {
conn.userAgent(pageRequest.getUserAgent());
}
if (pageRequest.getReferrer() != null) {
conn.referrer(pageRequest.getReferrer());
}
conn.timeout(pageRequest.getTimeoutMillis());
if (pageRequest.isValidateTLSCertificates()) {
conn.sslSocketFactory(generateSSLSocketFactory());
}
conn.maxBodySize(0); // 取消默认1M限制
// 代理
if (pageRequest.getProxy() != null) {
conn.proxy(pageRequest.getProxy());
}
conn.ignoreContentType(true);
conn.method(pageRequest.isIfPost()?Connection.Method.POST:Connection.Method.GET);
// 发出请求
Connection.Response resp = conn.execute();
String pageSource = resp.body();
return pageSource;
} catch (IOException e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 抽取元素数据
*
* @param fieldElement
* @param selectType
* @param selectVal
* @return String
*/
public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) {
String fieldElementOrigin = null;
if (XxlCrawlerConf.SelectType.HTML == selectType) {
fieldElementOrigin = fieldElement.html();
} else if (XxlCrawlerConf.SelectType.VAL == selectType) {
fieldElementOrigin = fieldElement.val();
} else if (XxlCrawlerConf.SelectType.TEXT == selectType) {
fieldElementOrigin = fieldElement.text();
} else if (XxlCrawlerConf.SelectType.ATTR == selectType) {
fieldElementOrigin = fieldElement.attr(selectVal);
} else if (XxlCrawlerConf.SelectType.HAS_CLASS == selectType) {
fieldElementOrigin = String.valueOf(fieldElement.hasClass(selectVal));
} else {
fieldElementOrigin = fieldElement.toString();
}
return fieldElementOrigin;
}
/**
* 获取页面上所有超链接地址 (标签的href值)
*
* @param html 页面文档
* @return Set
*/
public static Set findLinks(Document html) {
if (html == null) {
return null;
}
// element
/**
*
* Elements resultSelect = html.select(tagName); // 选择器方式
* Element resultId = html.getElementById(tagName); // 元素ID方式
* Elements resultClass = html.getElementsByClass(tagName); // ClassName方式
* Elements resultTag = html.getElementsByTag(tagName); // html标签方式 "body"
*
*/
Elements hrefElements = html.select("a[href]");
// 抽取数据
Set links = new HashSet();
if (hrefElements!=null && hrefElements.size() > 0) {
for (Element item : hrefElements) {
String href = item.attr("abs:href"); // href、abs:href
if (UrlUtil.isUrl(href)) {
links.add(href);
}
}
}
return links;
}
/**
* 获取页面上所有图片地址 (标签的href值)
*
* @param html
* @return Set
*/
public static Set findImages(Document html) {
Elements imgs = html.getElementsByTag("img");
Set images = new HashSet();
if (imgs!=null && imgs.size() > 0) {
for (Element element: imgs) {
String imgSrc = element.attr("abs:src");
images.add(imgSrc);
}
}
return images;
}
}