All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.miw.spider.utils.Client Maven / Gradle / Ivy

There is a newer version: 0.0.2
Show newest version
package cn.miw.spider.utils;

import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.Cookie;

public class Client {

	public static Document getDocument(String url) throws IOException {
		return getDocument(url, url);
	}
	public static Document JSoupGetDocument(String url) throws IOException {
		return Jsoup.connect(url).ignoreContentType(true).get();
	}

	private static final int TIMEOUT = 1000*30;
	private static Set cookies;

	public static Document getDocument(String url, String baseUrl) throws IOException {
		System.out.println("==========================================================================");
		System.out.println("抓取 链接:" + url);
		System.out.println("==========================================================================");
		URL link = new URL(url);
		WebRequest request = new WebRequest(link);
		request.setCharset(Charset.forName("UTF-8"));
		request.setAdditionalHeader("Referer", baseUrl);
		request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
		// 构造一个webClient 模拟Chrome 浏览器
		WebClient webClient = new WebClient();
		webClient.getOptions().setJavaScriptEnabled(true);
		webClient.getOptions().setRedirectEnabled(true);
		webClient.getOptions().setCssEnabled(false);
		webClient.getOptions().setActiveXNative(false);
		webClient.getOptions().setCssEnabled(false);
		webClient.getOptions().setThrowExceptionOnScriptError(false);
		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
		webClient.getOptions().setTimeout(TIMEOUT);
		if(cookies==null){
			webClient.getPage(baseUrl);
			cookies = webClient.getCookieManager().getCookies();
		}
		if (cookies != null){
			for(Cookie cookie : cookies){
				webClient.getCookieManager().addCookie(cookie);
			}
		}
		HtmlPage rootPage = webClient.getPage(request);
		Set tmpcookies = webClient.getCookieManager().getCookies();
		if (tmpcookies != null && tmpcookies.size() > 0){
			cookies = tmpcookies;
		}
		webClient.waitForBackgroundJavaScript(TIMEOUT);
		String html = rootPage.asXml();
		webClient.close();
		Document document = Jsoup.parse(html, baseUrl);
		return document;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy