cn.tyoui.httpclient.HttpCrawler Maven / Gradle / Ivy
package cn.tyoui.httpclient;
import cn.tyoui.pojo.ProxyIP;
import cn.tyoui.pojo.StatusCode;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* 爬虫网页
*
* @author Tyoui
*/
public class HttpCrawler {
private CloseableHttpClient httpClient = HttpClients.createDefault();
private List list = null;
//保存爬取的网页
private String dir = null;
/**
* 代理初始化
*
* @param proxyText 自定义代理ip文本
* @throws Exception 代理错误
*/
public void proxyInit(String proxyText) throws Exception {
list = new ArrayList<>();
List listIP = FileUtils.readLines(new File(proxyText));
for (String str : listIP) {
String ip = str.split(":")[0];
int port = Integer.parseInt(str.split(":")[1]);
ProxyIP proxyIp = new ProxyIP(ip, port);
list.add(proxyIp);
}
}
/**
* 开始爬取
*
* @param webURL 要爬取的网址
* @param max 最长时间爬一次
* @param min 最短时间爬一次
* @throws Exception 爬取失败
*/
public void startCrawler(String webURL, int min, int max) throws Exception {
String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/") + 1) + ".html";
File file = new File(path);
if (file.exists() && file.length() > 20000)
return;
if (getList() == null) {
crawler(webURL, path, null, 0);
} else {
int index = new Random().nextInt(list.size() - 1);
crawler(webURL, path, list.get(index), index);
}
Thread.sleep(new Random().nextInt(max) + min);
}
/**
* 爬虫
*
* @param url 要爬的网址
* @param path 保存的路径
* @param proxy 代理ip的对象
* @param index 第几个代理ip
* @throws CloneNotSupportedException 关闭流失败
* @throws IOException 关闭流失败
*/
private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException {
CloseableHttpResponse response = null;
HttpGet httpGet = null;
try {
httpGet = new HttpGet(url);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "gzip,deflate,sdch");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
RequestConfig requestConfig;
if (proxy == null) {
requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build();
} else {
HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort());
requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build();
}
httpGet.setConfig(requestConfig);
response = httpClient.execute(httpGet);
int status = response.getStatusLine().getStatusCode();
if ((status >= 200 && status < 300) || status == 404) {
HttpEntity entity = response.getEntity();
entity.writeTo(new FileOutputStream(path));
System.out.println("下载成功!" + url);
} else {
if (list != null)
list.remove(index);
throw new Exception(StatusCode.getStatus(status));
}
} catch (Exception e) {
System.err.println(e + "\t" + url);
} finally {
if (httpGet != null)
httpGet.clone();
if (response != null)
response.close();
}
}
/**
* 保存爬取网页发的文件夹
*
* @param dir 文件夹
*/
public void setDir(String dir) {
this.dir = dir;
File file = new File(dir);
if (!file.exists())
file.mkdirs();
}
/**
* 关闭爬取流
*/
public void close() {
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取代理ip链表
*
* @return 代理ip链表
*/
public List getList() {
return list;
}
/**
* 设置代理IP
*
* @param list 代理IP链表
*/
public void setList(List list) {
this.list = list;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy