cn.tyoui.httpclient.HttpCrawler Maven / Gradle / Ivy

Go to download
package cn.tyoui.httpclient;

import cn.tyoui.pojo.ProxyIP;
import cn.tyoui.pojo.StatusCode;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

/**
 * 爬虫网页
 *
 * @author Tyoui
 */
public class HttpCrawler {

    private CloseableHttpClient httpClient = HttpClients.createDefault();

    private List list = null;

    //保存爬取的网页
    private String dir = null;

    /**
     * 代理初始化
     *
     * @param proxyText 自定义代理ip文本
     * @throws Exception 代理错误
     */
    public void proxyInit(String proxyText) throws Exception {
        list = new ArrayList<>();
        List listIP = FileUtils.readLines(new File(proxyText));
        for (String str : listIP) {
            String ip = str.split(":")[0];
            int port = Integer.parseInt(str.split(":")[1]);
            ProxyIP proxyIp = new ProxyIP(ip, port);
            list.add(proxyIp);
        }
    }

    /**
     * 开始爬取
     *
     * @param webURL 要爬取的网址
     * @param max    最长时间爬一次
     * @param min    最短时间爬一次
     * @throws Exception 爬取失败
     */
    public void startCrawler(String webURL, int min, int max) throws Exception {
        String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/") + 1) + ".html";
        File file = new File(path);
        if (file.exists() && file.length() > 20000)
            return;
        if (getList() == null) {
            crawler(webURL, path, null, 0);
        } else {
            int index = new Random().nextInt(list.size() - 1);
            crawler(webURL, path, list.get(index), index);
        }
        Thread.sleep(new Random().nextInt(max) + min);
    }

    /**
     * 爬虫
     *
     * @param url   要爬的网址
     * @param path  保存的路径
     * @param proxy 代理ip的对象
     * @param index 第几个代理ip
     * @throws CloneNotSupportedException 关闭流失败
     * @throws IOException                关闭流失败
     */
    private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException {
        CloseableHttpResponse response = null;
        HttpGet httpGet = null;
        try {
            httpGet = new HttpGet(url);
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpGet.setHeader("Accept-Encoding", "gzip,deflate,sdch");
            httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
            RequestConfig requestConfig;
            if (proxy == null) {
                requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build();
            } else {
                HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort());
                requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build();
            }
            httpGet.setConfig(requestConfig);
            response = httpClient.execute(httpGet);
            int status = response.getStatusLine().getStatusCode();
            if ((status >= 200 && status < 300) || status == 404) {
                HttpEntity entity = response.getEntity();
                entity.writeTo(new FileOutputStream(path));
                System.out.println("下载成功！" + url);
            } else {
                if (list != null)
                    list.remove(index);
                throw new Exception(StatusCode.getStatus(status));
            }
        } catch (Exception e) {
            System.err.println(e + "\t" + url);
        } finally {
            if (httpGet != null)
                httpGet.clone();
            if (response != null)
                response.close();
        }
    }

    /**
     * 保存爬取网页发的文件夹
     *
     * @param dir 文件夹
     */
    public void setDir(String dir) {
        this.dir = dir;
        File file = new File(dir);
        if (!file.exists())
            file.mkdirs();
    }

    /**
     * 关闭爬取流
     */
    public void close() {
        try {
            httpClient.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * 获取代理ip链表
     *
     * @return 代理ip链表
     */
    public List getList() {
        return list;
    }


    /**
     * 设置代理IP
     *
     * @param list 代理IP链表
     */
    public void setList(List list) {
        this.list = list;
    }
}