All Downloads are FREE. Search and download functionalities are using the official Maven repository.

mtons.spider.Spider Maven / Gradle / Ivy

package mtons.spider;

import org.apache.http.HttpStatus;
import org.apache.http.util.Asserts;
import org.apache.log4j.Logger;
import mtons.spider.bucket.Bucket;
import mtons.spider.bucket.DefaultBucket;
import mtons.spider.config.Config;
import mtons.spider.fetcher.Fetcher;
import mtons.spider.http.HttpConnect;
import mtons.spider.http.Request;
import mtons.spider.http.Response;
import mtons.spider.http.Style;
import mtons.spider.http.supports.HttpClientConnect;
import mtons.spider.kit.Counter;
import mtons.spider.kit.Kit;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

/**
 * 爬虫主程序
 *
 * Created by langhsu on 2015/11/3.
 */
public class Spider implements Runnable {
    private Logger logger = Logger.getLogger("spider.runnable");

    public final static int STATUS_INIT = 0;
    public final static int STATUS_RUNNING = 1;
    public final static int STATUS_STOPPED = 2;

    private Map fetcherMap = new HashMap<>();

    private Bucket bucket; // 缓存队列
    private HttpConnect httpConnect; // HTTP 链接管理

    private Config config; // 站点配置

    private SpiderExecutor executor; // spider 执行器

    private final Counter counter = new Counter();
    private ReentrantLock awaitLock = new ReentrantLock();
    private Condition awaitCondition = awaitLock.newCondition();
    private int emptySleepTime = 5 * 1000;
    private AtomicInteger running = new AtomicInteger(STATUS_INIT);

    public Spider() {
        this.config = new Config();
        this.bucket = new DefaultBucket();
    }

    public Spider(Fetcher spiderFetcher) {
        this.config = new Config();
        this.fetcherMap.put(spiderFetcher.getStub(), spiderFetcher);
        this.bucket = new DefaultBucket();
    }

    public Spider(Config config, Fetcher spiderFetcher) {
        this.config = config;
        this.fetcherMap.put(spiderFetcher.getStub(), spiderFetcher);
        this.bucket = new DefaultBucket();
    }

    /**
     * 初始化
     */
    private void init() {
        if (httpConnect == null) {
            httpConnect = new HttpClientConnect();
        }

        httpConnect.setTimeout(config.getTimeout());
        httpConnect.setUseGzip(config.isUseGzip());

        executor = new SpiderExecutor(config.getThreads());

        Asserts.check(!config.getRequests().isEmpty(), "no request");

        config.getRequests().forEach(bucket::push);
    }

    /**
     * 向 缓存队列 中添加任务
     * @param request Request 请求对象
     * @return Spider
     */
    public Spider addRequest(Request request) {
        if (!config.getRequests().contains(request)) {
            config.getRequests().add(request);
        }
        return this;
    }

    @Override
    public void run() {
        isRunning();

        init();

        logger.info("start spider .......");
        while (!Thread.currentThread().isInterrupted() && running.get() == STATUS_RUNNING) {
            Request request = bucket.poll();

            if (request == null) {
                logger.info("no request, threadAlive = " + executor.getThreadAlive());
                // 在所以任务执行完后, 退出
                if (executor.getThreadAlive() == 0) {
                    break;
                }
                // 等待下一个任务
                awaitNext();
            } else {
                final Request todo = request;
                executor.execute(() -> {
                    try {
                        forgeRequest(todo);

                        if (todo.getStyle() == Style.DETAIL) {
                            counter.plus();
                        } else {
                            counter.paginationPlus();
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        logger.error(todo.getUrl() + " - " + e.getMessage(), e);
                        isRetry(request);
                    } finally {
                        wake();
                    }
                });

                // 限制抓取的最大页数
                if (config.getDegree() > 0 && counter.getPaginationCount() >= config.getDegree()) {
                    break;
                }
            }
        }

        running.set(STATUS_STOPPED);

        // release some resources
        destroy();

        logger.info("spider report, count >>> " + counter.stop());
    }

    public void addFetcher(Fetcher fetcher) {
        this.fetcherMap.put(fetcher.getStub(), fetcher);
    }

    public void setHttpConnect(HttpConnect httpConnect) {
        this.httpConnect = httpConnect;
    }

    public void setConfig(Config config) {
        this.config = config;
    }

    private void forgeRequest(Request request) throws IOException {
        Response response = httpConnect.send(request);

        if (null == response && response.getStatusCode() != HttpStatus.SC_OK) {
            logger.error("request failure > " + request.getUrl());
            return;
        }

        if (fetcherMap.containsKey(request.getStub())) {
            fetcherMap.get(request.getStub()).accept(response);
        } else {
            logger.error("fetch failure > did not find the fetcher[" + request.getStub() + "]");
        }

        try {
            Thread.sleep(config.getInterval());
        } catch (InterruptedException e) {
        }

        mountMoreTask(response);
    }

    /**
     * 挂载更多任务
     * @param response
     */
    private void mountMoreTask(Response response) {
        if (response.getNextRequests().size() > 0) {
            for (Request request : response.getNextRequests()) {
                if (Kit.isEmptyUrl(request.getUrl())) {
                    continue;
                }
                mountRequest(request);
            }
        }
    }

    /**
     * 向 缓存队列 中添加任务
     * @param request Request 请求对象
     */
    private void mountRequest(Request request) {
        if (!bucket.contains(request)) {
            bucket.push(request);
        }
    }

    /**
     * 是否重试, 对于 List 页面请求失败的, 再次将其添加带队尾
     *
     * @param request Request 请求对象
     */
    private void isRetry(Request request) {
        if (request.getStyle() == Style.LIST) {
            // 如果满足条件, 重新添加到队尾
            if (request.getRetry() < config.getMaxRetries()) {
                request.plusRetry();
                mountRequest(request);
                logger.info(request.getUrl() + " retry, times - " + request.getRetry());
            }
        }
    }

    /**
     * 进入等待状态
     */
    private void awaitNext() {
        awaitLock.lock();
        try {
            awaitCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            logger.warn("await next - interrupted, error {}", e);
        } finally {
            awaitLock.unlock();
        }
    }

    /**
     * 唤醒线程
     */
    private void wake() {
        try {
            awaitLock.lock();
            awaitCondition.signalAll();
        } finally {
            awaitLock.unlock();
        }
    }

    private void isRunning() {
        while (true) {
            int statNow = running.get();
            if (statNow == STATUS_RUNNING) {
                throw new IllegalStateException("Spider is already running!");
            }
            if (running.compareAndSet(statNow, STATUS_RUNNING)) {
                break;
            }
        }
    }

    /**
     * 销毁, 该操作会等待线程池中的所以任务结束后再进行销毁
     */
    public void destroy() {
        /**
         * 停止任务执行器
         */
        executor.shutdown();

        int retry = 0;
        try {
            // 等待线程结束
            while (!executor.awaitTermination(5)) {
                logger.info("has threadAlive - " + executor.getThreadAlive() + ", do await - " + retry);

                // 当任务结束 或 等待超过 99 次 自动结束
                if (executor.getThreadAlive() == 0 || retry > 99) {
                    logger.info("retry > 10, do forced destroy");
                    break;
                }
                retry++;
            }
        } catch (InterruptedException e) {
            logger.error("destroy error", e);
        }

        /**
         * 销毁 Http 链接池
         */
        httpConnect.destroy();

        logger.info("shutdown");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy