
mtons.spider.Spider Maven / Gradle / Ivy
package mtons.spider;
import org.apache.http.HttpStatus;
import org.apache.http.util.Asserts;
import org.apache.log4j.Logger;
import mtons.spider.bucket.Bucket;
import mtons.spider.bucket.DefaultBucket;
import mtons.spider.config.Config;
import mtons.spider.fetcher.Fetcher;
import mtons.spider.http.HttpConnect;
import mtons.spider.http.Request;
import mtons.spider.http.Response;
import mtons.spider.http.Style;
import mtons.spider.http.supports.HttpClientConnect;
import mtons.spider.kit.Counter;
import mtons.spider.kit.Kit;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* 爬虫主程序
*
* Created by langhsu on 2015/11/3.
*/
public class Spider implements Runnable {
private Logger logger = Logger.getLogger("spider.runnable");
public final static int STATUS_INIT = 0;
public final static int STATUS_RUNNING = 1;
public final static int STATUS_STOPPED = 2;
private Map fetcherMap = new HashMap<>();
private Bucket bucket; // 缓存队列
private HttpConnect httpConnect; // HTTP 链接管理
private Config config; // 站点配置
private SpiderExecutor executor; // spider 执行器
private final Counter counter = new Counter();
private ReentrantLock awaitLock = new ReentrantLock();
private Condition awaitCondition = awaitLock.newCondition();
private int emptySleepTime = 5 * 1000;
private AtomicInteger running = new AtomicInteger(STATUS_INIT);
public Spider() {
this.config = new Config();
this.bucket = new DefaultBucket();
}
public Spider(Fetcher spiderFetcher) {
this.config = new Config();
this.fetcherMap.put(spiderFetcher.getStub(), spiderFetcher);
this.bucket = new DefaultBucket();
}
public Spider(Config config, Fetcher spiderFetcher) {
this.config = config;
this.fetcherMap.put(spiderFetcher.getStub(), spiderFetcher);
this.bucket = new DefaultBucket();
}
/**
* 初始化
*/
private void init() {
if (httpConnect == null) {
httpConnect = new HttpClientConnect();
}
httpConnect.setTimeout(config.getTimeout());
httpConnect.setUseGzip(config.isUseGzip());
executor = new SpiderExecutor(config.getThreads());
Asserts.check(!config.getRequests().isEmpty(), "no request");
config.getRequests().forEach(bucket::push);
}
/**
* 向 缓存队列 中添加任务
* @param request Request 请求对象
* @return Spider
*/
public Spider addRequest(Request request) {
if (!config.getRequests().contains(request)) {
config.getRequests().add(request);
}
return this;
}
@Override
public void run() {
isRunning();
init();
logger.info("start spider .......");
while (!Thread.currentThread().isInterrupted() && running.get() == STATUS_RUNNING) {
Request request = bucket.poll();
if (request == null) {
logger.info("no request, threadAlive = " + executor.getThreadAlive());
// 在所以任务执行完后, 退出
if (executor.getThreadAlive() == 0) {
break;
}
// 等待下一个任务
awaitNext();
} else {
final Request todo = request;
executor.execute(() -> {
try {
forgeRequest(todo);
if (todo.getStyle() == Style.DETAIL) {
counter.plus();
} else {
counter.paginationPlus();
}
} catch (Exception e) {
e.printStackTrace();
logger.error(todo.getUrl() + " - " + e.getMessage(), e);
isRetry(request);
} finally {
wake();
}
});
// 限制抓取的最大页数
if (config.getDegree() > 0 && counter.getPaginationCount() >= config.getDegree()) {
break;
}
}
}
running.set(STATUS_STOPPED);
// release some resources
destroy();
logger.info("spider report, count >>> " + counter.stop());
}
public void addFetcher(Fetcher fetcher) {
this.fetcherMap.put(fetcher.getStub(), fetcher);
}
public void setHttpConnect(HttpConnect httpConnect) {
this.httpConnect = httpConnect;
}
public void setConfig(Config config) {
this.config = config;
}
private void forgeRequest(Request request) throws IOException {
Response response = httpConnect.send(request);
if (null == response && response.getStatusCode() != HttpStatus.SC_OK) {
logger.error("request failure > " + request.getUrl());
return;
}
if (fetcherMap.containsKey(request.getStub())) {
fetcherMap.get(request.getStub()).accept(response);
} else {
logger.error("fetch failure > did not find the fetcher[" + request.getStub() + "]");
}
try {
Thread.sleep(config.getInterval());
} catch (InterruptedException e) {
}
mountMoreTask(response);
}
/**
* 挂载更多任务
* @param response
*/
private void mountMoreTask(Response response) {
if (response.getNextRequests().size() > 0) {
for (Request request : response.getNextRequests()) {
if (Kit.isEmptyUrl(request.getUrl())) {
continue;
}
mountRequest(request);
}
}
}
/**
* 向 缓存队列 中添加任务
* @param request Request 请求对象
*/
private void mountRequest(Request request) {
if (!bucket.contains(request)) {
bucket.push(request);
}
}
/**
* 是否重试, 对于 List 页面请求失败的, 再次将其添加带队尾
*
* @param request Request 请求对象
*/
private void isRetry(Request request) {
if (request.getStyle() == Style.LIST) {
// 如果满足条件, 重新添加到队尾
if (request.getRetry() < config.getMaxRetries()) {
request.plusRetry();
mountRequest(request);
logger.info(request.getUrl() + " retry, times - " + request.getRetry());
}
}
}
/**
* 进入等待状态
*/
private void awaitNext() {
awaitLock.lock();
try {
awaitCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("await next - interrupted, error {}", e);
} finally {
awaitLock.unlock();
}
}
/**
* 唤醒线程
*/
private void wake() {
try {
awaitLock.lock();
awaitCondition.signalAll();
} finally {
awaitLock.unlock();
}
}
private void isRunning() {
while (true) {
int statNow = running.get();
if (statNow == STATUS_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (running.compareAndSet(statNow, STATUS_RUNNING)) {
break;
}
}
}
/**
* 销毁, 该操作会等待线程池中的所以任务结束后再进行销毁
*/
public void destroy() {
/**
* 停止任务执行器
*/
executor.shutdown();
int retry = 0;
try {
// 等待线程结束
while (!executor.awaitTermination(5)) {
logger.info("has threadAlive - " + executor.getThreadAlive() + ", do await - " + retry);
// 当任务结束 或 等待超过 99 次 自动结束
if (executor.getThreadAlive() == 0 || retry > 99) {
logger.info("retry > 10, do forced destroy");
break;
}
retry++;
}
} catch (InterruptedException e) {
logger.error("destroy error", e);
}
/**
* 销毁 Http 链接池
*/
httpConnect.destroy();
logger.info("shutdown");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy