cn.wanghaomiao.seimi.core.SeimiProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SeimiCrawler Show documentation
Show all versions of SeimiCrawler Show documentation
一个支持分布式的可以高效开发且可以高效运行的爬虫框架。设计思想上融合了spring与scrapy的优点。
package cn.wanghaomiao.seimi.core;
import cn.wanghaomiao.seimi.annotation.Interceptor;
import cn.wanghaomiao.seimi.http.HttpClientFactory;
import cn.wanghaomiao.seimi.http.HttpMethod;
import cn.wanghaomiao.seimi.struct.BodyType;
import cn.wanghaomiao.seimi.struct.CrawlerModel;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.Method;
import java.util.List;
import java.util.Map;
/**
* @author 汪浩淼 [[email protected]]
* @since 2015/8/21.
*/
public class SeimiProcessor implements Runnable {
private SeimiQueue queue;
private List interceptors;
private CrawlerModel crawlerModel;
private Logger logger = LoggerFactory.getLogger(getClass());
public SeimiProcessor(List interceptors,CrawlerModel crawlerModel){
this.queue = crawlerModel.getQueueInstance();
this.interceptors = interceptors;
this.crawlerModel = crawlerModel;
}
@Override
public void run() {
while (true){
try {
Request request = queue.bPop(crawlerModel.getCrawlerName());
if (request!=null){
if (crawlerModel==null){
logger.error("no such crawler name:'{}'",request.getCrawlerName());
continue;
}
if (request.isStop()){
logger.info("SeimiProcessor[{}] will stop!",Thread.currentThread().getName());
break;
}
HttpClient hc;
if (crawlerModel.isUseCookie()){
hc = HttpClientFactory.getHttpClient(10000,crawlerModel.getInstance().getCookieStore());
}else {
hc = HttpClientFactory.getHttpClient();
}
RequestConfig config = RequestConfig.custom().setProxy(crawlerModel.getProxy()).build();
RequestBuilder requestBuilder;
if (HttpMethod.POST.equals(request.getHttpMethod())){
requestBuilder = RequestBuilder.post().setUri(request.getUrl());
}else {
requestBuilder = RequestBuilder.get().setUri(request.getUrl());
}
if (request.getParams()!=null){
for (Map.Entry entry:request.getParams().entrySet()){
requestBuilder.addParameter(entry.getKey(),entry.getValue());
}
}
requestBuilder.setConfig(config);
HttpResponse httpResponse = hc.execute(requestBuilder.build());
Response seimiResponse = renderResponse(httpResponse,request);
Method requestCallback = crawlerModel.getMemberMethods().get(request.getCallBack());
if (requestCallback!=null){
for (SeimiInterceptor interceptor : interceptors) {
Interceptor interAnno = interceptor.getClass().getAnnotation(Interceptor.class);
if (interAnno.everyMethod()||requestCallback.isAnnotationPresent(interceptor.getTargetAnnotationClass())||crawlerModel.getClazz().isAnnotationPresent(interceptor.getTargetAnnotationClass())){
interceptor.before(requestCallback, seimiResponse);
}
}
requestCallback.invoke(crawlerModel.getInstance(),seimiResponse);
for (SeimiInterceptor interceptor : interceptors) {
Interceptor interAnno = interceptor.getClass().getAnnotation(Interceptor.class);
if (interAnno.everyMethod()||requestCallback.isAnnotationPresent(interceptor.getTargetAnnotationClass())||crawlerModel.getClazz().isAnnotationPresent(interceptor.getTargetAnnotationClass())){
interceptor.after(requestCallback, seimiResponse);
}
}
logger.debug("Crawler[{}] ,url={} ,responseStatus={}",crawlerModel.getCrawlerName(),request.getUrl(),httpResponse.getStatusLine().getStatusCode());
}
}
}catch (Exception e){
logger.error(e.getMessage(),e);
}
}
}
private Response renderResponse(HttpResponse httpResponse,Request request){
Response seimiResponse = new Response();
HttpEntity entity = httpResponse.getEntity();
seimiResponse.setHttpResponse(httpResponse);
seimiResponse.setReponseEntity(entity);
seimiResponse.setUrl(request.getUrl());
seimiResponse.setRequest(request);
if (entity != null) {
Header referer = httpResponse.getFirstHeader("Referer");
if (referer!=null){
seimiResponse.setReferer(referer.getValue());
}
if (!entity.getContentType().getValue().contains("image")){
seimiResponse.setBodyType(BodyType.TEXT);
try {
seimiResponse.setContent(EntityUtils.toString(entity));
seimiResponse.setData(seimiResponse.getContent().getBytes());
} catch (Exception e) {
logger.error("no content data");
}
}else {
seimiResponse.setBodyType(BodyType.BINARY);
try {
seimiResponse.setData(EntityUtils.toByteArray(entity));
seimiResponse.setContent(StringUtils.substringAfterLast(request.getUrl(),"/"));
} catch (Exception e) {
logger.error("no data can be read from httpResponse");
}
}
}
return seimiResponse;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy