All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.wanghaomiao.seimi.core.SeimiProcessor Maven / Gradle / Ivy

Go to download

一个支持分布式的可以高效开发且可以高效运行的爬虫框架。设计思想上融合了spring与scrapy的优点。

There is a newer version: 2.1.4
Show newest version
package cn.wanghaomiao.seimi.core;

import cn.wanghaomiao.seimi.annotation.Interceptor;
import cn.wanghaomiao.seimi.http.HttpClientFactory;
import cn.wanghaomiao.seimi.http.HttpMethod;
import cn.wanghaomiao.seimi.struct.BodyType;
import cn.wanghaomiao.seimi.struct.CrawlerModel;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.Method;
import java.util.List;
import java.util.Map;

/**
 * @author 汪浩淼 [[email protected]]
 * @since 2015/8/21.
 */
public class SeimiProcessor implements Runnable {
    private SeimiQueue queue;
    private List interceptors;
    private CrawlerModel crawlerModel;
    private Logger logger = LoggerFactory.getLogger(getClass());
    public SeimiProcessor(List interceptors,CrawlerModel crawlerModel){
        this.queue = crawlerModel.getQueueInstance();
        this.interceptors = interceptors;
        this.crawlerModel = crawlerModel;
    }
    @Override
    public void run() {
        while (true){
            try {
                Request request = queue.bPop(crawlerModel.getCrawlerName());
                if (request!=null){
                    if (crawlerModel==null){
                        logger.error("no such crawler name:'{}'",request.getCrawlerName());
                        continue;
                    }
                    if (request.isStop()){
                        logger.info("SeimiProcessor[{}] will stop!",Thread.currentThread().getName());
                        break;
                    }
                    HttpClient hc;
                    if (crawlerModel.isUseCookie()){
                        hc = HttpClientFactory.getHttpClient(10000,crawlerModel.getInstance().getCookieStore());
                    }else {
                        hc = HttpClientFactory.getHttpClient();
                    }
                    RequestConfig config = RequestConfig.custom().setProxy(crawlerModel.getProxy()).build();
                    RequestBuilder requestBuilder;
                    if (HttpMethod.POST.equals(request.getHttpMethod())){
                        requestBuilder = RequestBuilder.post().setUri(request.getUrl());
                    }else {
                        requestBuilder = RequestBuilder.get().setUri(request.getUrl());
                    }
                    if (request.getParams()!=null){
                        for (Map.Entry entry:request.getParams().entrySet()){
                            requestBuilder.addParameter(entry.getKey(),entry.getValue());
                        }
                    }
                    requestBuilder.setConfig(config);
                    HttpResponse httpResponse = hc.execute(requestBuilder.build());
                    Response seimiResponse = renderResponse(httpResponse,request);
                    Method requestCallback = crawlerModel.getMemberMethods().get(request.getCallBack());
                    if (requestCallback!=null){
                        for (SeimiInterceptor interceptor : interceptors) {
                            Interceptor interAnno = interceptor.getClass().getAnnotation(Interceptor.class);
                            if (interAnno.everyMethod()||requestCallback.isAnnotationPresent(interceptor.getTargetAnnotationClass())||crawlerModel.getClazz().isAnnotationPresent(interceptor.getTargetAnnotationClass())){
                                interceptor.before(requestCallback, seimiResponse);
                            }
                        }
                        requestCallback.invoke(crawlerModel.getInstance(),seimiResponse);
                        for (SeimiInterceptor interceptor : interceptors) {
                            Interceptor interAnno = interceptor.getClass().getAnnotation(Interceptor.class);
                            if (interAnno.everyMethod()||requestCallback.isAnnotationPresent(interceptor.getTargetAnnotationClass())||crawlerModel.getClazz().isAnnotationPresent(interceptor.getTargetAnnotationClass())){
                                interceptor.after(requestCallback, seimiResponse);
                            }
                        }
                        logger.debug("Crawler[{}] ,url={} ,responseStatus={}",crawlerModel.getCrawlerName(),request.getUrl(),httpResponse.getStatusLine().getStatusCode());
                    }
                }
            }catch (Exception e){
                logger.error(e.getMessage(),e);
            }
        }
    }
    private Response renderResponse(HttpResponse httpResponse,Request request){
        Response seimiResponse = new Response();
        HttpEntity entity = httpResponse.getEntity();
        seimiResponse.setHttpResponse(httpResponse);
        seimiResponse.setReponseEntity(entity);
        seimiResponse.setUrl(request.getUrl());
        seimiResponse.setRequest(request);
        if (entity != null) {
            Header referer = httpResponse.getFirstHeader("Referer");
            if (referer!=null){
                seimiResponse.setReferer(referer.getValue());
            }
            if (!entity.getContentType().getValue().contains("image")){
                seimiResponse.setBodyType(BodyType.TEXT);
                try {
                    seimiResponse.setContent(EntityUtils.toString(entity));
                    seimiResponse.setData(seimiResponse.getContent().getBytes());
                } catch (Exception e) {
                    logger.error("no content data");
                }
            }else {
                seimiResponse.setBodyType(BodyType.BINARY);
                try {
                    seimiResponse.setData(EntityUtils.toByteArray(entity));
                    seimiResponse.setContent(StringUtils.substringAfterLast(request.getUrl(),"/"));
                } catch (Exception e) {
                    logger.error("no data can be read from httpResponse");
                }
            }
        }
        return seimiResponse;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy