All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.wanghaomiao.seimi.struct.CrawlerModel Maven / Gradle / Ivy

Go to download

一个支持分布式的可以高效开发且可以高效运行的爬虫框架。设计思想上融合了spring与scrapy的优点。

There is a newer version: 2.1.4
Show newest version
package cn.wanghaomiao.seimi.struct;

import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.utils.StrFormatUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.util.Assert;
import org.springframework.util.ReflectionUtils;

import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Map;

/**
 * @author 汪浩淼 [[email protected]]
 *         Date: 2015/7/17.
 */
public class CrawlerModel {
    private ApplicationContext context;
    private BaseSeimiCrawler instance;
    private Class clazz;
    private SeimiQueue queueInstance;
    private Class queueClass;
    private Map memberMethods;
    private String crawlerName;
    private HttpHost proxy;
    private boolean useCookie = false;
    private Logger logger = LoggerFactory.getLogger(CrawlerModel.class);

    public CrawlerModel(Class cls,ApplicationContext applicationContext){
        super();
        this.context = applicationContext;
        this.clazz = cls;
        this.instance = context.getBean(cls);
        init();
    }

    private void init(){
        Crawler c = clazz.getAnnotation(Crawler.class);
        Assert.notNull(c, StrFormatUtil.info("crawler {} lost annotation @cn.wanghaomiao.seimi.annotation.Crawler!",clazz.getName()));
        this.queueClass = c.queue();
        this.queueInstance = context.getBean(queueClass);
        Assert.notNull(queueInstance, StrFormatUtil.info("can not get {} instance,please check scan path", queueClass));
        instance.setQueue(queueInstance);
        memberMethods = new HashMap<>();
        ReflectionUtils.doWithMethods(clazz, new ReflectionUtils.MethodCallback() {
            @Override
            public void doWith(Method method) throws IllegalArgumentException, IllegalAccessException {
                memberMethods.put(method.getName(),method);
            }
        });
        this.crawlerName = StringUtils.isNoneBlank(c.name())?c.name():clazz.getSimpleName();
        instance.setCrawlerName(this.crawlerName);
        resolveProxy(c.proxy());
        this.useCookie = c.useCookie();
        logger.info("Crawler[{}] init complete.", crawlerName);
    }

    private void resolveProxy(String proxyStr){
        HttpHost r = null;
        if (StringUtils.isBlank(proxyStr)){
            return;
        }
        if (proxyStr.matches("(http|https|socket)://([0-9a-zA-Z]+\\.?)+:\\d+")){
            String[] pies = proxyStr.split(":");
            String scheme = pies[0];
            int port = Integer.parseInt(pies[2]);
            String host = pies[1].substring(2);
            if (scheme.equals("socket")){
                r = new HttpHost(host,port);
            }else {
                r = new HttpHost(host,port,scheme);
            }
        }else {
            logger.error("proxy must like ‘http|https|socket://host:port’");
        }
        proxy = r;
    }

    public ApplicationContext getContext() {
        return context;
    }

    public void setContext(ApplicationContext context) {
        this.context = context;
    }

    public BaseSeimiCrawler getInstance() {
        return instance;
    }

    public void setInstance(BaseSeimiCrawler instance) {
        this.instance = instance;
    }

    public Class getClazz() {
        return clazz;
    }

    public void setClazz(Class clazz) {
        this.clazz = clazz;
    }

    public SeimiQueue getQueueInstance() {
        return queueInstance;
    }

    public void setqueueImpl(SeimiQueue queueImpl) {
        this.queueInstance = queueImpl;
    }

    public Class getqueueClass() {
        return queueClass;
    }

    public void setqueueClass(Class queueClass) {
        this.queueClass = queueClass;
    }

    public Map getMemberMethods() {
        return memberMethods;
    }

    public void setMemberMethods(Map memberMethods) {
        this.memberMethods = memberMethods;
    }

    public String getCrawlerName() {
        return crawlerName;
    }

    public HttpHost getProxy() {
        return proxy;
    }

    public boolean isUseCookie() {
        return useCookie;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy