cn.wanghaomiao.seimi.struct.CrawlerModel Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SeimiCrawler Show documentation
Show all versions of SeimiCrawler Show documentation
一个支持分布式的可以高效开发且可以高效运行的爬虫框架。设计思想上融合了spring与scrapy的优点。
package cn.wanghaomiao.seimi.struct;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.utils.StrFormatUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.util.Assert;
import org.springframework.util.ReflectionUtils;
import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Map;
/**
* @author 汪浩淼 [[email protected]]
* Date: 2015/7/17.
*/
public class CrawlerModel {
private ApplicationContext context;
private BaseSeimiCrawler instance;
private Class extends BaseSeimiCrawler> clazz;
private SeimiQueue queueInstance;
private Class extends SeimiQueue> queueClass;
private Map memberMethods;
private String crawlerName;
private HttpHost proxy;
private boolean useCookie = false;
private Logger logger = LoggerFactory.getLogger(CrawlerModel.class);
public CrawlerModel(Class extends BaseSeimiCrawler> cls,ApplicationContext applicationContext){
super();
this.context = applicationContext;
this.clazz = cls;
this.instance = context.getBean(cls);
init();
}
private void init(){
Crawler c = clazz.getAnnotation(Crawler.class);
Assert.notNull(c, StrFormatUtil.info("crawler {} lost annotation @cn.wanghaomiao.seimi.annotation.Crawler!",clazz.getName()));
this.queueClass = c.queue();
this.queueInstance = context.getBean(queueClass);
Assert.notNull(queueInstance, StrFormatUtil.info("can not get {} instance,please check scan path", queueClass));
instance.setQueue(queueInstance);
memberMethods = new HashMap<>();
ReflectionUtils.doWithMethods(clazz, new ReflectionUtils.MethodCallback() {
@Override
public void doWith(Method method) throws IllegalArgumentException, IllegalAccessException {
memberMethods.put(method.getName(),method);
}
});
this.crawlerName = StringUtils.isNoneBlank(c.name())?c.name():clazz.getSimpleName();
instance.setCrawlerName(this.crawlerName);
resolveProxy(c.proxy());
this.useCookie = c.useCookie();
logger.info("Crawler[{}] init complete.", crawlerName);
}
private void resolveProxy(String proxyStr){
HttpHost r = null;
if (StringUtils.isBlank(proxyStr)){
return;
}
if (proxyStr.matches("(http|https|socket)://([0-9a-zA-Z]+\\.?)+:\\d+")){
String[] pies = proxyStr.split(":");
String scheme = pies[0];
int port = Integer.parseInt(pies[2]);
String host = pies[1].substring(2);
if (scheme.equals("socket")){
r = new HttpHost(host,port);
}else {
r = new HttpHost(host,port,scheme);
}
}else {
logger.error("proxy must like ‘http|https|socket://host:port’");
}
proxy = r;
}
public ApplicationContext getContext() {
return context;
}
public void setContext(ApplicationContext context) {
this.context = context;
}
public BaseSeimiCrawler getInstance() {
return instance;
}
public void setInstance(BaseSeimiCrawler instance) {
this.instance = instance;
}
public Class extends BaseSeimiCrawler> getClazz() {
return clazz;
}
public void setClazz(Class extends BaseSeimiCrawler> clazz) {
this.clazz = clazz;
}
public SeimiQueue getQueueInstance() {
return queueInstance;
}
public void setqueueImpl(SeimiQueue queueImpl) {
this.queueInstance = queueImpl;
}
public Class extends SeimiQueue> getqueueClass() {
return queueClass;
}
public void setqueueClass(Class extends SeimiQueue> queueClass) {
this.queueClass = queueClass;
}
public Map getMemberMethods() {
return memberMethods;
}
public void setMemberMethods(Map memberMethods) {
this.memberMethods = memberMethods;
}
public String getCrawlerName() {
return crawlerName;
}
public HttpHost getProxy() {
return proxy;
}
public boolean isUseCookie() {
return useCookie;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy