com.geccocrawler.gecco.spider.SpiderBeanFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gecco Show documentation
Show all versions of gecco Show documentation
Easy to use lightweight web crawler
package com.geccocrawler.gecco.spider;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.reflections.Reflections;
import org.reflections.scanners.TypeAnnotationsScanner;
import org.reflections.util.ConfigurationBuilder;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.downloader.DownloaderAOPFactory;
import com.geccocrawler.gecco.downloader.DownloaderFactory;
import com.geccocrawler.gecco.downloader.MonitorDownloaderFactory;
import com.geccocrawler.gecco.dynamic.GeccoClassLoader;
import com.geccocrawler.gecco.dynamic.GeccoJavaReflectionAdapter;
import com.geccocrawler.gecco.pipeline.DefaultPipelineFactory;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.pipeline.PipelineFactory;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.render.MonitorRenderFactory;
import com.geccocrawler.gecco.spider.render.RenderFactory;
import com.geccocrawler.gecco.spider.render.RenderType;
import com.geccocrawler.gecco.utils.ReflectUtils;
import com.geccocrawler.gecco.utils.UrlMatcher;
/**
* SpiderBean是爬虫渲染的JavaBean的统一接口类,所有Bean均继承该接口。SpiderBeanFactroy会根据请求的url地址,
* 匹配相应的SpiderBean,同时生成该SpiderBean的上下文SpiderBeanContext. SpiderBeanContext包括需要改SpiderBean的渲染类
* (目前支持HTML、JSON两种Bean的渲染方式)、下载前处理类、下载后处理类以及渲染完成后对SpiderBean的后续处理Pipeline。
*
* @author huchengyi
*
*/
public class SpiderBeanFactory {
private static final Log LOG = LogFactory.getLog(SpiderBeanFactory.class);
/**
* 匹配的SpriderBean matchUrl:SpiderBean
*/
private Map> spiderBeans;
/**
* 匹配的SpiderBean上下文 SpiderBeanClassName:SpiderBeanClass
*/
private Map spiderBeanContexts;
private DownloaderFactory downloaderFactory;
private DownloaderAOPFactory downloaderAOPFactory;
private RenderFactory renderFactory;
private PipelineFactory pipelineFactory;
protected Reflections reflections;
public SpiderBeanFactory(String classPath) {
this(classPath, null);
}
public SpiderBeanFactory(String classPath, PipelineFactory pipelineFactory) {
if (StringUtils.isNotEmpty(classPath)) {
reflections = new Reflections(
ConfigurationBuilder.build("com.geccocrawler.gecco", classPath, GeccoClassLoader.get())
.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
// reflections = new Reflections("com.geccocrawler.gecco", classPath);
} else {
reflections = new Reflections(ConfigurationBuilder.build("com.geccocrawler.gecco", GeccoClassLoader.get())
.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
// reflections = new Reflections("com.geccocrawler.gecco");
}
dynamic();
this.downloaderFactory = new MonitorDownloaderFactory(reflections);
this.downloaderAOPFactory = new DownloaderAOPFactory(reflections);
this.renderFactory = new MonitorRenderFactory(reflections);
if (pipelineFactory != null) {
this.pipelineFactory = pipelineFactory;
} else {
this.pipelineFactory = new DefaultPipelineFactory(reflections);
}
this.spiderBeans = new ConcurrentHashMap>();
this.spiderBeanContexts = new ConcurrentHashMap();
loadSpiderBean(reflections);
}
/**
* 动态增加的spiderBean
*/
private void dynamic() {
GeccoClassLoader gcl = GeccoClassLoader.get();
for (String className : gcl.getClasses().keySet()) {
reflections.getStore().get(TypeAnnotationsScanner.class.getSimpleName()).put(Gecco.class.getName(),
className);
}
}
private void loadSpiderBean(Reflections reflections) {
Set> spiderBeanClasses = reflections.getTypesAnnotatedWith(Gecco.class);
for (Class> spiderBeanClass : spiderBeanClasses) {
addSpiderBean(spiderBeanClass);
}
}
@SuppressWarnings({ "unchecked" })
public void addSpiderBean(Class> spiderBeanClass) {
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
for(String matchUrl : gecco.matchUrl()) {
//String matchUrl = gecco.matchUrl();
try {
// SpiderBean spider = (SpiderBean)spiderBeanClass.newInstance();
// 判断是不是SpiderBeanClass????
if (spiderBeans.containsKey(matchUrl)) {
LOG.warn("there are multil '" + matchUrl + "' ,first htmlBean will be Override。");
}
spiderBeans.put(matchUrl, (Class extends SpiderBean>) spiderBeanClass);
SpiderBeanContext context = initContext(spiderBeanClass);
spiderBeanContexts.put(spiderBeanClass.getName(), context);
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
public void removeSpiderBean(Class> spiderBeanClass) {
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
for(String matchUrl : gecco.matchUrl()) {
//String matchUrl = gecco.matchUrl();
try {
spiderBeans.remove(matchUrl);
spiderBeanContexts.remove(spiderBeanClass.getName());
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
public Class extends SpiderBean> matchSpider(HttpRequest request) {
String url = request.getUrl();
Class extends SpiderBean> commonSpider = null;// 通用爬虫
for (Map.Entry> entrys : spiderBeans.entrySet()) {
Class extends SpiderBean> spider = entrys.getValue();
String urlPattern = entrys.getKey();
Map params = UrlMatcher.match(url, urlPattern);
if (params != null) {
request.setParameters(params);
return spider;
} else {
if (urlPattern.equals("*")) {
commonSpider = spider;
}
}
}
if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫
return commonSpider;
}
return null;
}
public SpiderBeanContext getContext(Class extends SpiderBean> spider) {
return spiderBeanContexts.get(spider.getName());
}
private SpiderBeanContext initContext(Class> spiderBeanClass) {
SpiderBeanContext context = new SpiderBeanContext();
// 关联的after、before、downloader
downloadContext(context, spiderBeanClass);
// 关联的render
renderContext(context, spiderBeanClass);
// 关联的pipelines
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
String[] pipelineNames = gecco.pipelines();
pipelineContext(context, pipelineNames);
return context;
}
private void downloadContext(SpiderBeanContext context, Class> spiderBeanClass) {
String geccoName = spiderBeanClass.getName();
context.setBeforeDownload(downloaderAOPFactory.getBefore(geccoName));
context.setAfterDownload(downloaderAOPFactory.getAfter(geccoName));
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
String downloader = gecco.downloader();
context.setDownloader(downloaderFactory.getDownloader(downloader));
context.setTimeout(gecco.timeout());
}
private void renderContext(SpiderBeanContext context, Class> spiderBeanClass) {
RenderType renderType = RenderType.HTML;
if (ReflectUtils.haveSuperType(spiderBeanClass, JsonBean.class)) {
renderType = RenderType.JSON;
}
context.setRender(renderFactory.getRender(renderType));
}
@SuppressWarnings({ "rawtypes" })
private void pipelineContext(SpiderBeanContext context, String[] pipelineNames) {
if (pipelineNames != null && pipelineNames.length > 0) {
List pipelines = new ArrayList();
for (String pipelineName : pipelineNames) {
if (StringUtils.isEmpty(pipelineName)) {
continue;
}
Pipeline pipeline = pipelineFactory.getPipeline(pipelineName);
if (pipeline != null) {
pipelines.add(pipeline);
}
}
context.setPipelines(pipelines);
}
}
public DownloaderAOPFactory getDownloaderAOPFactory() {
return downloaderAOPFactory;
}
public RenderFactory getRenderFactory() {
return renderFactory;
}
public PipelineFactory getPipelineFactory() {
return pipelineFactory;
}
public DownloaderFactory getDownloaderFactory() {
return downloaderFactory;
}
public Reflections getReflections() {
return reflections;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy