All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.geccocrawler.gecco.spider.SpiderBeanFactory Maven / Gradle / Ivy

There is a newer version: 1.3.21
Show newest version
package com.geccocrawler.gecco.spider;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.reflections.Reflections;
import org.reflections.scanners.TypeAnnotationsScanner;
import org.reflections.util.ConfigurationBuilder;

import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.downloader.DownloaderAOPFactory;
import com.geccocrawler.gecco.downloader.DownloaderFactory;
import com.geccocrawler.gecco.downloader.MonitorDownloaderFactory;
import com.geccocrawler.gecco.dynamic.GeccoClassLoader;
import com.geccocrawler.gecco.dynamic.GeccoJavaReflectionAdapter;
import com.geccocrawler.gecco.pipeline.DefaultPipelineFactory;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.pipeline.PipelineFactory;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.render.MonitorRenderFactory;
import com.geccocrawler.gecco.spider.render.RenderFactory;
import com.geccocrawler.gecco.spider.render.RenderType;
import com.geccocrawler.gecco.utils.ReflectUtils;
import com.geccocrawler.gecco.utils.UrlMatcher;

/**
 * SpiderBean是爬虫渲染的JavaBean的统一接口类,所有Bean均继承该接口。SpiderBeanFactroy会根据请求的url地址,
 * 匹配相应的SpiderBean,同时生成该SpiderBean的上下文SpiderBeanContext. SpiderBeanContext包括需要改SpiderBean的渲染类
 * (目前支持HTML、JSON两种Bean的渲染方式)、下载前处理类、下载后处理类以及渲染完成后对SpiderBean的后续处理Pipeline。
 * 
 * @author huchengyi
 *
 */
public class SpiderBeanFactory {

	private static final Log LOG = LogFactory.getLog(SpiderBeanFactory.class);

	/**
	 * 匹配的SpriderBean matchUrl:SpiderBean
	 */
	private Map> spiderBeans;

	/**
	 * 匹配的SpiderBean上下文 SpiderBeanClassName:SpiderBeanClass
	 */
	private Map spiderBeanContexts;

	private DownloaderFactory downloaderFactory;

	private DownloaderAOPFactory downloaderAOPFactory;

	private RenderFactory renderFactory;

	private PipelineFactory pipelineFactory;

	protected Reflections reflections;

	public SpiderBeanFactory(String classPath) {
		this(classPath, null);
	}

	public SpiderBeanFactory(String classPath, PipelineFactory pipelineFactory) {
		if (StringUtils.isNotEmpty(classPath)) {
			reflections = new Reflections(
					ConfigurationBuilder.build("com.geccocrawler.gecco", classPath, GeccoClassLoader.get())
							.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
			// reflections = new Reflections("com.geccocrawler.gecco", classPath);
		} else {
			reflections = new Reflections(ConfigurationBuilder.build("com.geccocrawler.gecco", GeccoClassLoader.get())
					.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
			// reflections = new Reflections("com.geccocrawler.gecco");
		}
		dynamic();

		this.downloaderFactory = new MonitorDownloaderFactory(reflections);
		this.downloaderAOPFactory = new DownloaderAOPFactory(reflections);
		this.renderFactory = new MonitorRenderFactory(reflections);
		if (pipelineFactory != null) {
			this.pipelineFactory = pipelineFactory;
		} else {
			this.pipelineFactory = new DefaultPipelineFactory(reflections);
		}
		this.spiderBeans = new ConcurrentHashMap>();
		this.spiderBeanContexts = new ConcurrentHashMap();
		loadSpiderBean(reflections);
	}

	/**
	 * 动态增加的spiderBean
	 */
	private void dynamic() {
		GeccoClassLoader gcl = GeccoClassLoader.get();
		for (String className : gcl.getClasses().keySet()) {
			reflections.getStore().get(TypeAnnotationsScanner.class.getSimpleName()).put(Gecco.class.getName(),
					className);
		}
	}

	private void loadSpiderBean(Reflections reflections) {
		Set> spiderBeanClasses = reflections.getTypesAnnotatedWith(Gecco.class);
		for (Class spiderBeanClass : spiderBeanClasses) {
			addSpiderBean(spiderBeanClass);
		}
	}

	@SuppressWarnings({ "unchecked" })
	public void addSpiderBean(Class spiderBeanClass) {
		Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
		for(String matchUrl : gecco.matchUrl()) {
		//String matchUrl = gecco.matchUrl();
			try {
				// SpiderBean spider = (SpiderBean)spiderBeanClass.newInstance();
				// 判断是不是SpiderBeanClass????
				if (spiderBeans.containsKey(matchUrl)) {
					LOG.warn("there are multil '" + matchUrl + "' ,first htmlBean will be Override。");
				}
				spiderBeans.put(matchUrl, (Class) spiderBeanClass);
				SpiderBeanContext context = initContext(spiderBeanClass);
				spiderBeanContexts.put(spiderBeanClass.getName(), context);
			} catch (Exception ex) {
				ex.printStackTrace();
			}
		}
	}

	public void removeSpiderBean(Class spiderBeanClass) {
		Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
		for(String matchUrl : gecco.matchUrl()) {
		//String matchUrl = gecco.matchUrl();
			try {
				spiderBeans.remove(matchUrl);
				spiderBeanContexts.remove(spiderBeanClass.getName());
			} catch (Exception ex) {
				ex.printStackTrace();
			}
		}
	}

	public Class matchSpider(HttpRequest request) {
		String url = request.getUrl();
		Class commonSpider = null;// 通用爬虫
		for (Map.Entry> entrys : spiderBeans.entrySet()) {
			Class spider = entrys.getValue();
			String urlPattern = entrys.getKey();
			Map params = UrlMatcher.match(url, urlPattern);
			if (params != null) {
				request.setParameters(params);
				return spider;
			} else {
				if (urlPattern.equals("*")) {
					commonSpider = spider;
				}
			}
		}
		if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫
			return commonSpider;
		}
		return null;
	}

	public SpiderBeanContext getContext(Class spider) {
		return spiderBeanContexts.get(spider.getName());
	}

	private SpiderBeanContext initContext(Class spiderBeanClass) {
		SpiderBeanContext context = new SpiderBeanContext();
		// 关联的after、before、downloader
		downloadContext(context, spiderBeanClass);
		// 关联的render
		renderContext(context, spiderBeanClass);
		// 关联的pipelines
		Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
		String[] pipelineNames = gecco.pipelines();
		pipelineContext(context, pipelineNames);
		return context;
	}

	private void downloadContext(SpiderBeanContext context, Class spiderBeanClass) {
		String geccoName = spiderBeanClass.getName();
		context.setBeforeDownload(downloaderAOPFactory.getBefore(geccoName));
		context.setAfterDownload(downloaderAOPFactory.getAfter(geccoName));
		Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
		String downloader = gecco.downloader();
		context.setDownloader(downloaderFactory.getDownloader(downloader));
		context.setTimeout(gecco.timeout());
	}

	private void renderContext(SpiderBeanContext context, Class spiderBeanClass) {
		RenderType renderType = RenderType.HTML;
		if (ReflectUtils.haveSuperType(spiderBeanClass, JsonBean.class)) {
			renderType = RenderType.JSON;
		}
		context.setRender(renderFactory.getRender(renderType));
	}

	@SuppressWarnings({ "rawtypes" })
	private void pipelineContext(SpiderBeanContext context, String[] pipelineNames) {
		if (pipelineNames != null && pipelineNames.length > 0) {
			List pipelines = new ArrayList();
			for (String pipelineName : pipelineNames) {
				if (StringUtils.isEmpty(pipelineName)) {
					continue;
				}
				Pipeline pipeline = pipelineFactory.getPipeline(pipelineName);
				if (pipeline != null) {
					pipelines.add(pipeline);
				}
			}
			context.setPipelines(pipelines);
		}
	}

	public DownloaderAOPFactory getDownloaderAOPFactory() {
		return downloaderAOPFactory;
	}

	public RenderFactory getRenderFactory() {
		return renderFactory;
	}

	public PipelineFactory getPipelineFactory() {
		return pipelineFactory;
	}

	public DownloaderFactory getDownloaderFactory() {
		return downloaderFactory;
	}

	public Reflections getReflections() {
		return reflections;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy