All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.virjar.vscrawler.core.VSCrawlerBuilder Maven / Gradle / Ivy

There is a newer version: 0.3.1
Show newest version
package com.virjar.vscrawler.core;

import com.google.common.collect.Lists;
import com.virjar.vscrawler.core.event.systemevent.SeedEmptyEvent;
import com.virjar.vscrawler.core.event.systemevent.ShutDownChecker;
import com.virjar.vscrawler.core.net.CrawlerHttpClientGenerator;
import com.virjar.vscrawler.core.net.DefaultHttpClientGenerator;
import com.virjar.vscrawler.core.net.proxy.IPPool;
import com.virjar.vscrawler.core.net.proxy.strategy.ProxyPlanner;
import com.virjar.vscrawler.core.net.proxy.strategy.ProxyStrategy;
import com.virjar.vscrawler.core.net.session.CrawlerSessionPool;
import com.virjar.vscrawler.core.net.session.LoginHandler;
import com.virjar.vscrawler.core.net.user.*;
import com.virjar.vscrawler.core.processor.BindRouteProcessor;
import com.virjar.vscrawler.core.processor.PageDownLoadProcessor;
import com.virjar.vscrawler.core.processor.RouteProcessor;
import com.virjar.vscrawler.core.processor.SeedProcessor;
import com.virjar.vscrawler.core.resourcemanager.ResourceManager;
import com.virjar.vscrawler.core.resourcemanager.ResourceManagerFactory;
import com.virjar.vscrawler.core.resourcemanager.model.ResourceSetting;
import com.virjar.vscrawler.core.resourcemanager.service.QueueStore;
import com.virjar.vscrawler.core.resourcemanager.service.RamQueueStore;
import com.virjar.vscrawler.core.resourcemanager.service.ResourceQueue;
import com.virjar.vscrawler.core.seed.*;
import com.virjar.vscrawler.core.serialize.ConsolePipeline;
import com.virjar.vscrawler.core.serialize.Pipeline;
import com.virjar.vscrawler.core.util.VSCrawlerConstant;

import java.util.List;

/**
 * Created by virjar on 17/4/30.
* build a crawlerInstance * * @author virjar * @since 0.0.1 */ public class VSCrawlerBuilder { /** * httpclient构造器,可能需要定制自己的httpclient */ private CrawlerHttpClientGenerator crawlerHttpClientGenerator; /** * 登录处理器 */ private LoginHandler loginHandler; /** * 用户数据导入源 */ private UserResourceFacade userResourceFacade; /** * 代理切换策略 */ private ProxyStrategy proxyStrategy; /** * 代理池对象 */ private IPPool ipPool; /** * 自定义代理策略的时候,代理决策器 */ private ProxyPlanner proxyPlanner; /** * 种子处理器,负责解析页面逻辑 */ private SeedProcessor processor; /** * 基于路由的页面解析器 */ private List seedRouters = Lists.newLinkedList(); /** * 序列化层 */ private List pipelineList = Lists.newLinkedList(); /** * 初始化种子来源 */ private InitSeedSource initSeedSource; /** * 种子ID决策器,他的存在可以提供自定义消重功能 */ private SeedKeyResolver seedKeyResolver; /** * 段决策器,实现种子按时间分段,分段后段内种子消重,段间种子互不消重 */ private SegmentResolver segmentResolver; private boolean loginOnSessionCreate = false; /** * session池,初始化大小 */ private int sessionPoolInitialSize = 0; /** * session池,核心大小 */ private int sessionPoolCoreSize = 5; /** * session池,最大大小 */ private int sessionPoolMaxSize = 30; /** * session池,重用时间间隔,太短了可能被封 */ private long sessionPoolReuseDuration = 0L; /** * session池,seesion最大在线时长,太长了则可能封 */ private long sessionPoolMaxOnlineDuration = Long.MAX_VALUE; private int seedManagerCacheSize = 1024; /** * 爬虫工作线程数 */ private int workerThreadNumber = Runtime.getRuntime().availableProcessors() * 4; /** * 慢启动控制 */ private boolean slowStart = false; /** * 慢启动时长 */ private long slowStartDuration = 5 * 60 * 1000; /** * 一分钟内没有新任务,则关闭爬虫 */ private long stopWhileTaskEmptyDuration = 60 * 1000; private String crawlerName = VSCrawlerConstant.DEFAULT_CRAWLER_NAME; private ResourceManager resourceManager; private QueueStore defaultQueueStore; private ResourceSetting defaultResourceSetting; public static VSCrawlerBuilder create() { return new VSCrawlerBuilder(); } public VSCrawlerBuilder setStopWhileTaskEmptyDuration(long stopWhileTaskEmptyDuration) { this.stopWhileTaskEmptyDuration = stopWhileTaskEmptyDuration; return this; } public VSCrawlerBuilder setSlowStart(boolean slowStart) { this.slowStart = slowStart; return this; } public VSCrawlerBuilder setSlowStartDuration(long slowStartDuration) { this.slowStartDuration = slowStartDuration; return this; } public VSCrawlerBuilder setWorkerThreadNumber(int workerThreadNumber) { this.workerThreadNumber = workerThreadNumber; return this; } public VSCrawlerBuilder setLoginOnSessionCreate(boolean loginOnSessionCreate) { this.loginOnSessionCreate = loginOnSessionCreate; return this; } public VSCrawlerBuilder setCrawlerHttpClientGenerator(CrawlerHttpClientGenerator crawlerHttpClientGenerator) { this.crawlerHttpClientGenerator = crawlerHttpClientGenerator; return this; } public VSCrawlerBuilder setInitSeedSource(InitSeedSource initSeedSource) { this.initSeedSource = initSeedSource; return this; } public VSCrawlerBuilder setIpPool(IPPool ipPool) { this.ipPool = ipPool; return this; } public VSCrawlerBuilder setLoginHandler(LoginHandler loginHandler) { this.loginHandler = loginHandler; return this; } public VSCrawlerBuilder setPipelineList(List pipelineList) { this.pipelineList = pipelineList; return this; } public VSCrawlerBuilder addPipeline(Pipeline pipeline) { this.pipelineList.add(pipeline); return this; } public VSCrawlerBuilder setProcessor(SeedProcessor processor) { this.processor = processor; return this; } public VSCrawlerBuilder addRouteProcessor(BindRouteProcessor seedRouter) { this.seedRouters.add(seedRouter); return this; } public VSCrawlerBuilder setProxyPlanner(ProxyPlanner proxyPlanner) { this.proxyPlanner = proxyPlanner; return this; } public VSCrawlerBuilder setProxyStrategy(ProxyStrategy proxyStrategy) { this.proxyStrategy = proxyStrategy; return this; } public VSCrawlerBuilder setSeedKeyResolver(SeedKeyResolver seedKeyResolver) { this.seedKeyResolver = seedKeyResolver; return this; } public VSCrawlerBuilder setSegmentResolver(SegmentResolver segmentResolver) { this.segmentResolver = segmentResolver; return this; } public VSCrawlerBuilder setUserResourceFacade(UserResourceFacade userResourceFacade) { this.userResourceFacade = userResourceFacade; return this; } public VSCrawlerBuilder setCrawlerName(String crawlerName) { if (crawlerName == null || !crawlerName.matches("[0-9a-zA-Z_]+")) { throw new IllegalArgumentException("crawlerName must be identified,for :" + crawlerName); } this.crawlerName = crawlerName; return this; } public VSCrawlerBuilder setSessionPoolCoreSize(int sessionPoolCoreSize) { this.sessionPoolCoreSize = sessionPoolCoreSize; if (sessionPoolMaxSize < sessionPoolCoreSize) { sessionPoolMaxSize = sessionPoolCoreSize; } if (sessionPoolInitialSize > sessionPoolMaxSize) { sessionPoolInitialSize = sessionPoolMaxSize; } return this; } public VSCrawlerBuilder setSessionPoolInitialSize(int sessionPoolInitialSize) { this.sessionPoolInitialSize = sessionPoolInitialSize; if (sessionPoolInitialSize > sessionPoolMaxSize) { sessionPoolMaxSize = sessionPoolInitialSize; } if (sessionPoolCoreSize > sessionPoolMaxSize) { sessionPoolCoreSize = sessionPoolMaxSize; } return this; } public VSCrawlerBuilder setSessionPoolMaxOnlineDuration(long sessionPoolMaxOnlineDuration) { this.sessionPoolMaxOnlineDuration = sessionPoolMaxOnlineDuration; return this; } public VSCrawlerBuilder setSessionPoolMaxSize(int sessionPoolMaxSize) { this.sessionPoolMaxSize = sessionPoolMaxSize; if (sessionPoolCoreSize > sessionPoolMaxSize) { sessionPoolCoreSize = sessionPoolMaxSize; } if (sessionPoolInitialSize > sessionPoolMaxSize) { sessionPoolInitialSize = sessionPoolMaxSize; } return this; } public VSCrawlerBuilder setSessionPoolReuseDuration(long sessionPoolReuseDuration) { this.sessionPoolReuseDuration = sessionPoolReuseDuration; return this; } public VSCrawlerBuilder setSeedManagerCacheSize(int seedManagerCacheSize) { this.seedManagerCacheSize = seedManagerCacheSize; return this; } public VSCrawlerBuilder setResourceManager(ResourceManager resourceManager) { this.resourceManager = resourceManager; return this; } public VSCrawlerBuilder setDefaultQueueStore(QueueStore defaultQueueStore) { this.defaultQueueStore = defaultQueueStore; return this; } public VSCrawlerBuilder setDefaultResourceSetting(ResourceSetting defaultResourceSetting) { this.defaultResourceSetting = defaultResourceSetting; return this; } public VSCrawler build() { final VSCrawlerContext vsCrawlerContext = VSCrawlerContext.create(crawlerName); if (crawlerHttpClientGenerator == null) { crawlerHttpClientGenerator = new DefaultHttpClientGenerator(); } if (proxyStrategy == null) { proxyStrategy = ProxyStrategy.NONE; } if (proxyStrategy == ProxyStrategy.CUSTOM && proxyPlanner == null) { throw new IllegalStateException("proxyPlanner must exist if proxyStrategy is custom"); } CrawlerSessionPool crawlerSessionPool = new CrawlerSessionPool(vsCrawlerContext, crawlerHttpClientGenerator, proxyStrategy, ipPool, proxyPlanner, sessionPoolMaxSize, sessionPoolCoreSize, sessionPoolInitialSize, sessionPoolReuseDuration, sessionPoolMaxOnlineDuration); if (initSeedSource == null) { initSeedSource = new LocalFileSeedSource(); } if (seedKeyResolver == null) { seedKeyResolver = new DefaultSeedKeyResolver(); } if (segmentResolver == null) { segmentResolver = new DefaultSegmentResolver(); } BerkeleyDBSeedManager berkeleyDBSeedManager = new BerkeleyDBSeedManager(vsCrawlerContext, initSeedSource, seedKeyResolver, segmentResolver, seedManagerCacheSize); if (processor == null && seedRouters.isEmpty()) { processor = new PageDownLoadProcessor(); } if (processor != null && !seedRouters.isEmpty()) { throw new IllegalStateException(" seedProcessor and routeProcessor conflict"); } if (!seedRouters.isEmpty()) { RouteProcessor routeProcessor = new RouteProcessor(); routeProcessor.addRouters(seedRouters); processor = routeProcessor; } if (pipelineList.isEmpty()) { pipelineList.add(new ConsolePipeline()); } VSCrawler vsCrawler = new VSCrawler(vsCrawlerContext, crawlerSessionPool, berkeleyDBSeedManager, processor, pipelineList, workerThreadNumber, slowStart, slowStartDuration); if (loginOnSessionCreate) { if (userResourceFacade == null) { userResourceFacade = new DefaultUserResource(); } } if (resourceManager == null) { resourceManager = ResourceManagerFactory.create().build(); } vsCrawlerContext.setResourceManager(resourceManager); if (defaultQueueStore == null) { defaultQueueStore = new RamQueueStore(); } vsCrawlerContext.setQueueStore(defaultQueueStore); if (defaultResourceSetting == null) { defaultResourceSetting = ResourceSetting.create().setLock(true); } vsCrawlerContext.setResourceSetting(defaultResourceSetting); if (userResourceFacade != null) { if (loginHandler == null) { throw new IllegalStateException("login handler is null ,but open login switch"); } ResourceQueue resourceQueue = resourceManager.getResourceQueue(vsCrawlerContext.makeUserResourceTag()); if (resourceQueue != null) { resourceQueue.addResourceLoader(new UserManager2ResourceLoader(userResourceFacade)); } else { resourceManager.registry(new ResourceQueue(vsCrawlerContext.makeUserResourceTag(), defaultQueueStore, defaultResourceSetting, new UserManager2ResourceLoader(userResourceFacade))); } vsCrawler.addCrawlerStartCallBack(new AutoLoginPlugin(loginHandler, new UserManager2(resourceManager, vsCrawlerContext))); } if (stopWhileTaskEmptyDuration > 0) { vsCrawler.addCrawlerStartCallBack(new VSCrawler.CrawlerStartCallBack() { @Override public void onCrawlerStart(final VSCrawler vsCrawler) { vsCrawler.getVsCrawlerContext().getAutoEventRegistry().registerObserver(new ShutDownChecker() { @Override public void checkShutDown(VSCrawlerContext vsCrawlerContext1) { // 15s之后检查活跃线程数,发现为0,证明连续10s都没用任务执行了 if (vsCrawler.activeWorker() == 0 && (System.currentTimeMillis() - vsCrawler.getLastActiveTime()) > 10000) { System.out.println((stopWhileTaskEmptyDuration / 1000) + "秒没收到爬虫任务,自动爬虫关闭器,尝试停止爬虫"); vsCrawler.stopCrawler(); } } }); vsCrawler.getVsCrawlerContext().getAutoEventRegistry().registerObserver(new SeedEmptyEvent() { @Override public void onSeedEmpty(VSCrawlerContext vsCrawlerContext1) { vsCrawler.getVsCrawlerContext().getAutoEventRegistry().createDelayEventSender(ShutDownChecker.class, stopWhileTaskEmptyDuration).delegate() .checkShutDown(vsCrawlerContext); } }); } }); } return vsCrawler; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy