cn.renlm.plugins.MyCrawlerUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of MyCrawler Show documentation
爬虫工具封装
There is a newer version: 2.8.8
/*
 * Copyright (c) 2020 Renlm
 * MyCrawler is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 * 	http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */
package cn.renlm.plugins;

import java.util.List;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.renlm.plugins.MyCrawler.MySite;
import cn.renlm.plugins.MyCrawler.MySpider;
import cn.renlm.plugins.MyCrawler.data.MyProcessPage;
import cn.renlm.plugins.MyCrawler.data.MyProcessPipe;
import cn.renlm.plugins.MyCrawler.pipeline.MyPipeline;
import cn.renlm.plugins.MyCrawler.processor.MyPageProcessor;
import cn.renlm.plugins.MyCrawler.scheduler.MyDuplicateVerify;
import cn.renlm.plugins.MyCrawler.scheduler.MyQueueScheduler;
import cn.renlm.plugins.MyCrawler.scheduler.MyRedisScheduler;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬虫工具
 * 
 * @author RenLiMing(任黎明)
 *
 */
@Slf4j
@UtilityClass
public class MyCrawlerUtil {

	public static final String depthExtraKey = "_MyCrawlerDepthExtra_";
	public static final String screenshotBASE64ExtraKey = "_MyCrawlerScreenshotBASE64Extra_";

	/**
	 * 爬虫实例
	 * 
	 * @param site
	 * @param pageProcessor
	 * @param pipelines
	 * @return
	 */
	public static final MySpider createSpider(MySite site, MyPageProcessor pageProcessor, MyPipeline... pipelines) {
		MyQueueScheduler scheduler = new MyQueueScheduler();
		PageProcessor processor = createPageProcessor(site, pageProcessor);
		MySpider mySpider = new MySpider(processor, site, scheduler);
		mySpider.setScheduler(scheduler);
		for (MyPipeline pipeline : pipelines) {
			mySpider.addPipeline(createPipeline(site, pipeline, scheduler));
		}
		return mySpider.onDownloaded(site, page -> {
			if (page.isDownloadSuccess()) {
				log.debug("{} download success.", page.getUrl());
			} else {
				log.error("{} download fail.", page.getUrl());
			}
		});
	}

	/**
	 * 爬虫实例
	 * 
	 * @param pool
	 * @param site
	 * @param pageProcessor
	 * @param pipelines
	 * @return
	 */
	public static final MySpider createSpider(JedisPool pool, MySite site, MyPageProcessor pageProcessor,
			MyPipeline... pipelines) {
		MyRedisScheduler scheduler = new MyRedisScheduler(pool);
		PageProcessor processor = createPageProcessor(site, pageProcessor);
		MySpider mySpider = new MySpider(processor, site, scheduler);
		mySpider.setScheduler(scheduler);
		for (MyPipeline pipeline : pipelines) {
			mySpider.addPipeline(createPipeline(site, pipeline, scheduler));
		}
		return mySpider.onDownloaded(site, page -> {
			if (page.isDownloadSuccess()) {
				log.debug("{} download success.", page.getUrl());
			} else {
				log.error("{} download fail.", page.getUrl());
			}
		});
	}

	/**
	 * 页面处理器
	 * 
	 * @param site
	 * @param pageProcessor
	 * @return
	 */
	private static final PageProcessor createPageProcessor(final MySite site, final MyPageProcessor pageProcessor) {
		return new PageProcessor() {
			@Override
			public void process(Page page) {
				int depth = ObjectUtil.defaultIfNull(page.getRequest().getExtra(depthExtraKey), 1);
				String screenshotBASE64 = page.getRequest().getExtra(screenshotBASE64ExtraKey);
				MyProcessPage myPage = new MyProcessPage(depth, site, page, screenshotBASE64);
				pageProcessor.process(myPage);
				List targetRequests = page.getTargetRequests();
				// 爬取深度控制
				if (CollUtil.isNotEmpty(targetRequests)) {
					page.getTargetRequests().forEach(it -> {
						it.putExtra(depthExtraKey, depth + 1);
					});
					if (site.getMaxDepth() > 0 && depth >= site.getMaxDepth()) {
						page.getTargetRequests().clear();
					}
				}
			}

			@Override
			public Site getSite() {
				return site;
			}
		};
	}

	/**
	 * 结果处理器
	 * 
	 * @param site
	 * @param pipeline
	 * @param duplicateVerify
	 * @return
	 */
	private static final Pipeline createPipeline(final MySite site, final MyPipeline pipeline,
			final MyDuplicateVerify duplicateVerify) {
		return new Pipeline() {
			@Override
			public void process(ResultItems resultItems, Task task) {
				MyProcessPipe myData = new MyProcessPipe(task, resultItems, duplicateVerify);
				pipeline.process(myData);
			}
		};
	}
}