All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.renlm.plugins.MyCrawler.MySpider Maven / Gradle / Ivy

There is a newer version: 2.8.8
Show newest version
/*
 * Copyright (c) 2020 Renlm
 * MyCrawler is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 * 	http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */
package cn.renlm.plugins.MyCrawler;

import java.io.IOException;
import java.util.function.Consumer;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.BooleanUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.setting.Setting;
import cn.renlm.plugins.MyCrawler.scheduler.MyDuplicateVerify;
import cn.renlm.plugins.MyCrawler.selenium.ChromeDownloader;
import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.SpiderListener;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬虫
 * 
 * @author RenLiMing(任黎明)
 *
 */
public class MySpider extends Spider {

	@Getter
	private final MySite mySite;

	@Getter
	private final MyDuplicateVerify myDuplicateVerify;

	/**
	 * 构造函数
	 * 
	 * @param pageProcessor
	 * @param mySite
	 * @param myDuplicateVerify
	 */
	public MySpider(PageProcessor pageProcessor, MySite mySite, MyDuplicateVerify myDuplicateVerify) {
		super(pageProcessor);
		this.mySite = mySite;
		this.myDuplicateVerify = myDuplicateVerify;
	}

	/**
	 * 处理缓存问题
	 */
	@Override
	public Spider addUrl(String... urls) {
		if (ObjectUtil.isNotEmpty(this.mySite) && ObjectUtil.isNotEmpty(this.myDuplicateVerify)
				&& BooleanUtil.isTrue(this.mySite.isForceUpdate())) {
			for (String url : urls) {
				this.myDuplicateVerify.cleanCache(new Request(url), this);
			}
		}
		super.addUrl(urls);
		return this;
	}

	/**
	 * 处理缓存问题
	 */
	@Override
	public MySpider addRequest(Request... requests) {
		if (ObjectUtil.isNotEmpty(this.mySite) && ObjectUtil.isNotEmpty(this.myDuplicateVerify)
				&& BooleanUtil.isTrue(this.mySite.isForceUpdate())) {
			for (Request request : requests) {
				this.myDuplicateVerify.cleanCache(request, this);
			}
		}
		super.addRequest(requests);
		return this;
	}

	/**
	 * 下载完成回调
	 * 
	 * @param site
	 * @param page
	 * @return
	 */
	public MySpider onDownloaded(MySite site, Consumer page) {
		if (ObjectUtil.isNotEmpty(site) && site.isEnableSelenuim()) {
			Setting chromeSetting = site.getChromeSetting();
			if (BooleanUtil.isTrue(site.getHeadless())) {
				chromeSetting.set("headless", "true");
			}
			if (StrUtil.isNotBlank(site.getUserAgent())) {
				chromeSetting.set("userAgent", site.getUserAgent());
			}
			if (BooleanUtil.isTrue(site.getScreenshot())) {
				chromeSetting.set("screenshot", "true");
			}
			if (site.getSleepTime() > 0) {
				chromeSetting.set("sleepTime", String.valueOf(site.getSleepTime()));
			}
			ChromeDownloader downloader = new ChromeDownloader(chromeSetting) {
				@Override
				public Page download(Request request, Task task) {
					Page pager = super.download(request, task);
					page.accept(pager);
					return pager;
				}

				@Override
				public void close() {
					try {
						super.close();
					} catch (IOException e) {
					}
				}
			};
			this.setDownloader(downloader);
		} else {
			HttpClientDownloader downloader = new HttpClientDownloader() {
				@Override
				public Page download(Request request, Task task) {
					Page pager = super.download(request, task);
					page.accept(pager);
					return pager;
				}
			};
			this.setDownloader(downloader);
		}
		return this;
	}

	/**
	 * 成功回调
	 * 
	 * @param request
	 * @return
	 */
	public MySpider onSuccess(Consumer request) {
		if (CollUtil.isEmpty(this.getSpiderListeners())) {
			this.setSpiderListeners(CollUtil.newArrayList());
		}
		this.getSpiderListeners().add(new SpiderListener() {
			@Override
			public void onSuccess(Request req) {
				request.accept(req);
			}

			@Override
			public void onError(Request req) {

			}
		});
		return this;
	}

	/**
	 * 失败回调
	 * 
	 * @param request
	 * @return
	 */
	public MySpider onError(Consumer request) {
		if (CollUtil.isEmpty(this.getSpiderListeners())) {
			this.setSpiderListeners(CollUtil.newArrayList());
		}
		this.getSpiderListeners().add(new SpiderListener() {
			@Override
			public void onSuccess(Request req) {

			}

			@Override
			public void onError(Request req) {
				request.accept(req);
			}
		});
		return this;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy