All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.renlm.plugins.MyCrawler.scheduler.MyQueueScheduler Maven / Gradle / Ivy

There is a newer version: 2.8.8
Show newest version
/*
 * Copyright (c) 2020 Renlm
 * MyCrawler is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 * 	http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */
package cn.renlm.plugins.MyCrawler.scheduler;

import java.util.Set;

import cn.hutool.core.util.BooleanUtil;
import cn.hutool.core.util.ReflectUtil;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;

/**
 * 默认Url调度
 * 
 * @author RenLiMing(任黎明)
 *
 */
public class MyQueueScheduler extends QueueScheduler implements MyDuplicateVerify {

	private DuplicateRemover verifyDuplicate = new HashSetDuplicateRemover();

	@Override
	public void cleanCache(Request request, Task task) {
		this.verifyDuplicate(true, request, task);
	}

	@Override
	@SuppressWarnings("unchecked")
	public boolean verifyDuplicate(Boolean forceUpdate, Request request, Task task) {
		boolean duplicate = verifyDuplicate.isDuplicate(request, task);
		if (BooleanUtil.isTrue(forceUpdate)) {
			Set urls = (Set) ReflectUtil.getFieldValue(verifyDuplicate, "urls");
			urls.remove(request.getUrl());
			return false;
		}
		return duplicate;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy