cn.renlm.plugins.MyCrawler.scheduler.MyQueueScheduler Maven / Gradle / Ivy
/*
* Copyright (c) 2020 Renlm
* MyCrawler is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
package cn.renlm.plugins.MyCrawler.scheduler;
import java.util.Set;
import cn.hutool.core.util.BooleanUtil;
import cn.hutool.core.util.ReflectUtil;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
/**
* 默认Url调度
*
* @author RenLiMing(任黎明)
*
*/
public class MyQueueScheduler extends QueueScheduler implements MyDuplicateVerify {
private DuplicateRemover verifyDuplicate = new HashSetDuplicateRemover();
@Override
public void cleanCache(Request request, Task task) {
this.verifyDuplicate(true, request, task);
}
@Override
@SuppressWarnings("unchecked")
public boolean verifyDuplicate(Boolean forceUpdate, Request request, Task task) {
boolean duplicate = verifyDuplicate.isDuplicate(request, task);
if (BooleanUtil.isTrue(forceUpdate)) {
Set urls = (Set) ReflectUtil.getFieldValue(verifyDuplicate, "urls");
urls.remove(request.getUrl());
return false;
}
return duplicate;
}
}