All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.renlm.plugins.MyCrawler.scheduler.MyRedisScheduler Maven / Gradle / Ivy

There is a newer version: 2.8.8
Show newest version
/*
 * Copyright (c) 2020 Renlm
 * MyCrawler is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 * 	http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */
package cn.renlm.plugins.MyCrawler.scheduler;

import cn.hutool.core.codec.Base64;
import cn.hutool.core.util.BooleanUtil;
import cn.hutool.core.util.NumberUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.renlm.plugins.MyCrawler.PageUrlType;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.RedisPriorityScheduler;

/**
 * 分布式Url调度
 * 
 * @author RenLiMing(任黎明)
 *
 */
public class MyRedisScheduler extends RedisPriorityScheduler implements MyDuplicateVerify {

	private static final String VERIFY_PREFIX = "verify_";

	public MyRedisScheduler(JedisPool jedisPool) {
		super(jedisPool);
	}

	private String getVerifyKey(Task task) {
		return VERIFY_PREFIX + task.getUUID();
	}

	@Override
	public void cleanCache(Request request, Task task) {
		String url = request.getUrl();
		String cacheKey = Base64.encode(url);
		try (Jedis jedis = pool.getResource()) {
			jedis.srem(getSetKey(task), url);
			jedis.srem(getVerifyKey(task), url);
			jedis.del(cacheKey);
		}
	}

	@Override
	public boolean verifyDuplicate(Boolean forceUpdate, Request request, Task task) {
		String url = request.getUrl();
		int pageUrlType = ObjectUtil.defaultIfNull(request.getExtra(PageUrlType.extraKey),
				PageUrlType.unknown.value());
		try (Jedis jedis = pool.getResource()) {
			String cacheKey = Base64.encode(url);
			if (NumberUtil.equals(pageUrlType, PageUrlType.enterurl.value())) {
				this.cleanCache(request, task);
				return false;
			} else if (NumberUtil.equals(pageUrlType, PageUrlType.seed.value())) {
				boolean duplicate = jedis.exists(cacheKey);
				if (BooleanUtil.isTrue(forceUpdate) || !duplicate) {
					jedis.srem(getSetKey(task), url);
					jedis.setex(cacheKey, 60 * 60 * 21L, url);
				}
				if (BooleanUtil.isTrue(forceUpdate)) {
					return false;
				}
				return duplicate;
			} else {
				boolean duplicate = jedis.sadd(getVerifyKey(task), url) == 0;
				if (BooleanUtil.isTrue(forceUpdate)) {
					jedis.srem(getSetKey(task), url);
					return false;
				}
				return duplicate;
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy