All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.scheduler.RedisScheduler Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.scheduler;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;

import com.alibaba.fastjson.JSON;

import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;

/**
 * Use Redis as url scheduler for distributed crawlers.
* * @author [email protected]
* @since 0.2.0 */ public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover { protected JedisPool pool; private static final String QUEUE_PREFIX = "queue_"; private static final String SET_PREFIX = "set_"; private static final String ITEM_PREFIX = "item_"; public RedisScheduler(String host) { this(new JedisPool(new JedisPoolConfig(), host)); } public RedisScheduler(JedisPool pool) { this.pool = pool; setDuplicateRemover(this); } @Override public void resetDuplicateCheck(Task task) { try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } } @Override public boolean isDuplicate(Request request, Task task) { try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; } } @Override protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { jedis.close(); } } private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { return true; } if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { return true; } if (request.isBinaryContent() || request.getRequestBody() != null) { return true; } if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { return true; } return false; } @Override public synchronized Request poll(Task task) { try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); return o; } Request request = new Request(url); return request; } } protected String getSetKey(Task task) { return SET_PREFIX + task.getUUID(); } protected String getQueueKey(Task task) { return QUEUE_PREFIX + task.getUUID(); } protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } @Override public int getLeftRequestsCount(Task task) { try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); } } @Override public int getTotalRequestsCount(Task task) { try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy