com.github.xbynet.crawler.scheduler.RedisScheduler Maven / Gradle / Ivy
package com.github.xbynet.crawler.scheduler;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.protocol.HttpClientContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import com.alibaba.fastjson.JSON;
import com.github.xbynet.crawler.Const;
import com.github.xbynet.crawler.ISpider;
import com.github.xbynet.crawler.Request;
import com.github.xbynet.crawler.RequestAction;
/**
* Use Redis as url scheduler for distributed crawlers.
*
* @author [email protected]
* @since 0.2.0
*/
public class RedisScheduler implements Scheduler, DuplicateRemover {
private Logger log=LoggerFactory.getLogger(RedisScheduler.class);
protected JedisPool pool;
private static final String QUEUE_PREFIX = "queue_";
private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
this(new JedisPool(new JedisPoolConfig(), host));
}
public RedisScheduler(JedisPool pool) {
this.pool = pool;
}
@Override
public void resetDuplicateCheck(ISpider spider) {
Jedis jedis = pool.getResource();
try {
jedis.del(getSetKey(spider));
} finally {
jedis.close();
}
}
@Override
public boolean isDuplicate(Request request, ISpider spider) {
Jedis jedis = pool.getResource();
try {
return jedis.sadd(getSetKey(spider), request.getUrl()) > 0;
} finally {
jedis.close();
}
}
@Override
public void push(Request request, ISpider spider) {
Jedis jedis = pool.getResource();
if (Const.HttpMethod.POST == request.getMethod()
|| !isDuplicate(request, spider)) {
log.debug("push to queue {}", request.getUrl());
try {
jedis.rpush(getQueueKey(spider), request.getUrl());
String field = DigestUtils.md5Hex(request.getUrl());
byte[] data=SerializationUtils.serialize(request);
jedis.hset((ITEM_PREFIX + spider.getName()).getBytes(), field.getBytes(), data);
} finally {
jedis.close();
}
}
}
@Override
public synchronized Request poll(ISpider spider) {
Jedis jedis = pool.getResource();
try {
String url = jedis.lpop(getQueueKey(spider));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + spider.getName();
String field = DigestUtils.md5Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
Request request=SerializationUtils.deserialize(bytes);
return request;
} finally {
jedis.close();
}
}
protected String getSetKey(ISpider spider) {
return SET_PREFIX + spider.getName();
}
protected String getQueueKey(ISpider spider) {
return QUEUE_PREFIX + spider.getName();
}
protected String getItemKey(ISpider spider)
{
return ITEM_PREFIX + spider.getName();
}
@Override
public int getLeftRequestsCount(ISpider spider) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.llen(getQueueKey(spider));
return size.intValue();
} finally {
jedis.close();
}
}
@Override
public int getTotalRequestsCount(ISpider spider) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.scard(getSetKey(spider));
return size.intValue();
} finally {
jedis.close();
}
}
@Override
public DuplicateRemover getDuplicateRemover() {
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy