com.digitalpebble.stormcrawler.elasticsearch.persistence.AbstractSpout Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-elasticsearch Show documentation
Show all versions of storm-crawler-elasticsearch Show documentation
Elasticsearch resources for StormCrawler
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.elasticsearch.persistence;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.concurrent.TimeUnit;
import org.apache.storm.metric.api.IMetric;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsGroup;
import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsRequest;
import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse;
import org.elasticsearch.client.Client;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.elasticsearch.ElasticSearchConnection;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.base.Optional;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
public abstract class AbstractSpout extends BaseRichSpout {
private static final Logger LOG = LoggerFactory
.getLogger(AbstractSpout.class);
protected static final String ESBoltType = "status";
protected static final String ESStatusIndexNameParamName = "es.status.index.name";
protected static final String ESStatusDocTypeParamName = "es.status.doc.type";
/**
* Time in seconds for which acked or failed URLs will be considered for
* fetching again, default 30 secs.
**/
protected static final String ESStatusTTLPurgatory = "es.status.ttl.purgatory";
/**
* Min time to allow between 2 successive queries to ES. Value in msecs,
* default 2000.
**/
private static final String ESStatusMinDelayParamName = "es.status.min.delay.queries";
protected String indexName;
protected String docType;
protected boolean active = true;
protected SpoutOutputCollector _collector;
protected MultiCountMetric eventCounter;
protected static Client client;
/**
* when using multiple instances - each one is in charge of a specific shard
* useful when sharding based on host or domain to guarantee a good mix of
* URLs
*/
protected int shardID = -1;
/** Used to distinguish between instances in the logs **/
protected String logIdprefix = "";
protected Queue buffer = new LinkedList<>();
/**
* Map to keep in-process URLs, ev. with additional information for URL /
* politeness bucket (hostname / domain etc.). The entries are kept in a
* cache for a configurable amount of time to avoid that some items are
* fetched a second time if new items are queried shortly after they have
* been acked.
*/
protected InProcessMap beingProcessed;
protected long timeStartESQuery = 0;
private long minDelayBetweenQueries = 2000;
/** Map which holds elements some additional time after the removal. */
public class InProcessMap extends HashMap {
private Cache> deletionCache;
public InProcessMap(long maxDuration, TimeUnit timeUnit) {
deletionCache = CacheBuilder.newBuilder()
.expireAfterWrite(maxDuration, timeUnit).build();
}
@Override
public boolean containsKey(Object key) {
boolean incache = super.containsKey(key);
if (!incache) {
incache = (deletionCache.getIfPresent(key) != null);
}
return incache;
}
@Override
public V remove(Object key) {
deletionCache.put((K) key, Optional.absent());
return super.remove(key);
}
public long inCache() {
return deletionCache.size();
}
}
@Override
public void open(Map stormConf, TopologyContext context,
SpoutOutputCollector collector) {
indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName,
"status");
docType = ConfUtils.getString(stormConf, ESStatusDocTypeParamName,
"status");
// one ES client per JVM
synchronized (AbstractSpout.class) {
try {
if (client == null) {
client = ElasticSearchConnection.getClient(stormConf,
ESBoltType);
}
} catch (Exception e1) {
LOG.error("Can't connect to ElasticSearch", e1);
throw new RuntimeException(e1);
}
}
// if more than one instance is used we expect their number to be the
// same as the number of shards
int totalTasks = context
.getComponentTasks(context.getThisComponentId()).size();
if (totalTasks > 1) {
logIdprefix = "[" + context.getThisComponentId() + " #"
+ context.getThisTaskIndex() + "] ";
// determine the number of shards so that we can restrict the
// search
ClusterSearchShardsRequest request = new ClusterSearchShardsRequest(
indexName);
ClusterSearchShardsResponse shardresponse = client.admin()
.cluster().searchShards(request).actionGet();
ClusterSearchShardsGroup[] shardgroups = shardresponse.getGroups();
if (totalTasks != shardgroups.length) {
throw new RuntimeException(
"Number of ES spout instances should be the same as number of shards ("
+ shardgroups.length + ") but is " + totalTasks);
}
shardID = shardgroups[context.getThisTaskIndex()].getShardId();
LOG.info("{} assigned shard ID {}", logIdprefix, shardID);
}
_collector = collector;
int ttlPurgatory = ConfUtils
.getInt(stormConf, ESStatusTTLPurgatory, 30);
minDelayBetweenQueries = ConfUtils.getLong(stormConf,
ESStatusMinDelayParamName, 2000);
beingProcessed = new InProcessMap<>(ttlPurgatory, TimeUnit.SECONDS);
eventCounter = context.registerMetric("counters",
new MultiCountMetric(), 10);
context.registerMetric("buffer_size", new IMetric() {
@Override
public Object getValueAndReset() {
return buffer.size();
}
}, 10);
context.registerMetric("beingProcessed", new IMetric() {
@Override
public Object getValueAndReset() {
return beingProcessed.size();
}
}, 10);
context.registerMetric("inPurgatory", new IMetric() {
@Override
public Object getValueAndReset() {
return beingProcessed.inCache();
}
}, 10);
}
/** Returns true if ES was queried too recently and needs throttling **/
protected boolean throttleESQueries() {
Date now = new Date();
if (timeStartESQuery != 0) {
// check that we allowed some time between queries
long difference = now.getTime() - timeStartESQuery;
if (difference < minDelayBetweenQueries) {
long sleepTime = minDelayBetweenQueries - difference;
LOG.debug(
"{} Not enough time elapsed since {} - should try again in {}",
logIdprefix, timeStartESQuery, sleepTime);
return true;
}
}
return false;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("url", "metadata"));
}
protected final Metadata fromKeyValues(Map keyValues) {
Map> mdAsMap = (Map>) keyValues
.get("metadata");
Metadata metadata = new Metadata();
if (mdAsMap != null) {
Iterator>> mdIter = mdAsMap.entrySet()
.iterator();
while (mdIter.hasNext()) {
Entry> mdEntry = mdIter.next();
String key = mdEntry.getKey();
// periods are not allowed in ES2 - replace with %2E
key = key.replaceAll("%2E", "\\.");
Object mdValObj = mdEntry.getValue();
// single value
if (mdValObj instanceof String) {
metadata.addValue(key, (String) mdValObj);
}
// multi valued
else {
metadata.addValues(key, (List) mdValObj);
}
}
}
return metadata;
}
@Override
public void ack(Object msgId) {
LOG.debug("{} Ack for {}", logIdprefix, msgId);
beingProcessed.remove(msgId);
eventCounter.scope("acked").incrBy(1);
}
@Override
public void fail(Object msgId) {
LOG.info("{} Fail for {}", logIdprefix, msgId);
beingProcessed.remove(msgId);
eventCounter.scope("failed").incrBy(1);
}
@Override
public void activate() {
active = true;
}
@Override
public void deactivate() {
active = false;
}
@Override
public void close() {
if (client != null)
client.close();
}
}