
com.digitalpebble.stormcrawler.elasticsearch.persistence.HybridSpout Maven / Gradle / Ivy
Show all versions of storm-crawler-elasticsearch Show documentation
/**
* Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy of the
* License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.elasticsearch.persistence;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import com.digitalpebble.stormcrawler.persistence.EmptyQueueListener;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import java.time.Instant;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.SortBuilders;
import org.elasticsearch.search.sort.SortOrder;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Uses collapsing spouts to get an initial set of URLs and keys to query for and gets emptyQueue
* notifications from the URLBuffer to query ES for a specific key.
*
* @since 1.15
*/
public class HybridSpout extends AggregationSpout implements EmptyQueueListener {
private static final Logger LOG = LoggerFactory.getLogger(HybridSpout.class);
protected static final String RELOADPARAMNAME = "es.status.max.urls.per.reload";
private int bufferReloadSize = 10;
private Cache searchAfterCache;
private HostResultListener hrl;
@Override
public void open(
Map stormConf,
TopologyContext context,
SpoutOutputCollector collector) {
super.open(stormConf, context, collector);
bufferReloadSize = ConfUtils.getInt(stormConf, RELOADPARAMNAME, maxURLsPerBucket);
buffer.setEmptyQueueListener(this);
searchAfterCache = Caffeine.newBuilder().build();
hrl = new HostResultListener();
}
@Override
public void emptyQueue(String queueName) {
LOG.info("{} Emptied buffer queue for {}", logIdprefix, queueName);
if (!currentBuckets.contains(queueName)) {
// not interested in this one any more
return;
}
// reloading the aggregs - searching now
// would just overload ES and yield
// mainly duplicates
if (isInQuery.get()) {
LOG.trace("{} isInquery true", logIdprefix, queueName);
return;
}
LOG.info("{} Querying for more docs for {}", logIdprefix, queueName);
if (queryDate == null) {
queryDate = new Date();
lastTimeResetToNOW = Instant.now();
}
String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime());
BoolQueryBuilder queryBuilder =
boolQuery()
.filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate));
queryBuilder.filter(QueryBuilders.termQuery(partitionField, queueName));
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(queryBuilder);
sourceBuilder.from(0);
sourceBuilder.size(bufferReloadSize);
sourceBuilder.explain(false);
sourceBuilder.trackTotalHits(false);
// sort within a bucket
for (String bsf : bucketSortField) {
FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC);
sourceBuilder.sort(sorter);
}
// do we have a search after for this one?
Object[] searchAfterValues = searchAfterCache.getIfPresent(queueName);
if (searchAfterValues != null) {
sourceBuilder.searchAfter(searchAfterValues);
}
SearchRequest request = new SearchRequest(indexName);
request.source(sourceBuilder);
// https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html
// _shards:2,3
// specific shard but ideally a local copy of it
if (shardID != -1) {
request.preference("_shards:" + shardID + "|_local");
}
// dump query to log
LOG.debug("{} ES query {} - {}", logIdprefix, queueName, request.toString());
client.searchAsync(request, RequestOptions.DEFAULT, hrl);
}
@Override
/** Overrides the handling of responses for aggregations */
public void onResponse(SearchResponse response) {
// delete all entries from the searchAfterCache when
// we get the results from the aggregation spouts
searchAfterCache.invalidateAll();
super.onResponse(response);
}
@Override
/** The aggregation kindly told us where to start from * */
protected void sortValuesForKey(String key, Object[] sortValues) {
if (sortValues != null && sortValues.length > 0) this.searchAfterCache.put(key, sortValues);
}
/** Handling of results for a specific queue * */
class HostResultListener implements ActionListener {
@Override
public void onResponse(SearchResponse response) {
int alreadyprocessed = 0;
int numDocs = 0;
SearchHit[] hits = response.getHits().getHits();
Object[] sortValues = null;
// retrieve the key for these results
String key = null;
for (SearchHit hit : hits) {
numDocs++;
String pfield = partitionField;
Map sourceAsMap = hit.getSourceAsMap();
if (pfield.startsWith("metadata.")) {
sourceAsMap = (Map) sourceAsMap.get("metadata");
pfield = pfield.substring(9);
}
Object key_as_object = sourceAsMap.get(pfield);
if (key_as_object instanceof List) {
if (((List) (key_as_object)).size() == 1)
key = (String) ((List) key_as_object).get(0);
} else {
key = key_as_object.toString();
}
sortValues = hit.getSortValues();
if (!addHitToBuffer(hit)) {
alreadyprocessed++;
}
}
// no key if no results have been found
if (key != null) {
searchAfterCache.put(key, sortValues);
}
eventCounter.scope("ES_queries_host").incrBy(1);
eventCounter.scope("ES_docs_host").incrBy(numDocs);
eventCounter.scope("already_being_processed_host").incrBy(alreadyprocessed);
LOG.info(
"{} ES term query returned {} hits in {} msec with {} already being processed for {}",
logIdprefix,
numDocs,
response.getTook().getMillis(),
alreadyprocessed,
key);
}
@Override
public void onFailure(Exception e) {
LOG.error("Exception with ES query", e);
}
}
}