com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-elasticsearch Show documentation
Elasticsearch resources for StormCrawler
There is a newer version: 2.11
Show newest version
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.stormcrawler.elasticsearch.persistence;

import static org.elasticsearch.index.query.QueryBuilders.boolQuery;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import java.time.Instant;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.aggregations.BucketOrder;
import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation;
import org.elasticsearch.search.aggregations.bucket.sampler.DiversifiedAggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.TopHits;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.SortBuilders;
import org.elasticsearch.search.sort.SortOrder;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Spout which pulls URL from an ES index. Use a single instance unless you use 'es.status.routing'
 * with the StatusUpdaterBolt, in which case you need to have exactly the same number of spout
 * instances as ES shards. Guarantees a good mix of URLs by aggregating them by an arbitrary field
 * e.g. key.
 */
public class AggregationSpout extends AbstractSpout implements ActionListener {

    private static final Logger LOG = LoggerFactory.getLogger(AggregationSpout.class);

    private static final String ESStatusSampleParamName = "es.status.sample";
    private static final String ESMostRecentDateIncreaseParamName = "es.status.recentDate.increase";
    private static final String ESMostRecentDateMinGapParamName = "es.status.recentDate.min.gap";

    private boolean sample = false;

    private int recentDateIncrease = -1;
    private int recentDateMinGap = -1;

    protected Set currentBuckets;

    @Override
    public void open(
            Map stormConf,
            TopologyContext context,
            SpoutOutputCollector collector) {
        sample = ConfUtils.getBoolean(stormConf, ESStatusSampleParamName, sample);
        recentDateIncrease =
                ConfUtils.getInt(stormConf, ESMostRecentDateIncreaseParamName, recentDateIncrease);
        recentDateMinGap =
                ConfUtils.getInt(stormConf, ESMostRecentDateMinGapParamName, recentDateMinGap);
        super.open(stormConf, context, collector);
        currentBuckets = new HashSet<>();
    }

    @Override
    protected void populateBuffer() {

        if (queryDate == null) {
            queryDate = new Date();
            lastTimeResetToNOW = Instant.now();
        }

        String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime());

        LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedQueryDate);

        BoolQueryBuilder queryBuilder =
                boolQuery()
                        .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate));

        if (filterQueries != null) {
            for (String filterQuery : filterQueries) {
                queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery));
            }
        }

        SearchRequest request = new SearchRequest(indexName);

        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(queryBuilder);
        sourceBuilder.from(0);
        sourceBuilder.size(0);
        sourceBuilder.explain(false);
        sourceBuilder.trackTotalHits(false);

        if (queryTimeout != -1) {
            sourceBuilder.timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS));
        }

        TermsAggregationBuilder aggregations =
                AggregationBuilders.terms("partition").field(partitionField).size(maxBucketNum);

        org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder tophits =
                AggregationBuilders.topHits("docs").size(maxURLsPerBucket).explain(false);

        // sort within a bucket
        for (String bsf : bucketSortField) {
            FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC);
            tophits.sort(sorter);
        }

        aggregations.subAggregation(tophits);

        // sort between buckets
        if (StringUtils.isNotBlank(totalSortField)) {
            org.elasticsearch.search.aggregations.metrics.MinAggregationBuilder minBuilder =
                    AggregationBuilders.min("top_hit").field(totalSortField);
            aggregations.subAggregation(minBuilder);
            aggregations.order(BucketOrder.aggregation("top_hit", true));
        }

        if (sample) {
            DiversifiedAggregationBuilder sab = new DiversifiedAggregationBuilder("sample");
            sab.field(partitionField).maxDocsPerValue(maxURLsPerBucket);
            sab.shardSize(maxURLsPerBucket * maxBucketNum);
            sab.subAggregation(aggregations);
            sourceBuilder.aggregation(sab);
        } else {
            sourceBuilder.aggregation(aggregations);
        }

        request.source(sourceBuilder);

        // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html
        // _shards:2,3
        // specific shard but ideally a local copy of it
        if (shardID != -1) {
            request.preference("_shards:" + shardID + "|_local");
        }

        // dump query to log
        LOG.debug("{} ES query {}", logIdprefix, request);

        LOG.trace("{} isInquery set to true");
        isInQuery.set(true);
        client.searchAsync(request, RequestOptions.DEFAULT, this);
    }

    @Override
    public void onFailure(Exception arg0) {
        LOG.error("{} Exception with ES query", logIdprefix, arg0);
        markQueryReceivedNow();
    }

    @Override
    public void onResponse(SearchResponse response) {
        long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent();

        Aggregations aggregs = response.getAggregations();

        if (aggregs == null) {
            markQueryReceivedNow();
            return;
        }

        SingleBucketAggregation sample = aggregs.get("sample");
        if (sample != null) {
            aggregs = sample.getAggregations();
        }

        Terms agg = aggregs.get("partition");

        int numhits = 0;
        int numBuckets = 0;
        int alreadyprocessed = 0;

        Instant mostRecentDateFound = null;

        currentBuckets.clear();

        // For each entry
        Iterator iterator = (Iterator) agg.getBuckets().iterator();
        while (iterator.hasNext()) {
            Terms.Bucket entry = iterator.next();
            String key = (String) entry.getKey(); // bucket key

            currentBuckets.add(key);

            long docCount = entry.getDocCount(); // Doc count

            int hitsForThisBucket = 0;

            SearchHit lastHit = null;

            // filter results so that we don't include URLs we are already
            // being processed
            TopHits topHits = entry.getAggregations().get("docs");
            for (SearchHit hit : topHits.getHits().getHits()) {

                LOG.debug(
                        "{} -> id [{}], _source [{}]",
                        logIdprefix,
                        hit.getId(),
                        hit.getSourceAsString());

                hitsForThisBucket++;

                lastHit = hit;

                Map keyValues = hit.getSourceAsMap();
                String url = (String) keyValues.get("url");

                // consider only the first document of the last bucket
                // for optimising the nextFetchDate
                if (hitsForThisBucket == 1 && !iterator.hasNext()) {
                    String strDate = (String) keyValues.get("nextFetchDate");
                    try {
                        mostRecentDateFound = Instant.parse(strDate);
                    } catch (Exception e) {
                        throw new RuntimeException("can't parse date :" + strDate);
                    }
                }

                // is already being processed or in buffer - skip it!
                if (beingProcessed.containsKey(url)) {
                    LOG.debug("{} -> already processed: {}", logIdprefix, url);
                    alreadyprocessed++;
                    continue;
                }

                Metadata metadata = fromKeyValues(keyValues);
                boolean added = buffer.add(url, metadata);
                if (!added) {
                    LOG.debug("{} -> already in buffer: {}", logIdprefix, url);
                    alreadyprocessed++;
                    continue;
                }
                LOG.debug("{} -> added to buffer : {}", logIdprefix, url);
            }

            if (lastHit != null) {
                sortValuesForKey(key, lastHit.getSortValues());
            }

            if (hitsForThisBucket > 0) numBuckets++;

            numhits += hitsForThisBucket;

            LOG.debug(
                    "{} key [{}], hits[{}], doc_count [{}]",
                    logIdprefix,
                    key,
                    hitsForThisBucket,
                    docCount,
                    alreadyprocessed);
        }

        LOG.info(
                "{} ES query returned {} hits from {} buckets in {} msec with {} already being processed. Took {} msec per doc on average.",
                logIdprefix,
                numhits,
                numBuckets,
                timeTaken,
                alreadyprocessed,
                ((float) timeTaken / numhits));

        queryTimes.addMeasurement(timeTaken);
        eventCounter.scope("already_being_processed").incrBy(alreadyprocessed);
        eventCounter.scope("ES_queries").incrBy(1);
        eventCounter.scope("ES_docs").incrBy(numhits);

        // optimise the nextFetchDate by getting the most recent value
        // returned in the query and add to it, unless the previous value is
        // within n mins in which case we'll keep it
        if (mostRecentDateFound != null && recentDateIncrease >= 0) {
            Calendar potentialNewDate = Calendar.getInstance();
            potentialNewDate.setTimeInMillis(mostRecentDateFound.getEpochSecond());
            potentialNewDate.add(Calendar.MINUTE, recentDateIncrease);
            Date oldDate = null;
            // check boundaries
            if (this.recentDateMinGap > 0) {
                Calendar low = Calendar.getInstance();
                low.setTime(queryDate);
                low.add(Calendar.MINUTE, -recentDateMinGap);
                Calendar high = Calendar.getInstance();
                high.setTime(queryDate);
                high.add(Calendar.MINUTE, recentDateMinGap);
                if (high.before(potentialNewDate) || low.after(potentialNewDate)) {
                    oldDate = queryDate;
                }
            } else {
                oldDate = queryDate;
            }
            if (oldDate != null) {
                queryDate = potentialNewDate.getTime();
                LOG.info(
                        "{} queryDate changed from {} to {} based on mostRecentDateFound {}",
                        logIdprefix,
                        oldDate,
                        queryDate,
                        mostRecentDateFound);
            } else {
                LOG.info(
                        "{} queryDate kept at {} based on mostRecentDateFound {}",
                        logIdprefix,
                        queryDate,
                        mostRecentDateFound);
            }
        }

        // reset the value for next fetch date if the previous one is too old
        if (resetFetchDateAfterNSecs != -1) {
            Instant changeNeededOn =
                    Instant.ofEpochMilli(
                            lastTimeResetToNOW.toEpochMilli() + (resetFetchDateAfterNSecs * 1000));
            if (Instant.now().isAfter(changeNeededOn)) {
                LOG.info(
                        "{} queryDate set to null based on resetFetchDateAfterNSecs {}",
                        logIdprefix,
                        resetFetchDateAfterNSecs);
                queryDate = null;
            }
        }

        // change the date if we don't get any results at all
        if (numBuckets == 0) {
            queryDate = null;
        }

        // remove lock
        markQueryReceivedNow();
    }

    protected void sortValuesForKey(String key, Object[] sortValues) {}
}