All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout Maven / Gradle / Ivy

There is a newer version: 2.11
Show newest version
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * 

http://www.apache.org/licenses/LICENSE-2.0 * *

Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.elasticsearch.persistence; import static org.elasticsearch.index.query.QueryBuilders.boolQuery; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import java.time.Instant; import java.util.Calendar; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.core.TimeValue; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.aggregations.AggregationBuilders; import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation; import org.elasticsearch.search.aggregations.bucket.sampler.DiversifiedAggregationBuilder; import org.elasticsearch.search.aggregations.bucket.terms.Terms; import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; import org.elasticsearch.search.aggregations.metrics.TopHits; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; import org.joda.time.format.ISODateTimeFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Spout which pulls URL from an ES index. Use a single instance unless you use 'es.status.routing' * with the StatusUpdaterBolt, in which case you need to have exactly the same number of spout * instances as ES shards. Guarantees a good mix of URLs by aggregating them by an arbitrary field * e.g. key. */ public class AggregationSpout extends AbstractSpout implements ActionListener { private static final Logger LOG = LoggerFactory.getLogger(AggregationSpout.class); private static final String ESStatusSampleParamName = "es.status.sample"; private static final String ESMostRecentDateIncreaseParamName = "es.status.recentDate.increase"; private static final String ESMostRecentDateMinGapParamName = "es.status.recentDate.min.gap"; private boolean sample = false; private int recentDateIncrease = -1; private int recentDateMinGap = -1; protected Set currentBuckets; @Override public void open( Map stormConf, TopologyContext context, SpoutOutputCollector collector) { sample = ConfUtils.getBoolean(stormConf, ESStatusSampleParamName, sample); recentDateIncrease = ConfUtils.getInt(stormConf, ESMostRecentDateIncreaseParamName, recentDateIncrease); recentDateMinGap = ConfUtils.getInt(stormConf, ESMostRecentDateMinGapParamName, recentDateMinGap); super.open(stormConf, context, collector); currentBuckets = new HashSet<>(); } @Override protected void populateBuffer() { if (queryDate == null) { queryDate = new Date(); lastTimeResetToNOW = Instant.now(); } String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedQueryDate); BoolQueryBuilder queryBuilder = boolQuery() .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate)); if (filterQueries != null) { for (String filterQuery : filterQueries) { queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery)); } } SearchRequest request = new SearchRequest(indexName); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); sourceBuilder.query(queryBuilder); sourceBuilder.from(0); sourceBuilder.size(0); sourceBuilder.explain(false); sourceBuilder.trackTotalHits(false); if (queryTimeout != -1) { sourceBuilder.timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)); } TermsAggregationBuilder aggregations = AggregationBuilders.terms("partition").field(partitionField).size(maxBucketNum); org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder tophits = AggregationBuilders.topHits("docs").size(maxURLsPerBucket).explain(false); // sort within a bucket for (String bsf : bucketSortField) { FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); tophits.sort(sorter); } aggregations.subAggregation(tophits); // sort between buckets if (StringUtils.isNotBlank(totalSortField)) { org.elasticsearch.search.aggregations.metrics.MinAggregationBuilder minBuilder = AggregationBuilders.min("top_hit").field(totalSortField); aggregations.subAggregation(minBuilder); aggregations.order(BucketOrder.aggregation("top_hit", true)); } if (sample) { DiversifiedAggregationBuilder sab = new DiversifiedAggregationBuilder("sample"); sab.field(partitionField).maxDocsPerValue(maxURLsPerBucket); sab.shardSize(maxURLsPerBucket * maxBucketNum); sab.subAggregation(aggregations); sourceBuilder.aggregation(sab); } else { sourceBuilder.aggregation(aggregations); } request.source(sourceBuilder); // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html // _shards:2,3 // specific shard but ideally a local copy of it if (shardID != -1) { request.preference("_shards:" + shardID + "|_local"); } // dump query to log LOG.debug("{} ES query {}", logIdprefix, request); LOG.trace("{} isInquery set to true"); isInQuery.set(true); client.searchAsync(request, RequestOptions.DEFAULT, this); } @Override public void onFailure(Exception arg0) { LOG.error("{} Exception with ES query", logIdprefix, arg0); markQueryReceivedNow(); } @Override public void onResponse(SearchResponse response) { long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent(); Aggregations aggregs = response.getAggregations(); if (aggregs == null) { markQueryReceivedNow(); return; } SingleBucketAggregation sample = aggregs.get("sample"); if (sample != null) { aggregs = sample.getAggregations(); } Terms agg = aggregs.get("partition"); int numhits = 0; int numBuckets = 0; int alreadyprocessed = 0; Instant mostRecentDateFound = null; currentBuckets.clear(); // For each entry Iterator iterator = (Iterator) agg.getBuckets().iterator(); while (iterator.hasNext()) { Terms.Bucket entry = iterator.next(); String key = (String) entry.getKey(); // bucket key currentBuckets.add(key); long docCount = entry.getDocCount(); // Doc count int hitsForThisBucket = 0; SearchHit lastHit = null; // filter results so that we don't include URLs we are already // being processed TopHits topHits = entry.getAggregations().get("docs"); for (SearchHit hit : topHits.getHits().getHits()) { LOG.debug( "{} -> id [{}], _source [{}]", logIdprefix, hit.getId(), hit.getSourceAsString()); hitsForThisBucket++; lastHit = hit; Map keyValues = hit.getSourceAsMap(); String url = (String) keyValues.get("url"); // consider only the first document of the last bucket // for optimising the nextFetchDate if (hitsForThisBucket == 1 && !iterator.hasNext()) { String strDate = (String) keyValues.get("nextFetchDate"); try { mostRecentDateFound = Instant.parse(strDate); } catch (Exception e) { throw new RuntimeException("can't parse date :" + strDate); } } // is already being processed or in buffer - skip it! if (beingProcessed.containsKey(url)) { LOG.debug("{} -> already processed: {}", logIdprefix, url); alreadyprocessed++; continue; } Metadata metadata = fromKeyValues(keyValues); boolean added = buffer.add(url, metadata); if (!added) { LOG.debug("{} -> already in buffer: {}", logIdprefix, url); alreadyprocessed++; continue; } LOG.debug("{} -> added to buffer : {}", logIdprefix, url); } if (lastHit != null) { sortValuesForKey(key, lastHit.getSortValues()); } if (hitsForThisBucket > 0) numBuckets++; numhits += hitsForThisBucket; LOG.debug( "{} key [{}], hits[{}], doc_count [{}]", logIdprefix, key, hitsForThisBucket, docCount, alreadyprocessed); } LOG.info( "{} ES query returned {} hits from {} buckets in {} msec with {} already being processed. Took {} msec per doc on average.", logIdprefix, numhits, numBuckets, timeTaken, alreadyprocessed, ((float) timeTaken / numhits)); queryTimes.addMeasurement(timeTaken); eventCounter.scope("already_being_processed").incrBy(alreadyprocessed); eventCounter.scope("ES_queries").incrBy(1); eventCounter.scope("ES_docs").incrBy(numhits); // optimise the nextFetchDate by getting the most recent value // returned in the query and add to it, unless the previous value is // within n mins in which case we'll keep it if (mostRecentDateFound != null && recentDateIncrease >= 0) { Calendar potentialNewDate = Calendar.getInstance(); potentialNewDate.setTimeInMillis(mostRecentDateFound.getEpochSecond()); potentialNewDate.add(Calendar.MINUTE, recentDateIncrease); Date oldDate = null; // check boundaries if (this.recentDateMinGap > 0) { Calendar low = Calendar.getInstance(); low.setTime(queryDate); low.add(Calendar.MINUTE, -recentDateMinGap); Calendar high = Calendar.getInstance(); high.setTime(queryDate); high.add(Calendar.MINUTE, recentDateMinGap); if (high.before(potentialNewDate) || low.after(potentialNewDate)) { oldDate = queryDate; } } else { oldDate = queryDate; } if (oldDate != null) { queryDate = potentialNewDate.getTime(); LOG.info( "{} queryDate changed from {} to {} based on mostRecentDateFound {}", logIdprefix, oldDate, queryDate, mostRecentDateFound); } else { LOG.info( "{} queryDate kept at {} based on mostRecentDateFound {}", logIdprefix, queryDate, mostRecentDateFound); } } // reset the value for next fetch date if the previous one is too old if (resetFetchDateAfterNSecs != -1) { Instant changeNeededOn = Instant.ofEpochMilli( lastTimeResetToNOW.toEpochMilli() + (resetFetchDateAfterNSecs * 1000)); if (Instant.now().isAfter(changeNeededOn)) { LOG.info( "{} queryDate set to null based on resetFetchDateAfterNSecs {}", logIdprefix, resetFetchDateAfterNSecs); queryDate = null; } } // change the date if we don't get any results at all if (numBuckets == 0) { queryDate = null; } // remove lock markQueryReceivedNow(); } protected void sortValuesForKey(String key, Object[] sortValues) {} }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy