All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregator Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.search.aggregations.bucket.histogram;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.common.Rounding;
import org.elasticsearch.common.util.ByteArray;
import org.elasticsearch.common.util.IntArray;
import org.elasticsearch.common.util.LongArray;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.BucketOrder;
import org.elasticsearch.search.aggregations.CardinalityUpperBound;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.LeafBucketCollector;
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
import org.elasticsearch.search.aggregations.bucket.BestBucketsDeferringCollector;
import org.elasticsearch.search.aggregations.bucket.DeferableBucketAggregator;
import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector;
import org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregationBuilder.RoundingInfo;
import org.elasticsearch.search.aggregations.bucket.terms.LongKeyedBucketOrds;
import org.elasticsearch.search.aggregations.support.AggregationContext;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.LongToIntFunction;
import java.util.function.LongUnaryOperator;

/**
 * An aggregator for date values that attempts to return a specific number of
 * buckets, reconfiguring how it rounds dates to buckets on the fly as new
 * data arrives.
 * 

* This class is abstract because there is a simple implementation for when the * aggregator only collects from a single bucket and a more complex * implementation when it doesn't. This ain't great from a test coverage * standpoint but the simpler implementation is between 7% and 15% faster * when you can use it. This is an important aggregation and we need that * performance. */ abstract class AutoDateHistogramAggregator extends DeferableBucketAggregator { static AutoDateHistogramAggregator build( String name, AggregatorFactories factories, int targetBuckets, RoundingInfo[] roundingInfos, ValuesSourceConfig valuesSourceConfig, AggregationContext context, Aggregator parent, CardinalityUpperBound cardinality, Map metadata ) throws IOException { return cardinality == CardinalityUpperBound.ONE ? new FromSingle(name, factories, targetBuckets, roundingInfos, valuesSourceConfig, context, parent, metadata) : new FromMany(name, factories, targetBuckets, roundingInfos, valuesSourceConfig, context, parent, metadata); } private final ValuesSource.Numeric valuesSource; private final DocValueFormat formatter; private final Function roundingPreparer; /** * A reference to the collector so we can * {@link BestBucketsDeferringCollector#rewriteBuckets}. */ private BestBucketsDeferringCollector deferringCollector; protected final RoundingInfo[] roundingInfos; protected final int targetBuckets; private AutoDateHistogramAggregator( String name, AggregatorFactories factories, int targetBuckets, RoundingInfo[] roundingInfos, ValuesSourceConfig valuesSourceConfig, AggregationContext context, Aggregator parent, Map metadata ) throws IOException { super(name, factories, context, parent, metadata); this.targetBuckets = targetBuckets; // TODO: Remove null usage here, by using a different aggregator for create this.valuesSource = valuesSourceConfig.hasValues() ? (ValuesSource.Numeric) valuesSourceConfig.getValuesSource() : null; this.formatter = valuesSourceConfig.format(); this.roundingInfos = roundingInfos; this.roundingPreparer = valuesSourceConfig.roundingPreparer(); } @Override public final ScoreMode scoreMode() { if (valuesSource != null && valuesSource.needsScores()) { return ScoreMode.COMPLETE; } return super.scoreMode(); } @Override protected final boolean shouldDefer(Aggregator aggregator) { return true; } @Override public final DeferringBucketCollector buildDeferringCollector() { deferringCollector = new BestBucketsDeferringCollector(topLevelQuery(), searcher(), descendsFromGlobalAggregator(parent())); return deferringCollector; } protected abstract LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException; @Override public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { if (valuesSource == null) { return LeafBucketCollector.NO_OP_COLLECTOR; } return getLeafCollector(valuesSource.longValues(ctx), sub); } protected final InternalAggregation[] buildAggregations( LongKeyedBucketOrds bucketOrds, LongToIntFunction roundingIndexFor, long[] owningBucketOrds ) throws IOException { return buildAggregationsForVariableBuckets( owningBucketOrds, bucketOrds, (bucketValue, docCount, subAggregationResults) -> new InternalAutoDateHistogram.Bucket( bucketValue, docCount, formatter, subAggregationResults ), (owningBucketOrd, buckets) -> { // the contract of the histogram aggregation is that shards must return // buckets ordered by key in ascending order CollectionUtil.introSort(buckets, BucketOrder.key(true).comparator()); // value source will be null for unmapped fields InternalAutoDateHistogram.BucketInfo emptyBucketInfo = new InternalAutoDateHistogram.BucketInfo( roundingInfos, roundingIndexFor.applyAsInt(owningBucketOrd), buildEmptySubAggregations() ); return new InternalAutoDateHistogram(name, buckets, targetBuckets, emptyBucketInfo, formatter, metadata(), 1); } ); } @Override public final InternalAggregation buildEmptyAggregation() { InternalAutoDateHistogram.BucketInfo emptyBucketInfo = new InternalAutoDateHistogram.BucketInfo( roundingInfos, 0, buildEmptySubAggregations() ); return new InternalAutoDateHistogram(name, Collections.emptyList(), targetBuckets, emptyBucketInfo, formatter, metadata(), 1); } protected final Rounding.Prepared prepareRounding(int index) { return roundingPreparer.apply(roundingInfos[index].rounding); } protected final void merge(long[] mergeMap, long newNumBuckets) { LongUnaryOperator howToRewrite = b -> mergeMap[(int) b]; rewriteBuckets(newNumBuckets, howToRewrite); if (deferringCollector != null) { deferringCollector.rewriteBuckets(howToRewrite); } } /** * Initially it uses the most fine grained rounding configuration possible * but as more data arrives it rebuckets the data until it "fits" in the * aggregation rounding. Similar to {@link FromMany} this checks both the * bucket count and range of the aggregation, but unlike * {@linkplain FromMany} it keeps an accurate count of the buckets and it * doesn't delay rebucketing. *

* Rebucketing is roughly {@code O(number_of_hits_collected_so_far)} but we * rebucket roughly {@code O(log number_of_hits_collected_so_far)} because * the "shape" of the roundings is roughly * logarithmically increasing. */ private static class FromSingle extends AutoDateHistogramAggregator { private int roundingIdx; private Rounding.Prepared preparedRounding; /** * Map from value to bucket ordinals. *

* It is important that this is the exact subtype of * {@link LongKeyedBucketOrds} so that the JVM can make a monomorphic * call to {@link LongKeyedBucketOrds#add(long, long)} in the tight * inner loop of {@link LeafBucketCollector#collect(int, long)}. You'd * think that it wouldn't matter, but its seriously 7%-15% performance * difference for the aggregation. Yikes. */ private LongKeyedBucketOrds.FromSingle bucketOrds; private long min = Long.MAX_VALUE; private long max = Long.MIN_VALUE; FromSingle( String name, AggregatorFactories factories, int targetBuckets, RoundingInfo[] roundingInfos, ValuesSourceConfig valuesSourceConfig, AggregationContext context, Aggregator parent, Map metadata ) throws IOException { super(name, factories, targetBuckets, roundingInfos, valuesSourceConfig, context, parent, metadata); preparedRounding = prepareRounding(0); bucketOrds = new LongKeyedBucketOrds.FromSingle(bigArrays()); } @Override protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException { return new LeafBucketCollectorBase(sub, values) { @Override public void collect(int doc, long owningBucketOrd) throws IOException { assert owningBucketOrd == 0; if (false == values.advanceExact(doc)) { return; } int valuesCount = values.docValueCount(); long previousRounded = Long.MIN_VALUE; for (int i = 0; i < valuesCount; ++i) { long value = values.nextValue(); long rounded = preparedRounding.round(value); assert rounded >= previousRounded; if (rounded == previousRounded) { continue; } collectValue(doc, rounded); previousRounded = rounded; } } private void collectValue(int doc, long rounded) throws IOException { long bucketOrd = bucketOrds.add(0, rounded); if (bucketOrd < 0) { // already seen bucketOrd = -1 - bucketOrd; collectExistingBucket(sub, doc, bucketOrd); return; } collectBucket(sub, doc, bucketOrd); increaseRoundingIfNeeded(rounded); } private void increaseRoundingIfNeeded(long rounded) { if (roundingIdx >= roundingInfos.length - 1) { return; } min = Math.min(min, rounded); max = Math.max(max, rounded); if (bucketOrds.size() <= targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() && max - min <= targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis()) { return; } do { LongKeyedBucketOrds oldOrds = bucketOrds; boolean success = false; try { preparedRounding = prepareRounding(++roundingIdx); long[] mergeMap = new long[Math.toIntExact(oldOrds.size())]; bucketOrds = new LongKeyedBucketOrds.FromSingle(bigArrays()); success = true; // now it is safe to close oldOrds after we finish LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = oldOrds.ordsEnum(0); while (ordsEnum.next()) { long oldKey = ordsEnum.value(); long newKey = preparedRounding.round(oldKey); long newBucketOrd = bucketOrds.add(0, newKey); mergeMap[(int) ordsEnum.ord()] = newBucketOrd >= 0 ? newBucketOrd : -1 - newBucketOrd; } merge(mergeMap, bucketOrds.size()); } finally { if (success) { oldOrds.close(); } } } while (roundingIdx < roundingInfos.length - 1 && (bucketOrds.size() > targetBuckets * roundingInfos[roundingIdx].getMaximumInnerInterval() || max - min > targetBuckets * roundingInfos[roundingIdx].getMaximumRoughEstimateDurationMillis())); } }; } @Override public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { return buildAggregations(bucketOrds, l -> roundingIdx, owningBucketOrds); } @Override public void collectDebugInfo(BiConsumer add) { super.collectDebugInfo(add); add.accept("surviving_buckets", bucketOrds.size()); } @Override protected void doClose() { Releasables.close(bucketOrds); } } /** * Initially it uses the most fine grained rounding configuration possible but * as more data arrives it uses two heuristics to shift to coarser and coarser * rounding. The first heuristic is the number of buckets, specifically, * when there are more buckets than can "fit" in the current rounding it shifts * to the next rounding. Instead of redoing the rounding, it estimates the * number of buckets that will "survive" at the new rounding and uses * that as the initial value for the bucket count that it * increments in order to trigger another promotion to another coarser * rounding. This works fairly well at containing the number of buckets, but * the estimate of the number of buckets will be wrong if the buckets are * quite a spread out compared to the rounding. *

* The second heuristic it uses to trigger promotion to a coarser rounding is * the distance between the min and max bucket. When that distance is greater * than what the current rounding supports it promotes. This heuristic * isn't good at limiting the number of buckets but is great when the buckets * are spread out compared to the rounding. So it should complement the first * heuristic. *

* When promoting a rounding we keep the old buckets around because it is * expensive to call {@link BestBucketsDeferringCollector#rewriteBuckets}. * In particular it is {@code O(number_of_hits_collected_so_far)}. So if we * called it frequently we'd end up in {@code O(n^2)} territory. Bad news for * aggregations! Instead, we keep a "budget" of buckets that we're ok * "wasting". When we promote the rounding and our estimate of the number of * "dead" buckets that have data but have yet to be merged into the buckets * that are valid for the current rounding exceeds the budget then we rebucket * the entire aggregation and double the budget. *

* Once we're done collecting and we know exactly which buckets we'll be * returning we finally perform a "real", "perfect bucketing", * rounding all of the keys for {@code owningBucketOrd} that we're going to * collect and picking the rounding based on a real, accurate count and the * min and max. */ private static class FromMany extends AutoDateHistogramAggregator { /** * An array of prepared roundings in the same order as * {@link #roundingInfos}. The 0th entry is prepared initially, * and other entries are null until first needed. */ private final Rounding.Prepared[] preparedRoundings; /** * Map from value to bucket ordinals. *

* It is important that this is the exact subtype of * {@link LongKeyedBucketOrds} so that the JVM can make a monomorphic * call to {@link LongKeyedBucketOrds#add(long, long)} in the tight * inner loop of {@link LeafBucketCollector#collect(int, long)}. */ private LongKeyedBucketOrds.FromMany bucketOrds; /** * The index of the rounding that each {@code owningBucketOrd} is * currently using. *

* During collection we use overestimates for how much buckets are save * by bumping to the next rounding index. So we end up bumping less * aggressively than a "perfect" algorithm. That is fine because we * correct the error when we merge the buckets together all the way * up in {@link InternalAutoDateHistogram#reduceBucket}. In particular, * on final reduce we bump the rounding until it we appropriately * cover the date range across all of the results returned by all of * the {@link AutoDateHistogramAggregator}s. */ private ByteArray roundingIndices; /** * The minimum key per {@code owningBucketOrd}. */ private LongArray mins; /** * The max key per {@code owningBucketOrd}. */ private LongArray maxes; /** * An underestimate of the number of buckets that are "live" in the * current rounding for each {@code owningBucketOrdinal}. */ private IntArray liveBucketCountUnderestimate; /** * An over estimate of the number of wasted buckets. When this gets * too high we {@link #rebucket} which sets it to 0. */ private long wastedBucketsOverestimate = 0; /** * The next {@link #wastedBucketsOverestimate} that will trigger a * {@link #rebucket() rebucketing}. */ private long nextRebucketAt = 1000; // TODO this could almost certainly start higher when asMultiBucketAggregator is gone /** * The number of times the aggregator had to {@link #rebucket()} the * results. We keep this just to report to the profiler. */ private int rebucketCount = 0; FromMany( String name, AggregatorFactories factories, int targetBuckets, RoundingInfo[] roundingInfos, ValuesSourceConfig valuesSourceConfig, AggregationContext context, Aggregator parent, Map metadata ) throws IOException { super(name, factories, targetBuckets, roundingInfos, valuesSourceConfig, context, parent, metadata); assert roundingInfos.length < 127 : "Rounding must fit in a signed byte"; roundingIndices = bigArrays().newByteArray(1, true); mins = bigArrays().newLongArray(1, false); mins.set(0, Long.MAX_VALUE); maxes = bigArrays().newLongArray(1, false); maxes.set(0, Long.MIN_VALUE); preparedRoundings = new Rounding.Prepared[roundingInfos.length]; // Prepare the first rounding because we know we'll need it. preparedRoundings[0] = prepareRounding(0); bucketOrds = new LongKeyedBucketOrds.FromMany(bigArrays()); liveBucketCountUnderestimate = bigArrays().newIntArray(1, true); } @Override protected LeafBucketCollector getLeafCollector(SortedNumericDocValues values, LeafBucketCollector sub) throws IOException { return new LeafBucketCollectorBase(sub, values) { @Override public void collect(int doc, long owningBucketOrd) throws IOException { if (false == values.advanceExact(doc)) { return; } int valuesCount = values.docValueCount(); long previousRounded = Long.MIN_VALUE; int roundingIdx = roundingIndexFor(owningBucketOrd); for (int i = 0; i < valuesCount; ++i) { long value = values.nextValue(); long rounded = preparedRoundings[roundingIdx].round(value); assert rounded >= previousRounded; if (rounded == previousRounded) { continue; } roundingIdx = collectValue(owningBucketOrd, roundingIdx, doc, rounded); previousRounded = rounded; } } private int collectValue(long owningBucketOrd, int roundingIdx, int doc, long rounded) throws IOException { long bucketOrd = bucketOrds.add(owningBucketOrd, rounded); if (bucketOrd < 0) { // already seen bucketOrd = -1 - bucketOrd; collectExistingBucket(sub, doc, bucketOrd); return roundingIdx; } collectBucket(sub, doc, bucketOrd); liveBucketCountUnderestimate = bigArrays().grow(liveBucketCountUnderestimate, owningBucketOrd + 1); int estimatedBucketCount = liveBucketCountUnderestimate.increment(owningBucketOrd, 1); return increaseRoundingIfNeeded(owningBucketOrd, estimatedBucketCount, rounded, roundingIdx); } /** * Increase the rounding of {@code owningBucketOrd} using * estimated, bucket counts, {@link #rebucket() rebucketing} the all * buckets if the estimated number of wasted buckets is too high. */ private int increaseRoundingIfNeeded(long owningBucketOrd, int oldEstimatedBucketCount, long newKey, int oldRounding) { if (oldRounding >= roundingInfos.length - 1) { return oldRounding; } if (mins.size() < owningBucketOrd + 1) { long oldSize = mins.size(); mins = bigArrays().grow(mins, owningBucketOrd + 1); mins.fill(oldSize, mins.size(), Long.MAX_VALUE); } if (maxes.size() < owningBucketOrd + 1) { long oldSize = maxes.size(); maxes = bigArrays().grow(maxes, owningBucketOrd + 1); maxes.fill(oldSize, maxes.size(), Long.MIN_VALUE); } long min = Math.min(mins.get(owningBucketOrd), newKey); mins.set(owningBucketOrd, min); long max = Math.max(maxes.get(owningBucketOrd), newKey); maxes.set(owningBucketOrd, max); if (oldEstimatedBucketCount <= targetBuckets * roundingInfos[oldRounding].getMaximumInnerInterval() && max - min <= targetBuckets * roundingInfos[oldRounding].getMaximumRoughEstimateDurationMillis()) { return oldRounding; } long oldRoughDuration = roundingInfos[oldRounding].roughEstimateDurationMillis; int newRounding = oldRounding; int newEstimatedBucketCount; do { newRounding++; double ratio = (double) oldRoughDuration / (double) roundingInfos[newRounding].getRoughEstimateDurationMillis(); newEstimatedBucketCount = (int) Math.ceil(oldEstimatedBucketCount * ratio); } while (newRounding < roundingInfos.length - 1 && (newEstimatedBucketCount > targetBuckets * roundingInfos[newRounding].getMaximumInnerInterval() || max - min > targetBuckets * roundingInfos[newRounding].getMaximumRoughEstimateDurationMillis())); setRounding(owningBucketOrd, newRounding); mins.set(owningBucketOrd, preparedRoundings[newRounding].round(mins.get(owningBucketOrd))); maxes.set(owningBucketOrd, preparedRoundings[newRounding].round(maxes.get(owningBucketOrd))); wastedBucketsOverestimate += oldEstimatedBucketCount - newEstimatedBucketCount; if (wastedBucketsOverestimate > nextRebucketAt) { rebucket(); // Bump the threshold for the next rebucketing wastedBucketsOverestimate = 0; nextRebucketAt *= 2; } else { liveBucketCountUnderestimate.set(owningBucketOrd, newEstimatedBucketCount); } return newRounding; } }; } private void rebucket() { rebucketCount++; LongKeyedBucketOrds oldOrds = bucketOrds; boolean success = false; try { long[] mergeMap = new long[Math.toIntExact(oldOrds.size())]; bucketOrds = new LongKeyedBucketOrds.FromMany(bigArrays()); success = true; for (long owningBucketOrd = 0; owningBucketOrd <= oldOrds.maxOwningBucketOrd(); owningBucketOrd++) { LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = oldOrds.ordsEnum(owningBucketOrd); Rounding.Prepared preparedRounding = preparedRoundings[roundingIndexFor(owningBucketOrd)]; while (ordsEnum.next()) { long oldKey = ordsEnum.value(); long newKey = preparedRounding.round(oldKey); long newBucketOrd = bucketOrds.add(owningBucketOrd, newKey); mergeMap[(int) ordsEnum.ord()] = newBucketOrd >= 0 ? newBucketOrd : -1 - newBucketOrd; } liveBucketCountUnderestimate = bigArrays().grow(liveBucketCountUnderestimate, owningBucketOrd + 1); liveBucketCountUnderestimate.set(owningBucketOrd, Math.toIntExact(bucketOrds.bucketsInOrd(owningBucketOrd))); } merge(mergeMap, bucketOrds.size()); } finally { if (success) { oldOrds.close(); } } } @Override public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { /* * Rebucket before building the aggregation to build as small as result * as possible. * * TODO it'd be faster if we could apply the merging on the fly as we * replay the hits and build the buckets. How much faster is not clear, * but it does have the advantage of only touching the buckets that we * want to collect. */ rebucket(); return buildAggregations(bucketOrds, this::roundingIndexFor, owningBucketOrds); } @Override public void collectDebugInfo(BiConsumer add) { super.collectDebugInfo(add); add.accept("surviving_buckets", bucketOrds.size()); add.accept("wasted_buckets_overestimate", wastedBucketsOverestimate); add.accept("next_rebucket_at", nextRebucketAt); add.accept("rebucket_count", rebucketCount); } private void setRounding(long owningBucketOrd, int newRounding) { roundingIndices = bigArrays().grow(roundingIndices, owningBucketOrd + 1); roundingIndices.set(owningBucketOrd, (byte) newRounding); if (preparedRoundings[newRounding] == null) { preparedRoundings[newRounding] = prepareRounding(newRounding); } } private int roundingIndexFor(long owningBucketOrd) { return owningBucketOrd < roundingIndices.size() ? roundingIndices.get(owningBucketOrd) : 0; } @Override public void doClose() { Releasables.close(bucketOrds, roundingIndices, mins, maxes, liveBucketCountUnderestimate); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy