org.elasticsearch.search.aggregations.bucket.terms.TermsAggregatorFactory Maven / Gradle / Ivy
Show all versions of elasticsearch Show documentation
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.search.aggregations.bucket.terms;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.IndexSearcher;
import org.elasticsearch.ElasticsearchStatusException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.BucketOrder;
import org.elasticsearch.search.aggregations.CardinalityUpperBound;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.InternalOrder;
import org.elasticsearch.search.aggregations.InternalOrder.CompoundOrder;
import org.elasticsearch.search.aggregations.NonCollectingAggregator;
import org.elasticsearch.search.aggregations.bucket.BucketUtils;
import org.elasticsearch.search.aggregations.bucket.terms.NumericTermsAggregator.ResultStrategy;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
import org.elasticsearch.search.aggregations.support.AggregationContext;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.aggregations.support.SamplingContext;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
import org.elasticsearch.search.aggregations.support.ValuesSourceRegistry;
import org.elasticsearch.xcontent.ParseField;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.function.LongPredicate;
public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
static Boolean REMAP_GLOBAL_ORDS, COLLECT_SEGMENT_ORDS;
private static final Logger logger = LogManager.getLogger(TermsAggregatorFactory.class);
static void registerAggregators(ValuesSourceRegistry.Builder builder) {
builder.register(
TermsAggregationBuilder.REGISTRY_KEY,
List.of(CoreValuesSourceType.KEYWORD, CoreValuesSourceType.IP),
TermsAggregatorFactory.bytesSupplier(),
true
);
builder.register(
TermsAggregationBuilder.REGISTRY_KEY,
List.of(CoreValuesSourceType.DATE, CoreValuesSourceType.BOOLEAN, CoreValuesSourceType.NUMERIC),
TermsAggregatorFactory.numericSupplier(),
true
);
}
/**
* The maximum number of global ordinals a field can have for us to try
* aggregating it "filter by filter". "Filter by filter" aggregation is
* generally faster when possible but takes more memory because we have
* to build the filters.
*
* The value that we have here is a fairly wild guess. At the time of
* writing we figure each filter clause takes a couple of kb worth of
* accounting so this is in the neighborhood of one megabyte of memory.
*
* Secondly, this is a fairly crude heuristic. If the terms agg was going
* to take up nearly as much memory anyway it might be worth it to use
* filters. More experiment is required.
*/
static final long MAX_ORDS_TO_TRY_FILTERS = 1000;
/**
* This supplier is used for all the field types that should be aggregated as bytes/strings,
* including those that need global ordinals
*/
private static TermsAggregatorSupplier bytesSupplier() {
return new TermsAggregatorSupplier() {
@Override
public Aggregator build(
String name,
AggregatorFactories factories,
ValuesSourceConfig valuesSourceConfig,
BucketOrder order,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude,
String executionHint,
AggregationContext context,
Aggregator parent,
SubAggCollectionMode subAggCollectMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map metadata
) throws IOException {
ValuesSource valuesSource = valuesSourceConfig.getValuesSource();
ExecutionMode execution = null;
if (executionHint != null) {
execution = ExecutionMode.fromString(executionHint);
}
// In some cases, using ordinals is just not supported: override it
if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals == false) {
execution = ExecutionMode.MAP;
}
if (execution == null) {
execution = ExecutionMode.GLOBAL_ORDINALS;
}
final long maxOrd = execution == ExecutionMode.GLOBAL_ORDINALS ? getMaxOrd(valuesSource, context.searcher()) : -1;
if (subAggCollectMode == null) {
subAggCollectMode = pickSubAggColectMode(factories, bucketCountThresholds.getShardSize(), maxOrd);
}
if ((includeExclude != null) && (includeExclude.isRegexBased()) && valuesSourceConfig.format() != DocValueFormat.RAW) {
// TODO this exception message is not really accurate for the string case. It's really disallowing regex + formatter
throw new AggregationExecutionException(
"Aggregation ["
+ name
+ "] cannot support regular expression style "
+ "include/exclude settings as they can only be applied to string fields. Use an array of values for "
+ "include/exclude clauses"
);
}
// TODO: [Zach] we might want refactor and remove ExecutionMode#create(), moving that logic outside the enum
logger.debug("Creating bytes terms aggregator with execution mode [{}]", execution);
return execution.create(
name,
factories,
valuesSourceConfig,
order,
bucketCountThresholds,
includeExclude,
context,
parent,
subAggCollectMode,
showTermDocCountError,
cardinality,
metadata
);
}
};
}
/**
* This supplier is used for all fields that expect to be aggregated as a numeric value.
* This includes floating points, and formatted types that use numerics internally for storage (date, boolean, etc)
*/
private static TermsAggregatorSupplier numericSupplier() {
return new TermsAggregatorSupplier() {
@Override
public Aggregator build(
String name,
AggregatorFactories factories,
ValuesSourceConfig valuesSourceConfig,
BucketOrder order,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude,
String executionHint,
AggregationContext context,
Aggregator parent,
SubAggCollectionMode subAggCollectMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map metadata
) throws IOException {
if ((includeExclude != null) && (includeExclude.isRegexBased())) {
throw new AggregationExecutionException(
"Aggregation ["
+ name
+ "] cannot support regular expression style "
+ "include/exclude settings as they can only be applied to string fields. Use an array of numeric values for "
+ "include/exclude clauses used to filter numeric fields"
);
}
if (subAggCollectMode == null) {
subAggCollectMode = pickSubAggColectMode(factories, bucketCountThresholds.getShardSize(), -1);
}
ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSourceConfig.getValuesSource();
IncludeExclude.LongFilter longFilter = null;
Function> resultStrategy;
if (numericValuesSource.isFloatingPoint()) {
if (includeExclude != null) {
longFilter = includeExclude.convertToDoubleFilter();
}
resultStrategy = agg -> agg.new DoubleTermsResults(showTermDocCountError);
} else {
if (includeExclude != null) {
longFilter = includeExclude.convertToLongFilter(valuesSourceConfig.format());
}
resultStrategy = agg -> agg.new LongTermsResults(showTermDocCountError);
}
return new NumericTermsAggregator(
name,
factories,
resultStrategy,
numericValuesSource,
valuesSourceConfig.format(),
order,
bucketCountThresholds,
context,
parent,
subAggCollectMode,
longFilter,
cardinality,
metadata
);
}
};
}
private final TermsAggregatorSupplier aggregatorSupplier;
private final BucketOrder order;
private final IncludeExclude includeExclude;
private final String executionHint;
private final SubAggCollectionMode collectMode;
private final TermsAggregator.BucketCountThresholds bucketCountThresholds;
private final boolean showTermDocCountError;
TermsAggregatorFactory(
String name,
ValuesSourceConfig config,
BucketOrder order,
IncludeExclude includeExclude,
String executionHint,
SubAggCollectionMode collectMode,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
boolean showTermDocCountError,
AggregationContext context,
AggregatorFactory parent,
AggregatorFactories.Builder subFactoriesBuilder,
Map metadata,
TermsAggregatorSupplier aggregatorSupplier
) throws IOException {
super(name, config, context, parent, subFactoriesBuilder, metadata);
this.aggregatorSupplier = aggregatorSupplier;
this.order = order;
this.includeExclude = includeExclude;
this.executionHint = executionHint;
this.collectMode = collectMode;
this.bucketCountThresholds = bucketCountThresholds;
this.showTermDocCountError = showTermDocCountError;
}
@Override
protected Aggregator createUnmapped(Aggregator parent, Map metadata) throws IOException {
final InternalAggregation aggregation = new UnmappedTerms(
name,
order,
bucketCountThresholds.getRequiredSize(),
bucketCountThresholds.getMinDocCount(),
metadata
);
Aggregator agg = new NonCollectingAggregator(name, context, parent, factories, metadata) {
@Override
public InternalAggregation buildEmptyAggregation() {
return aggregation;
}
};
// even in the case of an unmapped aggregator, validate the order
order.validate(agg);
return agg;
}
private static boolean isAggregationSort(BucketOrder order) {
if (order instanceof InternalOrder.Aggregation) {
return true;
} else if (order instanceof CompoundOrder compoundOrder) {
return compoundOrder.orderElements().stream().anyMatch(TermsAggregatorFactory::isAggregationSort);
} else {
return false;
}
}
@Override
protected Aggregator doCreateInternal(Aggregator parent, CardinalityUpperBound cardinality, Map metadata)
throws IOException {
BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds);
if (InternalOrder.isKeyOrder(order) == false
&& bucketCountThresholds.getShardSize() == TermsAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) {
// The user has not made a shardSize selection. Use default
// heuristic to avoid any wrong-ranking caused by distributed
// counting
bucketCountThresholds.setShardSize(BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize()));
}
// If min_doc_count and shard_min_doc_count is provided, we do not support them being larger than 1
// This is because we cannot be sure about their relative scale when sampled
if (getSamplingContext().map(SamplingContext::isSampled).orElse(false)) {
if (bucketCountThresholds.getMinDocCount() > 1 || bucketCountThresholds.getShardMinDocCount() > 1) {
throw new ElasticsearchStatusException(
"aggregation [{}] is within a sampling context; "
+ "min_doc_count, provided [{}], and min_shard_doc_count, provided [{}], cannot be greater than 1",
RestStatus.BAD_REQUEST,
name(),
bucketCountThresholds.getMinDocCount(),
bucketCountThresholds.getShardMinDocCount()
);
}
}
bucketCountThresholds.ensureValidity();
return aggregatorSupplier.build(
name,
factories,
config,
order,
bucketCountThresholds,
includeExclude,
executionHint,
context,
parent,
collectMode,
showTermDocCountError,
cardinality,
metadata
);
}
/**
* Pick a {@link SubAggCollectionMode} based on heuristics about what
* we're collecting.
*/
static SubAggCollectionMode pickSubAggColectMode(AggregatorFactories factories, int expectedSize, long maxOrd) {
if (factories.countAggregators() == 0) {
// Without sub-aggregations we pretty much ignore this field value so just pick something
return SubAggCollectionMode.DEPTH_FIRST;
}
if (expectedSize == Integer.MAX_VALUE) {
// We expect to return all buckets so delaying them won't save any time
return SubAggCollectionMode.DEPTH_FIRST;
}
if (maxOrd == -1 || maxOrd > expectedSize) {
/*
* We either don't know how many buckets we expect there to be
* (maxOrd == -1) or we expect there to be more buckets than
* we will collect from this shard. So delaying collection of
* the sub-buckets *should* save time.
*/
return SubAggCollectionMode.BREADTH_FIRST;
}
// We expect to collect so many buckets that we may as well collect them all.
return SubAggCollectionMode.DEPTH_FIRST;
}
/**
* Get the maximum global ordinal value for the provided {@link ValuesSource} or -1
* if the values source is not an instance of {@link ValuesSource.Bytes.WithOrdinals}.
*/
private static long getMaxOrd(ValuesSource source, IndexSearcher searcher) throws IOException {
if (source instanceof ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals) {
return valueSourceWithOrdinals.globalMaxOrd(searcher);
} else {
return -1;
}
}
public enum ExecutionMode {
MAP(new ParseField("map")) {
@Override
Aggregator create(
String name,
AggregatorFactories factories,
ValuesSourceConfig valuesSourceConfig,
BucketOrder order,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude,
AggregationContext context,
Aggregator parent,
SubAggCollectionMode subAggCollectMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map metadata
) throws IOException {
IncludeExclude.StringFilter filter = includeExclude == null
? null
: includeExclude.convertToStringFilter(valuesSourceConfig.format());
return new MapStringTermsAggregator(
name,
factories,
new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig),
a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource()),
order,
valuesSourceConfig.format(),
bucketCountThresholds,
filter,
context,
parent,
subAggCollectMode,
showTermDocCountError,
cardinality,
metadata
);
}
},
GLOBAL_ORDINALS(new ParseField("global_ordinals")) {
@Override
Aggregator create(
String name,
AggregatorFactories factories,
ValuesSourceConfig valuesSourceConfig,
BucketOrder order,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude,
AggregationContext context,
Aggregator parent,
SubAggCollectionMode subAggCollectMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map metadata
) throws IOException {
assert valuesSourceConfig.getValuesSource() instanceof ValuesSource.Bytes.WithOrdinals;
ValuesSource.Bytes.WithOrdinals ordinalsValuesSource = (ValuesSource.Bytes.WithOrdinals) valuesSourceConfig
.getValuesSource();
SortedSetDocValues values = globalOrdsValues(context, ordinalsValuesSource);
long maxOrd = values.getValueCount();
if (maxOrd > 0
&& maxOrd <= MAX_ORDS_TO_TRY_FILTERS
&& context.enableRewriteToFilterByFilter()
&& false == context.isInSortOrderExecutionRequired()) {
StringTermsAggregatorFromFilters adapted = StringTermsAggregatorFromFilters.adaptIntoFiltersOrNull(
name,
factories,
context,
parent,
showTermDocCountError,
cardinality,
metadata,
valuesSourceConfig,
order,
bucketCountThresholds,
gloabalOrdsFilter(includeExclude, valuesSourceConfig.format(), values),
values
);
if (adapted != null) {
/*
* We don't check the estimated cost here because we're
* fairly sure that any field that has global ordinals
* is going to be able to query fairly quickly. Mostly
* checking the cost is a defense against runtime fields
* which *have* queries but they are slow and have high
* cost. But runtime fields don't have global ords
* so we won't have got here anyway.
*
* It's totally possible that there might be a top level
* query that was generated by a runtime field. That
* query might indeed be slow, but we won't execute it
* any more times doing filter-by-filter then we would
* doing regular collection.
*/
logger.debug("Using adapted fiter-by-filter implementation");
return adapted;
}
}
final double ratio = maxOrd / ((double) context.searcher().getIndexReader().numDocs());
if (factories == AggregatorFactories.EMPTY
&& includeExclude == null
&& cardinality == CardinalityUpperBound.ONE
&& ordinalsValuesSource.supportsGlobalOrdinalsMapping()
&&
// we use the static COLLECT_SEGMENT_ORDS to allow tests to force specific optimizations
(COLLECT_SEGMENT_ORDS != null ? COLLECT_SEGMENT_ORDS.booleanValue() : ratio <= 0.5 && maxOrd <= 2048)) {
/*
* We can use the low cardinality execution mode iff this aggregator:
* - has no sub-aggregator AND
* - collects from a single bucket AND
* - has a values source that can map from segment to global ordinals
* - At least we reduce the number of global ordinals look-ups by half (ration <= 0.5) AND
* - the maximum global ordinal is less than 2048 (LOW_CARDINALITY has additional memory usage,
* which directly linked to maxOrd, so we need to limit).
*/
logger.debug("Using low cardinality global ordinals implementation");
return new GlobalOrdinalsStringTermsAggregator.LowCardinality(
name,
factories,
a -> a.new StandardTermsResults(),
ordinalsValuesSource,
values,
order,
valuesSourceConfig.format(),
bucketCountThresholds,
context,
parent,
false,
subAggCollectMode,
showTermDocCountError,
metadata
);
}
boolean remapGlobalOrds;
if (cardinality == CardinalityUpperBound.ONE && REMAP_GLOBAL_ORDS != null) {
/*
* We use REMAP_GLOBAL_ORDS to allow tests to force
* specific optimizations but this particular one
* is only possible if we're collecting from a single
* bucket.
*/
remapGlobalOrds = REMAP_GLOBAL_ORDS.booleanValue();
} else {
remapGlobalOrds = true;
if (includeExclude == null
&& cardinality == CardinalityUpperBound.ONE
&& (factories == AggregatorFactories.EMPTY
|| (isAggregationSort(order) == false && subAggCollectMode == SubAggCollectionMode.BREADTH_FIRST))) {
/*
* We don't need to remap global ords iff this aggregator:
* - has no include/exclude rules AND
* - only collects from a single bucket AND
* - has no sub-aggregator or only sub-aggregator that can be deferred
* ({@link SubAggCollectionMode#BREADTH_FIRST}).
*/
remapGlobalOrds = false;
}
}
logger.debug("Using standard global ordinals implementation. remap is [{}]", remapGlobalOrds);
return new GlobalOrdinalsStringTermsAggregator(
name,
factories,
a -> a.new StandardTermsResults(),
ordinalsValuesSource,
values,
order,
valuesSourceConfig.format(),
bucketCountThresholds,
gloabalOrdsFilter(includeExclude, valuesSourceConfig.format(), values),
context,
parent,
remapGlobalOrds,
subAggCollectMode,
showTermDocCountError,
cardinality,
metadata
);
}
};
public static ExecutionMode fromString(String value) {
return switch (value) {
case "global_ordinals" -> GLOBAL_ORDINALS;
case "map" -> MAP;
default -> throw new IllegalArgumentException(
"Unknown `execution_hint`: [" + value + "], expected any of [map, global_ordinals]"
);
};
}
private final ParseField parseField;
ExecutionMode(ParseField parseField) {
this.parseField = parseField;
}
abstract Aggregator create(
String name,
AggregatorFactories factories,
ValuesSourceConfig valuesSourceConfig,
BucketOrder order,
TermsAggregator.BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude,
AggregationContext context,
Aggregator parent,
SubAggCollectionMode subAggCollectMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map metadata
) throws IOException;
@Override
public String toString() {
return parseField.getPreferredName();
}
}
public static SortedSetDocValues globalOrdsValues(AggregationContext context, ValuesSource.Bytes.WithOrdinals valuesSource)
throws IOException {
IndexReader reader = context.searcher().getIndexReader();
if (reader.leaves().isEmpty()) {
return DocValues.emptySortedSet();
}
return valuesSource.globalOrdinalsValues(reader.leaves().get(0));
}
public static LongPredicate gloabalOrdsFilter(IncludeExclude includeExclude, DocValueFormat format, SortedSetDocValues values)
throws IOException {
if (includeExclude == null) {
return GlobalOrdinalsStringTermsAggregator.ALWAYS_TRUE;
}
return includeExclude.convertToOrdinalsFilter(format).acceptedGlobalOrdinals(values)::get;
}
}