org.elasticsearch.search.aggregations.bucket.terms.SignificantTextAggregatorFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 8.13.4
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.search.aggregations.bucket.terms;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.DeDuplicatingTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DuplicateByteSequenceSpotter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.BytesRefHash;
import org.elasticsearch.common.util.ObjectArray;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.CardinalityUpperBound;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.LeafBucketCollector;
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
import org.elasticsearch.search.aggregations.NonCollectingAggregator;
import org.elasticsearch.search.aggregations.bucket.BucketUtils;
import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.StringFilter;
import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectConsumer;
import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectorSource;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.elasticsearch.search.aggregations.support.AggregationContext;
import org.elasticsearch.search.lookup.SourceLookup;
import org.elasticsearch.search.profile.Timer;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.LongConsumer;

public class SignificantTextAggregatorFactory extends AggregatorFactory {
    private static final int MEMORY_GROWTH_REPORTING_INTERVAL_BYTES = 5000;

    private final IncludeExclude includeExclude;
    private final MappedFieldType fieldType;
    private final String[] sourceFieldNames;
    private final QueryBuilder backgroundFilter;
    private final TermsAggregator.BucketCountThresholds bucketCountThresholds;
    private final SignificanceHeuristic significanceHeuristic;
    private final boolean filterDuplicateText;

    public SignificantTextAggregatorFactory(
        String name,
        IncludeExclude includeExclude,
        QueryBuilder backgroundFilter,
        TermsAggregator.BucketCountThresholds bucketCountThresholds,
        SignificanceHeuristic significanceHeuristic,
        AggregationContext context,
        AggregatorFactory parent,
        AggregatorFactories.Builder subFactoriesBuilder,
        String fieldName,
        String[] sourceFieldNames,
        boolean filterDuplicateText,
        Map metadata
    ) throws IOException {
        super(name, context, parent, subFactoriesBuilder, metadata);

        this.fieldType = context.getFieldType(fieldName);
        if (fieldType != null) {
            if (supportsAgg(fieldType) == false) {
                throw new IllegalArgumentException(
                    "Field [" + fieldType.name() + "] has no analyzer, but SignificantText " + "requires an analyzed field"
                );
            }
            String indexedFieldName = fieldType.name();
            this.sourceFieldNames = sourceFieldNames == null ? new String[] { indexedFieldName } : sourceFieldNames;
        } else {
            this.sourceFieldNames = new String[0];
        }

        this.includeExclude = includeExclude;
        this.backgroundFilter = backgroundFilter;
        this.filterDuplicateText = filterDuplicateText;
        this.bucketCountThresholds = bucketCountThresholds;
        this.significanceHeuristic = significanceHeuristic;
    }

    protected Aggregator createUnmapped(Aggregator parent, Map metadata) throws IOException {
        final InternalAggregation aggregation = new UnmappedSignificantTerms(
            name,
            bucketCountThresholds.getRequiredSize(),
            bucketCountThresholds.getMinDocCount(),
            metadata
        );
        return new NonCollectingAggregator(name, context, parent, factories, metadata) {
            @Override
            public InternalAggregation buildEmptyAggregation() {
                return aggregation;
            }
        };
    }

    private static boolean supportsAgg(MappedFieldType ft) {
        return ft.getTextSearchInfo() != TextSearchInfo.NONE && ft.getTextSearchInfo() != TextSearchInfo.SIMPLE_MATCH_WITHOUT_TERMS;
    }

    @Override
    protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map metadata)
        throws IOException {

        if (fieldType == null) {
            return createUnmapped(parent, metadata);
        }

        BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds);
        if (bucketCountThresholds.getShardSize() == SignificantTextAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) {
            // The user has not made a shardSize selection.
            // Use default heuristic to avoid any wrong-ranking caused by
            // distributed counting but request double the usual amount.
            // We typically need more than the number of "top" terms requested
            // by other aggregations as the significance algorithm is in less
            // of a position to down-select at shard-level - some of the things
            // we want to find have only one occurrence on each shard and as
            // such are impossible to differentiate from non-significant terms
            // at that early stage.
            bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize()));
        }

        // TODO - need to check with mapping that this is indeed a text field....

        final IncludeExclude.StringFilter incExcFilter = includeExclude == null
            ? null
            : includeExclude.convertToStringFilter(DocValueFormat.RAW);

        final SignificanceLookup lookup = new SignificanceLookup(context, fieldType, DocValueFormat.RAW, backgroundFilter);
        final CollectorSource collectorSource = createCollectorSource();
        boolean success = false;
        try {
            final MapStringTermsAggregator mapStringTermsAggregator = new MapStringTermsAggregator(
                name,
                factories,
                collectorSource,
                a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality),
                null,
                DocValueFormat.RAW,
                bucketCountThresholds,
                incExcFilter,
                context,
                parent,
                SubAggCollectionMode.BREADTH_FIRST,
                false,
                cardinality,
                metadata
            );
            success = true;
            return mapStringTermsAggregator;
        } finally {
            if (success == false) {
                Releasables.close(collectorSource);
            }
        }
    }

    /**
     * Create the {@link CollectorSource}, gathering some timing information
     * if we're profiling.
     * 
     * When profiling aggregations {@link LeafBucketCollector#collect(int, long)} method
     * out of the box but our implementation of that method does three things that is
     * useful to get timing for:
     * 

     * Fetch field values from {@code _source}
     * 
Analyze the field
     * 
Do all the normal {@code terms} agg stuff with its terms
     * 
     * 
     * The most convenient way to measure all of these is to time the fetch and all
     * the normal {@code terms} agg stuff. You can then subtract those timings from
     * the overall collect time to get the analyze time. You can also get the total
     * number of terms that we analyzed by looking at the invocation count on the
     * {@code terms} agg stuff.
     * 

     * While we're at it we count the number of values we fetch from source.
     */
    private CollectorSource createCollectorSource() {
        Analyzer analyzer = context.getIndexAnalyzer(f -> { throw new IllegalArgumentException("No analyzer configured for field " + f); });
        if (context.profiling()) {
            return new ProfilingSignificantTextCollectorSource(
                context.lookup().source(),
                context.bigArrays(),
                fieldType,
                analyzer,
                sourceFieldNames,
                filterDuplicateText
            );
        }
        return new SignificantTextCollectorSource(
            context.lookup().source(),
            context.bigArrays(),
            fieldType,
            analyzer,
            sourceFieldNames,
            filterDuplicateText
        );
    }

    private static class SignificantTextCollectorSource implements MapStringTermsAggregator.CollectorSource {
        private final SourceLookup sourceLookup;
        private final BigArrays bigArrays;
        private final MappedFieldType fieldType;
        private final Analyzer analyzer;
        private final String[] sourceFieldNames;
        private final BytesRefBuilder scratch = new BytesRefBuilder();
        private ObjectArray dupSequenceSpotters;

        SignificantTextCollectorSource(
            SourceLookup sourceLookup,
            BigArrays bigArrays,
            MappedFieldType fieldType,
            Analyzer analyzer,
            String[] sourceFieldNames,
            boolean filterDuplicateText
        ) {
            this.sourceLookup = sourceLookup;
            this.bigArrays = bigArrays;
            this.fieldType = fieldType;
            this.analyzer = analyzer;
            this.sourceFieldNames = sourceFieldNames;
            dupSequenceSpotters = filterDuplicateText ? bigArrays.newObjectArray(1) : null;
        }

        @Override
        public String describe() {
            return "analyze " + fieldType.name() + " from _source";
        }

        @Override
        public void collectDebugInfo(BiConsumer add) {}

        @Override
        public boolean needsScores() {
            return false;
        }

        @Override
        public LeafBucketCollector getLeafCollector(
            StringFilter includeExclude,
            LeafReaderContext ctx,
            LeafBucketCollector sub,
            LongConsumer addRequestCircuitBreakerBytes,
            CollectConsumer consumer
        ) throws IOException {
            return new LeafBucketCollectorBase(sub, null) {
                @Override
                public void collect(int doc, long owningBucketOrd) throws IOException {
                    if (dupSequenceSpotters == null) {
                        collectFromSource(doc, owningBucketOrd, null);
                        return;
                    }
                    dupSequenceSpotters = bigArrays.grow(dupSequenceSpotters, owningBucketOrd + 1);
                    DuplicateByteSequenceSpotter spotter = dupSequenceSpotters.get(owningBucketOrd);
                    if (spotter == null) {
                        spotter = new DuplicateByteSequenceSpotter();
                        dupSequenceSpotters.set(owningBucketOrd, spotter);
                    }
                    collectFromSource(doc, owningBucketOrd, spotter);
                    spotter.startNewSequence();
                }

                private void collectFromSource(int doc, long owningBucketOrd, DuplicateByteSequenceSpotter spotter) throws IOException {
                    sourceLookup.setSegmentAndDocument(ctx, doc);
                    BytesRefHash inDocTerms = new BytesRefHash(256, bigArrays);

                    try {
                        for (String sourceField : sourceFieldNames) {
                            Iterator itr = extractRawValues(sourceField).stream().map(obj -> {
                                if (obj == null) {
                                    return null;
                                }
                                if (obj instanceof BytesRef) {
                                    return fieldType.valueForDisplay(obj).toString();
                                }
                                return obj.toString();
                            }).iterator();
                            while (itr.hasNext()) {
                                String text = itr.next();
                                TokenStream ts = analyzer.tokenStream(fieldType.name(), text);
                                processTokenStream(
                                    includeExclude,
                                    doc,
                                    owningBucketOrd,
                                    text,
                                    ts,
                                    inDocTerms,
                                    spotter,
                                    addRequestCircuitBreakerBytes,
                                    sub,
                                    consumer
                                );
                            }
                        }
                    } finally {
                        Releasables.close(inDocTerms);
                    }
                }
            };
        }

        protected void processTokenStream(
            StringFilter includeExclude,
            int doc,
            long owningBucketOrd,
            String text,
            TokenStream ts,
            BytesRefHash inDocTerms,
            DuplicateByteSequenceSpotter spotter,
            LongConsumer addRequestCircuitBreakerBytes,
            LeafBucketCollector sub,
            CollectConsumer consumer
        ) throws IOException {
            long lastTrieSize = 0;
            if (spotter != null) {
                lastTrieSize = spotter.getEstimatedSizeInBytes();
                ts = new DeDuplicatingTokenFilter(ts, spotter);
            }
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            try {
                while (ts.incrementToken()) {
                    if (spotter != null) {
                        long newTrieSize = spotter.getEstimatedSizeInBytes();
                        long growth = newTrieSize - lastTrieSize;
                        // Only update the circuitbreaker after
                        if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
                            addRequestCircuitBreakerBytes.accept(growth);
                            lastTrieSize = newTrieSize;
                        }
                    }

                    scratch.clear();
                    scratch.copyChars(termAtt);
                    BytesRef bytes = scratch.get();
                    if (includeExclude != null && false == includeExclude.accept(bytes)) {
                        continue;
                    }
                    if (inDocTerms.add(bytes) < 0) {
                        continue;
                    }
                    consumer.accept(sub, doc, owningBucketOrd, bytes);
                }
            } finally {
                ts.close();
            }
            if (spotter != null) {
                long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize;
                if (growth > 0) {
                    addRequestCircuitBreakerBytes.accept(growth);
                }
            }
        }

        /**
         * Extract values from {@code _source}.
         */
        protected List