org.elasticsearch.search.aggregations.bucket.filter.FilterByFilterAggregator Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.search.aggregations.bucket.filter;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.util.Bits;
import org.elasticsearch.common.CheckedSupplier;
import org.elasticsearch.core.CheckedFunction;
import org.elasticsearch.search.aggregations.AdaptingAggregator;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.CardinalityUpperBound;
import org.elasticsearch.search.aggregations.LeafBucketCollector;
import org.elasticsearch.search.aggregations.support.AggregationContext;
import org.elasticsearch.search.runtime.AbstractScriptFieldQuery;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;

/**
 * Collects results by running each filter against the searcher and doesn't
 * build any {@link LeafBucketCollector}s which is generally faster than
 * {@link Compatible} but doesn't support when there is a parent aggregator
 * or any child aggregators.
 */
public class FilterByFilterAggregator extends FiltersAggregator {
    /**
     * Builds {@link FilterByFilterAggregator} when the filters are valid and
     * it would be faster than a "native" aggregation implementation. The
     * interface is designed to allow easy construction of
     * {@link AdaptingAggregator}.
     */
    public abstract static class AdapterBuilder {
        private final String name;
        private final List> filters = new ArrayList<>();
        private final boolean keyed;
        private final AggregationContext context;
        private final Aggregator parent;
        private final CardinalityUpperBound cardinality;
        private final Map metadata;
        private final Query rewrittenTopLevelQuery;
        private boolean valid = true;

        public AdapterBuilder(
            String name,
            boolean keyed,
            String otherBucketKey,
            AggregationContext context,
            Aggregator parent,
            CardinalityUpperBound cardinality,
            Map metadata
        ) throws IOException {
            this.name = name;
            this.keyed = keyed;
            this.context = context;
            this.parent = parent;
            this.cardinality = cardinality;
            this.metadata = metadata;
            this.rewrittenTopLevelQuery = context.searcher().rewrite(context.query());
            this.valid = parent == null && otherBucketKey == null;
        }

        /**
         * Subclasses should override this to adapt the
         * {@link FilterByFilterAggregator} into another sort of aggregator
         * if required.
         */
        protected abstract T adapt(CheckedFunction delegate) throws IOException;

        public final void add(String key, Query query) throws IOException {
            if (valid == false) {
                return;
            }
            if (query instanceof AbstractScriptFieldQuery) {
                /*
                 * We know that runtime fields aren't fast to query at all
                 * but we expect all other sorts of queries are at least as
                 * fast as the native aggregator.
                 */
                valid = false;
                return;
            }
            add(QueryToFilterAdapter.build(context.searcher(), key, query));
        }

        final void add(QueryToFilterAdapter filter) throws IOException {
            if (valid == false) {
                return;
            }
            QueryToFilterAdapter mergedFilter = filter.union(rewrittenTopLevelQuery);
            if (mergedFilter.isInefficientUnion()) {
                /*
                 * For now any complex union kicks us out of filter by filter
                 * mode. Its possible that this de-optimizes many "filters"
                 * aggregations but likely correct when "range", "date_histogram",
                 * or "terms" are converted to this agg. We investigated a sort
                 * of "combined" iteration mechanism and its complex *and* slower
                 * than the native implementations of the aggs above.
                 */
                valid = false;
                return;
            }
            if (filters.size() == 1) {
                /*
                 * When we add the second filter we check if there are any _doc_count
                 * fields and bail out of filter-by filter mode if there are. _doc_count
                 * fields are expensive to decode and the overhead of iterating per
                 * filter causes us to decode doc counts over and over again.
                 */
                if (context.hasDocCountField()) {
                    valid = false;
                    return;
                }
            }
            filters.add(mergedFilter);
        }

        /**
         * Build the the adapter or {@code null} if the this isn't a valid rewrite.
         */
        public final T build() throws IOException {
            if (false == valid) {
                return null;
            }
            class AdapterBuild implements CheckedFunction {
                private FilterByFilterAggregator agg;

                @Override
                public FilterByFilterAggregator apply(AggregatorFactories subAggregators) throws IOException {
                    agg = new FilterByFilterAggregator(name, subAggregators, filters, keyed, context, parent, cardinality, metadata);
                    return agg;
                }
            }
            AdapterBuild adapterBuild = new AdapterBuild();
            T result = adapt(adapterBuild);
            if (adapterBuild.agg.scoreMode().needsScores()) {
                /*
                 * Filter by filter won't produce the correct results if the
                 * sub-aggregators need scores because we're not careful with how
                 * we merge filters. Right now we have to build the whole
                 * aggregation in order to know if it'll need scores or not.
                 * This means we'll build the *sub-aggs* too. Oh well.
                 */
                return null;
            }
            return result;
        }
    }

    /**
     * Count of segments with "live" docs. This is both deleted docs and
     * docs covered by field level security.
     */
    private int segmentsWithDeletedDocs;
    /**
     * Count of segments with documents have consult the {@code doc_count}
     * field.
     */
    private int segmentsWithDocCountField;
    /**
     * Count of segments this aggregator performed a document by document
     * collection for. We have to collect when there are sub-aggregations
     * and it disables some optimizations we can make while just counting.
     */
    private int segmentsCollected;
    /**
     * Count of segments this aggregator counted. We can count when there
     * aren't any sub-aggregators and we have some counting optimizations
     * that don't apply to document by document collections.
     * 
     * But the "fallback" for counting when we don't have a fancy optimization
     * is to perform document by document collection and increment a counter
     * on each document. This fallback does not increment the
     * {@link #segmentsCollected} counter and does increment
     * the {@link #segmentsCounted} counter because those counters are to
     * signal which operation we were allowed to perform. The filters
     * themselves will have debugging counters measuring if they could
     * perform the count from metadata or had to fall back.
     */
    private int segmentsCounted;

    /**
     * Build the aggregation. Private to force callers to go through the
     * {@link AdapterBuilder} which centralizes the logic to decide if this
     * aggregator would be faster than the native implementation.
     */
    private FilterByFilterAggregator(
        String name,
        AggregatorFactories factories,
        List> filters,
        boolean keyed,
        AggregationContext context,
        Aggregator parent,
        CardinalityUpperBound cardinality,
        Map metadata
    ) throws IOException {
        super(name, factories, filters, keyed, null, context, parent, cardinality, metadata);
    }

    /**
     * Instead of returning a {@link LeafBucketCollector} we do the
     * collection ourselves by running the filters directly. This is safe
     * because we only use this aggregator if there isn't a {@code parent}
     * which would change how we collect buckets and because we take the
     * top level query into account when building the filters.
     */
    @Override
    protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
        assert scoreMode().needsScores() == false;
        if (filters().size() == 0) {
            return LeafBucketCollector.NO_OP_COLLECTOR;
        }
        Bits live = ctx.reader().getLiveDocs();
        if (false == docCountProvider.alwaysOne()) {
            segmentsWithDocCountField++;
        }
        if (subAggregators.length == 0) {
            // TOOD we'd be better off if we could do sub.isNoop() or something.
            /*
             * Without sub.isNoop we always end up in the `collectXXX` modes even if
             * the sub-aggregators opt out of traditional collection.
             */
            segmentsCounted++;
            collectCount(ctx, live);
        } else {
            segmentsCollected++;
            collectSubs(ctx, live, sub);
        }
        return LeafBucketCollector.NO_OP_COLLECTOR;
    }

    /**
     * Gather a count of the number of documents that match each filter
     * without sending any documents to a sub-aggregator. This yields
     * the correct response when there aren't any sub-aggregators or they
     * all opt out of needing any sort of collection.
     */
    private void collectCount(LeafReaderContext ctx, Bits live) throws IOException {
        Counter counter = new Counter(docCountProvider);
        for (int filterOrd = 0; filterOrd < filters().size(); filterOrd++) {
            incrementBucketDocCount(filterOrd, filters().get(filterOrd).count(ctx, counter, live));
        }
    }

    /**
     * Collect all documents that match all filters and send them to
     * the sub-aggregators. This method is only required when there are
     * sub-aggregators that haven't opted out of being collected.
     * 

     * This collects each filter one at a time, resetting the
     * sub-aggregators between each filter as though they were hitting
     * a fresh segment.
     * 
     * It's very tempting to try and collect the
     * filters into blocks of matches and then reply the whole block
     * into ascending order without the resetting. That'd probably
     * work better if the disk was very, very slow and we didn't have
     * any kind of disk caching. But with disk caching its about twice
     * as fast to collect each filter one by one like this. And it uses
     * less memory because there isn't a need to buffer a block of matches.
     * And its a hell of a lot less code.
     */
    private void collectSubs(LeafReaderContext ctx, Bits live, LeafBucketCollector sub) throws IOException {
        class MatchCollector implements LeafCollector {
            LeafBucketCollector subCollector = sub;
            int filterOrd;

            @Override
            public void collect(int docId) throws IOException {
                collectBucket(subCollector, docId, filterOrd);
            }

            @Override
            public void setScorer(Scorable scorer) throws IOException {}
        }
        MatchCollector collector = new MatchCollector();
        filters().get(0).collect(ctx, collector, live);
        for (int filterOrd = 1; filterOrd < filters().size(); filterOrd++) {
            collector.subCollector = collectableSubAggregators.getLeafCollector(ctx);
            collector.filterOrd = filterOrd;
            filters().get(filterOrd).collect(ctx, collector, live);
        }
    }

    @Override
    public void collectDebugInfo(BiConsumer add) {
        super.collectDebugInfo(add);
        add.accept("segments_counted", segmentsCounted);
        add.accept("segments_collected", segmentsCollected);
        add.accept("segments_with_deleted_docs", segmentsWithDeletedDocs);
        add.accept("segments_with_doc_count_field", segmentsWithDocCountField);
    }

    CheckedSupplier canUseMetadata(LeafReaderContext ctx) {
        return new CheckedSupplier() {
            Boolean canUse;

            @Override
            public Boolean get() throws IOException {
                if (canUse == null) {
                    canUse = canUse();
                }
                return canUse;
            }

            private boolean canUse() throws IOException {
                if (ctx.reader().getLiveDocs() != null) {
                    return false;
                }
                docCountProvider.setLeafReaderContext(ctx);
                return docCountProvider.alwaysOne();
            }
        };
    }
}