org.elasticsearch.index.query.IntervalBuilder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 8.14.0
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.index.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.IntervalMatchesIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

/**
 * Constructs an IntervalsSource based on analyzed text
 */
public abstract class IntervalBuilder {

    private final String field;
    private final Analyzer analyzer;

    public IntervalBuilder(String field, Analyzer analyzer) {
        this.field = field;
        this.analyzer = analyzer;
    }

    /** Create term intervals for the provided term. */
    protected abstract IntervalsSource termIntervals(BytesRef term);

    public IntervalsSource analyzeText(String query, int maxGaps, boolean ordered) throws IOException {
        try (TokenStream ts = analyzer.tokenStream(field, query); CachingTokenFilter stream = new CachingTokenFilter(ts)) {
            return analyzeText(stream, maxGaps, ordered);
        }
    }

    protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered) throws IOException {

        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
        PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);

        if (termAtt == null) {
            return NO_INTERVALS;
        }

        // phase 1: read through the stream and assess the situation:
        // counting the number of tokens/positions and marking if we have any synonyms.

        int numTokens = 0;
        boolean hasSynonyms = false;
        boolean isGraph = false;

        stream.reset();
        while (stream.incrementToken()) {
            numTokens++;
            int positionIncrement = posIncAtt.getPositionIncrement();
            if (positionIncrement == 0) {
                hasSynonyms = true;
            }
            int positionLength = posLenAtt.getPositionLength();
            if (positionLength > 1) {
                isGraph = true;
            }
        }

        // phase 2: based on token count, presence of synonyms, and options
        // formulate a single term, boolean, or phrase.

        if (numTokens == 0) {
            return NO_INTERVALS;
        } else if (numTokens == 1) {
            // single term
            return analyzeTerm(stream);
        } else if (isGraph) {
            // graph
            return combineSources(analyzeGraph(stream), maxGaps, ordered);
        } else {
            // phrase
            if (hasSynonyms) {
                // phrase with single-term synonyms
                return analyzeSynonyms(stream, maxGaps, ordered);
            } else {
                // simple phrase
                return combineSources(analyzeTerms(stream), maxGaps, ordered);
            }
        }

    }

    protected IntervalsSource analyzeTerm(TokenStream ts) throws IOException {
        TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        ts.reset();
        ts.incrementToken();
        return termIntervals(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
    }

    protected static IntervalsSource combineSources(List sources, int maxGaps, boolean ordered) {
        if (sources.size() == 0) {
            return NO_INTERVALS;
        }
        if (sources.size() == 1) {
            return sources.get(0);
        }
        IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]);
        if (maxGaps == 0 && ordered) {
            return Intervals.phrase(sourcesArray);
        }
        IntervalsSource inner = ordered ? Intervals.ordered(sourcesArray) : Intervals.unordered(sourcesArray);
        if (maxGaps == -1) {
            return inner;
        }
        return Intervals.maxgaps(maxGaps, inner);
    }

    protected List analyzeTerms(TokenStream ts) throws IOException {
        List terms = new ArrayList<>();
        TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            BytesRef term = bytesAtt.getBytesRef();
            int precedingSpaces = posAtt.getPositionIncrement() - 1;
            terms.add(extend(termIntervals(BytesRef.deepCopyOf(term)), precedingSpaces));
        }
        ts.end();
        return terms;
    }

    public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
        if (precedingSpaces == 0) {
            return source;
        }
        return Intervals.extend(source, precedingSpaces, 0);
    }

    protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
        List terms = new ArrayList<>();
        List synonyms = new ArrayList<>();
        TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        int spaces = 0;
        while (ts.incrementToken()) {
            int posInc = posAtt.getPositionIncrement();
            if (posInc > 0) {
                if (synonyms.size() == 1) {
                    terms.add(extend(synonyms.get(0), spaces));
                } else if (synonyms.size() > 1) {
                    terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
                }
                synonyms.clear();
                spaces = posInc - 1;
            }
            synonyms.add(termIntervals(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
        }
        if (synonyms.size() == 1) {
            terms.add(extend(synonyms.get(0), spaces));
        } else {
            terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
        }
        return combineSources(terms, maxGaps, ordered);
    }

    protected List analyzeGraph(TokenStream source) throws IOException {
        source.reset();
        GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);

        List clauses = new ArrayList<>();
        int[] articulationPoints = graph.articulationPoints();
        int lastState = 0;
        int maxClauseCount = BooleanQuery.getMaxClauseCount();
        for (int i = 0; i <= articulationPoints.length; i++) {
            int start = lastState;
            int end = -1;
            if (i < articulationPoints.length) {
                end = articulationPoints[i];
            }
            lastState = end;
            if (graph.hasSidePath(start)) {
                List paths = new ArrayList<>();
                Iterator it = graph.getFiniteStrings(start, end);
                while (it.hasNext()) {
                    TokenStream ts = it.next();
                    IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true);
                    if (paths.size() >= maxClauseCount) {
                        throw new BooleanQuery.TooManyClauses();
                    }
                    paths.add(phrase);
                }
                if (paths.size() > 0) {
                    clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0])));
                }
            } else {
                Iterator it = graph.getFiniteStrings(start, end);
                TokenStream ts = it.next();
                clauses.addAll(analyzeTerms(ts));
                assert it.hasNext() == false;
            }
        }
        return clauses;
    }

    static final IntervalsSource NO_INTERVALS = new IntervalsSource() {

        @Override
        public IntervalIterator intervals(String field, LeafReaderContext ctx) {
            return new IntervalIterator() {
                @Override
                public int start() {
                    return NO_MORE_INTERVALS;
                }

                @Override
                public int end() {
                    return NO_MORE_INTERVALS;
                }

                @Override
                public int gaps() {
                    throw new UnsupportedOperationException();
                }

                @Override
                public int nextInterval() {
                    return NO_MORE_INTERVALS;
                }

                @Override
                public float matchCost() {
                    return 0;
                }

                @Override
                public int docID() {
                    return NO_MORE_DOCS;
                }

                @Override
                public int nextDoc() {
                    return NO_MORE_DOCS;
                }

                @Override
                public int advance(int target) {
                    return NO_MORE_DOCS;
                }

                @Override
                public long cost() {
                    return 0;
                }
            };
        }

        @Override
        public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) {
            return null;
        }

        @Override
        public void visit(String field, QueryVisitor visitor) {}

        @Override
        public int minExtent() {
            return 0;
        }

        @Override
        public Collection pullUpDisjunctions() {
            return Collections.emptyList();
        }

        @Override
        public int hashCode() {
            return 0;
        }

        @Override
        public boolean equals(Object other) {
            return other == this;
        }

        @Override
        public String toString() {
            return "no_match";
        }
    };

}