All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.percolator.QueryAnalyzer Maven / Gradle / Ivy

Go to download

Percolator module adds capability to index queries and query these queries by specifying documents

The newest version!
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.percolator;

import org.apache.lucene.document.BinaryRange;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.BlendedTermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.elasticsearch.Version;
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
import org.elasticsearch.index.query.DateRangeIncludingNowQuery;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;

final class QueryAnalyzer {

    private QueryAnalyzer() {}

    /**
     * Extracts terms and ranges from the provided query. These terms and ranges are stored with the percolator query and
     * used by the percolate query's candidate query as fields to be query by. The candidate query
     * holds the terms from the document to be percolated and allows to the percolate query to ignore
     * percolator queries that we know would otherwise never match.
     *
     * 

* When extracting the terms for the specified query, we can also determine if the percolator query is * always going to match. For example if a percolator query just contains a term query or a disjunction * query then when the candidate query matches with that, we know the entire percolator query always * matches. This allows the percolate query to skip the expensive memory index verification step that * it would otherwise have to execute (for example when a percolator query contains a phrase query or a * conjunction query). * *

* The query analyzer doesn't always extract all terms from the specified query. For example from a * boolean query with no should clauses or phrase queries only the longest term are selected, * since that those terms are likely to be the rarest. Boolean query's must_not clauses are always ignored. * *

* Sometimes the query analyzer can't always extract terms or ranges from a sub query, if that happens then * query analysis is stopped and an UnsupportedQueryException is thrown. So that the caller can mark * this query in such a way that the PercolatorQuery always verifies if this query with the MemoryIndex. * * @param query The query to analyze. * @param indexVersion The create version of the index containing the percolator queries. */ static Result analyze(Query query, Version indexVersion) { ResultBuilder builder = new ResultBuilder(indexVersion, false); query.visit(builder); return builder.getResult(); } private static final Set> verifiedQueries = new HashSet<>( Arrays.asList( TermQuery.class, TermInSetQuery.class, SynonymQuery.class, SpanTermQuery.class, SpanOrQuery.class, BooleanQuery.class, DisjunctionMaxQuery.class, ConstantScoreQuery.class, BoostQuery.class, BlendedTermQuery.class ) ); private static boolean isVerified(Query query) { if (query instanceof FunctionScoreQuery) { return ((FunctionScoreQuery) query).getMinScore() == null; } for (Class cls : verifiedQueries) { if (cls.isAssignableFrom(query.getClass())) { return true; } } return false; } private static class ResultBuilder extends QueryVisitor { final boolean conjunction; final Version version; List children = new ArrayList<>(); boolean verified = true; int minimumShouldMatch = 0; List terms = new ArrayList<>(); private ResultBuilder(Version version, boolean conjunction) { this.conjunction = conjunction; this.version = version; } @Override public String toString() { return (conjunction ? "CONJ" : "DISJ") + children + terms + "~" + minimumShouldMatch; } Result getResult() { List partialResults = new ArrayList<>(); if (terms.size() > 0) { partialResults.addAll(terms); } if (children.isEmpty() == false) { List childResults = children.stream().map(ResultBuilder::getResult).collect(Collectors.toList()); partialResults.addAll(childResults); } if (partialResults.isEmpty()) { return verified ? Result.MATCH_NONE : Result.UNKNOWN; } Result result; if (partialResults.size() == 1) { result = partialResults.get(0); } else { result = conjunction ? handleConjunction(partialResults, version) : handleDisjunction(partialResults, minimumShouldMatch, version); } if (verified == false) { result = result.unverify(); } return result; } @Override public QueryVisitor getSubVisitor(Occur occur, Query parent) { if (parent instanceof DateRangeIncludingNowQuery) { terms.add(Result.UNKNOWN); return QueryVisitor.EMPTY_VISITOR; } this.verified = isVerified(parent); if (occur == Occur.MUST || occur == Occur.FILTER) { ResultBuilder builder = new ResultBuilder(version, true); children.add(builder); return builder; } if (occur == Occur.MUST_NOT) { this.verified = false; return QueryVisitor.EMPTY_VISITOR; } int minimumShouldMatchValue = 0; if (parent instanceof BooleanQuery) { BooleanQuery bq = (BooleanQuery) parent; if (bq.getMinimumNumberShouldMatch() == 0 && bq.clauses().stream().anyMatch(c -> c.getOccur() == Occur.MUST || c.getOccur() == Occur.FILTER)) { return QueryVisitor.EMPTY_VISITOR; } minimumShouldMatchValue = bq.getMinimumNumberShouldMatch(); } ResultBuilder child = new ResultBuilder(version, false); child.minimumShouldMatch = minimumShouldMatchValue; children.add(child); return child; } @Override public void visitLeaf(Query query) { if (query instanceof MatchAllDocsQuery) { terms.add(new Result(true, true)); } else if (query instanceof MatchNoDocsQuery) { terms.add(Result.MATCH_NONE); } else if (query instanceof PointRangeQuery) { terms.add(pointRangeQuery((PointRangeQuery) query)); } else { terms.add(Result.UNKNOWN); } } @Override public void consumeTerms(Query query, Term... termsToConsume) { boolean isVerified = isVerified(query); Set qe = Arrays.stream(termsToConsume).map(QueryExtraction::new).collect(Collectors.toSet()); if (qe.size() > 0) { if (version.before(Version.V_6_1_0) && conjunction) { Optional longest = qe.stream() .filter(q -> q.term != null) .max(Comparator.comparingInt(q -> q.term.bytes().length)); if (longest.isPresent()) { qe = Collections.singleton(longest.get()); } } this.terms.add(new Result(isVerified, qe, conjunction ? qe.size() : 1)); } } @Override public void consumeTermsMatching(Query query, String field, Supplier automaton) { if (query instanceof TermInSetQuery) { TermInSetQuery q = (TermInSetQuery) query; PrefixCodedTerms.TermIterator ti = q.getTermData().iterator(); BytesRef term; Set qe = new HashSet<>(); while ((term = ti.next()) != null) { qe.add(new QueryExtraction(new Term(field, term))); } this.terms.add(new Result(true, qe, 1)); } else { super.consumeTermsMatching(query, field, automaton); } } } private static Result pointRangeQuery(PointRangeQuery query) { if (query.getNumDims() != 1) { return Result.UNKNOWN; } byte[] lowerPoint = query.getLowerPoint(); byte[] upperPoint = query.getUpperPoint(); // Need to check whether upper is not smaller than lower, otherwise NumericUtils.subtract(...) fails IAE // If upper is really smaller than lower then we deal with like MatchNoDocsQuery. (verified and no extractions) if (new BytesRef(lowerPoint).compareTo(new BytesRef(upperPoint)) > 0) { return new Result(true, Collections.emptySet(), 0); } byte[] interval = new byte[16]; NumericUtils.subtract(16, 0, prepad(upperPoint), prepad(lowerPoint), interval); return new Result( false, Collections.singleton(new QueryExtraction(new Range(query.getField(), lowerPoint, upperPoint, interval))), 1 ); } private static byte[] prepad(byte[] original) { int offset = BinaryRange.BYTES - original.length; byte[] result = new byte[BinaryRange.BYTES]; System.arraycopy(original, 0, result, offset, original.length); return result; } private static Result handleConjunction(List conjunctionsWithUnknowns, Version version) { List conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList()); if (conjunctions.isEmpty()) { if (conjunctionsWithUnknowns.isEmpty()) { throw new IllegalArgumentException("Must have at least one conjunction sub result"); } return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one } if (conjunctionsWithUnknowns.size() == 1) { return conjunctionsWithUnknowns.get(0); } if (version.onOrAfter(Version.V_6_1_0)) { for (Result subResult : conjunctions) { if (subResult.isMatchNoDocs()) { return subResult; } } int msm = 0; boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size(); boolean matchAllDocs = true; Set extractions = new HashSet<>(); Set seenRangeFields = new HashSet<>(); for (Result result : conjunctions) { int resultMsm = result.minimumShouldMatch; for (QueryExtraction queryExtraction : result.extractions) { if (queryExtraction.range != null) { // In case of range queries each extraction does not simply increment the // minimum_should_match for that percolator query like for a term based extraction, // so that can lead to more false positives for percolator queries with range queries // than term based queries. // This is because the way number fields are extracted from the document to be // percolated. Per field a single range is extracted and if a percolator query has two or // more range queries on the same field, then the minimum should match can be higher than clauses // in the CoveringQuery. Therefore right now the minimum should match is only incremented once per // number field when processing the percolator query at index time. // For multiple ranges within a single extraction (ie from an existing conjunction or disjunction) // then this will already have been taken care of, so we only check against fieldnames from // previously processed extractions, and don't add to the seenRangeFields list until all // extractions from this result are processed if (seenRangeFields.contains(queryExtraction.range.fieldName)) { resultMsm = Math.max(0, resultMsm - 1); verified = false; } } else { // In case that there are duplicate term query extractions we need to be careful with // incrementing msm, because that could lead to valid matches not becoming candidate matches: // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3) // doc: field: val1 val2 val3 // So lets be protective and decrease the msm: if (extractions.contains(queryExtraction)) { resultMsm = Math.max(0, resultMsm - 1); verified = false; } } } msm += resultMsm; // add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly // calculated for subsequent Results result.extractions.stream().map(e -> e.range).filter(Objects::nonNull).map(e -> e.fieldName).forEach(seenRangeFields::add); if (result.verified == false // If some inner extractions are optional, the result can't be verified || result.minimumShouldMatch < result.extractions.size()) { verified = false; } matchAllDocs &= result.matchAllDocs; extractions.addAll(result.extractions); } if (matchAllDocs) { return new Result(matchAllDocs, verified); } else { return new Result(verified, extractions, msm); } } else { Result bestClause = null; for (Result result : conjunctions) { bestClause = selectBestResult(result, bestClause); } return bestClause; } } private static Result handleDisjunction(List disjunctions, int requiredShouldClauses, Version version) { if (disjunctions.stream().anyMatch(Result::isUnknown)) { return Result.UNKNOWN; } if (disjunctions.size() == 1) { return disjunctions.get(0); } // Keep track of the msm for each clause: List clauses = new ArrayList<>(disjunctions.size()); boolean verified; if (version.before(Version.V_6_1_0)) { verified = requiredShouldClauses <= 1; } else { verified = true; } int numMatchAllClauses = 0; boolean hasRangeExtractions = false; // In case that there are duplicate extracted terms / ranges then the msm should always be equal to the clause // with lowest msm, because the at percolate time there is no way to know the number of repetitions per // extracted term and field value from a percolator document may have more 'weight' than others. // Example percolator query: value1 OR value2 OR value2 OR value3 OR value3 OR value3 OR value4 OR value5 (msm set to 3) // In the above example query the extracted msm would be 3 // Example document1: value1 value2 value3 // With the msm and extracted terms this would match and is expected behaviour // Example document2: value3 // This document should match too (value3 appears in 3 clauses), but with msm set to 3 and the fact // that fact that only distinct values are indexed in extracted terms field this document would // never match. boolean hasDuplicateTerms = false; Set terms = new HashSet<>(); for (int i = 0; i < disjunctions.size(); i++) { Result subResult = disjunctions.get(i); if (subResult.verified == false // one of the sub queries requires more than one term to match, we can't // verify it with a single top-level min_should_match || subResult.minimumShouldMatch > 1 // One of the inner clauses has multiple extractions, we won't be able to // verify it with a single top-level min_should_match || (subResult.extractions.size() > 1 && requiredShouldClauses > 1)) { verified = false; } if (subResult.matchAllDocs) { numMatchAllClauses++; } int resultMsm = subResult.minimumShouldMatch; for (QueryExtraction extraction : subResult.extractions) { if (terms.add(extraction) == false) { verified = false; hasDuplicateTerms = true; } } if (hasRangeExtractions == false) { hasRangeExtractions = subResult.extractions.stream().anyMatch(qe -> qe.range != null); } clauses.add(resultMsm); } boolean matchAllDocs = numMatchAllClauses > 0 && numMatchAllClauses >= requiredShouldClauses; int msm = 0; if (version.onOrAfter(Version.V_6_1_0) && // Having ranges would mean we need to juggle with the msm and that complicates this logic a lot, // so for now lets not do it. hasRangeExtractions == false) { // Figure out what the combined msm is for this disjunction: // (sum the lowest required clauses, otherwise we're too strict and queries may not match) clauses = clauses.stream().filter(val -> val > 0).sorted().collect(Collectors.toList()); // When there are duplicated query extractions, percolator can no longer reliably determine msm across this disjunction if (hasDuplicateTerms) { // pick lowest msm: msm = clauses.get(0); } else { int limit = Math.min(clauses.size(), Math.max(1, requiredShouldClauses)); for (int i = 0; i < limit; i++) { msm += clauses.get(i); } } } else { msm = 1; } if (matchAllDocs) { return new Result(matchAllDocs, verified); } else { return new Result(verified, terms, msm); } } /** * Return an extraction for the conjunction of {@code result1} and {@code result2} * by picking up clauses that look most restrictive and making it unverified if * the other clause is not null and doesn't match all documents. This is used by * 6.0.0 indices which didn't use the terms_set query. */ static Result selectBestResult(Result result1, Result result2) { assert result1 != null || result2 != null; if (result1 == null) { return result2; } else if (result2 == null) { return result1; } else if (result1.matchAllDocs) { // conjunction with match_all Result result = result2; if (result1.verified == false) { result = result.unverify(); } return result; } else if (result2.matchAllDocs) { // conjunction with match_all Result result = result1; if (result2.verified == false) { result = result.unverify(); } return result; } else { // Prefer term based extractions over range based extractions: boolean onlyRangeBasedExtractions = true; for (QueryExtraction clause : result1.extractions) { if (clause.term != null) { onlyRangeBasedExtractions = false; break; } } for (QueryExtraction clause : result2.extractions) { if (clause.term != null) { onlyRangeBasedExtractions = false; break; } } if (onlyRangeBasedExtractions) { BytesRef extraction1SmallestRange = smallestRange(result1.extractions); BytesRef extraction2SmallestRange = smallestRange(result2.extractions); if (extraction1SmallestRange == null) { return result2.unverify(); } else if (extraction2SmallestRange == null) { return result1.unverify(); } // Keep the clause with smallest range, this is likely to be the rarest. if (extraction1SmallestRange.compareTo(extraction2SmallestRange) <= 0) { return result1.unverify(); } else { return result2.unverify(); } } else { int extraction1ShortestTerm = minTermLength(result1.extractions); int extraction2ShortestTerm = minTermLength(result2.extractions); // keep the clause with longest terms, this likely to be rarest. if (extraction1ShortestTerm >= extraction2ShortestTerm) { return result1.unverify(); } else { return result2.unverify(); } } } } private static int minTermLength(Set extractions) { // In case there are only range extractions, then we return Integer.MIN_VALUE, // so that selectBestExtraction(...) we are likely to prefer the extractions that contains at least a single extraction if (extractions.stream().filter(queryExtraction -> queryExtraction.term != null).count() == 0 && extractions.stream().filter(queryExtraction -> queryExtraction.range != null).count() > 0) { return Integer.MIN_VALUE; } int min = Integer.MAX_VALUE; for (QueryExtraction qt : extractions) { if (qt.term != null) { min = Math.min(min, qt.bytes().length); } } return min; } private static BytesRef smallestRange(Set terms) { BytesRef min = null; for (QueryExtraction qt : terms) { if (qt.range != null) { if (min == null || qt.range.interval.compareTo(min) < 0) { min = qt.range.interval; } } } return min; } /** * Query extraction result. A result is a candidate for a given document either if: * - `matchAllDocs` is true * - `extractions` and the document have `minimumShouldMatch` terms in common * Further more, the match doesn't need to be verified if `verified` is true, checking * `matchAllDocs` and `extractions` is enough. */ static class Result { final Set extractions; final boolean verified; final int minimumShouldMatch; final boolean matchAllDocs; Result(boolean matchAllDocs, boolean verified, Set extractions, int minimumShouldMatch) { if (minimumShouldMatch > extractions.size()) { throw new IllegalArgumentException( "minimumShouldMatch can't be greater than the number of extractions: " + minimumShouldMatch + " > " + extractions.size() ); } this.matchAllDocs = matchAllDocs; this.extractions = extractions; this.verified = verified; this.minimumShouldMatch = minimumShouldMatch; } Result(boolean verified, Set extractions, int minimumShouldMatch) { this(false, verified, extractions, minimumShouldMatch); } Result(boolean matchAllDocs, boolean verified) { this(matchAllDocs, verified, Collections.emptySet(), 0); } @Override public String toString() { return extractions.toString(); } Result unverify() { if (verified) { return new Result(matchAllDocs, false, extractions, minimumShouldMatch); } else { return this; } } boolean isUnknown() { return false; } boolean isMatchNoDocs() { return matchAllDocs == false && extractions.isEmpty(); } static final Result UNKNOWN = new Result(false, false, Collections.emptySet(), 0) { @Override boolean isUnknown() { return true; } @Override boolean isMatchNoDocs() { return false; } @Override public String toString() { return "UNKNOWN"; } }; static final Result MATCH_NONE = new Result(false, true, Collections.emptySet(), 0) { @Override boolean isMatchNoDocs() { return true; } }; } static class QueryExtraction { final Term term; final Range range; QueryExtraction(Term term) { this.term = term; this.range = null; } QueryExtraction(Range range) { this.term = null; this.range = range; } String field() { return term != null ? term.field() : null; } BytesRef bytes() { return term != null ? term.bytes() : null; } String text() { return term != null ? term.text() : null; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; QueryExtraction queryExtraction = (QueryExtraction) o; return Objects.equals(term, queryExtraction.term) && Objects.equals(range, queryExtraction.range); } @Override public int hashCode() { return Objects.hash(term, range); } @Override public String toString() { return "QueryExtraction{" + "term=" + term + ",range=" + range + '}'; } } static class Range { final String fieldName; final byte[] lowerPoint; final byte[] upperPoint; final BytesRef interval; Range(String fieldName, byte[] lowerPoint, byte[] upperPoint, byte[] interval) { this.fieldName = fieldName; this.lowerPoint = lowerPoint; this.upperPoint = upperPoint; // using BytesRef here just to make use of its compareTo method. this.interval = new BytesRef(interval); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Range range = (Range) o; return Objects.equals(fieldName, range.fieldName) && Arrays.equals(lowerPoint, range.lowerPoint) && Arrays.equals(upperPoint, range.upperPoint); } @Override public int hashCode() { int result = 1; result += 31 * fieldName.hashCode(); result += Arrays.hashCode(lowerPoint); result += Arrays.hashCode(upperPoint); return result; } @Override public String toString() { return "Range{" + ", fieldName='" + fieldName + '\'' + ", interval=" + interval + '}'; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy