
org.apache.lucene.search.MultiTermQuery Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.lucene
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.util.AttributeSource;
/**
* An abstract {@link Query} that matches documents containing a subset of terms provided by a
* {@link FilteredTermsEnum} enumeration.
*
* This query cannot be used directly; you must subclass it and define {@link
* #getTermsEnum(Terms,AttributeSource)} to provide a {@link FilteredTermsEnum} that iterates
* through the terms to be matched.
*
*
NOTE: if {@link RewriteMethod} is either {@link #CONSTANT_SCORE_BOOLEAN_REWRITE} or
* {@link #SCORING_BOOLEAN_REWRITE}, you may encounter a {@link IndexSearcher.TooManyClauses}
* exception during searching, which happens when the number of terms to be searched exceeds {@link
* IndexSearcher#getMaxClauseCount()}. Setting {@link RewriteMethod} to {@link
* #CONSTANT_SCORE_BLENDED_REWRITE} or {@link #CONSTANT_SCORE_REWRITE} prevents this.
*
*
The recommended rewrite method is {@link #CONSTANT_SCORE_BLENDED_REWRITE}: it doesn't spend
* CPU computing unhelpful scores, and is the most performant rewrite method given the query. If you
* need scoring (like {@link FuzzyQuery}, use {@link TopTermsScoringBooleanQueryRewrite} which uses
* a priority queue to only collect competitive terms and not hit this limitation.
*
*
Note that org.apache.lucene.queryparser.classic.QueryParser produces MultiTermQueries using
* {@link #CONSTANT_SCORE_REWRITE} by default.
*/
public abstract class MultiTermQuery extends Query {
protected final String field;
protected final RewriteMethod rewriteMethod;
/** Abstract class that defines how the query is rewritten. */
public abstract static class RewriteMethod {
public abstract Query rewrite(IndexSearcher indexSearcher, MultiTermQuery query)
throws IOException;
/**
* Returns the {@link MultiTermQuery}s {@link TermsEnum}
*
* @see MultiTermQuery#getTermsEnum(Terms, AttributeSource)
*/
protected TermsEnum getTermsEnum(MultiTermQuery query, Terms terms, AttributeSource atts)
throws IOException {
return query.getTermsEnum(
terms, atts); // allow RewriteMethod subclasses to pull a TermsEnum from the MTQ
}
}
/**
* A rewrite method where documents are assigned a constant score equal to the query's boost.
* Maintains a boolean query-like implementation over the most costly terms while pre-processing
* the less costly terms into a filter bitset. Enforces an upper-limit on the number of terms
* allowed in the boolean query-like implementation.
*
*
This method aims to balance the benefits of both {@link #CONSTANT_SCORE_BOOLEAN_REWRITE} and
* {@link #CONSTANT_SCORE_REWRITE} by enabling skipping and early termination over costly terms
* while limiting the overhead of a BooleanQuery with many terms. It also ensures you cannot hit
* {@link org.apache.lucene.search.IndexSearcher.TooManyClauses}. For some use-cases with all low
* cost terms, {@link #CONSTANT_SCORE_REWRITE} may be more performant. While for some use-cases
* with all high cost terms, {@link #CONSTANT_SCORE_BOOLEAN_REWRITE} may be better.
*/
public static final RewriteMethod CONSTANT_SCORE_BLENDED_REWRITE =
new RewriteMethod() {
@Override
public Query rewrite(IndexSearcher indexSearcher, MultiTermQuery query) {
return new MultiTermQueryConstantScoreBlendedWrapper<>(query);
}
};
/**
* A rewrite method that first creates a private Filter, by visiting each term in sequence and
* marking all docs for that term. Matching documents are assigned a constant score equal to the
* query's boost.
*
*
This method is faster than the BooleanQuery rewrite methods when the number of matched terms
* or matched documents is non-trivial. Also, it will never hit an errant {@link
* IndexSearcher.TooManyClauses} exception.
*/
public static final RewriteMethod CONSTANT_SCORE_REWRITE =
new RewriteMethod() {
@Override
public Query rewrite(IndexSearcher indexSearcher, MultiTermQuery query) {
return new MultiTermQueryConstantScoreWrapper<>(query);
}
};
/**
* A rewrite method that uses {@link org.apache.lucene.index.DocValuesType#SORTED} / {@link
* org.apache.lucene.index.DocValuesType#SORTED_SET} doc values to find matching docs through a
* post-filtering type approach. This will be very slow if used in isolation, but will likely be
* the most performant option when combined with a sparse query clause. All matching docs are
* assigned a constant score equal to the query's boost.
*
*
If you don't have doc values indexed, see the other rewrite methods that rely on postings
* alone (e.g., {@link #CONSTANT_SCORE_BLENDED_REWRITE}, {@link #SCORING_BOOLEAN_REWRITE}, etc.
* depending on scoring needs).
*/
public static final RewriteMethod DOC_VALUES_REWRITE = new DocValuesRewriteMethod();
/**
* A rewrite method that first translates each term into {@link BooleanClause.Occur#SHOULD} clause
* in a BooleanQuery, and keeps the scores as computed by the query. Note that typically such
* scores are meaningless to the user, and require non-trivial CPU to compute, so it's almost
* always better to use {@link #CONSTANT_SCORE_REWRITE} instead.
*
*
NOTE: This rewrite method will hit {@link IndexSearcher.TooManyClauses} if the number
* of terms exceeds {@link IndexSearcher#getMaxClauseCount}.
*/
public static final RewriteMethod SCORING_BOOLEAN_REWRITE =
ScoringRewrite.SCORING_BOOLEAN_REWRITE;
/**
* Like {@link #SCORING_BOOLEAN_REWRITE} except scores are not computed. Instead, each matching
* document receives a constant score equal to the query's boost.
*
*
NOTE: This rewrite method will hit {@link IndexSearcher.TooManyClauses} if the number
* of terms exceeds {@link IndexSearcher#getMaxClauseCount}.
*/
public static final RewriteMethod CONSTANT_SCORE_BOOLEAN_REWRITE =
ScoringRewrite.CONSTANT_SCORE_BOOLEAN_REWRITE;
/**
* A rewrite method that first translates each term into {@link BooleanClause.Occur#SHOULD} clause
* in a BooleanQuery, and keeps the scores as computed by the query.
*
*
This rewrite method only uses the top scoring terms so it will not overflow the boolean max
* clause count.
*/
public static final class TopTermsScoringBooleanQueryRewrite
extends TopTermsRewrite {
/**
* Create a TopTermsScoringBooleanQueryRewrite for at most size
terms.
*
* NOTE: if {@link IndexSearcher#getMaxClauseCount} is smaller than size
, then
* it will be used instead.
*/
public TopTermsScoringBooleanQueryRewrite(int size) {
super(size);
}
@Override
protected int getMaxSize() {
return IndexSearcher.getMaxClauseCount();
}
@Override
protected BooleanQuery.Builder getTopLevelBuilder() {
return new BooleanQuery.Builder();
}
@Override
protected Query build(Builder builder) {
return builder.build();
}
@Override
protected void addClause(
BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermStates states) {
final TermQuery tq = new TermQuery(term, states);
topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD);
}
}
/**
* A rewrite method that first translates each term into {@link BooleanClause.Occur#SHOULD} clause
* in a BooleanQuery, but adjusts the frequencies used for scoring to be blended across the terms,
* otherwise the rarest term typically ranks highest (often not useful eg in the set of expanded
* terms in a FuzzyQuery).
*
*
This rewrite method only uses the top scoring terms so it will not overflow the boolean max
* clause count.
*/
public static final class TopTermsBlendedFreqScoringRewrite
extends TopTermsRewrite {
/**
* Create a TopTermsBlendedScoringBooleanQueryRewrite for at most size
terms.
*
* NOTE: if {@link IndexSearcher#getMaxClauseCount} is smaller than size
, then
* it will be used instead.
*/
public TopTermsBlendedFreqScoringRewrite(int size) {
super(size);
}
@Override
protected int getMaxSize() {
return IndexSearcher.getMaxClauseCount();
}
@Override
protected BlendedTermQuery.Builder getTopLevelBuilder() {
BlendedTermQuery.Builder builder = new BlendedTermQuery.Builder();
builder.setRewriteMethod(BlendedTermQuery.BOOLEAN_REWRITE);
return builder;
}
@Override
protected Query build(BlendedTermQuery.Builder builder) {
return builder.build();
}
@Override
protected void addClause(
BlendedTermQuery.Builder topLevel,
Term term,
int docCount,
float boost,
TermStates states) {
topLevel.add(term, boost, states);
}
}
/**
* A rewrite method that first translates each term into {@link BooleanClause.Occur#SHOULD} clause
* in a BooleanQuery, but the scores are only computed as the boost.
*
*
This rewrite method only uses the top scoring terms so it will not overflow the boolean max
* clause count.
*/
public static final class TopTermsBoostOnlyBooleanQueryRewrite
extends TopTermsRewrite {
/**
* Create a TopTermsBoostOnlyBooleanQueryRewrite for at most size
terms.
*
* NOTE: if {@link IndexSearcher#getMaxClauseCount} is smaller than size
, then
* it will be used instead.
*/
public TopTermsBoostOnlyBooleanQueryRewrite(int size) {
super(size);
}
@Override
protected int getMaxSize() {
return IndexSearcher.getMaxClauseCount();
}
@Override
protected BooleanQuery.Builder getTopLevelBuilder() {
return new BooleanQuery.Builder();
}
@Override
protected Query build(BooleanQuery.Builder builder) {
return builder.build();
}
@Override
protected void addClause(
BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermStates states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
topLevel.add(new BoostQuery(q, boost), BooleanClause.Occur.SHOULD);
}
}
/** Constructs a query matching terms that cannot be represented with a single Term. */
public MultiTermQuery(final String field, RewriteMethod rewriteMethod) {
this.field = Objects.requireNonNull(field, "field must not be null");
this.rewriteMethod = Objects.requireNonNull(rewriteMethod, "rewriteMethod must not be null");
}
/** Returns the field name for this query */
public final String getField() {
return field;
}
/**
* Construct the enumeration to be used, expanding the pattern term. This method should only be
* called if the field exists (ie, implementations can assume the field does exist). This method
* should not return null (should instead return {@link TermsEnum#EMPTY} if no terms match). The
* TermsEnum must already be positioned to the first matching term. The given {@link
* AttributeSource} is passed by the {@link RewriteMethod} to share information between segments,
* for example {@link TopTermsRewrite} uses it to share maximum competitive boosts
*/
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
/**
* Constructs an enumeration that expands the pattern term. This method should only be called if
* the field exists (ie, implementations can assume the field does exist). This method never
* returns null. The returned TermsEnum is positioned to the first matching term.
*/
public final TermsEnum getTermsEnum(Terms terms) throws IOException {
return getTermsEnum(terms, new AttributeSource());
}
/**
* Return the number of unique terms contained in this query, if known up-front. If not known, -1
* will be returned.
*/
public long getTermsCount() {
return -1;
}
/**
* To rewrite to a simpler form, instead return a simpler enum from {@link #getTermsEnum(Terms,
* AttributeSource)}. For example, to rewrite to a single term, return a {@link SingleTermsEnum}
*/
@Override
public final Query rewrite(IndexSearcher indexSearcher) throws IOException {
return rewriteMethod.rewrite(indexSearcher, this);
}
/**
* @return the rewrite method used to build the final query
*/
public RewriteMethod getRewriteMethod() {
return rewriteMethod;
}
@Override
public int hashCode() {
final int prime = 31;
int result = classHash();
result = prime * result + rewriteMethod.hashCode();
result = prime * result + field.hashCode();
return result;
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) && equalsTo(getClass().cast(other));
}
private boolean equalsTo(MultiTermQuery other) {
return rewriteMethod.equals(other.rewriteMethod) && field.equals(other.field);
}
}