
org.apache.lucene.util.QueryBuilder Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.lucene
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
/**
* Creates queries from the {@link Analyzer} chain.
*
* Example usage:
*
*
* QueryBuilder builder = new QueryBuilder(analyzer);
* Query a = builder.createBooleanQuery("body", "just a test");
* Query b = builder.createPhraseQuery("body", "another test");
* Query c = builder.createMinShouldMatchQuery("body", "another test", 0.5f);
*
*
* This can also be used as a subclass for query parsers to make it easier to interact with the
* analysis chain. Factory methods such as {@code newTermQuery} are provided so that the generated
* queries can be customized.
*/
public class QueryBuilder {
protected Analyzer analyzer;
protected boolean enablePositionIncrements = true;
protected boolean enableGraphQueries = true;
protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false;
/**
* Wraps a term and boost
*
* @param term the term
* @param boost the boost
*/
public record TermAndBoost(BytesRef term, float boost) {
/** Creates a new TermAndBoost */
public TermAndBoost {
term = BytesRef.deepCopyOf(term);
}
}
/** Creates a new QueryBuilder using the given analyzer. */
public QueryBuilder(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Creates a boolean query from the query text.
*
*
This is equivalent to {@code createBooleanQuery(field, queryText, Occur.SHOULD)}
*
* @param field field name
* @param queryText text to be passed to the analyzer
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis of {@code queryText}
*/
public Query createBooleanQuery(String field, String queryText) {
return createBooleanQuery(field, queryText, BooleanClause.Occur.SHOULD);
}
/**
* Creates a boolean query from the query text.
*
* @param field field name
* @param queryText text to be passed to the analyzer
* @param operator operator used for clauses between analyzer tokens.
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis of {@code queryText}
*/
public Query createBooleanQuery(String field, String queryText, BooleanClause.Occur operator) {
if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
}
return createFieldQuery(analyzer, operator, field, queryText, false, 0);
}
/**
* Creates a phrase query from the query text.
*
*
This is equivalent to {@code createPhraseQuery(field, queryText, 0)}
*
* @param field field name
* @param queryText text to be passed to the analyzer
* @return {@code TermQuery}, {@code BooleanQuery}, {@code PhraseQuery}, or {@code
* MultiPhraseQuery}, based on the analysis of {@code queryText}
*/
public Query createPhraseQuery(String field, String queryText) {
return createPhraseQuery(field, queryText, 0);
}
/**
* Creates a phrase query from the query text.
*
* @param field field name
* @param queryText text to be passed to the analyzer
* @param phraseSlop number of other words permitted between words in query phrase
* @return {@code TermQuery}, {@code BooleanQuery}, {@code PhraseQuery}, or {@code
* MultiPhraseQuery}, based on the analysis of {@code queryText}
*/
public Query createPhraseQuery(String field, String queryText, int phraseSlop) {
return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, phraseSlop);
}
/**
* Creates a minimum-should-match query from the query text.
*
* @param field field name
* @param queryText text to be passed to the analyzer
* @param fraction of query terms {@code [0..1]} that should match
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis of {@code queryText}
*/
public Query createMinShouldMatchQuery(String field, String queryText, float fraction) {
if (Float.isNaN(fraction) || fraction < 0 || fraction > 1) {
throw new IllegalArgumentException("fraction should be >= 0 and <= 1");
}
// TODO: weird that BQ equals/rewrite/scorer doesn't handle this?
if (fraction == 1) {
return createBooleanQuery(field, queryText, BooleanClause.Occur.MUST);
}
Query query =
createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, 0);
if (query instanceof BooleanQuery) {
query = addMinShouldMatchToBoolean((BooleanQuery) query, fraction);
}
return query;
}
/** Rebuilds a boolean query and sets a new minimum number should match value. */
private BooleanQuery addMinShouldMatchToBoolean(BooleanQuery query, float fraction) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch((int) (fraction * query.clauses().size()));
for (BooleanClause clause : query) {
builder.add(clause);
}
return builder.build();
}
/**
* Returns the analyzer.
*
* @see #setAnalyzer(Analyzer)
*/
public Analyzer getAnalyzer() {
return analyzer;
}
/** Sets the analyzer used to tokenize text. */
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Returns true if position increments are enabled.
*
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* Set to true
to enable position increments in result query.
*
*
When set, result phrase and multi-phrase queries will be aware of position increments.
* Useful when e.g. a StopFilter increases the position increment of the token that follows an
* omitted token.
*
*
Default: true.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
/**
* Returns true if phrase query should be automatically generated for multi terms synonyms.
*
* @see #setAutoGenerateMultiTermSynonymsPhraseQuery(boolean)
*/
public boolean getAutoGenerateMultiTermSynonymsPhraseQuery() {
return autoGenerateMultiTermSynonymsPhraseQuery;
}
/**
* Set to true
if phrase queries should be automatically generated for multi terms
* synonyms. Default: false.
*/
public void setAutoGenerateMultiTermSynonymsPhraseQuery(boolean enable) {
this.autoGenerateMultiTermSynonymsPhraseQuery = enable;
}
/**
* Creates a query from the analysis chain.
*
*
Expert: this is more useful for subclasses such as queryparsers. If using this class
* directly, just use {@link #createBooleanQuery(String, String)} and {@link
* #createPhraseQuery(String, String)}. This is a complex method and it is usually not necessary
* to override it in a subclass; instead, override methods like {@link #newBooleanQuery}, etc., if
* possible.
*
* @param analyzer analyzer used for this query
* @param operator default boolean operator used for this query
* @param field field to create queries against
* @param queryText text to be passed to the analysis chain
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
protected Query createFieldQuery(
Analyzer analyzer,
BooleanClause.Occur operator,
String field,
String queryText,
boolean quoted,
int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Use the analyzer to get all the tokens, and then build an appropriate
// query based on the analysis chain.
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
return createFieldQuery(source, operator, field, quoted, phraseSlop);
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
}
/**
* Enable or disable graph TokenStream processing (enabled by default).
*
* @lucene.experimental
*/
public void setEnableGraphQueries(boolean v) {
enableGraphQueries = v;
}
/**
* Returns true if graph TokenStream processing is enabled (default).
*
* @lucene.experimental
*/
public boolean getEnableGraphQueries() {
return enableGraphQueries;
}
/**
* Creates a query from a token stream.
*
* @param source the token stream to create the query from
* @param operator default boolean operator used for this query
* @param field field to create queries against
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
protected Query createFieldQuery(
TokenStream source,
BooleanClause.Occur operator,
String field,
boolean quoted,
int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Build an appropriate query based on the analysis chain.
try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
if (termAtt == null) {
return null;
}
// phase 1: read through the stream and assess the situation:
// counting the number of tokens/positions and marking if we have any synonyms.
int numTokens = 0;
int positionCount = 0;
boolean hasSynonyms = false;
boolean isGraph = false;
stream.reset();
while (stream.incrementToken()) {
numTokens++;
int positionIncrement = posIncAtt.getPositionIncrement();
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
hasSynonyms = true;
}
int positionLength = posLenAtt.getPositionLength();
if (enableGraphQueries && positionLength > 1) {
isGraph = true;
}
}
// phase 2: based on token count, presence of synonyms, and options
// formulate a single term, boolean, or phrase.
if (numTokens == 0) {
return null;
} else if (numTokens == 1) {
// single term
return analyzeTerm(field, stream);
} else if (isGraph) {
// graph
if (quoted) {
return analyzeGraphPhrase(stream, field, phraseSlop);
} else {
return analyzeGraphBoolean(field, stream, operator);
}
} else if (quoted && positionCount > 1) {
// phrase
if (hasSynonyms) {
// complex phrase with synonyms
return analyzeMultiPhrase(field, stream, phraseSlop);
} else {
// simple phrase
return analyzePhrase(field, stream, phraseSlop);
}
} else {
// boolean
if (positionCount == 1) {
// only one position, with synonyms
return analyzeBoolean(field, stream);
} else {
// complex case: multiple positions
return analyzeMultiBoolean(field, stream, operator);
}
}
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
}
/** Creates simple term query from the cached tokenstream contents */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
if (!stream.incrementToken()) {
throw new AssertionError();
}
return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
}
/** Creates simple boolean query from the cached tokenstream contents */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
List terms = new ArrayList<>();
while (stream.incrementToken()) {
terms.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
}
return newSynonymQuery(field, terms.toArray(TermAndBoost[]::new));
}
protected void add(
String field,
BooleanQuery.Builder q,
List current,
BooleanClause.Occur operator) {
if (current.isEmpty()) {
return;
}
if (current.size() == 1) {
q.add(newTermQuery(new Term(field, current.get(0).term), current.get(0).boost), operator);
} else {
q.add(newSynonymQuery(field, current.toArray(TermAndBoost[]::new)), operator);
}
}
/** Creates complex boolean query from the cached tokenstream contents */
protected Query analyzeMultiBoolean(
String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
BooleanQuery.Builder q = newBooleanQuery();
List currentQuery = new ArrayList<>();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
while (stream.incrementToken()) {
if (posIncrAtt.getPositionIncrement() != 0) {
add(field, q, currentQuery, operator);
currentQuery.clear();
}
currentQuery.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
}
add(field, q, currentQuery, operator);
return q.build();
}
/** Creates simple phrase query from the cached tokenstream contents */
protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException {
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;
float phraseBoost = DEFAULT_BOOST;
stream.reset();
while (stream.incrementToken()) {
if (enablePositionIncrements) {
position += posIncrAtt.getPositionIncrement();
} else {
position += 1;
}
builder.add(new Term(field, termAtt.getBytesRef()), position);
phraseBoost *= boostAtt.getBoost();
}
PhraseQuery query = builder.build();
if (phraseBoost == DEFAULT_BOOST) {
return query;
}
return new BoostQuery(query, phraseBoost);
}
/** Creates complex phrase query from the cached tokenstream contents */
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop)
throws IOException {
MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
mpqb.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;
List multiTerms = new ArrayList<>();
stream.reset();
while (stream.incrementToken()) {
int positionIncrement = posIncrAtt.getPositionIncrement();
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new Term(field, termAtt.getBytesRef()));
}
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
return mpqb.build();
}
/**
* Creates a boolean query from a graph token stream. The articulation points of the graph are
* visited in order and the queries created at each point are merged in the returned boolean
* query.
*/
protected Query analyzeGraphBoolean(
String field, TokenStream source, BooleanClause.Occur operator) throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final Query positionalQuery;
if (graph.hasSidePath(start)) {
final Iterator sidePathsIterator = graph.getFiniteStrings(start, end);
Iterator queries =
new Iterator<>() {
@Override
public boolean hasNext() {
return sidePathsIterator.hasNext();
}
@Override
public Query next() {
TokenStream sidePath = sidePathsIterator.next();
return createFieldQuery(
sidePath,
BooleanClause.Occur.MUST,
field,
getAutoGenerateMultiTermSynonymsPhraseQuery(),
0);
}
};
positionalQuery = newGraphSynonymQuery(queries);
} else {
List attributes = graph.getTerms(start);
TermAndBoost[] terms =
attributes.stream()
.map(
s -> {
TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
BoostAttribute b = s.addAttribute(BoostAttribute.class);
return new TermAndBoost(t.getBytesRef(), b.getBoost());
})
.toArray(TermAndBoost[]::new);
assert terms.length > 0;
if (terms.length == 1) {
positionalQuery = newTermQuery(new Term(field, terms[0].term), terms[0].boost);
} else {
positionalQuery = newSynonymQuery(field, terms);
}
}
if (positionalQuery != null) {
builder.add(positionalQuery, operator);
}
}
return builder.build();
}
/** Creates graph phrase query from the tokenstream contents */
protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseSlop)
throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
// Creates a boolean query from the graph token stream by extracting all the
// finite strings from the graph and using them to create phrase queries with
// the appropriate slop.
BooleanQuery.Builder builder = new BooleanQuery.Builder();
Iterator it = graph.getFiniteStrings();
while (it.hasNext()) {
Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, phraseSlop);
if (query != null) {
builder.add(query, BooleanClause.Occur.SHOULD);
}
}
return builder.build();
}
/**
* Builds a new BooleanQuery instance.
*
* This is intended for subclasses that wish to customize the generated queries.
*
* @return new BooleanQuery instance
*/
protected BooleanQuery.Builder newBooleanQuery() {
return new BooleanQuery.Builder();
}
/**
* Builds a new SynonymQuery instance.
*
*
This is intended for subclasses that wish to customize the generated queries.
*
* @return new Query instance
*/
protected Query newSynonymQuery(String field, TermAndBoost[] terms) {
SynonymQuery.Builder builder = new SynonymQuery.Builder(field);
for (TermAndBoost t : terms) {
builder.addTerm(t.term, t.boost);
}
return builder.build();
}
/**
* Builds a new GraphQuery for multi-terms synonyms.
*
*
This is intended for subclasses that wish to customize the generated queries.
*
* @return new Query instance
*/
protected Query newGraphSynonymQuery(Iterator queries) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
while (queries.hasNext()) {
builder.add(queries.next(), BooleanClause.Occur.SHOULD);
}
BooleanQuery bq = builder.build();
if (bq.clauses().size() == 1) {
return bq.clauses().get(0).query();
}
return bq;
}
/**
* Builds a new TermQuery instance.
*
* This is intended for subclasses that wish to customize the generated queries.
*
* @param term term
* @return new TermQuery instance
*/
protected Query newTermQuery(Term term, float boost) {
Query q = new TermQuery(term);
if (boost == DEFAULT_BOOST) {
return q;
}
return new BoostQuery(q, boost);
}
/**
* Builds a new MultiPhraseQuery instance.
*
*
This is intended for subclasses that wish to customize the generated queries.
*
* @return new MultiPhraseQuery instance
*/
protected MultiPhraseQuery.Builder newMultiPhraseQueryBuilder() {
return new MultiPhraseQuery.Builder();
}
}