org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queryparser.complexPhrase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.spans.SpanNearQuery;
import org.apache.lucene.queries.spans.SpanNotQuery;
import org.apache.lucene.queries.spans.SpanOrQuery;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanTermQuery;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
/**
* QueryParser which permits complex phrase query syntax eg "(john jon jonathan~) peters*".
*
* Performs potentially multiple passes over Query text to parse any nested logic in
* PhraseQueries. - First pass takes any PhraseQuery content between quotes and stores for
* subsequent pass. All other query content is parsed as normal - Second pass parses any stored
* PhraseQuery content, checking all embedded clauses are referring to the same field and therefore
* can be rewritten as Span queries. All PhraseQuery clauses are expressed as ComplexPhraseQuery
* objects
*
*
This could arguably be done in one pass using a new QueryParser but here I am working within
* the constraints of the existing parser as a base class. This currently simply feeds all phrase
* content through an analyzer to select phrase terms - any "special" syntax such as * ~ * etc are
* not given special status
*/
public class ComplexPhraseQueryParser extends QueryParser {
private ArrayList complexPhrases = null;
private boolean isPass2ResolvingPhrases;
private boolean inOrder = true;
/**
* When inOrder
is true, the search terms must exists in the documents as the same
* order as in query.
*
* @param inOrder parameter to choose between ordered or un-ordered proximity search
*/
public void setInOrder(final boolean inOrder) {
this.inOrder = inOrder;
}
private ComplexPhraseQuery currentPhraseQuery = null;
public ComplexPhraseQueryParser(String f, Analyzer a) {
super(f, a);
}
@Override
protected Query getFieldQuery(String field, String queryText, int slop) {
ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop, inOrder);
complexPhrases.add(cpq); // add to list of phrases to be parsed once
// we
// are through with this pass
return cpq;
}
@Override
public Query parse(String query) throws ParseException {
if (isPass2ResolvingPhrases) {
MultiTermQuery.RewriteMethod oldMethod = getMultiTermRewriteMethod();
try {
// Temporarily force BooleanQuery rewrite so that Parser will
// generate visible
// collection of terms which we can convert into SpanQueries.
// ConstantScoreRewrite mode produces an
// opaque ConstantScoreQuery object which cannot be interrogated for
// terms in the same way a BooleanQuery can.
// QueryParser is not guaranteed threadsafe anyway so this temporary
// state change should not
// present an issue
setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
return super.parse(query);
} finally {
setMultiTermRewriteMethod(oldMethod);
}
}
// First pass - parse the top-level query recording any PhraseQuerys
// which will need to be resolved
complexPhrases = new ArrayList<>();
Query q = super.parse(query);
// Perform second pass, using this QueryParser to parse any nested
// PhraseQueries with different
// set of syntax restrictions (i.e. all fields must be same)
isPass2ResolvingPhrases = true;
try {
for (Iterator iterator = complexPhrases.iterator();
iterator.hasNext(); ) {
currentPhraseQuery = iterator.next();
// in each phrase, now parse the contents between quotes as a
// separate parse operation
currentPhraseQuery.parsePhraseElements(this);
}
} finally {
isPass2ResolvingPhrases = false;
}
return q;
}
// There is No "getTermQuery throws ParseException" method to override so
// unfortunately need
// to throw a runtime exception here if a term for another field is embedded
// in phrase query
@Override
protected Query newTermQuery(Term term, float boost) {
if (isPass2ResolvingPhrases) {
try {
checkPhraseClauseIsForSameField(term.field());
} catch (ParseException pe) {
throw new RuntimeException("Error parsing complex phrase", pe);
}
}
return super.newTermQuery(term, boost);
}
// Helper method used to report on any clauses that appear in query syntax
private void checkPhraseClauseIsForSameField(String field) throws ParseException {
if (!field.equals(currentPhraseQuery.field)) {
throw new ParseException(
"Cannot have clause for field \""
+ field
+ "\" nested in phrase "
+ " for field \""
+ currentPhraseQuery.field
+ "\"");
}
}
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
if (isPass2ResolvingPhrases) {
checkPhraseClauseIsForSameField(field);
}
return super.getWildcardQuery(field, termStr);
}
@Override
protected Query getRangeQuery(
String field, String part1, String part2, boolean startInclusive, boolean endInclusive)
throws ParseException {
if (isPass2ResolvingPhrases) {
checkPhraseClauseIsForSameField(field);
}
return super.getRangeQuery(field, part1, part2, startInclusive, endInclusive);
}
@Override
protected Query newRangeQuery(
String field, String part1, String part2, boolean startInclusive, boolean endInclusive) {
RewriteMethod originalRewriteMethod = getMultiTermRewriteMethod();
try {
if (isPass2ResolvingPhrases) {
setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
}
return super.newRangeQuery(field, part1, part2, startInclusive, endInclusive);
} finally {
setMultiTermRewriteMethod(originalRewriteMethod);
}
}
@Override
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
if (isPass2ResolvingPhrases) {
checkPhraseClauseIsForSameField(field);
}
return super.getFuzzyQuery(field, termStr, minSimilarity);
}
/*
* Used to handle the query content in between quotes and produced Span-based
* interpretations of the clauses.
*/
static class ComplexPhraseQuery extends Query {
final String field;
final String phrasedQueryStringContents;
final int slopFactor;
private final boolean inOrder;
private final Query[] contents = new Query[1];
public ComplexPhraseQuery(
String field, String phrasedQueryStringContents, int slopFactor, boolean inOrder) {
this.field = Objects.requireNonNull(field);
this.phrasedQueryStringContents = Objects.requireNonNull(phrasedQueryStringContents);
this.slopFactor = slopFactor;
this.inOrder = inOrder;
}
// Called by ComplexPhraseQueryParser for each phrase after the main
// parse
// thread is through
protected void parsePhraseElements(ComplexPhraseQueryParser qp) throws ParseException {
// TODO ensure that field-sensitivity is preserved ie the query
// string below is parsed as
// field+":("+phrasedQueryStringContents+")"
// but this will need code in rewrite to unwrap the first layer of
// boolean query
String oldDefaultParserField = qp.field;
try {
// temporarily set the QueryParser to be parsing the default field for this phrase e.g
// author:"fred* smith"
qp.field = this.field;
contents[0] = qp.parse(phrasedQueryStringContents);
} finally {
qp.field = oldDefaultParserField;
}
}
@Override
public void visit(QueryVisitor visitor) {
visitor.visitLeaf(this);
}
@Override
public Query rewrite(IndexSearcher indexSearcher) throws IOException {
final Query contents = this.contents[0];
// ArrayList spanClauses = new ArrayList();
if (contents instanceof TermQuery
|| contents instanceof MultiTermQuery
|| contents instanceof SynonymQuery) {
return contents;
}
// Build a sequence of Span clauses arranged in a SpanNear - child
// clauses can be complex
// Booleans e.g. nots and ors etc
int numNegatives = 0;
if (!(contents instanceof BooleanQuery)) {
throw new IllegalArgumentException(
"Unknown query type \""
+ contents.getClass().getName()
+ "\" found in phrase query string \""
+ phrasedQueryStringContents
+ "\"");
}
BooleanQuery bq = (BooleanQuery) contents;
SpanQuery[] allSpanClauses = new SpanQuery[bq.clauses().size()];
// For all clauses e.g. one* two~
int i = 0;
for (BooleanClause clause : bq) {
// HashSet bclauseterms=new HashSet();
Query qc = clause.getQuery();
// Rewrite this clause e.g one* becomes (one OR onerous)
qc = indexSearcher.rewrite(qc);
if (clause.getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
numNegatives++;
}
while (qc instanceof BoostQuery) {
qc = ((BoostQuery) qc).getQuery();
}
if (qc instanceof BooleanQuery || qc instanceof SynonymQuery) {
ArrayList sc = new ArrayList<>();
BooleanQuery booleanCaluse =
qc instanceof BooleanQuery ? (BooleanQuery) qc : convert((SynonymQuery) qc);
addComplexPhraseClause(sc, booleanCaluse);
if (sc.size() > 0) {
allSpanClauses[i] = sc.get(0);
} else {
// Insert fake term e.g. phrase query was for "Fred Smithe*" and
// there were no "Smithe*" terms - need to
// prevent match on just "Fred".
allSpanClauses[i] =
new SpanTermQuery(
new Term(field, "Dummy clause because no terms found - must match nothing"));
}
} else if (qc instanceof MatchNoDocsQuery) {
// Insert fake term e.g. phrase query was for "Fred Smithe*" and
// there were no "Smithe*" terms - need to
// prevent match on just "Fred".
allSpanClauses[i] =
new SpanTermQuery(
new Term(field, "Dummy clause because no terms found - must match nothing"));
} else {
if (qc instanceof TermQuery) {
TermQuery tq = (TermQuery) qc;
allSpanClauses[i] = new SpanTermQuery(tq.getTerm());
} else {
throw new IllegalArgumentException(
"Unknown query type \""
+ qc.getClass().getName()
+ "\" found in phrase query string \""
+ phrasedQueryStringContents
+ "\"");
}
}
i += 1;
}
if (numNegatives == 0) {
// The simple case - no negative elements in phrase
return new SpanNearQuery(allSpanClauses, slopFactor, inOrder);
}
// Complex case - we have mixed positives and negatives in the
// sequence.
// Need to return a SpanNotQuery
ArrayList positiveClauses = new ArrayList<>();
i = 0;
for (BooleanClause clause : bq) {
if (!clause.getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
positiveClauses.add(allSpanClauses[i]);
}
i += 1;
}
SpanQuery[] includeClauses = positiveClauses.toArray(new SpanQuery[positiveClauses.size()]);
SpanQuery include = null;
if (includeClauses.length == 1) {
include = includeClauses[0]; // only one positive clause
} else {
// need to increase slop factor based on gaps introduced by
// negatives
include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, inOrder);
}
// Use sequence of positive and negative values as the exclude.
SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, inOrder);
SpanNotQuery snot = new SpanNotQuery(include, exclude);
return snot;
}
private BooleanQuery convert(SynonymQuery qc) {
BooleanQuery.Builder bqb = new BooleanQuery.Builder();
for (Term t : qc.getTerms()) {
bqb.add(new BooleanClause(new TermQuery(t), Occur.SHOULD));
}
return bqb.build();
}
private void addComplexPhraseClause(List spanClauses, BooleanQuery qc) {
ArrayList ors = new ArrayList<>();
ArrayList nots = new ArrayList<>();
// For all clauses e.g. one* two~
for (BooleanClause clause : qc) {
Query childQuery = clause.getQuery();
while (childQuery instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) childQuery;
childQuery = bq.getQuery();
}
// select the list to which we will add these options
ArrayList chosenList = ors;
if (clause.getOccur() == BooleanClause.Occur.MUST_NOT) {
chosenList = nots;
}
if (childQuery instanceof TermQuery) {
TermQuery tq = (TermQuery) childQuery;
SpanQuery stq = new SpanTermQuery(tq.getTerm());
chosenList.add(stq);
} else if (childQuery instanceof BooleanQuery) {
BooleanQuery cbq = (BooleanQuery) childQuery;
addComplexPhraseClause(chosenList, cbq);
} else if (childQuery instanceof MatchNoDocsQuery) {
// Insert fake term e.g. phrase query was for "Fred Smithe*" and
// there were no "Smithe*" terms - need to
// prevent match on just "Fred".
SpanQuery stq =
new SpanTermQuery(
new Term(field, "Dummy clause because no terms found - must match nothing"));
chosenList.add(stq);
} else {
// TODO alternatively could call extract terms here?
throw new IllegalArgumentException(
"Unknown query type:" + childQuery.getClass().getName());
}
}
if (ors.size() == 0) {
return;
}
SpanOrQuery soq = new SpanOrQuery(ors.toArray(new SpanQuery[ors.size()]));
if (nots.size() == 0) {
spanClauses.add(soq);
} else {
SpanOrQuery snqs = new SpanOrQuery(nots.toArray(new SpanQuery[nots.size()]));
SpanNotQuery snq = new SpanNotQuery(soq, snqs);
spanClauses.add(snq);
}
}
@Override
public String toString(String field) {
StringBuilder sb = new StringBuilder();
if (!this.field.equals(field)) {
sb.append(this.field).append(":");
}
sb.append("\"").append(phrasedQueryStringContents).append("\"");
if (slopFactor != 0) {
sb.append("~").append(slopFactor);
}
return sb.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = classHash();
result = prime * result + field.hashCode();
result = prime * result + phrasedQueryStringContents.hashCode();
result = prime * result + slopFactor;
result = prime * result + (inOrder ? 1 : 0);
return result;
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) && equalsTo(getClass().cast(other));
}
private boolean equalsTo(ComplexPhraseQuery other) {
return field.equals(other.field)
&& phrasedQueryStringContents.equals(other.phrasedQueryStringContents)
&& slopFactor == other.slopFactor
&& inOrder == other.inOrder;
}
}
}