All Downloads are FREE. Search and download functionalities are using the official Maven repository.

querqy.lucene.rewrite.LuceneQueryBuilder Maven / Gradle / Ivy

The newest version!
/**
 * 
 */
package querqy.lucene.rewrite;

import java.io.IOException;
import java.util.LinkedList;
import java.util.function.Function;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;

import querqy.CompoundCharSequence;
import querqy.lucene.rewrite.BooleanQueryFactory.Clause;
import querqy.lucene.rewrite.cache.TermQueryCache;
import querqy.model.AbstractNodeVisitor;
import querqy.model.BooleanQuery;
import querqy.model.BoostedTerm;
import querqy.model.DisjunctionMaxQuery;
import querqy.model.MatchAllQuery;
import querqy.model.QuerqyQuery;
import querqy.model.RawQuery;
import querqy.model.Term;

/**
 * @author René Kriegler, @renekrie
 *
 */
public class LuceneQueryBuilder extends AbstractNodeVisitor> {

   enum ParentType {
      BQ, DMQ
   }

   final boolean normalizeBooleanQueryBoost;
   final boolean mustAddMultiMatchDmq;
   final float dmqTieBreakerMultiplier;
   final float multiMatchTieBreakerMultiplier;
   final TermQueryBuilder termQueryBuilder;
   final SearchFieldsAndBoosting searchFieldsAndBoosting;
   final TermSubQueryBuilder termSubQueryBuilder;

   final Function rawQueryParser;

   LinkedList clauseStack = new LinkedList<>();
   LinkedList dmqStack = new LinkedList<>();
   boolean useBooleanQueryForDMQ = false;

   private ParentType parentType = ParentType.BQ;



   public LuceneQueryBuilder(final TermQueryBuilder termQueryBuilder, final Analyzer analyzer,
                             final SearchFieldsAndBoosting searchFieldsAndBoosting,
                             final float dmqTieBreakerMultiplier, final float multiMatchTieBreakerMultiplier,
                             final TermQueryCache termQueryCache, final Function rawQueryParser) {
      this(   
              termQueryBuilder,
              analyzer,
              searchFieldsAndBoosting,
              dmqTieBreakerMultiplier,
              multiMatchTieBreakerMultiplier,
              true, 
              termQueryCache,
              rawQueryParser);
   }

   /**
    * 

Field names and boost factors are applied like this:

*

If a term doesn't already have a field name, generate all term queries for all fields and boost factors * from generatedQueryFieldsAndBoostings - if the term was generated by some rewriter - or from queryFieldsAndBoostings * otherwise.

*

If a term already has a field name use the boost factor for this field from generatedQueryFieldsAndBoostings if * the term was generated, and from queryFieldsAndBoostings otherwise. If the respective map doesn't contain the field, * use the defaultGeneratedFieldBoostFactor for generated terms. If the term is not generated, treat the field name as * part of the term text (= "fieldname:value").

* * @param termQueryBuilder The TermQueryBuilder * @param analyzer The query Analyzer * @param searchFieldsAndBoosting The search fields and their boost factors * @param dmqTieBreakerMultiplier The tie breaker for dismax queries * @param multiMatchTieBreakerMultiplier The multi-match (synonym) tie breaker for dismax queries * @param normalizeBooleanQueryBoost Iff true and if the analyzer turns a single token into multiple tokens, divide their aggregate score by their count * @param termQueryCache The term query cache or null */ public LuceneQueryBuilder(final TermQueryBuilder termQueryBuilder, final Analyzer analyzer, final SearchFieldsAndBoosting searchFieldsAndBoosting, final float dmqTieBreakerMultiplier, final float multiMatchTieBreakerMultiplier, final boolean normalizeBooleanQueryBoost, final TermQueryCache termQueryCache, final Function rawQueryParser) { if (termQueryBuilder == null) { throw new IllegalArgumentException("TermQueryBuilder must not be null"); } switch (searchFieldsAndBoosting.fieldBoostModel) { case NONE: mustAddMultiMatchDmq = false; break; case FIXED: mustAddMultiMatchDmq = multiMatchTieBreakerMultiplier < 1f; break; default: if (multiMatchTieBreakerMultiplier < 1f) { throw new IllegalArgumentException("MultiMatch DMQ cannot be added for field boost model " + searchFieldsAndBoosting.fieldBoostModel); } mustAddMultiMatchDmq = false; } this.searchFieldsAndBoosting = searchFieldsAndBoosting; this.dmqTieBreakerMultiplier = dmqTieBreakerMultiplier; this.normalizeBooleanQueryBoost = normalizeBooleanQueryBoost; this.multiMatchTieBreakerMultiplier = multiMatchTieBreakerMultiplier; this.termQueryBuilder = termQueryBuilder; termSubQueryBuilder = new TermSubQueryBuilder(analyzer, termQueryCache); this.rawQueryParser = rawQueryParser; } public void reset() { clauseStack.clear(); dmqStack.clear(); useBooleanQueryForDMQ = false; parentType = ParentType.BQ; } public Query createQuery(final querqy.model.Query query, final boolean useBooleanQueryForDMQ) { boolean tmp = this.useBooleanQueryForDMQ; try { this.useBooleanQueryForDMQ = useBooleanQueryForDMQ; return createQuery(query); } finally { this.useBooleanQueryForDMQ = tmp; } } public Query createQuery(final QuerqyQuery query) { if (query instanceof querqy.model.BooleanQuery) { parentType = ParentType.BQ; final LuceneQueryFactory origFactory = query.accept(this); final LuceneQueryFactory factory = mustAddMultiMatchDmq ? new MultiMatchDismaxQueryStructurePostProcessor(dmqTieBreakerMultiplier, multiMatchTieBreakerMultiplier).process(origFactory) : origFactory; termQueryBuilder.getDocumentFrequencyCorrection() .ifPresent(dfc -> factory.prepareDocumentFrequencyCorrection(dfc, false)); return factory.createQuery(null, termQueryBuilder); } else if (query instanceof MatchAllQuery) { return new MatchAllDocsQuery(); } else if (query instanceof RawQuery) { return rawQueryParser.apply((RawQuery) query); } else { throw new IllegalArgumentException("Cannot handle query of type " + query.getClass().getName()); } } @Override public LuceneQueryFactory visit(final querqy.model.Query query) { return visit((BooleanQuery) query); } @Override public LuceneQueryFactory visit(final BooleanQuery booleanQuery) { BooleanQueryFactory bq = new BooleanQueryFactory(normalizeBooleanQueryBoost && parentType == ParentType.DMQ); ParentType myParentType = parentType; parentType = ParentType.BQ; clauseStack.add(bq); super.visit(booleanQuery); clauseStack.removeLast(); parentType = myParentType; final Clause result; switch (bq.getNumberOfClauses()) { case 0: // no sub-query - this can happen if analysis filters out all tokens (stopwords) return new NeverMatchQueryFactory(); case 1: final Clause firstClause = bq.getFirstClause(); if (firstClause.occur == Occur.SHOULD) { // optimise and propagate the single clause up one level, but only // if occur equals neither MUST nor MUST_NOT, which would be lost on the // top level query result = bq.getFirstClause(); } else { result = new Clause(bq, occur(booleanQuery.occur)); } break; default: result = new Clause(bq, occur(booleanQuery.occur)); } switch (parentType) { case BQ: if (!clauseStack.isEmpty()) { clauseStack.getLast().add(result); return bq; } else {// else we are the top BQ return result.queryFactory; } case DMQ: if (result.occur != Occur.SHOULD) { // create a wrapper query final BooleanQueryFactory wrapper = new BooleanQueryFactory(false); wrapper.add(result); bq = wrapper; } dmqStack.getLast().add(bq); return bq; default: throw new RuntimeException("Unknown parentType " + parentType); } } protected Occur occur(final querqy.model.SubQuery.Occur occur) { switch (occur) { case MUST: return Occur.MUST; case MUST_NOT: return Occur.MUST_NOT; case SHOULD: return Occur.SHOULD; } throw new IllegalArgumentException("Cannot handle occur value: " + occur.name()); } @Override public LuceneQueryFactory visit(final DisjunctionMaxQuery disjunctionMaxQuery) { final ParentType myParentType = parentType; parentType = ParentType.DMQ; final DisjunctionMaxQueryFactory dmq = new DisjunctionMaxQueryFactory(dmqTieBreakerMultiplier); dmqStack.add(dmq); super.visit(disjunctionMaxQuery); dmqStack.removeLast(); parentType = myParentType; switch (dmq.getNumberOfDisjuncts()) { case 0: // no sub-query - this can happen if analysis filters out all tokens (stopwords) return new NeverMatchQueryFactory(); case 1: final LuceneQueryFactory firstDisjunct = dmq.getFirstDisjunct(); clauseStack.getLast().add(firstDisjunct, occur(disjunctionMaxQuery.occur)); return firstDisjunct; default: // FIXME: we can decide this earlier --> avoid creating DMQ in case of // MUST_NOT final boolean useBQ = this.useBooleanQueryForDMQ || (disjunctionMaxQuery.occur == querqy.model.SubQuery.Occur.MUST_NOT); if (useBQ) { // FIXME: correct to normalize boost? final BooleanQueryFactory bq = new BooleanQueryFactory(false); for (final LuceneQueryFactory queryFactory : dmq.disjuncts) { bq.add(queryFactory, Occur.SHOULD); } clauseStack.getLast().add(bq, occur(disjunctionMaxQuery.occur)); return bq; } clauseStack.getLast().add(dmq, occur(disjunctionMaxQuery.occur)); return dmq; } } @Override public LuceneQueryFactory visit(final Term term) { final DisjunctionMaxQueryFactory siblings = dmqStack.getLast(); final String fieldname = term.getField(); Term termToUse = null; try { FieldBoost fieldBoost = searchFieldsAndBoosting.getFieldBoost(term); if (fieldBoost == null) { if (fieldname != null && !term.isGenerated() && !searchFieldsAndBoosting.hasSearchField(fieldname, term)) { // someone searches in a field that is not set as a search field or didn't intend to search in a field at all // --> set value to fieldname + ":" + value in search in all fields final Term termWithFieldInValue = new Term(null, new CompoundCharSequence(":", fieldname, term.getValue())); fieldBoost = searchFieldsAndBoosting.getFieldBoost(termWithFieldInValue); if (fieldBoost != null) { termToUse = termWithFieldInValue; } } } else { termToUse = term; } if (fieldBoost == null) { // TODO: move to else clause of inner if above throw new RuntimeException("Could not get FieldBoost for term: " + term); } // check for field boost override in BoostedTerm if (termToUse instanceof BoostedTerm) { fieldBoost = new BoostedDelegatingFieldBoost(fieldBoost, ((BoostedTerm) termToUse).getBoost()); } for (final String searchField: searchFieldsAndBoosting.getSearchFields(termToUse)) { addTerm(searchField, fieldBoost, siblings, termToUse); } } catch (final IOException e) { // REVISIT: throw more specific exception? // - or save exception in Builder and then throw IOException from // build() throw new RuntimeException(e); } return null; } /** * *

* Applies analysis to a term and adds the result to the Lucene query factory * tree. *

* *

* The analysis might emit multiple tokens for the input term. If these * tokens constitute a sequence (according to the position attribute), a * BooleanQuery will be created and each position in the sequence constitutes * a MUST clause of this BooleanQuery. If multiple tokens occur at the same * position, a DismaxQuery will be created in this position and the tokens * constitute its disjuncts. The tiebreak factor will be set to the * dmqTieBreakerMultiplier property of this LuceneQueryBuilder. *

* * * @param fieldname * @param boost * @param target * @param sourceTerm * @throws IOException */ void addTerm(final String fieldname, final FieldBoost boost, final DisjunctionMaxQueryFactory target, final Term sourceTerm) throws IOException { final TermSubQueryFactory queryFactory = termSubQueryBuilder.termToFactory(fieldname, sourceTerm, boost); if (queryFactory != null) { target.add(queryFactory); boost.registerTermSubQuery(queryFactory); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy