All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.queries.BlendedTermQuery Maven / Gradle / Ivy

There is a newer version: 8.15.1
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lucene.queries;

import com.google.common.primitives.Ints;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.ArrayUtil;

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Set;

/**
 * BlendedTermQuery can be used to unify term statistics across
 * one or more fields in the index. A common problem with structured
 * documents is that a term that is significant in on field might not be
 * significant in other fields like in a scenario where documents represent
 * users with a "first_name" and a "second_name". When someone searches
 * for "simon" it will very likely get "paul simon" first since "simon" is a
 * an uncommon last name ie. has a low document frequency. This query
 * tries to "lie" about the global statistics like document frequency as well
 * total term frequency to rank based on the estimated statistics.
 * 

* While aggregating the total term frequency is trivial since it * can be summed up not every {@link org.apache.lucene.search.similarities.Similarity} * makes use of this statistic. The document frequency which is used in the * {@link org.apache.lucene.search.similarities.DefaultSimilarity} * can only be estimated as an lower-bound since it is a document based statistic. For * the document frequency the maximum frequency across all fields per term is used * which is the minimum number of documents the terms occurs in. *

*/ // TODO maybe contribute to Lucene public abstract class BlendedTermQuery extends Query { private final Term[] terms; public BlendedTermQuery(Term[] terms) { if (terms == null || terms.length == 0) { throw new IllegalArgumentException("terms must not be null or empty"); } this.terms = terms; } @Override public Query rewrite(IndexReader reader) throws IOException { IndexReaderContext context = reader.getContext(); TermContext[] ctx = new TermContext[terms.length]; int[] docFreqs = new int[ctx.length]; for (int i = 0; i < terms.length; i++) { ctx[i] = TermContext.build(context, terms[i]); docFreqs[i] = ctx[i].docFreq(); } final int maxDoc = reader.maxDoc(); blend(ctx, maxDoc, reader); Query query = topLevelQuery(terms, ctx, docFreqs, maxDoc); query.setBoost(getBoost()); return query; } protected abstract Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc); protected void blend(TermContext[] contexts, int maxDoc, IndexReader reader) throws IOException { if (contexts.length <= 1) { return; } int max = 0; long minSumTTF = Long.MAX_VALUE; for (int i = 0; i < contexts.length; i++) { TermContext ctx = contexts[i]; int df = ctx.docFreq(); // we use the max here since it's the only "true" estimation we can make here // at least max(df) documents have that term. Sum or Averages don't seem // to have a significant meaning here. // TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have: // df = df1 + df2 - (df1 * df2 / maxDoc)? max = Math.max(df, max); if (minSumTTF != -1 && ctx.totalTermFreq() != -1) { // we need to find out the minimum sumTTF to adjust the statistics // otherwise the statistics don't match minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field())); } else { minSumTTF = -1; } } if (minSumTTF != -1 && maxDoc > minSumTTF) { maxDoc = (int)minSumTTF; } if (max == 0) { return; // we are done that term doesn't exist at all } long sumTTF = minSumTTF == -1 ? -1 : 0; final TermContext[] tieBreak = new TermContext[contexts.length]; System.arraycopy(contexts, 0, tieBreak, 0, contexts.length); ArrayUtil.timSort(tieBreak, new Comparator() { @Override public int compare(TermContext o1, TermContext o2) { return Ints.compare(o2.docFreq(), o1.docFreq()); } }); int prev = tieBreak[0].docFreq(); int actualDf = Math.min(maxDoc, max); assert actualDf >=0 : "DF must be >= 0"; // here we try to add a little bias towards // the more popular (more frequent) fields // that acts as a tie breaker for (TermContext ctx : tieBreak) { if (ctx.docFreq() == 0) { break; } final int current = ctx.docFreq(); if (prev > current) { actualDf++; } ctx.setDocFreq(Math.min(maxDoc, actualDf)); prev = current; if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) { sumTTF += ctx.totalTermFreq(); } else { sumTTF = -1; // omit once TF is omitted anywhere! } } sumTTF = Math.min(sumTTF, minSumTTF); for (int i = 0; i < contexts.length; i++) { int df = contexts[i].docFreq(); if (df == 0) { continue; } // the blended sumTTF can't be greater than the sumTTTF on the field final long fixedTTF = sumTTF == -1 ? -1 : sumTTF; contexts[i] = adjustTTF(contexts[i], fixedTTF); } } private TermContext adjustTTF(TermContext termContext, long sumTTF) { if (sumTTF == -1 && termContext.totalTermFreq() == -1) { return termContext; } TermContext newTermContext = new TermContext(termContext.topReaderContext); List leaves = termContext.topReaderContext.leaves(); final int len; if (leaves == null) { len = 1; } else { len = leaves.size(); } int df = termContext.docFreq(); long ttf = sumTTF; for (int i = 0; i < len; i++) { TermState termState = termContext.get(i); if (termState == null) { continue; } newTermContext.register(termState, i, df, ttf); df = 0; ttf = 0; } return newTermContext; } @Override public String toString(String field) { return "blended(terms: " + Arrays.toString(terms) + ")"; } @Override public void extractTerms(Set terms) { for (Term term : this.terms) { terms.add(term); } } private volatile Term[] equalTerms = null; private Term[] equalsTerms() { if (terms.length == 1) { return terms; } if (equalTerms == null) { // sort the terms to make sure equals and hashCode are consistent // this should be a very small cost and equivalent to a HashSet but less object creation final Term[] t = new Term[terms.length]; System.arraycopy(terms, 0, t, 0, terms.length); ArrayUtil.timSort(t); equalTerms = t; } return equalTerms; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; BlendedTermQuery that = (BlendedTermQuery) o; if (!Arrays.equals(equalsTerms(), that.equalsTerms())) return false; return true; } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + Arrays.hashCode(equalsTerms()); return result; } public static BlendedTermQuery booleanBlendedQuery(Term[] terms, final boolean disableCoord) { return booleanBlendedQuery(terms, null, disableCoord); } public static BlendedTermQuery booleanBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord) { return new BlendedTermQuery(terms) { protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { BooleanQuery query = new BooleanQuery(disableCoord); for (int i = 0; i < terms.length; i++) { TermQuery termQuery = new TermQuery(terms[i], ctx[i]); if (boosts != null) { termQuery.setBoost(boosts[i]); } query.add(termQuery, BooleanClause.Occur.SHOULD); } return query; } }; } public static BlendedTermQuery commonTermsBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord, final float maxTermFrequency) { return new BlendedTermQuery(terms) { protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { BooleanQuery query = new BooleanQuery(true); BooleanQuery high = new BooleanQuery(disableCoord); BooleanQuery low = new BooleanQuery(disableCoord); for (int i = 0; i < terms.length; i++) { TermQuery termQuery = new TermQuery(terms[i], ctx[i]); if (boosts != null) { termQuery.setBoost(boosts[i]); } if ((maxTermFrequency >= 1f && docFreqs[i] > maxTermFrequency) || (docFreqs[i] > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) { high.add(termQuery, BooleanClause.Occur.SHOULD); } else { low.add(termQuery, BooleanClause.Occur.SHOULD); } } if (low.clauses().isEmpty()) { for (BooleanClause booleanClause : high) { booleanClause.setOccur(BooleanClause.Occur.MUST); } return high; } else if (high.clauses().isEmpty()) { return low; } else { query.add(high, BooleanClause.Occur.SHOULD); query.add(low, BooleanClause.Occur.MUST); return query; } } }; } public static BlendedTermQuery dismaxBlendedQuery(Term[] terms, final float tieBreakerMultiplier) { return dismaxBlendedQuery(terms, null, tieBreakerMultiplier); } public static BlendedTermQuery dismaxBlendedQuery(Term[] terms, final float[] boosts, final float tieBreakerMultiplier) { return new BlendedTermQuery(terms) { protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) { DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier); for (int i = 0; i < terms.length; i++) { TermQuery termQuery = new TermQuery(terms[i], ctx[i]); if (boosts != null) { termQuery.setBoost(boosts[i]); } query.add(termQuery); } return query; } }; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy