org.apache.lucene.sandbox.search.PhraseWildcardQuery Maven / Gradle / Ivy
Show all versions of lucene-sandbox Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.ExactPhraseMatcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseMatcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PhraseWeight;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SloppyPhraseMatcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.mutable.MutableValueBool;
/**
* A generalized version of {@link PhraseQuery}, built with one or more {@link MultiTermQuery} that
* provides term expansions for multi-terms (one of the expanded terms must match).
*
* Its main advantage is to control the total number of expansions across all {@link
* MultiTermQuery} and across all segments.
*
*
Use the {@link Builder} to build a {@link PhraseWildcardQuery}.
*
*
This query is similar to {@link MultiPhraseQuery}, but it handles, controls and optimizes the
* multi-term expansions.
*
*
This query is equivalent to building an ordered {@link
* org.apache.lucene.queries.spans.SpanNearQuery} with a list of {@link
* org.apache.lucene.queries.spans.SpanTermQuery} and {@link
* org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper}. But it optimizes the multi-term
* expansions and the segment accesses. It first resolves the single-terms to early stop if some
* does not match. Then it expands each multi-term sequentially, stopping immediately if one does
* not match. It detects the segments that do not match to skip them for the next expansions. This
* often avoid expanding the other multi-terms on some or even all segments. And finally it controls
* the total number of expansions.
*
*
Immutable.
*
* @lucene.experimental
*/
public class PhraseWildcardQuery extends Query {
protected static final Query NO_MATCH_QUERY =
new MatchNoDocsQuery("Empty " + PhraseWildcardQuery.class.getSimpleName());
protected final String field;
protected final List phraseTerms;
protected final int slop;
protected final int maxMultiTermExpansions;
protected final boolean segmentOptimizationEnabled;
protected PhraseWildcardQuery(
String field,
List phraseTerms,
int slop,
int maxMultiTermExpansions,
boolean segmentOptimizationEnabled) {
this.field = field;
this.phraseTerms = phraseTerms;
this.slop = slop;
this.maxMultiTermExpansions = maxMultiTermExpansions;
this.segmentOptimizationEnabled = segmentOptimizationEnabled;
}
public String getField() {
return field;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
if (phraseTerms.isEmpty()) {
return NO_MATCH_QUERY;
}
if (phraseTerms.size() == 1) {
return phraseTerms.get(0).getQuery();
}
return super.rewrite(reader);
}
@Override
public void visit(QueryVisitor visitor) {
if (!visitor.acceptField(field)) {
return;
}
QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this);
for (PhraseTerm phraseTerm : phraseTerms) {
phraseTerm.getQuery().visit(v);
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
IndexReader reader = searcher.getIndexReader();
// Build a list of segments ordered by terms size (number of terms).
// The first segments to be searched are the smaller ones, which are by
// design containing the most recent documents. Any segment in this list
// may also be removed in the PhraseTerm.collectTermData() calls below
// if one of the phrase term does not match in the segment. This allows
// to early stop expanding multi-terms on removed segments.
// Additionally there is a global multi-term expansion limit across all multi-terms
// and all segments. So this is important to first start with the smallest
// segments to give back non-used expansion credits to the next multi-terms,
// as this is more probable with the small segments.
List sizeSortedSegments =
new SegmentTermsSizeComparator().createTermsSizeSortedCopyOf(reader.leaves());
// TermsData will contain the collected TermState and TermStatistics for all the terms
// of the phrase. It is filled during PhraseTerm.collectTermData() calls below.
TermsData termsData = createTermsData(sizeSortedSegments.size());
// Iterate the phrase terms, and collect the TermState for single-terms.
// - Early stop if a single term does not match.
int numMultiTerms = 0;
for (PhraseTerm phraseTerm : phraseTerms) {
if (phraseTerm.hasExpansions()) {
numMultiTerms++;
} else {
assert TestCounters.get().incSingleTermAnalysisCount();
int numMatches = phraseTerm.collectTermData(this, searcher, sizeSortedSegments, termsData);
if (numMatches == 0) {
// Early stop here because the single term does not match in any segment.
// So the whole phrase query cannot match.
return earlyStopWeight();
}
}
}
// Iterate the phrase terms and collect the TermState for multi-terms.
// - Early stop if a multi-term does not match.
// - Expand the multi-terms only when required.
int remainingExpansions = maxMultiTermExpansions;
int remainingMultiTerms = numMultiTerms;
for (PhraseTerm phraseTerm : phraseTerms) {
if (phraseTerm.hasExpansions()) {
assert TestCounters.get().incMultiTermAnalysisCount();
assert remainingExpansions >= 0 && remainingExpansions <= maxMultiTermExpansions;
assert remainingMultiTerms > 0;
// Consider the remaining expansions allowed for all remaining multi-terms.
// Divide it evenly to get the expansion limit for the current multi-term.
int maxExpansionsForTerm = remainingExpansions / remainingMultiTerms;
int numExpansions =
phraseTerm.collectTermData(
this,
searcher,
sizeSortedSegments,
remainingMultiTerms,
maxExpansionsForTerm,
termsData);
assert numExpansions >= 0 && numExpansions <= maxExpansionsForTerm;
if (numExpansions == 0) {
// Early stop here because the multi-term does not match in any segment.
// So the whole phrase query cannot match.
return earlyStopWeight();
}
// Deduct the effectively used expansions. This may give more expansion
// credits to the next multi-terms.
remainingExpansions -= numExpansions;
remainingMultiTerms--;
}
}
assert remainingMultiTerms == 0;
assert remainingExpansions >= 0;
// TestCounters.get().printTestCounters(termsData);
return termsData.areAllTermsMatching()
? createPhraseWeight(searcher, scoreMode, boost, termsData)
: noMatchWeight();
}
/** Creates new {@link TermsData}. */
protected TermsData createTermsData(int numSegments) {
return new TermsData(phraseTerms.size(), numSegments);
}
protected Weight earlyStopWeight() {
assert TestCounters.get().incQueryEarlyStopCount();
return noMatchWeight();
}
protected Weight noMatchWeight() {
return new ConstantScoreWeight(this, 0) {
@Override
public Scorer scorer(LeafReaderContext leafReaderContext) {
return null;
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
};
}
PhraseWeight createPhraseWeight(
IndexSearcher searcher, ScoreMode scoreMode, float boost, TermsData termsData)
throws IOException {
return new PhraseWeight(this, field, searcher, scoreMode) {
@Override
protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
if (termsData.termStatsList.isEmpty()) {
return null;
}
return searcher
.getSimilarity()
.scorer(
boost,
searcher.collectionStatistics(field),
termsData.termStatsList.toArray(new TermStatistics[0]));
}
@Override
protected PhraseMatcher getPhraseMatcher(
LeafReaderContext leafReaderContext, Similarity.SimScorer scorer, boolean exposeOffsets)
throws IOException {
Terms fieldTerms = leafReaderContext.reader().terms(field);
if (fieldTerms == null) {
return null;
}
TermsEnum termsEnum = fieldTerms.iterator();
float totalMatchCost = 0;
PhraseQuery.PostingsAndFreq[] postingsFreqs =
new PhraseQuery.PostingsAndFreq[phraseTerms.size()];
for (int termPosition = 0; termPosition < postingsFreqs.length; termPosition++) {
TermData termData = termsData.getTermData(termPosition);
assert termData != null;
List termStates = termData.getTermStatesForSegment(leafReaderContext);
if (termStates == null) {
// If the current phrase term does not match in the segment, then the phrase cannot
// match on the segment.
// So early stop by returning a null scorer.
return null;
}
assert !termStates.isEmpty();
List postingsEnums = new ArrayList<>(termStates.size());
for (TermBytesTermState termBytesTermState : termStates) {
termsEnum.seekExact(termBytesTermState.termBytes, termBytesTermState.termState);
postingsEnums.add(
termsEnum.postings(
null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
}
PostingsEnum unionPostingsEnum;
if (postingsEnums.size() == 1) {
unionPostingsEnum = postingsEnums.get(0);
} else {
unionPostingsEnum =
exposeOffsets
? new MultiPhraseQuery.UnionFullPostingsEnum(postingsEnums)
: new MultiPhraseQuery.UnionPostingsEnum(postingsEnums);
}
postingsFreqs[termPosition] =
new PhraseQuery.PostingsAndFreq(
unionPostingsEnum,
new SlowImpactsEnum(unionPostingsEnum),
termPosition,
termData.terms);
}
if (slop == 0) {
// Sort by increasing docFreq order.
ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
} else {
return new SloppyPhraseMatcher(
postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
}
}
};
}
@Override
public boolean equals(Object o) {
if (!(o instanceof PhraseWildcardQuery)) {
return false;
}
PhraseWildcardQuery pwq = (PhraseWildcardQuery) o;
return slop == pwq.slop && phraseTerms.equals(pwq.phraseTerms);
}
@Override
public int hashCode() {
return classHash() ^ slop ^ phraseTerms.hashCode();
}
@Override
public final String toString(String omittedField) {
StringBuilder builder = new StringBuilder();
builder.append("phraseWildcard(");
if (field == null || !field.equals(omittedField)) {
builder.append(field).append(':');
}
builder.append('\"');
for (int i = 0; i < phraseTerms.size(); i++) {
if (i != 0) {
builder.append(' ');
}
phraseTerms.get(i).toString(builder);
}
builder.append('\"');
if (slop != 0) {
builder.append('~');
builder.append(slop);
}
builder.append(")");
return builder.toString();
}
/**
* Collects the {@link TermState} and {@link TermStatistics} for a single-term without expansion.
*
* @param termsData receives the collected data.
*/
protected int collectSingleTermData(
SingleTerm singleTerm,
IndexSearcher searcher,
List segments,
TermsData termsData)
throws IOException {
TermData termData = termsData.getOrCreateTermData(singleTerm.termPosition);
Term term = singleTerm.term;
termData.terms.add(term);
TermStates termStates = TermStates.build(searcher.getIndexReader().getContext(), term, true);
// Collect TermState per segment.
int numMatches = 0;
Iterator segmentIterator = segments.iterator();
while (segmentIterator.hasNext()) {
LeafReaderContext leafReaderContext = segmentIterator.next();
assert TestCounters.get().incSegmentUseCount();
boolean termMatchesInSegment = false;
Terms terms = leafReaderContext.reader().terms(term.field());
if (terms != null) {
checkTermsHavePositions(terms);
TermState termState = termStates.get(leafReaderContext);
if (termState != null) {
termMatchesInSegment = true;
numMatches++;
termData.setTermStatesForSegment(
leafReaderContext,
Collections.singletonList(new TermBytesTermState(term.bytes(), termState)));
}
}
if (!termMatchesInSegment && shouldOptimizeSegments()) {
// Remove this segment from the list because the phrase cannot match on it.
segmentIterator.remove();
assert TestCounters.get().incSegmentSkipCount();
}
}
// Collect the term stats across all segments.
if (termStates.docFreq() > 0) {
termsData.termStatsList.add(
searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq()));
}
return numMatches;
}
/**
* Collects the {@link TermState} and {@link TermStatistics} for a multi-term with expansion.
*
* @param remainingMultiTerms the number of remaining multi-terms to process, including the
* current one, excluding the multi-terms already processed.
* @param termsData receives the collected data.
*/
protected int collectMultiTermData(
MultiTerm multiTerm,
IndexSearcher searcher,
List segments,
int remainingMultiTerms, // Unused here but leveraged by extending classes.
int maxExpansionsForTerm,
TermsData termsData)
throws IOException {
TermData termData = termsData.getOrCreateTermData(multiTerm.termPosition);
Map termStatsMap = createTermStatsMap(multiTerm);
int numExpansions = 0;
Iterator segmentIterator = segments.iterator();
MutableValueBool shouldStopSegmentIteration = new MutableValueBool();
while (segmentIterator.hasNext() && !shouldStopSegmentIteration.value) {
LeafReaderContext leafReaderContext = segmentIterator.next();
int remainingExpansions = maxExpansionsForTerm - numExpansions;
assert remainingExpansions >= 0;
List termStates =
collectMultiTermDataForSegment(
multiTerm,
leafReaderContext,
remainingExpansions,
shouldStopSegmentIteration,
termStatsMap);
if (!termStates.isEmpty()) {
assert termStates.size() <= remainingExpansions;
numExpansions += termStates.size();
assert numExpansions <= maxExpansionsForTerm;
termData.setTermStatesForSegment(leafReaderContext, termStates);
} else if (shouldOptimizeSegments()) {
// Remove this segment from the list because the phrase cannot match on it.
segmentIterator.remove();
assert TestCounters.get().incSegmentSkipCount();
}
}
// Collect the term stats across all segments.
collectMultiTermStats(searcher, termStatsMap, termsData, termData);
return numExpansions;
}
protected boolean shouldOptimizeSegments() {
return segmentOptimizationEnabled;
}
/** Creates a {@link TermStats} map for a {@link MultiTerm}. */
protected Map createTermStatsMap(
MultiTerm multiTerm) { // multiTerm param can be used by sub-classes.
return new HashMap<>();
}
/**
* Collects the {@link TermState} list and {@link TermStatistics} for a multi-term on a specific
* index segment.
*
* @param remainingExpansions the number of remaining expansions allowed for the segment.
* @param shouldStopSegmentIteration to be set to true to stop the segment iteration calling this
* method repeatedly.
* @param termStatsMap receives the collected {@link TermStats} across all segments.
*/
protected List collectMultiTermDataForSegment(
MultiTerm multiTerm,
LeafReaderContext leafReaderContext,
int remainingExpansions,
MutableValueBool shouldStopSegmentIteration,
Map termStatsMap)
throws IOException {
TermsEnum termsEnum = createTermsEnum(multiTerm, leafReaderContext);
if (termsEnum == null) {
return Collections.emptyList();
}
assert TestCounters.get().incSegmentUseCount();
List termStates = new ArrayList<>();
while (termsEnum.next() != null && remainingExpansions > 0) {
// Collect term stats for the segment.
TermStats termStats = termStatsMap.get(termsEnum.term());
if (termStats == null) {
BytesRef termBytes = BytesRef.deepCopyOf(termsEnum.term());
termStats = new TermStats(termBytes);
termStatsMap.put(termBytes, termStats);
}
// Accumulate stats the same way TermStates.accumulateStatistics() does.
// Sum the stats per term for all segments the same way TermStates.build() does.
termStats.addStats(termsEnum.docFreq(), termsEnum.totalTermFreq());
// Collect TermState per segment.
termStates.add(new TermBytesTermState(termStats.termBytes, termsEnum.termState()));
remainingExpansions--;
assert TestCounters.get().incExpansionCount();
}
assert remainingExpansions >= 0;
shouldStopSegmentIteration.value = remainingExpansions == 0;
return termStates;
}
/**
* Creates the {@link TermsEnum} for the given {@link MultiTerm} and segment.
*
* @return null if there is no term for this query field in the segment.
*/
protected TermsEnum createTermsEnum(MultiTerm multiTerm, LeafReaderContext leafReaderContext)
throws IOException {
Terms terms = leafReaderContext.reader().terms(field);
if (terms == null) {
return null;
}
checkTermsHavePositions(terms);
TermsEnum termsEnum = multiTerm.query.getTermsEnum(terms);
assert termsEnum != null;
return termsEnum;
}
/**
* Collect the term stats across all segments.
*
* @param termStatsMap input map of already collected {@link TermStats}.
* @param termsData receives the {@link TermStatistics} computed for all {@link TermStats}.
* @param termData receives all the collected {@link Term}.
*/
protected void collectMultiTermStats(
IndexSearcher searcher,
Map termStatsMap,
TermsData termsData,
TermData termData)
throws IOException {
// Collect term stats across all segments.
// Collect stats the same way MultiPhraseQuery.MultiPhraseWeight constructor does, for all terms
// and all segments.
for (Map.Entry termStatsEntry : termStatsMap.entrySet()) {
Term term = new Term(field, termStatsEntry.getKey());
termData.terms.add(term);
TermStats termStats = termStatsEntry.getValue();
if (termStats.docFreq > 0) {
termsData.termStatsList.add(
searcher.termStatistics(term, termStats.docFreq, termStats.totalTermFreq));
}
}
}
protected void checkTermsHavePositions(Terms terms) {
if (!terms.hasPositions()) {
throw new IllegalStateException(
"field \""
+ field
+ "\" was indexed without position data;"
+ " cannot run "
+ PhraseWildcardQuery.class.getSimpleName());
}
}
/** Builds a {@link PhraseWildcardQuery}. */
public static class Builder {
protected final String field;
protected final List phraseTerms;
protected int slop;
protected final int maxMultiTermExpansions;
protected final boolean segmentOptimizationEnabled;
/**
* @param field The query field.
* @param maxMultiTermExpansions The maximum number of expansions across all multi-terms and
* across all segments. It counts expansions for each segments individually, that allows
* optimizations per segment and unused expansions are credited to next segments. This is
* different from {@link MultiPhraseQuery} and {@link
* org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper} which have an expansion limit
* per multi-term.
*/
public Builder(String field, int maxMultiTermExpansions) {
this(field, maxMultiTermExpansions, true);
}
/**
* @param field The query field.
* @param maxMultiTermExpansions The maximum number of expansions across all multi-terms and
* across all segments. It counts expansions for each segments individually, that allows
* optimizations per segment and unused expansions are credited to next segments. This is
* different from {@link MultiPhraseQuery} and {@link
* org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper} which have an expansion limit
* per multi-term.
* @param segmentOptimizationEnabled Whether to enable the segment optimization which consists
* in ignoring a segment for further analysis as soon as a term is not present inside it.
* This optimizes the query execution performance but changes the scoring. The result
* ranking is preserved.
*/
public Builder(String field, int maxMultiTermExpansions, boolean segmentOptimizationEnabled) {
this.field = field;
this.maxMultiTermExpansions = maxMultiTermExpansions;
this.segmentOptimizationEnabled = segmentOptimizationEnabled;
phraseTerms = new ArrayList<>();
}
/** Adds a single term at the next position in the phrase. */
public Builder addTerm(BytesRef termBytes) {
return addTerm(new Term(field, termBytes));
}
/** Adds a single term at the next position in the phrase. */
public Builder addTerm(Term term) {
if (!term.field().equals(field)) {
throw new IllegalArgumentException(
term.getClass().getSimpleName()
+ " field \""
+ term.field()
+ "\" cannot be different from the "
+ PhraseWildcardQuery.class.getSimpleName()
+ " field \""
+ field
+ "\"");
}
phraseTerms.add(new SingleTerm(term, phraseTerms.size()));
return this;
}
/**
* Adds a multi-term at the next position in the phrase. Any of the terms returned by the
* provided {@link MultiTermQuery} enumeration may match (expansion as a disjunction).
*/
public Builder addMultiTerm(MultiTermQuery multiTermQuery) {
if (!multiTermQuery.getField().equals(field)) {
throw new IllegalArgumentException(
multiTermQuery.getClass().getSimpleName()
+ " field \""
+ multiTermQuery.getField()
+ "\" cannot be different from the "
+ PhraseWildcardQuery.class.getSimpleName()
+ " field \""
+ field
+ "\"");
}
phraseTerms.add(new MultiTerm(multiTermQuery, phraseTerms.size()));
return this;
}
/** Sets the phrase slop. */
public Builder setSlop(int slop) {
if (slop < 0) {
throw new IllegalArgumentException("slop value cannot be negative");
}
this.slop = slop;
return this;
}
/** Builds a {@link PhraseWildcardQuery}. */
public PhraseWildcardQuery build() {
return new PhraseWildcardQuery(
field, phraseTerms, slop, maxMultiTermExpansions, segmentOptimizationEnabled);
}
}
/**
* All {@link PhraseTerm} are light and immutable. They do not hold query processing data such as
* {@link TermsData}. That way, the {@link PhraseWildcardQuery} is immutable and light itself and
* can be used safely as a key of the query cache.
*/
protected abstract static class PhraseTerm {
protected final int termPosition;
protected PhraseTerm(int termPosition) {
this.termPosition = termPosition;
}
protected abstract boolean hasExpansions();
protected abstract Query getQuery();
/**
* Collects {@link TermState} and {@link TermStatistics} for the term without expansion. It must
* be called only if {@link #hasExpansions()} returns false. Simplified version of {@code
* #collectTermData(PhraseWildcardQuery, IndexSearcher, List, int, int, TermsData)} with less
* arguments. This method throws {@link UnsupportedOperationException} if not overridden.
*/
protected int collectTermData(
PhraseWildcardQuery query,
IndexSearcher searcher,
List segments,
TermsData termsData)
throws IOException {
throw new UnsupportedOperationException();
}
/**
* Collects {@link TermState} and {@link TermStatistics} for the term (potentially expanded).
*
* @param termsData {@link TermsData} to update with the collected terms and stats.
* @return The number of expansions or matches in all segments; or 0 if this term does not match
* in any segment, in this case the phrase query can immediately stop.
*/
protected abstract int collectTermData(
PhraseWildcardQuery query,
IndexSearcher searcher,
List segments,
int remainingMultiTerms,
int maxExpansionsForTerm,
TermsData termsData)
throws IOException;
protected abstract void toString(StringBuilder builder);
@Override
public abstract boolean equals(Object o);
@Override
public abstract int hashCode();
}
/** Phrase term with no expansion. */
protected static class SingleTerm extends PhraseTerm {
protected final Term term;
protected SingleTerm(Term term, int termPosition) {
super(termPosition);
this.term = term;
}
@Override
protected boolean hasExpansions() {
return false;
}
@Override
protected Query getQuery() {
return new TermQuery(term);
}
@Override
protected int collectTermData(
PhraseWildcardQuery query,
IndexSearcher searcher,
List segments,
TermsData termsData)
throws IOException {
return collectTermData(query, searcher, segments, 0, 0, termsData);
}
@Override
protected int collectTermData(
PhraseWildcardQuery query,
IndexSearcher searcher,
List segments,
int remainingMultiTerms,
int maxExpansionsForTerm,
TermsData termsData)
throws IOException {
return query.collectSingleTermData(this, searcher, segments, termsData);
}
@Override
protected void toString(StringBuilder builder) {
builder.append(term.text());
}
@Override
public boolean equals(Object o) {
if (!(o instanceof SingleTerm)) {
return false;
}
SingleTerm singleTerm = (SingleTerm) o;
return term.equals(singleTerm.term);
}
@Override
public int hashCode() {
return term.hashCode();
}
}
/** Phrase term with expansions. */
protected static class MultiTerm extends PhraseTerm {
protected final MultiTermQuery query;
protected MultiTerm(MultiTermQuery query, int termPosition) {
super(termPosition);
this.query = query;
}
@Override
protected boolean hasExpansions() {
return true;
}
@Override
protected Query getQuery() {
return query;
}
@Override
protected int collectTermData(
PhraseWildcardQuery query,
IndexSearcher searcher,
List segments,
int remainingMultiTerms,
int maxExpansionsForTerm,
TermsData termsData)
throws IOException {
return query.collectMultiTermData(
this, searcher, segments, remainingMultiTerms, maxExpansionsForTerm, termsData);
}
@Override
protected void toString(StringBuilder builder) {
builder.append(query.toString(query.getField()));
}
@Override
public boolean equals(Object o) {
if (!(o instanceof MultiTerm)) {
return false;
}
MultiTerm multiTerm = (MultiTerm) o;
return query.equals(multiTerm.query);
}
@Override
public int hashCode() {
return query.hashCode();
}
}
/**
* Holds the {@link TermState} and {@link TermStatistics} for all the matched and collected {@link
* Term}, for all phrase terms, for all segments.
*/
protected static class TermsData {
protected final int numTerms;
protected final int numSegments;
protected final List termStatsList;
protected final TermData[] termDataPerPosition;
protected int numTermsMatching;
protected TermsData(int numTerms, int numSegments) {
this.numTerms = numTerms;
this.numSegments = numSegments;
termStatsList = new ArrayList<>();
termDataPerPosition = new TermData[numTerms];
}
protected TermData getOrCreateTermData(int termPosition) {
TermData termData = termDataPerPosition[termPosition];
if (termData == null) {
termData = new TermData(numSegments, this);
termDataPerPosition[termPosition] = termData;
}
return termData;
}
protected TermData getTermData(int termPosition) {
return termDataPerPosition[termPosition];
}
protected boolean areAllTermsMatching() {
assert numTermsMatching <= numTerms;
return numTermsMatching == numTerms;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("TermsData(");
builder.append("numSegments=").append(numSegments);
builder.append(", termDataPerPosition=").append(Arrays.asList(termDataPerPosition));
builder.append(", termsStatsList=[");
for (TermStatistics termStatistics : termStatsList) {
builder
.append("{")
.append(termStatistics.term().utf8ToString())
.append(", ")
.append(termStatistics.docFreq())
.append(", ")
.append(termStatistics.totalTermFreq())
.append("}");
}
builder.append("]");
builder.append(")");
return builder.toString();
}
}
/**
* Holds the {@link TermState} for all the collected {@link Term}, for a specific phrase term, for
* all segments.
*/
protected static class TermData {
protected final int numSegments;
protected final TermsData termsData;
protected List[] termStatesPerSegment;
protected final List terms;
protected TermData(int numSegments, TermsData termsData) {
this.numSegments = numSegments;
this.termsData = termsData;
terms = new ArrayList<>();
}
/** Sets the collected list of {@link TermBytesTermState} for the given segment. */
@SuppressWarnings("unchecked")
protected void setTermStatesForSegment(
LeafReaderContext leafReaderContext, List termStates) {
if (termStatesPerSegment == null) {
termStatesPerSegment = (List[]) new List>[numSegments];
termsData.numTermsMatching++;
}
termStatesPerSegment[leafReaderContext.ord] = termStates;
}
/**
* @return The collected list of {@link TermBytesTermState} for the given segment; or null if
* this phrase term does not match in the given segment.
*/
protected List getTermStatesForSegment(
LeafReaderContext leafReaderContext) {
assert termStatesPerSegment != null
: "No TermState for any segment; the query should have been stopped before";
return termStatesPerSegment[leafReaderContext.ord];
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("TermData(");
builder.append("termStates=");
if (termStatesPerSegment == null) {
builder.append("null");
} else {
builder.append(Arrays.asList(termStatesPerSegment));
}
builder.append(", terms=").append(terms);
builder.append(")");
return builder.toString();
}
}
/** Holds a pair of term bytes - term state. */
public static class TermBytesTermState {
protected final BytesRef termBytes;
protected final TermState termState;
public TermBytesTermState(BytesRef termBytes, TermState termState) {
this.termBytes = termBytes;
this.termState = termState;
}
@Override
public String toString() {
return "\"" + termBytes.utf8ToString() + "\"->" + termState;
}
}
/** Accumulates the doc freq and total term freq. */
public static class TermStats {
protected final BytesRef termBytes;
protected int docFreq;
protected long totalTermFreq;
protected TermStats(BytesRef termBytes) {
this.termBytes = termBytes;
}
public BytesRef getTermBytes() {
return termBytes;
}
protected void addStats(int docFreq, long totalTermFreq) {
this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0) {
this.totalTermFreq += totalTermFreq;
} else {
this.totalTermFreq = -1;
}
}
}
/**
* Compares segments based of the number of terms they contain.
*
* This is used to sort segments incrementally by number of terms. This way the first segment
* to search is the smallest, so a term has the lowest probability to match in this segment. And
* if the term does not match, we credit unused expansions when searching the other next segments.
*/
protected class SegmentTermsSizeComparator implements Comparator {
private static final String COMPARISON_ERROR_MESSAGE = "Segment comparison error";
@Override
public int compare(LeafReaderContext leafReaderContext1, LeafReaderContext leafReaderContext2) {
try {
return Long.compare(getTermsSize(leafReaderContext1), getTermsSize(leafReaderContext2));
} catch (IOException e) {
throw new RuntimeException(COMPARISON_ERROR_MESSAGE, e);
}
}
protected List createTermsSizeSortedCopyOf(List segments)
throws IOException {
List copy = new ArrayList<>(segments);
try {
copy.sort(this);
} catch (RuntimeException e) {
if (COMPARISON_ERROR_MESSAGE.equals(e.getMessage())) {
throw (IOException) e.getCause();
}
throw e;
}
return copy;
}
private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException {
Terms terms = Terms.getTerms(leafReaderContext.reader(), field);
return terms.size();
}
}
/** Test counters incremented when assertions are enabled. Used only when testing. */
protected static class TestCounters {
private static final TestCounters SINGLETON = new TestCounters();
protected long singleTermAnalysisCount;
protected long multiTermAnalysisCount;
protected long expansionCount;
protected long segmentUseCount;
protected long segmentSkipCount;
protected long queryEarlyStopCount;
protected static TestCounters get() {
return SINGLETON;
}
protected boolean incSingleTermAnalysisCount() {
singleTermAnalysisCount++;
return true;
}
protected boolean incMultiTermAnalysisCount() {
multiTermAnalysisCount++;
return true;
}
protected boolean incExpansionCount() {
expansionCount++;
return true;
}
protected boolean incSegmentUseCount() {
segmentUseCount++;
return true;
}
protected boolean incSegmentSkipCount() {
segmentSkipCount++;
return true;
}
protected boolean incQueryEarlyStopCount() {
queryEarlyStopCount++;
return true;
}
protected void clear() {
singleTermAnalysisCount = 0;
multiTermAnalysisCount = 0;
expansionCount = 0;
segmentUseCount = 0;
segmentSkipCount = 0;
queryEarlyStopCount = 0;
}
// protected void printTestCounters(TermsData termsData) {
// System.out.println("singleTermAnalysisCount=" + singleTermAnalysisCount);
// System.out.println("multiTermAnalysisCount=" + multiTermAnalysisCount);
// System.out.println("expansionCount=" + expansionCount);
// System.out.println("segmentUseCount=" + segmentUseCount);
// System.out.println("segmentSkipCount=" + segmentSkipCount);
// System.out.println("queryEarlyStopCount=" + queryEarlyStopCount);
// System.out.println(termsData);
// }
}
}