org.apache.lucene.search.uhighlight.UnifiedHighlighter Maven / Gradle / Ivy
Show all versions of lucene-highlighter Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.BaseCompositeReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
/**
* A Highlighter that can get offsets from either postings ({@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}), term vectors ({@link
* FieldType#setStoreTermVectorOffsets(boolean)}), or via re-analyzing text.
*
* This highlighter treats the single original document as the whole corpus, and then scores
* individual passages as if they were documents in this corpus. It uses a {@link BreakIterator} to
* find passages in the text; by default it breaks using {@link
* BreakIterator#getSentenceInstance(Locale) getSentenceInstance(Locale.ROOT)}. It then iterates in
* parallel (merge sorting by offset) through the positions of all terms from the query, coalescing
* those hits that occur in a single passage into a {@link Passage}, and then scores each Passage
* using a separate {@link PassageScorer}. Passages are finally formatted into highlighted snippets
* with a {@link PassageFormatter}.
*
*
You can customize the behavior by calling some of the setters, or by subclassing and
* overriding some methods. Some important hooks:
*
*
* - {@link #getBreakIterator(String)}: Customize how the text is divided into passages.
*
- {@link #getScorer(String)}: Customize how passages are ranked.
*
- {@link #getFormatter(String)}: Customize how snippets are formatted.
*
*
* This is thread-safe, notwithstanding the setters.
*
* @lucene.experimental
*/
public class UnifiedHighlighter {
protected static final char MULTIVAL_SEP_CHAR = (char) 0;
public static final int DEFAULT_MAX_LENGTH = 10000;
public static final int DEFAULT_CACHE_CHARS_THRESHOLD = 524288; // ~ 1 MB (2 byte chars)
protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY =
new LabelledCharArrayMatcher[0];
// All the private defaults will be removed once non-builder based UH is removed.
private static final boolean DEFAULT_ENABLE_MULTI_TERM_QUERY = true;
private static final boolean DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY = true;
private static final boolean DEFAULT_ENABLE_WEIGHT_MATCHES = true;
private static final boolean DEFAULT_ENABLE_RELEVANCY_OVER_SPEED = true;
private static final Supplier DEFAULT_BREAK_ITERATOR =
() -> BreakIterator.getSentenceInstance(Locale.ROOT);
private static final PassageScorer DEFAULT_PASSAGE_SCORER = new PassageScorer();
private static final PassageFormatter DEFAULT_PASSAGE_FORMATTER = new DefaultPassageFormatter();
private static final int DEFAULT_MAX_HIGHLIGHT_PASSAGES = -1;
protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
protected final Analyzer indexAnalyzer;
// lazy initialized with double-check locking; protected so subclass can init
protected volatile FieldInfos fieldInfos;
private Predicate fieldMatcher;
private Set flags;
// e.g. wildcards
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
// AKA "accuracy" or "query debugging"
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
private boolean weightMatches = DEFAULT_ENABLE_WEIGHT_MATCHES;
// For analysis, prefer MemoryIndexOffsetStrategy
private boolean passageRelevancyOverSpeed = DEFAULT_ENABLE_RELEVANCY_OVER_SPEED;
private int maxLength = DEFAULT_MAX_LENGTH;
// BreakIterator is stateful so we use a Supplier factory method
private Supplier breakIterator = DEFAULT_BREAK_ITERATOR;
private PassageScorer scorer = DEFAULT_PASSAGE_SCORER;
private PassageFormatter formatter = DEFAULT_PASSAGE_FORMATTER;
private int maxNoHighlightPassages = DEFAULT_MAX_HIGHLIGHT_PASSAGES;
private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
/**
* Constructs the highlighter with the given index searcher and analyzer.
*
* @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query,
* String, int)} is used, in which case this needs to be null.
* @param indexAnalyzer Required, even if in some circumstances it isn't used.
*/
@Deprecated
public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
this.searcher = indexSearcher; // TODO: make non nullable
this.indexAnalyzer =
Objects.requireNonNull(
indexAnalyzer,
"indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
}
@Deprecated
public void setHandleMultiTermQuery(boolean handleMtq) {
this.handleMultiTermQuery = handleMtq;
}
@Deprecated
public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
this.highlightPhrasesStrictly = highlightPhrasesStrictly;
}
@Deprecated
public void setPassageRelevancyOverSpeed(boolean passageRelevancyOverSpeed) {
this.passageRelevancyOverSpeed = passageRelevancyOverSpeed;
}
@Deprecated
public void setMaxLength(int maxLength) {
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
}
this.maxLength = maxLength;
}
@Deprecated
public void setBreakIterator(Supplier breakIterator) {
this.breakIterator = breakIterator;
}
@Deprecated
public void setScorer(PassageScorer scorer) {
this.scorer = scorer;
}
@Deprecated
public void setFormatter(PassageFormatter formatter) {
this.formatter = formatter;
}
@Deprecated
public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
this.maxNoHighlightPassages = defaultMaxNoHighlightPassages;
}
@Deprecated
public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
}
@Deprecated
public void setFieldMatcher(Predicate predicate) {
this.fieldMatcher = predicate;
}
@Deprecated
public void setWeightMatches(boolean weightMatches) {
this.weightMatches = weightMatches;
}
/**
* Returns whether {@link org.apache.lucene.search.MultiTermQuery} derivatives will be
* highlighted. By default it's enabled. MTQ highlighting can be expensive, particularly when
* using offsets in postings.
*/
@Deprecated
protected boolean shouldHandleMultiTermQuery(String field) {
return handleMultiTermQuery;
}
/**
* Returns whether position sensitive queries (e.g. phrases and {@link SpanQuery}ies) should be
* highlighted strictly based on query matches (slower) versus any/all occurrences of the
* underlying terms. By default it's enabled, but there's no overhead if such queries aren't used.
*/
@Deprecated
protected boolean shouldHighlightPhrasesStrictly(String field) {
return highlightPhrasesStrictly;
}
@Deprecated
protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
return passageRelevancyOverSpeed;
}
/** Builder for UnifiedHighlighter. */
public static class Builder {
/** If null, can only use highlightWithoutSearcher. */
private final IndexSearcher searcher;
private final Analyzer indexAnalyzer;
private Predicate fieldMatcher;
private Set flags;
private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
private boolean passageRelevancyOverSpeed = DEFAULT_ENABLE_RELEVANCY_OVER_SPEED;
private boolean weightMatches = DEFAULT_ENABLE_WEIGHT_MATCHES;
private int maxLength = DEFAULT_MAX_LENGTH;
/** BreakIterator is stateful so we use a Supplier factory method. */
private Supplier breakIterator = DEFAULT_BREAK_ITERATOR;
private PassageScorer scorer = DEFAULT_PASSAGE_SCORER;
private PassageFormatter formatter = DEFAULT_PASSAGE_FORMATTER;
private int maxNoHighlightPassages = DEFAULT_MAX_HIGHLIGHT_PASSAGES;
private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
/**
* Constructor for UH builder which accepts {@link IndexSearcher} and {@link Analyzer} objects.
* {@link IndexSearcher} object can only be null when {@link #highlightWithoutSearcher(String,
* Query, String, int)} is used.
*
* @param searcher - {@link IndexSearcher}
* @param indexAnalyzer - {@link Analyzer}
*/
public Builder(IndexSearcher searcher, Analyzer indexAnalyzer) {
this.searcher = searcher;
this.indexAnalyzer = indexAnalyzer;
}
/**
* User-defined set of {@link HighlightFlag} values which will override the flags set by {@link
* #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link
* #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}.
*
* Here the user can either specify the set of {@link HighlightFlag}s to be applied or use
* the boolean flags to populate final list of {@link HighlightFlag}s.
*
* @param values - set of {@link HighlightFlag} values.
*/
public Builder withFlags(Set values) {
this.flags = values;
return this;
}
/**
* Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) are highlighted
* strictly based on query matches (slower) versus any/all occurrences of the underlying terms.
* By default it's enabled, but there's no overhead if such queries aren't used.
*/
public Builder withHighlightPhrasesStrictly(boolean value) {
this.highlightPhrasesStrictly = value;
return this;
}
/**
* Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will be highlighted. By
* default it's enabled. MTQ highlighting can be expensive, particularly when using offsets in
* postings.
*/
public Builder withHandleMultiTermQuery(boolean value) {
this.handleMultiTermQuery = value;
return this;
}
/** Passage relevancy is more important than speed. True by default. */
public Builder withPassageRelevancyOverSpeed(boolean value) {
this.passageRelevancyOverSpeed = value;
return this;
}
/**
* Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's
* more accurate to the query, and the snippets can be a little different for phrases because
* the whole phrase is marked up instead of each word. The passage relevancy calculation can be
* different (maybe worse?) and it's slower when highlighting many fields. Use of this flag
* requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link HighlightFlag#PHRASES} and {@link
* HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true
* by default.
*/
public Builder withWeightMatches(boolean value) {
this.weightMatches = value;
return this;
}
/** The text to be highlight is effectively truncated by this length. */
public Builder withMaxLength(int value) {
if (value < 0 || value == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
}
this.maxLength = value;
return this;
}
public Builder withBreakIterator(Supplier value) {
this.breakIterator = value;
return this;
}
public Builder withFieldMatcher(Predicate value) {
this.fieldMatcher = value;
return this;
}
public Builder withScorer(PassageScorer value) {
this.scorer = value;
return this;
}
public Builder withFormatter(PassageFormatter value) {
this.formatter = value;
return this;
}
public Builder withMaxNoHighlightPassages(int value) {
this.maxNoHighlightPassages = value;
return this;
}
public Builder withCacheFieldValCharsThreshold(int value) {
this.cacheFieldValCharsThreshold = value;
return this;
}
public UnifiedHighlighter build() {
return new UnifiedHighlighter(this);
}
/** ... as passed in from the Builder constructor. */
public IndexSearcher getIndexSearcher() {
return searcher;
}
/** ... as passed in from the Builder constructor. */
public Analyzer getIndexAnalyzer() {
return indexAnalyzer;
}
public Set getFlags() {
return flags;
}
}
/**
* Creates a {@link Builder} object where {@link IndexSearcher} and {@link Analyzer} are not null.
*
* @param searcher - a {@link IndexSearcher} object.
* @param indexAnalyzer - a {@link Analyzer} object.
* @return a {@link Builder} object
*/
public static Builder builder(IndexSearcher searcher, Analyzer indexAnalyzer) {
return new Builder(searcher, indexAnalyzer);
}
/**
* Creates a {@link Builder} object in which you can only use {@link
* UnifiedHighlighter#highlightWithoutSearcher(String, Query, String, int)} for highlighting.
*
* @param indexAnalyzer - a {@link Analyzer} object.
* @return a {@link Builder} object
*/
public static Builder builderWithoutSearcher(Analyzer indexAnalyzer) {
return new Builder(null, indexAnalyzer);
}
/**
* Constructs the highlighter with the given {@link Builder}.
*
* @param builder - a {@link Builder} object.
*/
public UnifiedHighlighter(Builder builder) {
this.searcher = builder.searcher;
this.indexAnalyzer =
Objects.requireNonNull(
builder.indexAnalyzer,
"indexAnalyzer is required (even if in some circumstances it isn't used)");
this.flags = evaluateFlags(builder);
this.maxLength = builder.maxLength;
this.breakIterator = builder.breakIterator;
this.fieldMatcher = builder.fieldMatcher;
this.scorer = builder.scorer;
this.formatter = builder.formatter;
this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
this.cacheFieldValCharsThreshold = builder.cacheFieldValCharsThreshold;
}
/** Extracts matching terms */
protected static Set extractTerms(Query query) {
Set queryTerms = new HashSet<>();
query.visit(QueryVisitor.termCollector(queryTerms));
return queryTerms;
}
/**
* This method returns the set of of {@link HighlightFlag}s, which will be applied to the UH
* object. The output depends on the values provided to {@link
* Builder#withHandleMultiTermQuery(boolean)}, {@link
* Builder#withHighlightPhrasesStrictly(boolean)}, {@link
* Builder#withPassageRelevancyOverSpeed(boolean)} and {@link Builder#withWeightMatches(boolean)}
* OR {@link #setHandleMultiTermQuery(boolean)}, {@link #setHighlightPhrasesStrictly(boolean)},
* {@link #setPassageRelevancyOverSpeed(boolean)} and {@link #setWeightMatches(boolean)}
*
* @param shouldHandleMultiTermQuery - flag for adding Multi-term query
* @param shouldHighlightPhrasesStrictly - flag for adding phrase highlighting
* @param shouldPassageRelevancyOverSpeed - flag for adding passage relevancy
* @param shouldEnableWeightMatches - flag for enabling weight matches
* @return a set of {@link HighlightFlag}s.
*/
protected Set evaluateFlags(
final boolean shouldHandleMultiTermQuery,
final boolean shouldHighlightPhrasesStrictly,
final boolean shouldPassageRelevancyOverSpeed,
final boolean shouldEnableWeightMatches) {
Set highlightFlags = EnumSet.noneOf(HighlightFlag.class);
if (shouldHandleMultiTermQuery) {
highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
}
if (shouldHighlightPhrasesStrictly) {
highlightFlags.add(HighlightFlag.PHRASES);
}
if (shouldPassageRelevancyOverSpeed) {
highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
}
// Evaluate if WEIGHT_MATCHES can be added as a flag.
final boolean applyWeightMatches =
highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
&& highlightFlags.contains(HighlightFlag.PHRASES)
&& highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)
// User can also opt-out of WEIGHT_MATCHES.
&& shouldEnableWeightMatches;
if (applyWeightMatches) {
highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);
}
return highlightFlags;
}
/**
* Evaluate the highlight flags and set the {@link #flags} variable. This is called only once when
* the Builder object is used to create a UH object.
*
* @param uhBuilder - {@link Builder} object.
* @return {@link HighlightFlag}s.
*/
protected Set evaluateFlags(Builder uhBuilder) {
if (flags != null) {
return flags;
}
return flags =
evaluateFlags(
uhBuilder.handleMultiTermQuery,
uhBuilder.highlightPhrasesStrictly,
uhBuilder.passageRelevancyOverSpeed,
uhBuilder.weightMatches);
}
/**
* Evaluate the highlight flags and set the {@link #flags} variable. This is called every time
* {@link #getFlags(String)} method is called. This is used in the builder and has been marked
* deprecated since it is used only for the mutable initialization of a UH object.
*
* @param uh - {@link UnifiedHighlighter} object.
* @return {@link HighlightFlag}s.
*/
@Deprecated
protected Set evaluateFlags(UnifiedHighlighter uh) {
return evaluateFlags(
uh.handleMultiTermQuery,
uh.highlightPhrasesStrictly,
uh.passageRelevancyOverSpeed,
uh.weightMatches);
}
/**
* Returns the predicate to use for extracting the query part that must be highlighted. By default
* only queries that target the current field are kept. (AKA requireFieldMatch)
*/
protected Predicate getFieldMatcher(String field) {
if (fieldMatcher != null) {
return fieldMatcher;
} else {
// requireFieldMatch = true
return (qf) -> field.equals(qf);
}
}
/** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
protected Set getFlags(String field) {
// If a builder is used for initializing a UH object, then flags will never be null.
// Once the setters are removed, this method can just return the flags.
if (flags != null) {
return flags;
}
// When not using builder, you have to reevaluate the flags.
return evaluateFlags(this);
}
/**
* The maximum content size to process. Content will be truncated to this size before
* highlighting. Typically snippets closer to the beginning of the document better summarize its
* content.
*/
public int getMaxLength() {
return maxLength;
}
/**
* Returns the {@link BreakIterator} to use for dividing text into passages. This returns {@link
* BreakIterator#getSentenceInstance(Locale)} by default; subclasses can override to customize.
*
* Note: this highlighter will call {@link BreakIterator#preceding(int)} and {@link
* BreakIterator#next()} many times on it. The default generic JDK implementation of {@code
* preceding} performs poorly.
*/
protected BreakIterator getBreakIterator(String field) {
return breakIterator.get();
}
/** Returns the {@link PassageScorer} to use for ranking passages. */
protected PassageScorer getScorer(String field) {
return scorer;
}
/**
* Returns the {@link PassageFormatter} to use for formatting passages into highlighted snippets.
*/
protected PassageFormatter getFormatter(String field) {
return formatter;
}
/**
* Returns the number of leading passages (as delineated by the {@link BreakIterator}) when no
* highlights could be found. If it's less than 0 (the default) then this defaults to the {@code
* maxPassages} parameter given for each request. If this is 0 then the resulting highlight is
* null (not formatted).
*/
protected int getMaxNoHighlightPassages(String field) {
return maxNoHighlightPassages;
}
/**
* Limits the amount of field value pre-fetching until this threshold is passed. The highlighter
* internally highlights in batches of documents sized on the sum field value length (in chars) of
* the fields to be highlighted (bounded by {@link #getMaxLength()} for each field). By setting
* this to 0, you can force documents to be fetched and highlighted one at a time, which you
* usually shouldn't do. The default is 524288 chars which translates to about a megabyte.
* However, note that the highlighter sometimes ignores this and highlights one document at a time
* (without caching a bunch of documents in advance) when it can detect there's no point in it --
* such as when all fields will be highlighted via re-analysis as one example.
*/
public int getCacheFieldValCharsThreshold() { // question: should we size by bytes instead?
return cacheFieldValCharsThreshold;
}
/** ... as passed in from constructor. */
public IndexSearcher getIndexSearcher() {
return searcher;
}
/** ... as passed in from constructor. */
public Analyzer getIndexAnalyzer() {
return indexAnalyzer;
}
/** Source of term offsets; essential for highlighting. */
public enum OffsetSource {
POSTINGS,
TERM_VECTORS,
ANALYSIS,
POSTINGS_WITH_TERM_VECTORS,
NONE_NEEDED
}
/**
* Determine the offset source for the specified field. The default algorithm is as follows:
*
*
* - This calls {@link #getFieldInfo(String)}. Note this returns null if there is no searcher
* or if the field isn't found there.
*
- If there's a field info it has {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} then {@link OffsetSource#POSTINGS}
* is returned.
*
- If there's a field info and {@link FieldInfo#hasVectors()} then {@link
* OffsetSource#TERM_VECTORS} is returned (note we can't check here if the TV has offsets;
* if there isn't then an exception will get thrown down the line).
*
- Fall-back: {@link OffsetSource#ANALYSIS} is returned.
*
*
* Note that the highlighter sometimes switches to something else based on the query, such as
* if you have {@link OffsetSource#POSTINGS_WITH_TERM_VECTORS} but in fact don't need term
* vectors.
*/
protected OffsetSource getOffsetSource(String field) {
FieldInfo fieldInfo = getFieldInfo(field);
if (fieldInfo != null) {
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
return fieldInfo.hasVectors()
? OffsetSource.POSTINGS_WITH_TERM_VECTORS
: OffsetSource.POSTINGS;
}
if (fieldInfo.hasVectors()) { // unfortunately we can't also check if the TV has offsets
return OffsetSource.TERM_VECTORS;
}
}
return OffsetSource.ANALYSIS;
}
/**
* Called by the default implementation of {@link #getOffsetSource(String)}. If there is no
* searcher then we simply always return null.
*/
protected FieldInfo getFieldInfo(String field) {
if (searcher == null) {
return null;
}
// Need thread-safety for lazy-init but lets avoid 'synchronized' by using double-check locking
// idiom
FieldInfos fieldInfos = this.fieldInfos; // note: it's volatile; read once
if (fieldInfos == null) {
synchronized (this) {
fieldInfos = this.fieldInfos;
if (fieldInfos == null) {
fieldInfos = FieldInfos.getMergedFieldInfos(searcher.getIndexReader());
this.fieldInfos = fieldInfos;
}
}
}
return fieldInfos.fieldInfo(field);
}
/**
* Highlights the top passages from a single field.
*
* @param field field name to highlight. Must have a stored string value and also be indexed with
* offsets.
* @param query query to highlight.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in topDocs
. If
* no highlights were found for a document, the first sentence for the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
return highlight(field, query, topDocs, 1);
}
/**
* Highlights the top-N passages from a single field.
*
* @param field field name to highlight. Must have a stored string value.
* @param query query to highlight.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
* snippets.
* @return Array of formatted snippets corresponding to the documents in topDocs
. If
* no highlights were found for a document, the first {@code maxPassages} sentences from the
* field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages)
throws IOException {
Map res =
highlightFields(new String[] {field}, query, topDocs, new int[] {maxPassages});
return res.get(field);
}
/**
* Highlights the top passages from multiple fields.
*
* Conceptually, this behaves as a more efficient form of:
*
*
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, topDocs));
* }
* return m;
*
*
* @param fields field names to highlight. Must have a stored string value.
* @param query query to highlight.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in topDocs
. If no highlights were found for a document, the
* first sentence from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map highlightFields(String[] fields, Query query, TopDocs topDocs)
throws IOException {
int[] maxPassages = new int[fields.length];
Arrays.fill(maxPassages, 1);
return highlightFields(fields, query, topDocs, maxPassages);
}
/**
* Highlights the top-N passages from multiple fields.
*
* Conceptually, this behaves as a more efficient form of:
*
*
* Map m = new HashMap();
* for (String field : fields) {
* m.put(field, highlight(field, query, topDocs, maxPassages));
* }
* return m;
*
*
* @param fields field names to highlight. Must have a stored string value.
* @param query query to highlight.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @param maxPassages The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in topDocs
. If no highlights were found for a document, the
* first {@code maxPassages} sentences from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map highlightFields(
String[] fields, Query query, TopDocs topDocs, int[] maxPassages) throws IOException {
final ScoreDoc[] scoreDocs = topDocs.scoreDocs;
int[] docids = new int[scoreDocs.length];
for (int i = 0; i < docids.length; i++) {
docids[i] = scoreDocs[i].doc;
}
return highlightFields(fields, query, docids, maxPassages);
}
/**
* Highlights the top-N passages from multiple fields, for the provided int[] docids.
*
* @param fieldsIn field names to highlight. Must have a stored string value.
* @param query query to highlight.
* @param docidsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in docidsIn
. If no highlights were found for a document, the
* first {@code maxPassages} from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map highlightFields(
String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn) throws IOException {
Map snippets = new HashMap<>();
for (Map.Entry ent :
highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
Object[] snippetObjects = ent.getValue();
String[] snippetStrings = new String[snippetObjects.length];
snippets.put(ent.getKey(), snippetStrings);
for (int i = 0; i < snippetObjects.length; i++) {
Object snippet = snippetObjects[i];
if (snippet != null) {
snippetStrings[i] = snippet.toString();
}
}
}
return snippets;
}
/**
* Expert: highlights the top-N passages from multiple fields, for the provided int[] docids, to
* custom Object as returned by the {@link PassageFormatter}. Use this API to render to something
* other than String.
*
* @param fieldsIn field names to highlight. Must have a stored string value.
* @param query query to highlight.
* @param docIdsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
* highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets corresponding to
* the documents in docIdsIn
. If no highlights were found for a document, the
* first {@code maxPassages} from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if field
was indexed without {@link
* IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
protected Map highlightFieldsAsObjects(
String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
if (fieldsIn.length != maxPassagesIn.length) {
throw new IllegalArgumentException("invalid number of maxPassagesIn");
}
if (searcher == null) {
throw new IllegalStateException(
"This method requires that an indexSearcher was passed in the "
+ "constructor. Perhaps you mean to call highlightWithoutSearcher?");
}
// Sort docs & fields for sequential i/o
// Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
int[] docIds = new int[docIdsIn.length];
int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes); // latter 2 are "out" params
// Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
final String[] fields = new String[fieldsIn.length];
final int[] maxPassages = new int[maxPassagesIn.length];
copyAndSortFieldsWithMaxPassages(
fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params
// Init field highlighters (where most of the highlight logic lives, and on a per field basis)
Set queryTerms = extractTerms(query);
FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
int numTermVectors = 0;
int numPostings = 0;
for (int f = 0; f < fields.length; f++) {
FieldHighlighter fieldHighlighter =
getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
fieldHighlighters[f] = fieldHighlighter;
switch (fieldHighlighter.getOffsetSource()) {
case TERM_VECTORS:
numTermVectors++;
break;
case POSTINGS:
numPostings++;
break;
case POSTINGS_WITH_TERM_VECTORS:
numTermVectors++;
numPostings++;
break;
case ANALYSIS:
case NONE_NEEDED:
default:
// do nothing
break;
}
}
int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);
IndexReader indexReaderWithTermVecCache =
(numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;
// [fieldIdx][docIdInIndex] of highlightDoc result
Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
// Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
// Load the field values of the first batch of document(s) (note: commonly all docs are in
// this batch)
List fieldValsByDoc = loadFieldValues(fields, docIdIter, cacheCharsThreshold);
// the size of the above list is the size of the batch (num of docs in the batch)
// Highlight in per-field order first, then by doc (better I/O pattern)
for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
Object[] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
int docId = docIds[docIdx]; // sorted order
CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
if (content == null) {
continue;
}
IndexReader indexReader =
(fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
&& indexReaderWithTermVecCache != null)
? indexReaderWithTermVecCache
: searcher.getIndexReader();
final LeafReader leafReader;
if (indexReader instanceof LeafReader) {
leafReader = (LeafReader) indexReader;
} else {
List leaves = indexReader.leaves();
LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
leafReader = leafReaderContext.reader();
docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
}
int docInIndex = docInIndexes[docIdx]; // original input order
assert resultByDocIn[docInIndex] == null;
resultByDocIn[docInIndex] =
fieldHighlighter.highlightFieldForDoc(leafReader, docId, content.toString());
}
}
batchDocIdx += fieldValsByDoc.size();
}
IOUtils.close(indexReaderWithTermVecCache); // FYI won't close underlying reader
assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
|| docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
// TODO reconsider the return type; since this is an "advanced" method, lets not return a Map?
// Notice the only
// caller simply iterates it to build another structure.
// field -> object highlights parallel to docIdsIn
Map resultMap = CollectionUtil.newHashMap(fields.length);
for (int f = 0; f < fields.length; f++) {
resultMap.put(fields[f], highlightDocsInByField[f]);
}
return resultMap;
}
/**
* When cacheCharsThreshold is 0, loadFieldValues() only fetches one document at a time. We
* override it to be 0 in two circumstances:
*/
private int calculateOptimalCacheCharsThreshold(int numTermVectors, int numPostings) {
if (numPostings == 0 && numTermVectors == 0) {
// (1) When all fields are ANALYSIS there's no point in caching a batch of documents
// because no other info on disk is needed to highlight it.
return 0;
} else if (numTermVectors >= 2) {
// (2) When two or more fields have term vectors, given the field-then-doc algorithm, the
// underlying term
// vectors will be fetched in a terrible access pattern unless we highlight a doc at a time
// and use a special
// current-doc TV cache. So we do that. Hopefully one day TVs will be improved to make this
// pointless.
return 0;
} else {
return getCacheFieldValCharsThreshold();
}
}
private void copyAndSortFieldsWithMaxPassages(
String[] fieldsIn, int[] maxPassagesIn, final String[] fields, final int[] maxPassages) {
System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
String tmp = fields[i];
fields[i] = fields[j];
fields[j] = tmp;
int tmp2 = maxPassages[i];
maxPassages[i] = maxPassages[j];
maxPassages[j] = tmp2;
}
@Override
protected int compare(int i, int j) {
return fields[i].compareTo(fields[j]);
}
}.sort(0, fields.length);
}
private void copyAndSortDocIdsWithIndex(
int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
for (int i = 0; i < docInIndexes.length; i++) {
docInIndexes[i] = i;
}
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int tmp = docIds[i];
docIds[i] = docIds[j];
docIds[j] = tmp;
tmp = docInIndexes[i];
docInIndexes[i] = docInIndexes[j];
docInIndexes[j] = tmp;
}
@Override
protected int compare(int i, int j) {
return Integer.compare(docIds[i], docIds[j]);
}
}.sort(0, docIds.length);
}
/**
* Highlights text passed as a parameter. This requires the {@link IndexSearcher} provided to this
* highlighter is null. This use-case is more rare. Naturally, the mode of operation will be
* {@link OffsetSource#ANALYSIS}. The result of this method is whatever the {@link
* PassageFormatter} returns. For the {@link DefaultPassageFormatter} and assuming {@code content}
* has non-zero length, the result will be a non-null string -- so it's safe to call {@link
* Object#toString()} on it in that case.
*
* @param field field name to highlight (as found in the query).
* @param query query to highlight.
* @param content text to highlight.
* @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
* snippets.
* @return result of the {@link PassageFormatter} -- probably a String. Might be null.
* @throws IOException if an I/O error occurred during processing
*/
// TODO make content a List? and return a List? and ensure getEmptyHighlight is never invoked
// multiple times?
public Object highlightWithoutSearcher(String field, Query query, String content, int maxPassages)
throws IOException {
if (this.searcher != null) {
throw new IllegalStateException(
"highlightWithoutSearcher should only be called on a "
+ getClass().getSimpleName()
+ " without an IndexSearcher.");
}
Objects.requireNonNull(content, "content is required");
Set queryTerms = extractTerms(query);
return getFieldHighlighter(field, query, queryTerms, maxPassages)
.highlightFieldForDoc(null, -1, content);
}
protected FieldHighlighter getFieldHighlighter(
String field, Query query, Set allTerms, int maxPassages) {
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
return newFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
getMaxNoHighlightPassages(field),
getFormatter(field));
}
protected FieldHighlighter newFieldHighlighter(
String field,
FieldOffsetStrategy fieldOffsetStrategy,
BreakIterator breakIterator,
PassageScorer passageScorer,
int maxPassages,
int maxNoHighlightPassages,
PassageFormatter passageFormatter) {
return new FieldHighlighter(
field,
fieldOffsetStrategy,
breakIterator,
passageScorer,
maxPassages,
maxNoHighlightPassages,
passageFormatter);
}
protected UHComponents getHighlightComponents(String field, Query query, Set allTerms) {
Predicate fieldMatcher = getFieldMatcher(field);
Set highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
BytesRef[] terms = null;
LabelledCharArrayMatcher[] automata = null;
if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
terms = filterExtractedTerms(fieldMatcher, allTerms);
automata = getAutomata(field, query, highlightFlags);
} // otherwise don't need to extract
return new UHComponents(
field,
fieldMatcher,
query,
terms,
phraseHelper,
automata,
queryHasUnrecognizedPart,
highlightFlags);
}
protected boolean hasUnrecognizedQuery(Predicate fieldMatcher, Query query) {
boolean[] hasUnknownLeaf = new boolean[1];
query.visit(
new QueryVisitor() {
@Override
public boolean acceptField(String field) {
// checking hasUnknownLeaf is a trick to exit early
return hasUnknownLeaf[0] == false && fieldMatcher.test(field);
}
@Override
public void visitLeaf(Query query) {
if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) {
if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) {
hasUnknownLeaf[0] = true;
}
}
}
});
return hasUnknownLeaf[0];
}
protected static BytesRef[] filterExtractedTerms(
Predicate fieldMatcher, Set queryTerms) {
// Strip off the redundant field and sort the remaining terms
SortedSet filteredTerms = new TreeSet<>();
for (Term term : queryTerms) {
if (fieldMatcher.test(term.field())) {
filteredTerms.add(term.bytes());
}
}
return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
}
protected PhraseHelper getPhraseHelper(
String field, Query query, Set highlightFlags) {
boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES);
if (useWeightMatchesIter) {
return PhraseHelper.NONE; // will be handled by Weight.matches which always considers phrases
}
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly
? new PhraseHelper(
query,
field,
getFieldMatcher(field),
this::requiresRewrite,
this::preSpanQueryRewrite,
!handleMultiTermQuery)
: PhraseHelper.NONE;
}
protected LabelledCharArrayMatcher[] getAutomata(
String field, Query query, Set highlightFlags) {
// do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper
// handle those?
// if don't highlight phrases strictly,
final boolean lookInSpan =
!highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper
|| highlightFlags.contains(
HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), lookInSpan)
: ZERO_LEN_AUTOMATA_ARRAY;
}
protected OffsetSource getOptimizedOffsetSource(UHComponents components) {
OffsetSource offsetSource = getOffsetSource(components.getField());
// null automata means unknown, so assume a possibility
boolean mtqOrRewrite =
components.getAutomata() == null
|| components.getAutomata().length > 0
|| components.getPhraseHelper().willRewrite()
|| components.hasUnrecognizedQueryPart();
// null terms means unknown, so assume something to highlight
if (mtqOrRewrite == false
&& components.getTerms() != null
&& components.getTerms().length == 0) {
return OffsetSource.NONE_NEEDED; // nothing to highlight
}
switch (offsetSource) {
case POSTINGS:
if (mtqOrRewrite) { // may need to see scan through all terms for the highlighted document
// efficiently
return OffsetSource.ANALYSIS;
}
break;
case POSTINGS_WITH_TERM_VECTORS:
if (mtqOrRewrite == false) {
return OffsetSource.POSTINGS; // We don't need term vectors
}
break;
case ANALYSIS:
case TERM_VECTORS:
case NONE_NEEDED:
default:
// stick with the original offset source
break;
}
return offsetSource;
}
protected FieldOffsetStrategy getOffsetStrategy(
OffsetSource offsetSource, UHComponents components) {
switch (offsetSource) {
case ANALYSIS:
if (!components.getPhraseHelper().hasPositionSensitivity()
&& !components.getHighlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)
&& !components.getHighlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) {
// skip using a memory index since it's pure term filtering
return new TokenStreamOffsetStrategy(components, getIndexAnalyzer());
} else {
return new MemoryIndexOffsetStrategy(components, getIndexAnalyzer());
}
case NONE_NEEDED:
return NoOpOffsetStrategy.INSTANCE;
case TERM_VECTORS:
return new TermVectorOffsetStrategy(components);
case POSTINGS:
return new PostingsOffsetStrategy(components);
case POSTINGS_WITH_TERM_VECTORS:
return new PostingsWithTermVectorsOffsetStrategy(components);
default:
throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
}
}
/**
* When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
* {@link Query#rewrite(IndexSearcher)} called on them. It helps performance to avoid it if it's
* not needed. This method will be invoked on all SpanQuery instances recursively. If you have
* custom SpanQuery queries then override this to check instanceof and provide a definitive
* answer. If the query isn't your custom one, simply return null to have the default rules apply,
* which govern the ones included in Lucene.
*/
protected Boolean requiresRewrite(SpanQuery spanQuery) {
return null;
}
/**
* When highlighting phrases accurately, we may need to handle custom queries that aren't
* supported in the {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called
* by the {@code PhraseHelper}. Should custom query types be needed, this method should be
* overriden to return a collection of queries if appropriate, or null if nothing to do. If the
* query is not custom, simply returning null will allow the default rules to apply.
*
* @param query Query to be highlighted
* @return A Collection of Query object(s) if needs to be rewritten, otherwise null.
*/
protected Collection preSpanQueryRewrite(Query query) {
return null;
}
private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
return new DocIdSetIterator() {
int idx = -1;
@Override
public int docID() {
if (idx < 0 || idx >= sortedDocIds.length) {
return NO_MORE_DOCS;
}
return sortedDocIds[idx];
}
@Override
public int nextDoc() throws IOException {
idx++;
return docID();
}
@Override
public int advance(int target) throws IOException {
return super.slowAdvance(target); // won't be called, so whatever
}
@Override
public long cost() {
return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
}
};
}
/**
* Loads the String values for each docId by field to be highlighted. By default this loads from
* stored fields by the same name as given, but a subclass can change the source. The returned
* Strings must be identical to what was indexed (at least for postings or term-vectors offset
* sources). This method must load fields for at least one document from the given {@link
* DocIdSetIterator} but need not return all of them; by default the character lengths are summed
* and this method will return early when {@code cacheCharsThreshold} is exceeded. Specifically if
* that number is 0, then only one document is fetched no matter what. Values in the array of
* {@link CharSequence} will be null if no value was found.
*/
protected List loadFieldValues(
String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
List docListOfFields =
new ArrayList<>(cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost()));
LimitedStoredFieldVisitor visitor = newLimitedStoredFieldsVisitor(fields);
StoredFields storedFields = searcher.storedFields();
int sumChars = 0;
do {
int docId = docIter.nextDoc();
if (docId == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
visitor.init();
storedFields.document(docId, visitor);
CharSequence[] valuesByField = visitor.getValuesByField();
docListOfFields.add(valuesByField);
for (CharSequence val : valuesByField) {
sumChars += (val == null ? 0 : val.length());
}
} while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0);
return docListOfFields;
}
/**
* @lucene.internal
*/
protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
return new LimitedStoredFieldVisitor(fields, MULTIVAL_SEP_CHAR, getMaxLength());
}
/**
* Fetches stored fields for highlighting. Uses a multi-val separator char and honors a max length
* to retrieve.
*
* @lucene.internal
*/
protected static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
protected final String[] fields;
protected final char valueSeparator;
protected final int maxLength;
protected CharSequence[] values; // starts off as String; may become StringBuilder.
protected int currentField;
public LimitedStoredFieldVisitor(String[] fields, char valueSeparator, int maxLength) {
this.fields = fields;
this.valueSeparator = valueSeparator;
this.maxLength = maxLength;
}
void init() {
values = new CharSequence[fields.length];
currentField = -1;
}
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
assert currentField >= 0;
Objects.requireNonNull(value, "String value should not be null");
CharSequence curValue = values[currentField];
if (curValue == null) {
// question: if truncate due to maxLength, should we try and avoid keeping the other chars
// in-memory on
// the backing char[]?
values[currentField] =
value.substring(0, Math.min(maxLength, value.length())); // note: may return 'this'
return;
}
final int lengthBudget = maxLength - curValue.length();
if (lengthBudget <= 0) {
return;
}
StringBuilder curValueBuilder;
if (curValue instanceof StringBuilder) {
curValueBuilder = (StringBuilder) curValue;
} else {
// upgrade String to StringBuilder. Choose a good initial size.
curValueBuilder =
new StringBuilder(curValue.length() + Math.min(lengthBudget, value.length() + 256));
curValueBuilder.append(curValue);
}
curValueBuilder.append(valueSeparator);
curValueBuilder.append(value.substring(0, Math.min(lengthBudget - 1, value.length())));
values[currentField] = curValueBuilder;
}
@Override
public Status needsField(FieldInfo fieldInfo) throws IOException {
currentField = Arrays.binarySearch(fields, fieldInfo.name);
if (currentField < 0) {
return Status.NO;
}
CharSequence curVal = values[currentField];
if (curVal != null && curVal.length() >= maxLength) {
return fields.length == 1 ? Status.STOP : Status.NO;
}
return Status.YES;
}
CharSequence[] getValuesByField() {
return this.values;
}
}
/**
* Wraps an IndexReader that remembers/caches the last call to {@link TermVectors#get(int)} so
* that if the next call has the same ID, then it is reused. If TV's were column-stride (like
* doc-values), there would be no need for this.
*/
private static class TermVectorReusingLeafReader extends FilterLeafReader {
static IndexReader wrap(IndexReader reader) throws IOException {
LeafReader[] leafReaders =
reader.leaves().stream()
.map(LeafReaderContext::reader)
.map(TermVectorReusingLeafReader::new)
.toArray(LeafReader[]::new);
return new BaseCompositeReader(leafReaders, null) {
@Override
protected void doClose() { // don't close the underlying reader
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
};
}
private int lastDocId = -1;
private Fields tvFields;
TermVectorReusingLeafReader(LeafReader in) {
super(in);
}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = in.getTermVectors(docID);
}
return tvFields;
}
@Override
public TermVectors termVectors() throws IOException {
TermVectors orig = in.termVectors();
return new TermVectors() {
@Override
public Fields get(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = orig.get(docID);
}
return tvFields;
}
};
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
/** Flags for controlling highlighting behavior. */
public enum HighlightFlag {
/**
* @see Builder#withHighlightPhrasesStrictly(boolean)
*/
PHRASES,
/**
* @see Builder#withHandleMultiTermQuery(boolean)
*/
MULTI_TERM_QUERY,
/**
* @see Builder#withPassageRelevancyOverSpeed(boolean)
*/
PASSAGE_RELEVANCY_OVER_SPEED,
/**
* @see Builder#withWeightMatches(boolean)
*/
WEIGHT_MATCHES
// TODO: useQueryBoosts
}
}