org.apache.lucene.search.uhighlight.UnifiedHighlighter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.BaseCompositeReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;

/**
 * A Highlighter that can get offsets from either postings ({@link
 * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}), term vectors ({@link
 * FieldType#setStoreTermVectorOffsets(boolean)}), or via re-analyzing text.
 *
 * This highlighter treats the single original document as the whole corpus, and then scores
 * individual passages as if they were documents in this corpus. It uses a {@link BreakIterator} to
 * find passages in the text; by default it breaks using {@link
 * BreakIterator#getSentenceInstance(Locale) getSentenceInstance(Locale.ROOT)}. It then iterates in
 * parallel (merge sorting by offset) through the positions of all terms from the query, coalescing
 * those hits that occur in a single passage into a {@link Passage}, and then scores each Passage
 * using a separate {@link PassageScorer}. Passages are finally formatted into highlighted snippets
 * with a {@link PassageFormatter}.
 *
 * 
You can customize the behavior by calling some of the setters, or by subclassing and
 * overriding some methods. Some important hooks:
 *
 * 

 *   {@link #getBreakIterator(String)}: Customize how the text is divided into passages.
 *   
{@link #getScorer(String)}: Customize how passages are ranked.
 *   
{@link #getFormatter(String)}: Customize how snippets are formatted.
 *   
{@link #getPassageSortComparator(String)}: Customize how snippets are formatted.
 * 
 *
 * This is thread-safe, notwithstanding the setters.
 *
 * @lucene.experimental
 */
public class UnifiedHighlighter {

  protected static final char MULTIVAL_SEP_CHAR = (char) 0;

  public static final int DEFAULT_MAX_LENGTH = 10000;

  public static final int DEFAULT_CACHE_CHARS_THRESHOLD = 524288; // ~ 1 MB (2 byte chars)

  protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY =
      new LabelledCharArrayMatcher[0];

  // All the private defaults will be removed once non-builder based UH is removed.
  private static final boolean DEFAULT_ENABLE_MULTI_TERM_QUERY = true;
  private static final boolean DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY = true;
  private static final boolean DEFAULT_ENABLE_WEIGHT_MATCHES = true;
  private static final boolean DEFAULT_ENABLE_RELEVANCY_OVER_SPEED = true;
  private static final Supplier DEFAULT_BREAK_ITERATOR =
      () -> BreakIterator.getSentenceInstance(Locale.ROOT);
  private static final PassageScorer DEFAULT_PASSAGE_SCORER = new PassageScorer();
  private static final PassageFormatter DEFAULT_PASSAGE_FORMATTER = new DefaultPassageFormatter();
  private static final int DEFAULT_MAX_HIGHLIGHT_PASSAGES = -1;
  private static final Comparator DEFAULT_PASSAGE_SORT_COMPARATOR =
      Comparator.comparingInt(Passage::getStartOffset);

  protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher

  protected final Analyzer indexAnalyzer;

  // lazy initialized with double-check locking; protected so subclass can init
  protected volatile FieldInfos fieldInfos;

  private Predicate fieldMatcher;

  private final Function> maskedFieldsFunc;

  private Set flags;

  // e.g. wildcards
  private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;

  // AKA "accuracy" or "query debugging"
  private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;

  private boolean weightMatches = DEFAULT_ENABLE_WEIGHT_MATCHES;

  // For analysis, prefer MemoryIndexOffsetStrategy
  private boolean passageRelevancyOverSpeed = DEFAULT_ENABLE_RELEVANCY_OVER_SPEED;

  private int maxLength = DEFAULT_MAX_LENGTH;

  // BreakIterator is stateful so we use a Supplier factory method
  private Supplier breakIterator = DEFAULT_BREAK_ITERATOR;

  private PassageScorer scorer = DEFAULT_PASSAGE_SCORER;

  private PassageFormatter formatter = DEFAULT_PASSAGE_FORMATTER;

  private int maxNoHighlightPassages = DEFAULT_MAX_HIGHLIGHT_PASSAGES;

  private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;

  private Comparator passageSortComparator = DEFAULT_PASSAGE_SORT_COMPARATOR;

  /**
   * Constructs the highlighter with the given index searcher and analyzer.
   *
   * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query,
   *     String, int)} is used, in which case this needs to be null.
   * @param indexAnalyzer Required, even if in some circumstances it isn't used.
   */
  @Deprecated
  public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
    this.searcher = indexSearcher; // TODO: make non nullable
    this.indexAnalyzer =
        Objects.requireNonNull(
            indexAnalyzer,
            "indexAnalyzer is required" + " (even if in some circumstances it isn't used)");
    this.maskedFieldsFunc = null;
  }

  @Deprecated
  public void setHandleMultiTermQuery(boolean handleMtq) {
    this.handleMultiTermQuery = handleMtq;
  }

  @Deprecated
  public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
    this.highlightPhrasesStrictly = highlightPhrasesStrictly;
  }

  @Deprecated
  public void setPassageRelevancyOverSpeed(boolean passageRelevancyOverSpeed) {
    this.passageRelevancyOverSpeed = passageRelevancyOverSpeed;
  }

  @Deprecated
  public void setMaxLength(int maxLength) {
    if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
      // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
      // our sentinel in the offsets queue uses this value to terminate.
      throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
    }
    this.maxLength = maxLength;
  }

  @Deprecated
  public void setBreakIterator(Supplier breakIterator) {
    this.breakIterator = breakIterator;
  }

  @Deprecated
  public void setScorer(PassageScorer scorer) {
    this.scorer = scorer;
  }

  @Deprecated
  public void setFormatter(PassageFormatter formatter) {
    this.formatter = formatter;
  }

  @Deprecated
  public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
    this.maxNoHighlightPassages = defaultMaxNoHighlightPassages;
  }

  @Deprecated
  public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
    this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
  }

  @Deprecated
  public void setFieldMatcher(Predicate predicate) {
    this.fieldMatcher = predicate;
  }

  @Deprecated
  public void setWeightMatches(boolean weightMatches) {
    this.weightMatches = weightMatches;
  }

  /**
   * Returns whether {@link org.apache.lucene.search.MultiTermQuery} derivatives will be
   * highlighted. By default it's enabled. MTQ highlighting can be expensive, particularly when
   * using offsets in postings.
   */
  @Deprecated
  protected boolean shouldHandleMultiTermQuery(String field) {
    return handleMultiTermQuery;
  }

  /**
   * Returns whether position sensitive queries (e.g. phrases and {@link SpanQuery}ies) should be
   * highlighted strictly based on query matches (slower) versus any/all occurrences of the
   * underlying terms. By default it's enabled, but there's no overhead if such queries aren't used.
   */
  @Deprecated
  protected boolean shouldHighlightPhrasesStrictly(String field) {
    return highlightPhrasesStrictly;
  }

  @Deprecated
  protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
    return passageRelevancyOverSpeed;
  }

  /** Builder for UnifiedHighlighter. */
  public static class Builder {
    /** If null, can only use highlightWithoutSearcher. */
    private final IndexSearcher searcher;

    private final Analyzer indexAnalyzer;
    private Predicate fieldMatcher;

    private Function> maskedFieldsFunc;
    private Set flags;
    private boolean handleMultiTermQuery = DEFAULT_ENABLE_MULTI_TERM_QUERY;
    private boolean highlightPhrasesStrictly = DEFAULT_ENABLE_HIGHLIGHT_PHRASES_STRICTLY;
    private boolean passageRelevancyOverSpeed = DEFAULT_ENABLE_RELEVANCY_OVER_SPEED;
    private boolean weightMatches = DEFAULT_ENABLE_WEIGHT_MATCHES;
    private int maxLength = DEFAULT_MAX_LENGTH;

    /** BreakIterator is stateful so we use a Supplier factory method. */
    private Supplier breakIterator = DEFAULT_BREAK_ITERATOR;

    private PassageScorer scorer = DEFAULT_PASSAGE_SCORER;
    private PassageFormatter formatter = DEFAULT_PASSAGE_FORMATTER;
    private int maxNoHighlightPassages = DEFAULT_MAX_HIGHLIGHT_PASSAGES;
    private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
    private Comparator passageSortComparator = DEFAULT_PASSAGE_SORT_COMPARATOR;

    /**
     * Constructor for UH builder which accepts {@link IndexSearcher} and {@link Analyzer} objects.
     * {@link IndexSearcher} object can only be null when {@link #highlightWithoutSearcher(String,
     * Query, String, int)} is used.
     *
     * @param searcher - {@link IndexSearcher}
     * @param indexAnalyzer - {@link Analyzer}
     */
    public Builder(IndexSearcher searcher, Analyzer indexAnalyzer) {
      this.searcher = searcher;
      this.indexAnalyzer = indexAnalyzer;
    }

    /**
     * User-defined set of {@link HighlightFlag} values which will override the flags set by {@link
     * #withHandleMultiTermQuery(boolean)}, {@link #withHighlightPhrasesStrictly(boolean)}, {@link
     * #withPassageRelevancyOverSpeed(boolean)} and {@link #withWeightMatches(boolean)}.
     *
     * 
Here the user can either specify the set of {@link HighlightFlag}s to be applied or use
     * the boolean flags to populate final list of {@link HighlightFlag}s.
     *
     * @param values - set of {@link HighlightFlag} values.
     */
    public Builder withFlags(Set values) {
      this.flags = values;
      return this;
    }

    /**
     * Here position sensitive queries (e.g. phrases and {@link SpanQuery}ies) are highlighted
     * strictly based on query matches (slower) versus any/all occurrences of the underlying terms.
     * By default it's enabled, but there's no overhead if such queries aren't used.
     */
    public Builder withHighlightPhrasesStrictly(boolean value) {
      this.highlightPhrasesStrictly = value;
      return this;
    }

    /**
     * Here {@link org.apache.lucene.search.MultiTermQuery} derivatives will be highlighted. By
     * default it's enabled. MTQ highlighting can be expensive, particularly when using offsets in
     * postings.
     */
    public Builder withHandleMultiTermQuery(boolean value) {
      this.handleMultiTermQuery = value;
      return this;
    }

    /** Passage relevancy is more important than speed. True by default. */
    public Builder withPassageRelevancyOverSpeed(boolean value) {
      this.passageRelevancyOverSpeed = value;
      return this;
    }

    /**
     * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's
     * more accurate to the query, and the snippets can be a little different for phrases because
     * the whole phrase is marked up instead of each word. The passage relevancy calculation can be
     * different (maybe worse?) and it's slower when highlighting many fields. Use of this flag
     * requires {@link HighlightFlag#MULTI_TERM_QUERY} and {@link HighlightFlag#PHRASES} and {@link
     * HighlightFlag#PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true
     * by default.
     */
    public Builder withWeightMatches(boolean value) {
      this.weightMatches = value;
      return this;
    }

    /** The text to be highlight is effectively truncated by this length. */
    public Builder withMaxLength(int value) {
      if (value < 0 || value == Integer.MAX_VALUE) {
        // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
        // our sentinel in the offsets queue uses this value to terminate.
        throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
      }
      this.maxLength = value;
      return this;
    }

    public Builder withBreakIterator(Supplier value) {
      this.breakIterator = value;
      return this;
    }

    public Builder withFieldMatcher(Predicate value) {
      this.fieldMatcher = value;
      return this;
    }

    /**
     * Set up a function that given a field retuns a set of masked fields whose matches are combined
     * to highlight the given field. Masked fields should not include the original field. This is
     * useful when you want to highlight a field based on matches from several fields.
     *
     * 
Note: All masked fields must share the same source as the field being highlighted,
     * otherwise their offsets will not correspond to the highlighted field.
     *
     * 
Note: Only the field being highlighted must provide an original source value (e.g. through
     * stored field), other masked fields don't need it.
     */
    public Builder withMaskedFieldsFunc(Function> maskedFieldsFunc) {
      this.maskedFieldsFunc = maskedFieldsFunc;
      return this;
    }

    public Builder withScorer(PassageScorer value) {
      this.scorer = value;
      return this;
    }

    public Builder withFormatter(PassageFormatter value) {
      this.formatter = value;
      return this;
    }

    public Builder withMaxNoHighlightPassages(int value) {
      this.maxNoHighlightPassages = value;
      return this;
    }

    public Builder withCacheFieldValCharsThreshold(int value) {
      this.cacheFieldValCharsThreshold = value;
      return this;
    }

    public Builder withPassageSortComparator(Comparator value) {
      this.passageSortComparator = value;
      return this;
    }

    public UnifiedHighlighter build() {
      return new UnifiedHighlighter(this);
    }

    /** ... as passed in from the Builder constructor. */
    public IndexSearcher getIndexSearcher() {
      return searcher;
    }

    /** ... as passed in from the Builder constructor. */
    public Analyzer getIndexAnalyzer() {
      return indexAnalyzer;
    }

    public Set getFlags() {
      return flags;
    }
  }

  /**
   * Creates a {@link Builder} object where {@link IndexSearcher} and {@link Analyzer} are not null.
   *
   * @param searcher - a {@link IndexSearcher} object.
   * @param indexAnalyzer - a {@link Analyzer} object.
   * @return a {@link Builder} object
   */
  public static Builder builder(IndexSearcher searcher, Analyzer indexAnalyzer) {
    return new Builder(searcher, indexAnalyzer);
  }

  /**
   * Creates a {@link Builder} object in which you can only use {@link
   * UnifiedHighlighter#highlightWithoutSearcher(String, Query, String, int)} for highlighting.
   *
   * @param indexAnalyzer - a {@link Analyzer} object.
   * @return a {@link Builder} object
   */
  public static Builder builderWithoutSearcher(Analyzer indexAnalyzer) {
    return new Builder(null, indexAnalyzer);
  }

  /**
   * Constructs the highlighter with the given {@link Builder}.
   *
   * @param builder - a {@link Builder} object.
   */
  public UnifiedHighlighter(Builder builder) {
    this.searcher = builder.searcher;
    this.indexAnalyzer =
        Objects.requireNonNull(
            builder.indexAnalyzer,
            "indexAnalyzer is required (even if in some circumstances it isn't used)");
    this.flags = evaluateFlags(builder);
    this.maxLength = builder.maxLength;
    this.breakIterator = builder.breakIterator;
    this.fieldMatcher = builder.fieldMatcher;
    this.maskedFieldsFunc = builder.maskedFieldsFunc;
    this.scorer = builder.scorer;
    this.formatter = builder.formatter;
    this.maxNoHighlightPassages = builder.maxNoHighlightPassages;
    this.cacheFieldValCharsThreshold = builder.cacheFieldValCharsThreshold;
    this.passageSortComparator = builder.passageSortComparator;
  }

  /** Extracts matching terms */
  protected static Set extractTerms(Query query) {
    Set queryTerms = new HashSet<>();
    query.visit(QueryVisitor.termCollector(queryTerms));
    return queryTerms;
  }

  /**
   * This method returns the set of of {@link HighlightFlag}s, which will be applied to the UH
   * object. The output depends on the values provided to {@link
   * Builder#withHandleMultiTermQuery(boolean)}, {@link
   * Builder#withHighlightPhrasesStrictly(boolean)}, {@link
   * Builder#withPassageRelevancyOverSpeed(boolean)} and {@link Builder#withWeightMatches(boolean)}
   * OR {@link #setHandleMultiTermQuery(boolean)}, {@link #setHighlightPhrasesStrictly(boolean)},
   * {@link #setPassageRelevancyOverSpeed(boolean)} and {@link #setWeightMatches(boolean)}
   *
   * @param shouldHandleMultiTermQuery - flag for adding Multi-term query
   * @param shouldHighlightPhrasesStrictly - flag for adding phrase highlighting
   * @param shouldPassageRelevancyOverSpeed - flag for adding passage relevancy
   * @param shouldEnableWeightMatches - flag for enabling weight matches
   * @return a set of {@link HighlightFlag}s.
   */
  protected Set evaluateFlags(
      final boolean shouldHandleMultiTermQuery,
      final boolean shouldHighlightPhrasesStrictly,
      final boolean shouldPassageRelevancyOverSpeed,
      final boolean shouldEnableWeightMatches) {
    Set highlightFlags = EnumSet.noneOf(HighlightFlag.class);
    if (shouldHandleMultiTermQuery) {
      highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
    }
    if (shouldHighlightPhrasesStrictly) {
      highlightFlags.add(HighlightFlag.PHRASES);
    }
    if (shouldPassageRelevancyOverSpeed) {
      highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
    }

    // Evaluate if WEIGHT_MATCHES can be added as a flag.
    final boolean applyWeightMatches =
        highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
            && highlightFlags.contains(HighlightFlag.PHRASES)
            && highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)
            // User can also opt-out of WEIGHT_MATCHES.
            && shouldEnableWeightMatches;

    if (applyWeightMatches) {
      highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);
    }
    return highlightFlags;
  }

  /**
   * Evaluate the highlight flags and set the {@link #flags} variable. This is called only once when
   * the Builder object is used to create a UH object.
   *
   * @param uhBuilder - {@link Builder} object.
   * @return {@link HighlightFlag}s.
   */
  protected Set evaluateFlags(Builder uhBuilder) {
    if (flags != null) {
      return flags;
    }
    return flags =
        evaluateFlags(
            uhBuilder.handleMultiTermQuery,
            uhBuilder.highlightPhrasesStrictly,
            uhBuilder.passageRelevancyOverSpeed,
            uhBuilder.weightMatches);
  }

  /**
   * Evaluate the highlight flags and set the {@link #flags} variable. This is called every time
   * {@link #getFlags(String)} method is called. This is used in the builder and has been marked
   * deprecated since it is used only for the mutable initialization of a UH object.
   *
   * @param uh - {@link UnifiedHighlighter} object.
   * @return {@link HighlightFlag}s.
   */
  @Deprecated
  protected Set evaluateFlags(UnifiedHighlighter uh) {
    return evaluateFlags(
        uh.handleMultiTermQuery,
        uh.highlightPhrasesStrictly,
        uh.passageRelevancyOverSpeed,
        uh.weightMatches);
  }

  /**
   * Returns the predicate to use for extracting the query part that must be highlighted. By default
   * only queries that target the current field are kept. (AKA requireFieldMatch)
   */
  protected Predicate getFieldMatcher(String field) {
    if (fieldMatcher != null) {
      return fieldMatcher;
    } else {
      // requireFieldMatch = true
      return (qf) -> field.equals(qf);
    }
  }

  protected Set getMaskedFields(String field) {
    return maskedFieldsFunc == null ? null : maskedFieldsFunc.apply(field);
  }

  /** Returns the {@link HighlightFlag}s applicable for the current UH instance. */
  protected Set getFlags(String field) {
    // If a builder is used for initializing a UH object, then flags will never be null.
    // Once the setters are removed, this method can just return the flags.
    if (flags != null) {
      return flags;
    }
    // When not using builder, you have to reevaluate the flags.
    return evaluateFlags(this);
  }

  /**
   * The maximum content size to process. Content will be truncated to this size before
   * highlighting. Typically snippets closer to the beginning of the document better summarize its
   * content.
   */
  public int getMaxLength() {
    return maxLength;
  }

  /**
   * Returns the {@link BreakIterator} to use for dividing text into passages. This returns {@link
   * BreakIterator#getSentenceInstance(Locale)} by default; subclasses can override to customize.
   *
   * 
Note: this highlighter will call {@link BreakIterator#preceding(int)} and {@link
   * BreakIterator#next()} many times on it. The default generic JDK implementation of {@code
   * preceding} performs poorly.
   */
  protected BreakIterator getBreakIterator(String field) {
    return breakIterator.get();
  }

  /** Returns the {@link PassageScorer} to use for ranking passages. */
  protected PassageScorer getScorer(String field) {
    return scorer;
  }

  /**
   * Returns the {@link PassageFormatter} to use for formatting passages into highlighted snippets.
   */
  protected PassageFormatter getFormatter(String field) {
    return formatter;
  }

  /** Returns the {@link Comparator} to use for finally sorting passages. */
  protected Comparator getPassageSortComparator(String field) {
    return passageSortComparator;
  }

  /**
   * Returns the number of leading passages (as delineated by the {@link BreakIterator}) when no
   * highlights could be found. If it's less than 0 (the default) then this defaults to the {@code
   * maxPassages} parameter given for each request. If this is 0 then the resulting highlight is
   * null (not formatted).
   */
  protected int getMaxNoHighlightPassages(String field) {
    return maxNoHighlightPassages;
  }

  /**
   * Limits the amount of field value pre-fetching until this threshold is passed. The highlighter
   * internally highlights in batches of documents sized on the sum field value length (in chars) of
   * the fields to be highlighted (bounded by {@link #getMaxLength()} for each field). By setting
   * this to 0, you can force documents to be fetched and highlighted one at a time, which you
   * usually shouldn't do. The default is 524288 chars which translates to about a megabyte.
   * However, note that the highlighter sometimes ignores this and highlights one document at a time
   * (without caching a bunch of documents in advance) when it can detect there's no point in it --
   * such as when all fields will be highlighted via re-analysis as one example.
   */
  public int getCacheFieldValCharsThreshold() { // question: should we size by bytes instead?
    return cacheFieldValCharsThreshold;
  }

  /** ... as passed in from constructor. */
  public IndexSearcher getIndexSearcher() {
    return searcher;
  }

  /** ... as passed in from constructor. */
  public Analyzer getIndexAnalyzer() {
    return indexAnalyzer;
  }

  /** Source of term offsets; essential for highlighting. */
  public enum OffsetSource {
    POSTINGS,
    TERM_VECTORS,
    ANALYSIS,
    POSTINGS_WITH_TERM_VECTORS,
    NONE_NEEDED
  }

  /**
   * Determine the offset source for the specified field. The default algorithm is as follows:
   *
   * 

   *   This calls {@link #getFieldInfo(String)}. Note this returns null if there is no searcher
   *       or if the field isn't found there.
   *   
If there's a field info it has {@link
   *       IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} then {@link OffsetSource#POSTINGS}
   *       is returned.
   *   
If there's a field info and {@link FieldInfo#hasVectors()} then {@link
   *       OffsetSource#TERM_VECTORS} is returned (note we can't check here if the TV has offsets;
   *       if there isn't then an exception will get thrown down the line).
   *   
Fall-back: {@link OffsetSource#ANALYSIS} is returned.
   * 
   *
   * Note that the highlighter sometimes switches to something else based on the query, such as
   * if you have {@link OffsetSource#POSTINGS_WITH_TERM_VECTORS} but in fact don't need term
   * vectors.
   */
  protected OffsetSource getOffsetSource(String field) {
    FieldInfo fieldInfo = getFieldInfo(field);
    if (fieldInfo != null) {
      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
        return fieldInfo.hasVectors()
            ? OffsetSource.POSTINGS_WITH_TERM_VECTORS
            : OffsetSource.POSTINGS;
      }
      if (fieldInfo.hasVectors()) { // unfortunately we can't also check if the TV has offsets
        return OffsetSource.TERM_VECTORS;
      }
    }
    return OffsetSource.ANALYSIS;
  }

  /**
   * Called by the default implementation of {@link #getOffsetSource(String)}. If there is no
   * searcher then we simply always return null.
   */
  protected FieldInfo getFieldInfo(String field) {
    if (searcher == null) {
      return null;
    }
    // Need thread-safety for lazy-init but lets avoid 'synchronized' by using double-check locking
    // idiom
    FieldInfos fieldInfos = this.fieldInfos; // note: it's volatile; read once
    if (fieldInfos == null) {
      synchronized (this) {
        fieldInfos = this.fieldInfos;
        if (fieldInfos == null) {
          fieldInfos = FieldInfos.getMergedFieldInfos(searcher.getIndexReader());
          this.fieldInfos = fieldInfos;
        }
      }
    }
    return fieldInfos.fieldInfo(field);
  }

  /**
   * Highlights the top passages from a single field.
   *
   * @param field field name to highlight. Must have a stored string value and also be indexed with
   *     offsets.
   * @param query query to highlight.
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @return Array of formatted snippets corresponding to the documents in topDocs. If
   *     no highlights were found for a document, the first sentence for the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
    return highlight(field, query, topDocs, 1);
  }

  /**
   * Highlights the top-N passages from a single field.
   *
   * @param field field name to highlight. Must have a stored string value.
   * @param query query to highlight.
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
   *     snippets.
   * @return Array of formatted snippets corresponding to the documents in topDocs. If
   *     no highlights were found for a document, the first {@code maxPassages} sentences from the
   *     field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages)
      throws IOException {
    Map res =
        highlightFields(new String[] {field}, query, topDocs, new int[] {maxPassages});
    return res.get(field);
  }

  /**
   * Highlights the top passages from multiple fields.
   *
   * 
Conceptually, this behaves as a more efficient form of:
   *
   * 
   * Map m = new HashMap();
   * for (String field : fields) {
   * m.put(field, highlight(field, query, topDocs));
   * }
   * return m;
   * 
   *
   * @param fields field names to highlight. Must have a stored string value.
   * @param query query to highlight.
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @return Map keyed on field name, containing the array of formatted snippets corresponding to
   *     the documents in topDocs. If no highlights were found for a document, the
   *     first sentence from the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public Map highlightFields(String[] fields, Query query, TopDocs topDocs)
      throws IOException {
    int[] maxPassages = new int[fields.length];
    Arrays.fill(maxPassages, 1);
    return highlightFields(fields, query, topDocs, maxPassages);
  }

  /**
   * Highlights the top-N passages from multiple fields.
   *
   * Conceptually, this behaves as a more efficient form of:
   *
   * 
   * Map m = new HashMap();
   * for (String field : fields) {
   * m.put(field, highlight(field, query, topDocs, maxPassages));
   * }
   * return m;
   * 
   *
   * @param fields field names to highlight. Must have a stored string value.
   * @param query query to highlight.
   * @param topDocs TopDocs containing the summary result documents to highlight.
   * @param maxPassages The maximum number of top-N ranked passages per-field used to form the
   *     highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets corresponding to
   *     the documents in topDocs. If no highlights were found for a document, the
   *     first {@code maxPassages} sentences from the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public Map highlightFields(
      String[] fields, Query query, TopDocs topDocs, int[] maxPassages) throws IOException {
    final ScoreDoc[] scoreDocs = topDocs.scoreDocs;
    int[] docids = new int[scoreDocs.length];
    for (int i = 0; i < docids.length; i++) {
      docids[i] = scoreDocs[i].doc;
    }

    return highlightFields(fields, query, docids, maxPassages);
  }

  /**
   * Highlights the top-N passages from multiple fields, for the provided int[] docids.
   *
   * @param fieldsIn field names to highlight. Must have a stored string value.
   * @param query query to highlight.
   * @param docidsIn containing the document IDs to highlight.
   * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
   *     highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets corresponding to
   *     the documents in docidsIn. If no highlights were found for a document, the
   *     first {@code maxPassages} from the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  public Map highlightFields(
      String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn) throws IOException {
    Map snippets = new HashMap<>();
    for (Map.Entry ent :
        highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
      Object[] snippetObjects = ent.getValue();
      String[] snippetStrings = new String[snippetObjects.length];
      snippets.put(ent.getKey(), snippetStrings);
      for (int i = 0; i < snippetObjects.length; i++) {
        Object snippet = snippetObjects[i];
        if (snippet != null) {
          snippetStrings[i] = snippet.toString();
        }
      }
    }

    return snippets;
  }

  /**
   * Expert: highlights the top-N passages from multiple fields, for the provided int[] docids, to
   * custom Object as returned by the {@link PassageFormatter}. Use this API to render to something
   * other than String.
   *
   * @param fieldsIn field names to highlight. Must have a stored string value.
   * @param query query to highlight.
   * @param docIdsIn containing the document IDs to highlight.
   * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to form the
   *     highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets corresponding to
   *     the documents in docIdsIn. If no highlights were found for a document, the
   *     first {@code maxPassages} from the field will be returned.
   * @throws IOException if an I/O error occurred during processing
   * @throws IllegalArgumentException if field was indexed without {@link
   *     IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
  protected Map highlightFieldsAsObjects(
      String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
    if (fieldsIn.length < 1) {
      throw new IllegalArgumentException("fieldsIn must not be empty");
    }
    if (fieldsIn.length != maxPassagesIn.length) {
      throw new IllegalArgumentException("invalid number of maxPassagesIn");
    }
    if (searcher == null) {
      throw new IllegalStateException(
          "This method requires that an indexSearcher was passed in the "
              + "constructor.  Perhaps you mean to call highlightWithoutSearcher?");
    }

    // Sort docs & fields for sequential i/o

    // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
    int[] docIds = new int[docIdsIn.length];
    int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
    copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes); // latter 2 are "out" params

    // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
    final String[] fields = new String[fieldsIn.length];
    final int[] maxPassages = new int[maxPassagesIn.length];
    copyAndSortFieldsWithMaxPassages(
        fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params

    // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
    Set queryTerms = extractTerms(query);
    FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
    int numTermVectors = 0;
    int numPostings = 0;
    for (int f = 0; f < fields.length; f++) {
      FieldHighlighter fieldHighlighter =
          getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
      fieldHighlighters[f] = fieldHighlighter;

      switch (fieldHighlighter.getOffsetSource()) {
        case TERM_VECTORS:
          numTermVectors++;
          break;
        case POSTINGS:
          numPostings++;
          break;
        case POSTINGS_WITH_TERM_VECTORS:
          numTermVectors++;
          numPostings++;
          break;
        case ANALYSIS:
        case NONE_NEEDED:
        default:
          // do nothing
          break;
      }
    }

    int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);

    IndexReader indexReaderWithTermVecCache =
        (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;

    // [fieldIdx][docIdInIndex] of highlightDoc result
    Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
    // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
    DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
    for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
      // Load the field values of the first batch of document(s) (note: commonly all docs are in
      // this batch)
      List fieldValsByDoc = loadFieldValues(fields, docIdIter, cacheCharsThreshold);
      //    the size of the above list is the size of the batch (num of docs in the batch)

      // Highlight in per-field order first, then by doc (better I/O pattern)
      for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
        Object[] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
        FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
        for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
          int docId = docIds[docIdx]; // sorted order
          CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
          if (content == null) {
            continue;
          }
          IndexReader indexReader =
              (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
                      && indexReaderWithTermVecCache != null)
                  ? indexReaderWithTermVecCache
                  : searcher.getIndexReader();
          final LeafReader leafReader;
          if (indexReader instanceof LeafReader) {
            leafReader = (LeafReader) indexReader;
          } else {
            List leaves = indexReader.leaves();
            LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
            leafReader = leafReaderContext.reader();
            docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
          }
          int docInIndex = docInIndexes[docIdx]; // original input order
          assert resultByDocIn[docInIndex] == null;
          resultByDocIn[docInIndex] =
              fieldHighlighter.highlightFieldForDoc(leafReader, docId, content.toString());
        }
      }

      batchDocIdx += fieldValsByDoc.size();
    }
    IOUtils.close(indexReaderWithTermVecCache); // FYI won't close underlying reader
    assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
        || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;

    // TODO reconsider the return type; since this is an "advanced" method, lets not return a Map?
    // Notice the only
    //    caller simply iterates it to build another structure.

    // field -> object highlights parallel to docIdsIn
    Map resultMap = CollectionUtil.newHashMap(fields.length);
    for (int f = 0; f < fields.length; f++) {
      resultMap.put(fields[f], highlightDocsInByField[f]);
    }
    return resultMap;
  }

  /**
   * When cacheCharsThreshold is 0, loadFieldValues() only fetches one document at a time. We
   * override it to be 0 in two circumstances:
   */
  private int calculateOptimalCacheCharsThreshold(int numTermVectors, int numPostings) {
    if (numPostings == 0 && numTermVectors == 0) {
      // (1) When all fields are ANALYSIS there's no point in caching a batch of documents
      // because no other info on disk is needed to highlight it.
      return 0;
    } else if (numTermVectors >= 2) {
      // (2) When two or more fields have term vectors, given the field-then-doc algorithm, the
      // underlying term
      // vectors will be fetched in a terrible access pattern unless we highlight a doc at a time
      // and use a special
      // current-doc TV cache.  So we do that.  Hopefully one day TVs will be improved to make this
      // pointless.
      return 0;
    } else {
      return getCacheFieldValCharsThreshold();
    }
  }

  private void copyAndSortFieldsWithMaxPassages(
      String[] fieldsIn, int[] maxPassagesIn, final String[] fields, final int[] maxPassages) {
    System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
    System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
    new InPlaceMergeSorter() {
      @Override
      protected void swap(int i, int j) {
        String tmp = fields[i];
        fields[i] = fields[j];
        fields[j] = tmp;
        int tmp2 = maxPassages[i];
        maxPassages[i] = maxPassages[j];
        maxPassages[j] = tmp2;
      }

      @Override
      protected int compare(int i, int j) {
        return fields[i].compareTo(fields[j]);
      }
    }.sort(0, fields.length);
  }

  private void copyAndSortDocIdsWithIndex(
      int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
    System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
    for (int i = 0; i < docInIndexes.length; i++) {
      docInIndexes[i] = i;
    }
    new InPlaceMergeSorter() {
      @Override
      protected void swap(int i, int j) {
        int tmp = docIds[i];
        docIds[i] = docIds[j];
        docIds[j] = tmp;
        tmp = docInIndexes[i];
        docInIndexes[i] = docInIndexes[j];
        docInIndexes[j] = tmp;
      }

      @Override
      protected int compare(int i, int j) {
        return Integer.compare(docIds[i], docIds[j]);
      }
    }.sort(0, docIds.length);
  }

  /**
   * Highlights text passed as a parameter. This requires the {@link IndexSearcher} provided to this
   * highlighter is null. This use-case is more rare. Naturally, the mode of operation will be
   * {@link OffsetSource#ANALYSIS}. The result of this method is whatever the {@link
   * PassageFormatter} returns. For the {@link DefaultPassageFormatter} and assuming {@code content}
   * has non-zero length, the result will be a non-null string -- so it's safe to call {@link
   * Object#toString()} on it in that case.
   *
   * @param field field name to highlight (as found in the query).
   * @param query query to highlight.
   * @param content text to highlight.
   * @param maxPassages The maximum number of top-N ranked passages used to form the highlighted
   *     snippets.
   * @return result of the {@link PassageFormatter} -- probably a String. Might be null.
   * @throws IOException if an I/O error occurred during processing
   */
  // TODO make content a List? and return a List? and ensure getEmptyHighlight is never invoked
  // multiple times?
  public Object highlightWithoutSearcher(String field, Query query, String content, int maxPassages)
      throws IOException {
    if (this.searcher != null) {
      throw new IllegalStateException(
          "highlightWithoutSearcher should only be called on a "
              + getClass().getSimpleName()
              + " without an IndexSearcher.");
    }
    Objects.requireNonNull(content, "content is required");
    Set queryTerms = extractTerms(query);
    return getFieldHighlighter(field, query, queryTerms, maxPassages)
        .highlightFieldForDoc(null, -1, content);
  }

  protected FieldHighlighter getFieldHighlighter(
      String field, Query query, Set allTerms, int maxPassages) {
    Set maskedFields = getMaskedFields(field);
    FieldOffsetStrategy fieldOffsetStrategy;
    if (maskedFields == null || maskedFields.isEmpty()) {
      UHComponents components = getHighlightComponents(field, query, allTerms);
      OffsetSource offsetSource = getOptimizedOffsetSource(components);
      fieldOffsetStrategy = getOffsetStrategy(offsetSource, components);
    } else {
      List fieldsOffsetStrategies = new ArrayList<>(maskedFields.size() + 1);
      for (String maskedField : maskedFields) {
        UHComponents components = getHighlightComponents(maskedField, query, allTerms);
        OffsetSource offsetSource = getOptimizedOffsetSource(components);
        fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));
      }
      // adding original field as well
      UHComponents components = getHighlightComponents(field, query, allTerms);
      OffsetSource offsetSource = getOptimizedOffsetSource(components);
      fieldsOffsetStrategies.add(getOffsetStrategy(offsetSource, components));

      fieldOffsetStrategy = new MultiFieldsOffsetStrategy(fieldsOffsetStrategies);
    }
    return newFieldHighlighter(
        field,
        fieldOffsetStrategy,
        new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
        getScorer(field),
        maxPassages,
        getMaxNoHighlightPassages(field),
        getFormatter(field),
        getPassageSortComparator(field));
  }

  protected FieldHighlighter newFieldHighlighter(
      String field,
      FieldOffsetStrategy fieldOffsetStrategy,
      BreakIterator breakIterator,
      PassageScorer passageScorer,
      int maxPassages,
      int maxNoHighlightPassages,
      PassageFormatter passageFormatter,
      Comparator passageSortComparator) {
    return new FieldHighlighter(
        field,
        fieldOffsetStrategy,
        breakIterator,
        passageScorer,
        maxPassages,
        maxNoHighlightPassages,
        passageFormatter,
        passageSortComparator);
  }

  protected UHComponents getHighlightComponents(String field, Query query, Set allTerms) {
    Predicate fieldMatcher = getFieldMatcher(field);
    Set highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
    BytesRef[] terms = null;
    LabelledCharArrayMatcher[] automata = null;
    if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
      terms = filterExtractedTerms(fieldMatcher, allTerms);
      automata = getAutomata(field, query, highlightFlags);
    } // otherwise don't need to extract
    return new UHComponents(
        field,
        fieldMatcher,
        query,
        terms,
        phraseHelper,
        automata,
        queryHasUnrecognizedPart,
        highlightFlags);
  }

  protected boolean hasUnrecognizedQuery(Predicate fieldMatcher, Query query) {
    boolean[] hasUnknownLeaf = new boolean[1];
    query.visit(
        new QueryVisitor() {
          @Override
          public boolean acceptField(String field) {
            // checking hasUnknownLeaf is a trick to exit early
            return hasUnknownLeaf[0] == false && fieldMatcher.test(field);
          }

          @Override
          public void visitLeaf(Query query) {
            if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) {
              if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) {
                hasUnknownLeaf[0] = true;
              }
            }
          }
        });
    return hasUnknownLeaf[0];
  }

  protected static BytesRef[] filterExtractedTerms(
      Predicate fieldMatcher, Set queryTerms) {
    // Strip off the redundant field and sort the remaining terms
    SortedSet filteredTerms = new TreeSet<>();
    for (Term term : queryTerms) {
      if (fieldMatcher.test(term.field())) {
        filteredTerms.add(term.bytes());
      }
    }
    return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
  }

  protected PhraseHelper getPhraseHelper(
      String field, Query query, Set highlightFlags) {
    boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES);
    if (useWeightMatchesIter) {
      return PhraseHelper.NONE; // will be handled by Weight.matches which always considers phrases
    }
    boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
    boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
    return highlightPhrasesStrictly
        ? new PhraseHelper(
            query,
            field,
            getFieldMatcher(field),
            this::requiresRewrite,
            this::preSpanQueryRewrite,
            !handleMultiTermQuery)
        : PhraseHelper.NONE;
  }

  protected LabelledCharArrayMatcher[] getAutomata(
      String field, Query query, Set highlightFlags) {
    // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper
    // handle those?
    // if don't highlight phrases strictly,
    final boolean lookInSpan =
        !highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper
            || highlightFlags.contains(
                HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all

    return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
        ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), lookInSpan)
        : ZERO_LEN_AUTOMATA_ARRAY;
  }

  protected OffsetSource getOptimizedOffsetSource(UHComponents components) {
    OffsetSource offsetSource = getOffsetSource(components.getField());

    // null automata means unknown, so assume a possibility
    boolean mtqOrRewrite =
        components.getAutomata() == null
            || components.getAutomata().length > 0
            || components.getPhraseHelper().willRewrite()
            || components.hasUnrecognizedQueryPart();

    // null terms means unknown, so assume something to highlight
    if (mtqOrRewrite == false
        && components.getTerms() != null
        && components.getTerms().length == 0) {
      return OffsetSource.NONE_NEEDED; // nothing to highlight
    }

    switch (offsetSource) {
      case POSTINGS:
        if (mtqOrRewrite) { // may need to see scan through all terms for the highlighted document
          // efficiently
          return OffsetSource.ANALYSIS;
        }
        break;
      case POSTINGS_WITH_TERM_VECTORS:
        if (mtqOrRewrite == false) {
          return OffsetSource.POSTINGS; // We don't need term vectors
        }
        break;
      case ANALYSIS:
      case TERM_VECTORS:
      case NONE_NEEDED:
      default:
        // stick with the original offset source
        break;
    }

    return offsetSource;
  }

  protected FieldOffsetStrategy getOffsetStrategy(
      OffsetSource offsetSource, UHComponents components) {
    switch (offsetSource) {
      case ANALYSIS:
        if (!components.getPhraseHelper().hasPositionSensitivity()
            && !components.getHighlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)
            && !components.getHighlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) {
          // skip using a memory index since it's pure term filtering
          return new TokenStreamOffsetStrategy(components, getIndexAnalyzer());
        } else {
          return new MemoryIndexOffsetStrategy(components, getIndexAnalyzer());
        }
      case NONE_NEEDED:
        return NoOpOffsetStrategy.INSTANCE;
      case TERM_VECTORS:
        return new TermVectorOffsetStrategy(components);
      case POSTINGS:
        return new PostingsOffsetStrategy(components);
      case POSTINGS_WITH_TERM_VECTORS:
        return new PostingsWithTermVectorsOffsetStrategy(components);
      default:
        throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
    }
  }

  /**
   * When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
   * {@link Query#rewrite(IndexSearcher)} called on them. It helps performance to avoid it if it's
   * not needed. This method will be invoked on all SpanQuery instances recursively. If you have
   * custom SpanQuery queries then override this to check instanceof and provide a definitive
   * answer. If the query isn't your custom one, simply return null to have the default rules apply,
   * which govern the ones included in Lucene.
   */
  protected Boolean requiresRewrite(SpanQuery spanQuery) {
    return null;
  }

  /**
   * When highlighting phrases accurately, we may need to handle custom queries that aren't
   * supported in the {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called
   * by the {@code PhraseHelper}. Should custom query types be needed, this method should be
   * overriden to return a collection of queries if appropriate, or null if nothing to do. If the
   * query is not custom, simply returning null will allow the default rules to apply.
   *
   * @param query Query to be highlighted
   * @return A Collection of Query object(s) if needs to be rewritten, otherwise null.
   */
  protected Collection preSpanQueryRewrite(Query query) {
    return null;
  }

  private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
    return new DocIdSetIterator() {
      int idx = -1;

      @Override
      public int docID() {
        if (idx < 0 || idx >= sortedDocIds.length) {
          return NO_MORE_DOCS;
        }
        return sortedDocIds[idx];
      }

      @Override
      public int nextDoc() throws IOException {
        idx++;
        return docID();
      }

      @Override
      public int advance(int target) throws IOException {
        return super.slowAdvance(target); // won't be called, so whatever
      }

      @Override
      public long cost() {
        return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
      }
    };
  }

  /**
   * Loads the String values for each docId by field to be highlighted. By default this loads from
   * stored fields by the same name as given, but a subclass can change the source. The returned
   * Strings must be identical to what was indexed (at least for postings or term-vectors offset
   * sources). This method must load fields for at least one document from the given {@link
   * DocIdSetIterator} but need not return all of them; by default the character lengths are summed
   * and this method will return early when {@code cacheCharsThreshold} is exceeded. Specifically if
   * that number is 0, then only one document is fetched no matter what. Values in the array of
   * {@link CharSequence} will be null if no value was found.
   */
  protected List loadFieldValues(
      String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
    List docListOfFields =
        new ArrayList<>(cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost()));

    LimitedStoredFieldVisitor visitor = newLimitedStoredFieldsVisitor(fields);
    StoredFields storedFields = searcher.storedFields();
    int sumChars = 0;
    do {
      int docId = docIter.nextDoc();
      if (docId == DocIdSetIterator.NO_MORE_DOCS) {
        break;
      }
      visitor.init();
      storedFields.document(docId, visitor);
      CharSequence[] valuesByField = visitor.getValuesByField();
      docListOfFields.add(valuesByField);
      for (CharSequence val : valuesByField) {
        sumChars += (val == null ? 0 : val.length());
      }
    } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0);
    return docListOfFields;
  }

  /**
   * @lucene.internal
   */
  protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
    return new LimitedStoredFieldVisitor(fields, MULTIVAL_SEP_CHAR, getMaxLength());
  }

  /**
   * Fetches stored fields for highlighting. Uses a multi-val separator char and honors a max length
   * to retrieve.
   *
   * @lucene.internal
   */
  protected static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
    protected final String[] fields;
    protected final char valueSeparator;
    protected final int maxLength;
    protected CharSequence[] values; // starts off as String; may become StringBuilder.
    protected int currentField;

    public LimitedStoredFieldVisitor(String[] fields, char valueSeparator, int maxLength) {
      this.fields = fields;
      this.valueSeparator = valueSeparator;
      this.maxLength = maxLength;
    }

    void init() {
      values = new CharSequence[fields.length];
      currentField = -1;
    }

    @Override
    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
      assert currentField >= 0;
      Objects.requireNonNull(value, "String value should not be null");
      CharSequence curValue = values[currentField];
      if (curValue == null) {
        // question: if truncate due to maxLength, should we try and avoid keeping the other chars
        // in-memory on
        //  the backing char[]?
        values[currentField] =
            value.substring(0, Math.min(maxLength, value.length())); // note: may return 'this'
        return;
      }
      final int lengthBudget = maxLength - curValue.length();
      if (lengthBudget <= 0) {
        return;
      }
      StringBuilder curValueBuilder;
      if (curValue instanceof StringBuilder) {
        curValueBuilder = (StringBuilder) curValue;
      } else {
        // upgrade String to StringBuilder. Choose a good initial size.
        curValueBuilder =
            new StringBuilder(curValue.length() + Math.min(lengthBudget, value.length() + 256));
        curValueBuilder.append(curValue);
      }
      curValueBuilder.append(valueSeparator);
      curValueBuilder.append(value, 0, Math.min(lengthBudget - 1, value.length()));
      values[currentField] = curValueBuilder;
    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      currentField = Arrays.binarySearch(fields, fieldInfo.name);
      if (currentField < 0) {
        return Status.NO;
      }
      CharSequence curVal = values[currentField];
      if (curVal != null && curVal.length() >= maxLength) {
        return fields.length == 1 ? Status.STOP : Status.NO;
      }
      return Status.YES;
    }

    CharSequence[] getValuesByField() {
      return this.values;
    }
  }

  /**
   * Wraps an IndexReader that remembers/caches the last call to {@link TermVectors#get(int)} so
   * that if the next call has the same ID, then it is reused. If TV's were column-stride (like
   * doc-values), there would be no need for this.
   */
  private static class TermVectorReusingLeafReader extends FilterLeafReader {

    static IndexReader wrap(IndexReader reader) throws IOException {
      LeafReader[] leafReaders =
          reader.leaves().stream()
              .map(LeafReaderContext::reader)
              .map(TermVectorReusingLeafReader::new)
              .toArray(LeafReader[]::new);
      return new BaseCompositeReader(leafReaders, null) {
        @Override
        protected void doClose() { // don't close the underlying reader
        }

        @Override
        public CacheHelper getReaderCacheHelper() {
          return null;
        }
      };
    }

    private int lastDocId = -1;
    private Fields tvFields;

    TermVectorReusingLeafReader(LeafReader in) {
      super(in);
    }

    @Override
    public Fields getTermVectors(int docID) throws IOException {
      if (docID != lastDocId) {
        lastDocId = docID;
        tvFields = in.getTermVectors(docID);
      }
      return tvFields;
    }

    @Override
    public TermVectors termVectors() throws IOException {
      TermVectors orig = in.termVectors();
      return new TermVectors() {
        @Override
        public Fields get(int docID) throws IOException {
          if (docID != lastDocId) {
            lastDocId = docID;
            tvFields = orig.get(docID);
          }
          return tvFields;
        }
      };
    }

    @Override
    public CacheHelper getCoreCacheHelper() {
      return null;
    }

    @Override
    public CacheHelper getReaderCacheHelper() {
      return null;
    }
  }

  /** Flags for controlling highlighting behavior. */
  public enum HighlightFlag {
    /**
     * @see Builder#withHighlightPhrasesStrictly(boolean)
     */
    PHRASES,

    /**
     * @see Builder#withHandleMultiTermQuery(boolean)
     */
    MULTI_TERM_QUERY,

    /**
     * @see Builder#withPassageRelevancyOverSpeed(boolean)
     */
    PASSAGE_RELEVANCY_OVER_SPEED,

    /**
     * @see Builder#withWeightMatches(boolean)
     */
    WEIGHT_MATCHES

    // TODO: useQueryBoosts
  }
}