de.digitalcollections.solrocr.lucene.OcrHighlighter Maven / Gradle / Ivy

Go to download
/*
 * Contains verbatim code and custom code based on code from the Lucene
 * project, licensed under the following terms. All parts where this is
 * the case are clearly marked as such in a source code comment referring
 * to this header.
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE.upstream file distributed
 * with this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * For all parts where this is not the case, refer to the LICENSE file in the
 * repository root.
 */
package de.digitalcollections.solrocr.lucene;

import com.google.common.collect.ImmutableSet;
import de.digitalcollections.solrocr.formats.alto.AltoFormat;
import de.digitalcollections.solrocr.formats.hocr.HocrFormat;
import de.digitalcollections.solrocr.formats.miniocr.MiniOcrFormat;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.ContextBreakLocator;
import de.digitalcollections.solrocr.iter.ExitingIterCharSeq;
import de.digitalcollections.solrocr.iter.FileBytesCharIterator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.MultiFileBytesCharIterator;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrHighlightResult;
import de.digitalcollections.solrocr.model.OcrSnippet;
import de.digitalcollections.solrocr.model.SourcePointer;
import de.digitalcollections.solrocr.reader.LegacyBaseCompositeReader;
import de.digitalcollections.solrocr.solr.OcrHighlightParams;
import de.digitalcollections.solrocr.util.HighlightTimeout;
import de.digitalcollections.solrocr.util.PageCacheWarmer;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.BaseCompositeReader;
import org.apache.lucene.index.ExitableDirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.PhraseHelper;
import org.apache.lucene.search.uhighlight.UHComponents;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.search.SolrQueryTimeoutImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A {@link UnifiedHighlighter} variant to support generating snippets with text coordinates from
 * OCR data and lazy-loading field values from external storage.
 */
public class OcrHighlighter extends UnifiedHighlighter {

  private static final Logger log = LoggerFactory.getLogger(OcrHighlighter.class);

  private static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY_LEGACY =
      new CharacterRunAutomaton[0];
  private static final IndexSearcher EMPTY_INDEXSEARCHER;
  private static final Set FORMATS =
      ImmutableSet.of(new HocrFormat(), new AltoFormat(), new MiniOcrFormat());
  private static final int DEFAULT_SNIPPET_LIMIT = 100;
  public static final String PARTIAL_OCR_HIGHLIGHTS = "partialOcrHighlights";

  private static final boolean VERSION_IS_PRE81 =
      Version.LATEST.major < 8 || Version.LATEST.minor < 1;
  private static final boolean VERSION_IS_PRE82 =
      Version.LATEST.major < 8 || Version.LATEST.minor < 2;
  private static final boolean VERSION_IS_PRE84 =
      VERSION_IS_PRE82 || (Version.LATEST.major == 8 && Version.LATEST.minor < 4);
  private static final boolean VERSION_IS_PRE89 =
      VERSION_IS_PRE82 || (Version.LATEST.major == 8 && Version.LATEST.minor < 9);
  private static final Constructor hlComponentsConstructorLegacy;
  private static final Method offsetSourceGetterLegacy;
  private static final Method extractAutomataLegacyMethod;

  static {
    /**
     * Copied from the upstreama {@link UnifiedHighlighter} code. Please refer to the file
     * header for licensing information
     */
    try {
      IndexReader emptyReader = new MultiReader();
      EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
      EMPTY_INDEXSEARCHER.setQueryCache(null);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }

    // For compatibility with older versions, we grab references to deprecated APIs
    // via reflection and store them as static variables.
    try {
      if (VERSION_IS_PRE81) {
        @SuppressWarnings("rawtypes")
        Class multiTermHl =
            Class.forName("org.apache.lucene.search.uhighlight.MultiTermHighlighting");
        extractAutomataLegacyMethod =
            multiTermHl.getDeclaredMethod(
                "extractAutomata", Query.class, Predicate.class, boolean.class, Function.class);
        extractAutomataLegacyMethod.setAccessible(true);
      } else if (VERSION_IS_PRE84) {
        @SuppressWarnings("rawtypes")
        Class multiTermHl =
            Class.forName("org.apache.lucene.search.uhighlight.MultiTermHighlighting");
        extractAutomataLegacyMethod =
            multiTermHl.getDeclaredMethod(
                "extractAutomata", Query.class, Predicate.class, boolean.class);
        extractAutomataLegacyMethod.setAccessible(true);
      } else {
        extractAutomataLegacyMethod = null;
      }
      if (VERSION_IS_PRE82) {
        //noinspection JavaReflectionMemberAccess
        hlComponentsConstructorLegacy =
            UHComponents.class.getDeclaredConstructor(
                String.class,
                Predicate.class,
                Query.class,
                BytesRef[].class,
                PhraseHelper.class,
                CharacterRunAutomaton[].class,
                Set.class);
        offsetSourceGetterLegacy =
            UnifiedHighlighter.class.getDeclaredMethod(
                "getOptimizedOffsetSource",
                String.class,
                BytesRef[].class,
                PhraseHelper.class,
                CharacterRunAutomaton[].class);
      } else if (VERSION_IS_PRE84) {
        //noinspection JavaReflectionMemberAccess
        hlComponentsConstructorLegacy =
            UHComponents.class.getDeclaredConstructor(
                String.class,
                Predicate.class,
                Query.class,
                BytesRef[].class,
                PhraseHelper.class,
                CharacterRunAutomaton[].class,
                boolean.class,
                Set.class);
        offsetSourceGetterLegacy = null;
      } else {
        hlComponentsConstructorLegacy = null;
        offsetSourceGetterLegacy = null;
      }
    } catch (NoSuchMethodException | ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
  }

  private final SolrParams params;

  public OcrHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer, SolrParams params) {
    super(indexSearcher, indexAnalyzer);
    this.params = params;
  }

  @Override
  protected PassageScorer getScorer(String fieldName) {
    float k1 = params.getFieldFloat(fieldName, HighlightParams.SCORE_K1, 1.2f);
    float b = params.getFieldFloat(fieldName, HighlightParams.SCORE_B, 0.75f);
    float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
    boolean boostEarly =
        params.getFieldBool(fieldName, OcrHighlightParams.SCORE_BOOST_EARLY, false);
    return new OcrPassageScorer(k1, b, pivot, boostEarly);
  }

  @Override
  public Set getFlags(String field) {
    Set flags = EnumSet.noneOf(HighlightFlag.class);
    if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, true)) {
      flags.add(HighlightFlag.MULTI_TERM_QUERY);
    }
    if (params.getFieldBool(field, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
      flags.add(HighlightFlag.PHRASES);
    }
    flags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);

    if (params.getFieldBool(field, HighlightParams.WEIGHT_MATCHES, false) // true in 8.0
        && flags.contains(HighlightFlag.PHRASES)
        && flags.contains(HighlightFlag.MULTI_TERM_QUERY)) {
      flags.add(HighlightFlag.WEIGHT_MATCHES);
    }
    return flags;
  }

  /**
   * Highlight passages from OCR fields in multiple documents.
   *
   * Heavily based on {@link UnifiedHighlighter#highlightFieldsAsObjects(String[], Query, int[],
   * int[])} with modifications to add support for OCR-specific functionality and timeouts.
   * Please refer to the file header for licensing information on the original
   * code.
   */
  public OcrHighlightResult[] highlightOcrFields(
      String[] ocrFieldNames,
      Query query,
      int[] docIDs,
      int[] maxPassagesOcr,
      Map respHeader)
      throws IOException {
    if (ocrFieldNames.length < 1) {
      throw new IllegalArgumentException("ocrFieldNames must not be empty");
    }
    if (ocrFieldNames.length != maxPassagesOcr.length) {
      throw new IllegalArgumentException("invalid number of maxPassagesOcr");
    }
    if (searcher == null) {
      throw new IllegalStateException(
          "This method requires that an indexSearcher was passed in the "
              + "constructor.  Perhaps you mean to call highlightWithoutSearcher?");
    }

    Long timeAllowed = params.getLong(OcrHighlightParams.TIME_ALLOWED);
    if (timeAllowed != null) {
      HighlightTimeout.set(timeAllowed);
      SolrQueryTimeoutImpl.set(timeAllowed);
    }

    // Sort docs & fields for sequential i/o
    // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
    int[] docIds = new int[docIDs.length];
    int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
    copyAndSortDocIdsWithIndex(docIDs, docIds, docInIndexes); // latter 2 are "out" params

    // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
    final String[] fields = new String[ocrFieldNames.length];
    final int[] maxPassages = new int[maxPassagesOcr.length];
    copyAndSortFieldsWithMaxPassages(
        ocrFieldNames, maxPassagesOcr, fields, maxPassages); // latter 2 are "out" params

    // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
    Set queryTerms = extractTerms(query);
    OcrFieldHighlighter[] fieldHighlighters = new OcrFieldHighlighter[fields.length];
    int numTermVectors = 0;
    int numPostings = 0;
    for (int f = 0; f < fields.length; f++) {
      OcrFieldHighlighter fieldHighlighter =
          getOcrFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
      fieldHighlighters[f] = fieldHighlighter;

      switch (fieldHighlighter.getOffsetSource()) {
        case TERM_VECTORS:
          numTermVectors++;
          break;
        case POSTINGS:
          numPostings++;
          break;
        case POSTINGS_WITH_TERM_VECTORS:
          numTermVectors++;
          numPostings++;
          break;
        case ANALYSIS:
        case NONE_NEEDED:
        default:
          // do nothing
          // FIXME: This will raise a RuntimeException down the road, catch early?
          break;
      }
    }

    IndexReader indexReaderWithTermVecCache =
        (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;

    // [fieldIdx][docIdInIndex] of highlightDoc result
    OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][docIds.length][];
    int[][] snippetCountsByField = new int[fields.length][docIds.length];
    // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
    DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
    docLoop:
    for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
      List fieldValsByDoc = loadOcrFieldValues(fields, docIdIter);

      // Highlight in per-field order first, then by doc (better I/O pattern)
      for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
        OcrSnippet[][] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
        OcrFieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
        for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
          int docId = docIds[docIdx]; // sorted order
          IterableCharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
          if (content == null) {
            continue;
          }
          if (timeAllowed != null) {
            content = new ExitingIterCharSeq(content, HighlightTimeout.getInstance());
          }
          IndexReader indexReader =
              (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
                      && indexReaderWithTermVecCache != null)
                  ? indexReaderWithTermVecCache
                  : searcher.getIndexReader();
          final LeafReader leafReader;
          if (indexReader instanceof LeafReader) {
            leafReader = (LeafReader) indexReader;
          } else {
            List leaves = indexReader.leaves();
            LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
            leafReader = leafReaderContext.reader();
            docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
          }
          int docInIndex = docInIndexes[docIdx]; // original input order
          assert resultByDocIn[docInIndex] == null;
          OcrFormat ocrFormat = getFormat(content);
          String limitBlockParam = params.get(OcrHighlightParams.LIMIT_BLOCK, "block");
          OcrBlock[] limitBlocks = null;
          if (!limitBlockParam.equalsIgnoreCase("NONE")) {
            limitBlocks =
                OcrBlock.getHierarchyFrom(OcrBlock.valueOf(limitBlockParam.toUpperCase()))
                    .toArray(new OcrBlock[0]);
          }
          OcrBlock contextBlock =
              OcrBlock.valueOf(params.get(OcrHighlightParams.CONTEXT_BLOCK, "line").toUpperCase());
          BreakLocator contextLocator = ocrFormat.getBreakLocator(content, contextBlock);
          BreakLocator limitLocator =
              limitBlocks == null ? null : ocrFormat.getBreakLocator(content, limitBlocks);
          BreakLocator breakLocator =
              new ContextBreakLocator(
                  contextLocator, limitLocator, params.getInt(OcrHighlightParams.CONTEXT_SIZE, 2));
          OcrPassageFormatter formatter =
              ocrFormat.getPassageFormatter(
                  params.get(HighlightParams.TAG_PRE, ""),
                  params.get(HighlightParams.TAG_POST, ""),
                  params.getBool(OcrHighlightParams.ABSOLUTE_HIGHLIGHTS, false),
                  params.getBool(OcrHighlightParams.ALIGN_SPANS, false),
                  params.getBool(OcrHighlightParams.TRACK_PAGES, true));
          int snippetLimit =
              Math.max(
                  maxPassages[fieldIdx],
                  params.getInt(OcrHighlightParams.MAX_OCR_PASSAGES, DEFAULT_SNIPPET_LIMIT));
          boolean scorePassages = params.getBool(OcrHighlightParams.SCORE_PASSAGES, true);
          try {
            resultByDocIn[docInIndex] =
                fieldHighlighter.highlightFieldForDoc(
                    leafReader,
                    docId,
                    breakLocator,
                    formatter,
                    content,
                    params.get(OcrHighlightParams.PAGE_ID),
                    snippetLimit,
                    scorePassages);
          } catch (ExitingIterCharSeq.ExitingIterCharSeqException
              | ExitableDirectoryReader.ExitingReaderException e) {
            log.warn("OCR Highlighting timed out while handling " + content.getPointer(), e);
            respHeader.put(PARTIAL_OCR_HIGHLIGHTS, Boolean.TRUE);
            resultByDocIn[docInIndex] = null;
            // Stop highlighting
            break docLoop;
          } catch (RuntimeException e) {
            // This catch-all prevents OCR highlighting from failing the complete query, instead
            // users
            // get an error message in their Solr log.
            log.error("Could not highlight OCR content for document", e);
          } finally {
            if (content instanceof AutoCloseable) {
              try {
                ((AutoCloseable) content).close();
              } catch (Exception e) {
                log.warn(
                    "Encountered error while closing content iterator for {}: {}",
                    content.getPointer(),
                    e.getMessage());
              }
            }
          }
          snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId);
        }
      }
      batchDocIdx += fieldValsByDoc.size();
    }
    assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
        || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
    HighlightTimeout.reset();
    SolrQueryTimeoutImpl.reset();

    OcrHighlightResult[] out = new OcrHighlightResult[docIds.length];
    for (int d = 0; d < docIds.length; d++) {
      OcrHighlightResult hl = new OcrHighlightResult();
      for (int f = 0; f < fields.length; f++) {
        if (snippetCountsByField[f][d] <= 0) {
          continue;
        }
        hl.addSnippetsForField(fields[f], highlightDocsInByField[f][d]);
        hl.addSnippetCountForField(fields[f], snippetCountsByField[f][d]);
      }
      if (Arrays.stream(fields).allMatch(f -> hl.getFieldSnippets(f) == null)) {
        continue;
      }
      out[d] = hl;
    }
    return out;
  }

  @Override
  protected List loadFieldValues(
      String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
    return loadOcrFieldValues(fields, docIter).stream()
        .map(
            seqs ->
                Arrays.stream(seqs)
                    .map(IterableCharSequence::toString)
                    .toArray(CharSequence[]::new))
        .collect(Collectors.toList());
  }

  protected List loadOcrFieldValues(
      String[] fields, DocIdSetIterator docIter) throws IOException {
    List fieldValues = new ArrayList<>((int) docIter.cost());
    int docId;
    while ((docId = docIter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      DocumentStoredFieldVisitor docIdVisitor = new DocumentStoredFieldVisitor(fields);
      IterableCharSequence[] ocrVals = new IterableCharSequence[fields.length];
      searcher.doc(docId, docIdVisitor);
      for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
        String fieldName = fields[fieldIdx];
        String fieldValue = docIdVisitor.getDocument().get(fieldName);
        if (fieldValue == null) {
          // No OCR content at all
          ocrVals[fieldIdx] = null;
          continue;
        }
        if (!SourcePointer.isPointer(fieldValue)) {
          // OCR content as stored text
          ocrVals[fieldIdx] = IterableCharSequence.fromString(fieldValue);
          continue;
        }
        SourcePointer sourcePointer = SourcePointer.parse(fieldValue);
        if (sourcePointer == null) {
          // None of the files in the pointer exist or were readable, log should have warnings
          ocrVals[fieldIdx] = null;
          continue;
        }
        // If preloading is enabled, start warming the cache for the pointer
        PageCacheWarmer.getInstance().ifPresent(w -> w.preload(sourcePointer));
        if (sourcePointer.sources.size() == 1) {
          ocrVals[fieldIdx] =
              new FileBytesCharIterator(
                  sourcePointer.sources.get(0).path, StandardCharsets.UTF_8, sourcePointer);
        } else {
          ocrVals[fieldIdx] =
              new MultiFileBytesCharIterator(
                  sourcePointer.sources.stream().map(s -> s.path).collect(Collectors.toList()),
                  StandardCharsets.UTF_8,
                  sourcePointer);
        }
      }
      fieldValues.add(ocrVals);
    }
    return fieldValues;
  }

  private OcrFormat getFormat(IterableCharSequence content) {
    // Sample the first 4k characters to determine the format
    String sampleChunk = content.subSequence(0, Math.min(4096, content.length())).toString();
    return FORMATS.stream()
        .filter(fmt -> fmt.hasFormat(sampleChunk))
        .findFirst()
        .orElseThrow(
            () ->
                new RuntimeException(
                    "Could not determine OCR format for sample '" + sampleChunk + "'"));
  }

  /**
   * Configure the field highlighter.
   *
   * 
Heavily based on {@link UnifiedHighlighter#getFieldHighlighter(String, Query, Set, int)} and
   * {@link UnifiedHighlighter#getHighlightComponents(String, Query, Set)}, modified to integrate it
   * into our custom OCR highlighting setup. Please refer to the file header for licensing
   * information on the original code.
   */
  private OcrFieldHighlighter getOcrFieldHighlighter(
      String field, Query query, Set allTerms, int maxPassages) {
    // This method and some associated types changed in v8.2 and v8.4, so we have to delegate to an
    // adapter method for these versions
    if (VERSION_IS_PRE84) {
      return getOcrFieldHighlighterLegacy(field, query, allTerms, maxPassages);
    }

    Predicate fieldMatcher = getFieldMatcher(field);
    BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
    Set highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);

    UHComponents components =
        new UHComponents(
            field,
            fieldMatcher,
            query,
            terms,
            phraseHelper,
            automata,
            hasUnrecognizedQuery(fieldMatcher, query),
            highlightFlags);
    OffsetSource offsetSource = getOptimizedOffsetSource(components);
    return new OcrFieldHighlighter(
        field,
        getOffsetStrategy(offsetSource, components),
        getScorer(field),
        maxPassages,
        getMaxNoHighlightPassages(field));
  }

  private OcrFieldHighlighter getOcrFieldHighlighterLegacy(
      String field, Query query, Set allTerms, int maxPassages) {
    Predicate fieldMatcher = getFieldMatcher(field);
    BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
    Set highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomataLegacy(field, query, highlightFlags);

    // Obtaining these two values has changed with Solr 8.2, so we need to do some reflection for
    // older versions
    OffsetSource offsetSource;
    UHComponents components;
    if (VERSION_IS_PRE82) {
      offsetSource = this.getOffsetSourcePre82(field, terms, phraseHelper, automata);
      components =
          this.getUHComponentsPre82(
              field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
    } else {
      components =
          this.getUHComponentsPre84(
              field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
      offsetSource = this.getOptimizedOffsetSource(components);
    }
    return new OcrFieldHighlighter(
        field,
        getOffsetStrategy(offsetSource, components),
        getScorer(field),
        maxPassages,
        getMaxNoHighlightPassages(field));
  }

  private CharacterRunAutomaton[] getAutomataLegacy(
      String field, Query query, Set highlightFlags) {
    // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper
    // handle those?
    // if don't highlight phrases strictly,
    final boolean lookInSpan =
        !highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper
            || highlightFlags.contains(
                HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all

    return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
        ? extractAutomataLegacy(query, getFieldMatcher(field), lookInSpan)
        : ZERO_LEN_AUTOMATA_ARRAY_LEGACY;
  }

  private CharacterRunAutomaton[] extractAutomataLegacy(
      Query query, Predicate fieldMatcher, boolean lookInSpan) {
    Function> nopWriteFn = q -> null;
    try {
      if (VERSION_IS_PRE81) {
        return (CharacterRunAutomaton[])
            extractAutomataLegacyMethod.invoke(null, query, fieldMatcher, lookInSpan, nopWriteFn);
      } else {
        return (CharacterRunAutomaton[])
            extractAutomataLegacyMethod.invoke(null, query, fieldMatcher, lookInSpan);
      }
    } catch (IllegalAccessException | InvocationTargetException e) {
      throw new RuntimeException(e);
    }
  }

  private OffsetSource getOffsetSourcePre82(
      String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
    try {
      return (OffsetSource)
          offsetSourceGetterLegacy.invoke(this, field, terms, phraseHelper, automata);
    } catch (IllegalAccessException | InvocationTargetException e) {
      throw new RuntimeException(e);
    }
  }

  private UHComponents getUHComponentsPre82(
      String field,
      Predicate fieldMatcher,
      Query query,
      BytesRef[] terms,
      PhraseHelper phraseHelper,
      CharacterRunAutomaton[] automata,
      Set highlightFlags) {
    try {
      return hlComponentsConstructorLegacy.newInstance(
          field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
    } catch (IllegalAccessException | InvocationTargetException | InstantiationException e) {
      throw new RuntimeException(e);
    }
  }

  private UHComponents getUHComponentsPre84(
      String field,
      Predicate fieldMatcher,
      Query query,
      BytesRef[] terms,
      PhraseHelper phraseHelper,
      CharacterRunAutomaton[] automata,
      Set highlightFlags) {
    try {
      return hlComponentsConstructorLegacy.newInstance(
          field,
          fieldMatcher,
          query,
          terms,
          phraseHelper,
          automata,
          hasUnrecognizedQuery(fieldMatcher, query),
          highlightFlags);
    } catch (ReflectiveOperationException e) {
      throw new RuntimeException(e);
    }
  }

  /**
   * This is copied straight from {@link
   * UnifiedHighlighter#copyAndSortFieldsWithMaxPassages(String[], int[], String[], int[])} because
   * it has private access there. Please refer to the file header for licensing information
   * on the original code.
   */
  private void copyAndSortFieldsWithMaxPassages(
      String[] fieldsIn, int[] maxPassagesIn, final String[] fields, final int[] maxPassages) {
    System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
    System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
    new InPlaceMergeSorter() {
      @Override
      protected void swap(int i, int j) {
        String tmp = fields[i];
        fields[i] = fields[j];
        fields[j] = tmp;
        int tmp2 = maxPassages[i];
        maxPassages[i] = maxPassages[j];
        maxPassages[j] = tmp2;
      }

      @Override
      protected int compare(int i, int j) {
        return fields[i].compareTo(fields[j]);
      }
    }.sort(0, fields.length);
  }

  /**
   * This is copied straight from {@link UnifiedHighlighter#copyAndSortDocIdsWithIndex(int[], int[],
   * int[])} )} because it has private access there. Please refer to the file header for
   * licensing information on the original code.
   */
  private void copyAndSortDocIdsWithIndex(
      int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
    System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
    for (int i = 0; i < docInIndexes.length; i++) {
      docInIndexes[i] = i;
    }
    new InPlaceMergeSorter() {
      @Override
      protected void swap(int i, int j) {
        int tmp = docIds[i];
        docIds[i] = docIds[j];
        docIds[j] = tmp;
        tmp = docInIndexes[i];
        docInIndexes[i] = docInIndexes[j];
        docInIndexes[j] = tmp;
      }

      @Override
      protected int compare(int i, int j) {
        return Integer.compare(docIds[i], docIds[j]);
      }
    }.sort(0, docIds.length);
  }

  /**
   * This is copied straight from {@link UnifiedHighlighter#asDocIdSetIterator(int[])} )} because it
   * has private access there. Please refer to the file header for licensing information on
   * the original code.
   */
  private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
    return new DocIdSetIterator() {
      int idx = -1;

      @Override
      public int docID() {
        if (idx < 0 || idx >= sortedDocIds.length) {
          return NO_MORE_DOCS;
        }
        return sortedDocIds[idx];
      }

      @Override
      public int nextDoc() {
        idx++;
        return docID();
      }

      @Override
      public int advance(int target) throws IOException {
        return super.slowAdvance(target); // won't be called, so whatever
      }

      @Override
      public long cost() {
        return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
      }
    };
  }

  /**
   * Wraps an IndexReader that remembers/caches the last call to {@link
   * LeafReader#getTermVectors(int)} so that if the next call has the same ID, then it is reused. If
   * TV's were column-stride (like doc-values), there would be no need for this.
   *
   * This is copied straight from {@link UnifiedHighlighter#asDocIdSetIterator(int[])} )} because
   * it has private access there. Please refer to the file header for licensing information
   * on the original code.
   */
  private static class TermVectorReusingLeafReader extends FilterLeafReader {
    static IndexReader wrap(IndexReader reader) throws IOException {
      LeafReader[] leafReaders =
          reader.leaves().stream()
              .map(LeafReaderContext::reader)
              .map(TermVectorReusingLeafReader::new)
              .toArray(LeafReader[]::new);
      if (VERSION_IS_PRE89) {
        return new LegacyBaseCompositeReader(leafReaders) {
          @Override
          protected void doClose() throws IOException {
            reader.close();
          }

          @Override
          public CacheHelper getReaderCacheHelper() {
            return null;
          }
        };
      } else {
        return new BaseCompositeReader(leafReaders, null) {
          @Override
          protected void doClose() throws IOException {
            reader.close();
          }

          @Override
          public CacheHelper getReaderCacheHelper() {
            return null;
          }
        };
      }
    }

    private int lastDocId = -1;
    private Fields tvFields;

    TermVectorReusingLeafReader(LeafReader in) {
      super(in);
    }

    @Override
    public Fields getTermVectors(int docID) throws IOException {
      if (docID != lastDocId) {
        lastDocId = docID;
        tvFields = in.getTermVectors(docID);
      }
      return tvFields;
    }

    @Override
    public CacheHelper getCoreCacheHelper() {
      return null;
    }

    @Override
    public CacheHelper getReaderCacheHelper() {
      return null;
    }
  }
}