de.digitalcollections.solrocr.lucene.OcrHighlighter Maven / Gradle / Ivy
/*
* Contains verbatim code and custom code based on code from the Lucene
* project, licensed under the following terms. All parts where this is
* the case are clearly marked as such in a source code comment referring
* to this header.
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE.upstream file distributed
* with this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For all parts where this is not the case, refer to the LICENSE file in the
* repository root.
*/
package de.digitalcollections.solrocr.lucene;
import com.google.common.collect.ImmutableSet;
import de.digitalcollections.solrocr.formats.alto.AltoFormat;
import de.digitalcollections.solrocr.formats.hocr.HocrFormat;
import de.digitalcollections.solrocr.formats.miniocr.MiniOcrFormat;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.ContextBreakLocator;
import de.digitalcollections.solrocr.iter.ExitingIterCharSeq;
import de.digitalcollections.solrocr.iter.FileBytesCharIterator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.MultiFileBytesCharIterator;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrHighlightResult;
import de.digitalcollections.solrocr.model.OcrSnippet;
import de.digitalcollections.solrocr.model.SourcePointer;
import de.digitalcollections.solrocr.reader.LegacyBaseCompositeReader;
import de.digitalcollections.solrocr.solr.OcrHighlightParams;
import de.digitalcollections.solrocr.util.HighlightTimeout;
import de.digitalcollections.solrocr.util.PageCacheWarmer;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.BaseCompositeReader;
import org.apache.lucene.index.ExitableDirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.PhraseHelper;
import org.apache.lucene.search.uhighlight.UHComponents;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.search.SolrQueryTimeoutImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A {@link UnifiedHighlighter} variant to support generating snippets with text coordinates from
* OCR data and lazy-loading field values from external storage.
*/
public class OcrHighlighter extends UnifiedHighlighter {
private static final Logger log = LoggerFactory.getLogger(OcrHighlighter.class);
private static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY_LEGACY =
new CharacterRunAutomaton[0];
private static final IndexSearcher EMPTY_INDEXSEARCHER;
private static final Set FORMATS =
ImmutableSet.of(new HocrFormat(), new AltoFormat(), new MiniOcrFormat());
private static final int DEFAULT_SNIPPET_LIMIT = 100;
public static final String PARTIAL_OCR_HIGHLIGHTS = "partialOcrHighlights";
private static final boolean VERSION_IS_PRE81 =
Version.LATEST.major < 8 || Version.LATEST.minor < 1;
private static final boolean VERSION_IS_PRE82 =
Version.LATEST.major < 8 || Version.LATEST.minor < 2;
private static final boolean VERSION_IS_PRE84 =
VERSION_IS_PRE82 || (Version.LATEST.major == 8 && Version.LATEST.minor < 4);
private static final boolean VERSION_IS_PRE89 =
VERSION_IS_PRE82 || (Version.LATEST.major == 8 && Version.LATEST.minor < 9);
private static final Constructor hlComponentsConstructorLegacy;
private static final Method offsetSourceGetterLegacy;
private static final Method extractAutomataLegacyMethod;
static {
/**
* Copied from the upstreama {@link UnifiedHighlighter} code. Please refer to the file
* header for licensing information
*/
try {
IndexReader emptyReader = new MultiReader();
EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
EMPTY_INDEXSEARCHER.setQueryCache(null);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
// For compatibility with older versions, we grab references to deprecated APIs
// via reflection and store them as static variables.
try {
if (VERSION_IS_PRE81) {
@SuppressWarnings("rawtypes")
Class multiTermHl =
Class.forName("org.apache.lucene.search.uhighlight.MultiTermHighlighting");
extractAutomataLegacyMethod =
multiTermHl.getDeclaredMethod(
"extractAutomata", Query.class, Predicate.class, boolean.class, Function.class);
extractAutomataLegacyMethod.setAccessible(true);
} else if (VERSION_IS_PRE84) {
@SuppressWarnings("rawtypes")
Class multiTermHl =
Class.forName("org.apache.lucene.search.uhighlight.MultiTermHighlighting");
extractAutomataLegacyMethod =
multiTermHl.getDeclaredMethod(
"extractAutomata", Query.class, Predicate.class, boolean.class);
extractAutomataLegacyMethod.setAccessible(true);
} else {
extractAutomataLegacyMethod = null;
}
if (VERSION_IS_PRE82) {
//noinspection JavaReflectionMemberAccess
hlComponentsConstructorLegacy =
UHComponents.class.getDeclaredConstructor(
String.class,
Predicate.class,
Query.class,
BytesRef[].class,
PhraseHelper.class,
CharacterRunAutomaton[].class,
Set.class);
offsetSourceGetterLegacy =
UnifiedHighlighter.class.getDeclaredMethod(
"getOptimizedOffsetSource",
String.class,
BytesRef[].class,
PhraseHelper.class,
CharacterRunAutomaton[].class);
} else if (VERSION_IS_PRE84) {
//noinspection JavaReflectionMemberAccess
hlComponentsConstructorLegacy =
UHComponents.class.getDeclaredConstructor(
String.class,
Predicate.class,
Query.class,
BytesRef[].class,
PhraseHelper.class,
CharacterRunAutomaton[].class,
boolean.class,
Set.class);
offsetSourceGetterLegacy = null;
} else {
hlComponentsConstructorLegacy = null;
offsetSourceGetterLegacy = null;
}
} catch (NoSuchMethodException | ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
private final SolrParams params;
public OcrHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer, SolrParams params) {
super(indexSearcher, indexAnalyzer);
this.params = params;
}
@Override
protected PassageScorer getScorer(String fieldName) {
float k1 = params.getFieldFloat(fieldName, HighlightParams.SCORE_K1, 1.2f);
float b = params.getFieldFloat(fieldName, HighlightParams.SCORE_B, 0.75f);
float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
boolean boostEarly =
params.getFieldBool(fieldName, OcrHighlightParams.SCORE_BOOST_EARLY, false);
return new OcrPassageScorer(k1, b, pivot, boostEarly);
}
@Override
public Set getFlags(String field) {
Set flags = EnumSet.noneOf(HighlightFlag.class);
if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, true)) {
flags.add(HighlightFlag.MULTI_TERM_QUERY);
}
if (params.getFieldBool(field, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
flags.add(HighlightFlag.PHRASES);
}
flags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
if (params.getFieldBool(field, HighlightParams.WEIGHT_MATCHES, false) // true in 8.0
&& flags.contains(HighlightFlag.PHRASES)
&& flags.contains(HighlightFlag.MULTI_TERM_QUERY)) {
flags.add(HighlightFlag.WEIGHT_MATCHES);
}
return flags;
}
/**
* Highlight passages from OCR fields in multiple documents.
*
* Heavily based on {@link UnifiedHighlighter#highlightFieldsAsObjects(String[], Query, int[],
* int[])} with modifications to add support for OCR-specific functionality and timeouts.
* Please refer to the file header for licensing information on the original
* code.
*/
public OcrHighlightResult[] highlightOcrFields(
String[] ocrFieldNames,
Query query,
int[] docIDs,
int[] maxPassagesOcr,
Map respHeader)
throws IOException {
if (ocrFieldNames.length < 1) {
throw new IllegalArgumentException("ocrFieldNames must not be empty");
}
if (ocrFieldNames.length != maxPassagesOcr.length) {
throw new IllegalArgumentException("invalid number of maxPassagesOcr");
}
if (searcher == null) {
throw new IllegalStateException(
"This method requires that an indexSearcher was passed in the "
+ "constructor. Perhaps you mean to call highlightWithoutSearcher?");
}
Long timeAllowed = params.getLong(OcrHighlightParams.TIME_ALLOWED);
if (timeAllowed != null) {
HighlightTimeout.set(timeAllowed);
SolrQueryTimeoutImpl.set(timeAllowed);
}
// Sort docs & fields for sequential i/o
// Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
int[] docIds = new int[docIDs.length];
int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
copyAndSortDocIdsWithIndex(docIDs, docIds, docInIndexes); // latter 2 are "out" params
// Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
final String[] fields = new String[ocrFieldNames.length];
final int[] maxPassages = new int[maxPassagesOcr.length];
copyAndSortFieldsWithMaxPassages(
ocrFieldNames, maxPassagesOcr, fields, maxPassages); // latter 2 are "out" params
// Init field highlighters (where most of the highlight logic lives, and on a per field basis)
Set queryTerms = extractTerms(query);
OcrFieldHighlighter[] fieldHighlighters = new OcrFieldHighlighter[fields.length];
int numTermVectors = 0;
int numPostings = 0;
for (int f = 0; f < fields.length; f++) {
OcrFieldHighlighter fieldHighlighter =
getOcrFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
fieldHighlighters[f] = fieldHighlighter;
switch (fieldHighlighter.getOffsetSource()) {
case TERM_VECTORS:
numTermVectors++;
break;
case POSTINGS:
numPostings++;
break;
case POSTINGS_WITH_TERM_VECTORS:
numTermVectors++;
numPostings++;
break;
case ANALYSIS:
case NONE_NEEDED:
default:
// do nothing
// FIXME: This will raise a RuntimeException down the road, catch early?
break;
}
}
IndexReader indexReaderWithTermVecCache =
(numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;
// [fieldIdx][docIdInIndex] of highlightDoc result
OcrSnippet[][][] highlightDocsInByField = new OcrSnippet[fields.length][docIds.length][];
int[][] snippetCountsByField = new int[fields.length][docIds.length];
// Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
docLoop:
for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
List fieldValsByDoc = loadOcrFieldValues(fields, docIdIter);
// Highlight in per-field order first, then by doc (better I/O pattern)
for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
OcrSnippet[][] resultByDocIn = highlightDocsInByField[fieldIdx]; // parallel to docIdsIn
OcrFieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
int docId = docIds[docIdx]; // sorted order
IterableCharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
if (content == null) {
continue;
}
if (timeAllowed != null) {
content = new ExitingIterCharSeq(content, HighlightTimeout.getInstance());
}
IndexReader indexReader =
(fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
&& indexReaderWithTermVecCache != null)
? indexReaderWithTermVecCache
: searcher.getIndexReader();
final LeafReader leafReader;
if (indexReader instanceof LeafReader) {
leafReader = (LeafReader) indexReader;
} else {
List leaves = indexReader.leaves();
LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves));
leafReader = leafReaderContext.reader();
docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader
}
int docInIndex = docInIndexes[docIdx]; // original input order
assert resultByDocIn[docInIndex] == null;
OcrFormat ocrFormat = getFormat(content);
String limitBlockParam = params.get(OcrHighlightParams.LIMIT_BLOCK, "block");
OcrBlock[] limitBlocks = null;
if (!limitBlockParam.equalsIgnoreCase("NONE")) {
limitBlocks =
OcrBlock.getHierarchyFrom(OcrBlock.valueOf(limitBlockParam.toUpperCase()))
.toArray(new OcrBlock[0]);
}
OcrBlock contextBlock =
OcrBlock.valueOf(params.get(OcrHighlightParams.CONTEXT_BLOCK, "line").toUpperCase());
BreakLocator contextLocator = ocrFormat.getBreakLocator(content, contextBlock);
BreakLocator limitLocator =
limitBlocks == null ? null : ocrFormat.getBreakLocator(content, limitBlocks);
BreakLocator breakLocator =
new ContextBreakLocator(
contextLocator, limitLocator, params.getInt(OcrHighlightParams.CONTEXT_SIZE, 2));
OcrPassageFormatter formatter =
ocrFormat.getPassageFormatter(
params.get(HighlightParams.TAG_PRE, ""),
params.get(HighlightParams.TAG_POST, ""),
params.getBool(OcrHighlightParams.ABSOLUTE_HIGHLIGHTS, false),
params.getBool(OcrHighlightParams.ALIGN_SPANS, false),
params.getBool(OcrHighlightParams.TRACK_PAGES, true));
int snippetLimit =
Math.max(
maxPassages[fieldIdx],
params.getInt(OcrHighlightParams.MAX_OCR_PASSAGES, DEFAULT_SNIPPET_LIMIT));
boolean scorePassages = params.getBool(OcrHighlightParams.SCORE_PASSAGES, true);
try {
resultByDocIn[docInIndex] =
fieldHighlighter.highlightFieldForDoc(
leafReader,
docId,
breakLocator,
formatter,
content,
params.get(OcrHighlightParams.PAGE_ID),
snippetLimit,
scorePassages);
} catch (ExitingIterCharSeq.ExitingIterCharSeqException
| ExitableDirectoryReader.ExitingReaderException e) {
log.warn("OCR Highlighting timed out while handling " + content.getPointer(), e);
respHeader.put(PARTIAL_OCR_HIGHLIGHTS, Boolean.TRUE);
resultByDocIn[docInIndex] = null;
// Stop highlighting
break docLoop;
} catch (RuntimeException e) {
// This catch-all prevents OCR highlighting from failing the complete query, instead
// users
// get an error message in their Solr log.
log.error("Could not highlight OCR content for document", e);
} finally {
if (content instanceof AutoCloseable) {
try {
((AutoCloseable) content).close();
} catch (Exception e) {
log.warn(
"Encountered error while closing content iterator for {}: {}",
content.getPointer(),
e.getMessage());
}
}
}
snippetCountsByField[fieldIdx][docInIndex] = fieldHighlighter.getNumMatches(docId);
}
}
batchDocIdx += fieldValsByDoc.size();
}
assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
|| docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
HighlightTimeout.reset();
SolrQueryTimeoutImpl.reset();
OcrHighlightResult[] out = new OcrHighlightResult[docIds.length];
for (int d = 0; d < docIds.length; d++) {
OcrHighlightResult hl = new OcrHighlightResult();
for (int f = 0; f < fields.length; f++) {
if (snippetCountsByField[f][d] <= 0) {
continue;
}
hl.addSnippetsForField(fields[f], highlightDocsInByField[f][d]);
hl.addSnippetCountForField(fields[f], snippetCountsByField[f][d]);
}
if (Arrays.stream(fields).allMatch(f -> hl.getFieldSnippets(f) == null)) {
continue;
}
out[d] = hl;
}
return out;
}
@Override
protected List loadFieldValues(
String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException {
return loadOcrFieldValues(fields, docIter).stream()
.map(
seqs ->
Arrays.stream(seqs)
.map(IterableCharSequence::toString)
.toArray(CharSequence[]::new))
.collect(Collectors.toList());
}
protected List loadOcrFieldValues(
String[] fields, DocIdSetIterator docIter) throws IOException {
List fieldValues = new ArrayList<>((int) docIter.cost());
int docId;
while ((docId = docIter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
DocumentStoredFieldVisitor docIdVisitor = new DocumentStoredFieldVisitor(fields);
IterableCharSequence[] ocrVals = new IterableCharSequence[fields.length];
searcher.doc(docId, docIdVisitor);
for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
String fieldName = fields[fieldIdx];
String fieldValue = docIdVisitor.getDocument().get(fieldName);
if (fieldValue == null) {
// No OCR content at all
ocrVals[fieldIdx] = null;
continue;
}
if (!SourcePointer.isPointer(fieldValue)) {
// OCR content as stored text
ocrVals[fieldIdx] = IterableCharSequence.fromString(fieldValue);
continue;
}
SourcePointer sourcePointer = SourcePointer.parse(fieldValue);
if (sourcePointer == null) {
// None of the files in the pointer exist or were readable, log should have warnings
ocrVals[fieldIdx] = null;
continue;
}
// If preloading is enabled, start warming the cache for the pointer
PageCacheWarmer.getInstance().ifPresent(w -> w.preload(sourcePointer));
if (sourcePointer.sources.size() == 1) {
ocrVals[fieldIdx] =
new FileBytesCharIterator(
sourcePointer.sources.get(0).path, StandardCharsets.UTF_8, sourcePointer);
} else {
ocrVals[fieldIdx] =
new MultiFileBytesCharIterator(
sourcePointer.sources.stream().map(s -> s.path).collect(Collectors.toList()),
StandardCharsets.UTF_8,
sourcePointer);
}
}
fieldValues.add(ocrVals);
}
return fieldValues;
}
private OcrFormat getFormat(IterableCharSequence content) {
// Sample the first 4k characters to determine the format
String sampleChunk = content.subSequence(0, Math.min(4096, content.length())).toString();
return FORMATS.stream()
.filter(fmt -> fmt.hasFormat(sampleChunk))
.findFirst()
.orElseThrow(
() ->
new RuntimeException(
"Could not determine OCR format for sample '" + sampleChunk + "'"));
}
/**
* Configure the field highlighter.
*
* Heavily based on {@link UnifiedHighlighter#getFieldHighlighter(String, Query, Set, int)} and
* {@link UnifiedHighlighter#getHighlightComponents(String, Query, Set)}, modified to integrate it
* into our custom OCR highlighting setup. Please refer to the file header for licensing
* information on the original code.
*/
private OcrFieldHighlighter getOcrFieldHighlighter(
String field, Query query, Set allTerms, int maxPassages) {
// This method and some associated types changed in v8.2 and v8.4, so we have to delegate to an
// adapter method for these versions
if (VERSION_IS_PRE84) {
return getOcrFieldHighlighterLegacy(field, query, allTerms, maxPassages);
}
Predicate fieldMatcher = getFieldMatcher(field);
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
UHComponents components =
new UHComponents(
field,
fieldMatcher,
query,
terms,
phraseHelper,
automata,
hasUnrecognizedQuery(fieldMatcher, query),
highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
return new OcrFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
getScorer(field),
maxPassages,
getMaxNoHighlightPassages(field));
}
private OcrFieldHighlighter getOcrFieldHighlighterLegacy(
String field, Query query, Set allTerms, int maxPassages) {
Predicate fieldMatcher = getFieldMatcher(field);
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomataLegacy(field, query, highlightFlags);
// Obtaining these two values has changed with Solr 8.2, so we need to do some reflection for
// older versions
OffsetSource offsetSource;
UHComponents components;
if (VERSION_IS_PRE82) {
offsetSource = this.getOffsetSourcePre82(field, terms, phraseHelper, automata);
components =
this.getUHComponentsPre82(
field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
} else {
components =
this.getUHComponentsPre84(
field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
offsetSource = this.getOptimizedOffsetSource(components);
}
return new OcrFieldHighlighter(
field,
getOffsetStrategy(offsetSource, components),
getScorer(field),
maxPassages,
getMaxNoHighlightPassages(field));
}
private CharacterRunAutomaton[] getAutomataLegacy(
String field, Query query, Set highlightFlags) {
// do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper
// handle those?
// if don't highlight phrases strictly,
final boolean lookInSpan =
!highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper
|| highlightFlags.contains(
HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? extractAutomataLegacy(query, getFieldMatcher(field), lookInSpan)
: ZERO_LEN_AUTOMATA_ARRAY_LEGACY;
}
private CharacterRunAutomaton[] extractAutomataLegacy(
Query query, Predicate fieldMatcher, boolean lookInSpan) {
Function> nopWriteFn = q -> null;
try {
if (VERSION_IS_PRE81) {
return (CharacterRunAutomaton[])
extractAutomataLegacyMethod.invoke(null, query, fieldMatcher, lookInSpan, nopWriteFn);
} else {
return (CharacterRunAutomaton[])
extractAutomataLegacyMethod.invoke(null, query, fieldMatcher, lookInSpan);
}
} catch (IllegalAccessException | InvocationTargetException e) {
throw new RuntimeException(e);
}
}
private OffsetSource getOffsetSourcePre82(
String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
try {
return (OffsetSource)
offsetSourceGetterLegacy.invoke(this, field, terms, phraseHelper, automata);
} catch (IllegalAccessException | InvocationTargetException e) {
throw new RuntimeException(e);
}
}
private UHComponents getUHComponentsPre82(
String field,
Predicate fieldMatcher,
Query query,
BytesRef[] terms,
PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata,
Set highlightFlags) {
try {
return hlComponentsConstructorLegacy.newInstance(
field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
} catch (IllegalAccessException | InvocationTargetException | InstantiationException e) {
throw new RuntimeException(e);
}
}
private UHComponents getUHComponentsPre84(
String field,
Predicate fieldMatcher,
Query query,
BytesRef[] terms,
PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata,
Set highlightFlags) {
try {
return hlComponentsConstructorLegacy.newInstance(
field,
fieldMatcher,
query,
terms,
phraseHelper,
automata,
hasUnrecognizedQuery(fieldMatcher, query),
highlightFlags);
} catch (ReflectiveOperationException e) {
throw new RuntimeException(e);
}
}
/**
* This is copied straight from {@link
* UnifiedHighlighter#copyAndSortFieldsWithMaxPassages(String[], int[], String[], int[])} because
* it has private access there. Please refer to the file header for licensing information
* on the original code.
*/
private void copyAndSortFieldsWithMaxPassages(
String[] fieldsIn, int[] maxPassagesIn, final String[] fields, final int[] maxPassages) {
System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
String tmp = fields[i];
fields[i] = fields[j];
fields[j] = tmp;
int tmp2 = maxPassages[i];
maxPassages[i] = maxPassages[j];
maxPassages[j] = tmp2;
}
@Override
protected int compare(int i, int j) {
return fields[i].compareTo(fields[j]);
}
}.sort(0, fields.length);
}
/**
* This is copied straight from {@link UnifiedHighlighter#copyAndSortDocIdsWithIndex(int[], int[],
* int[])} )} because it has private access there. Please refer to the file header for
* licensing information on the original code.
*/
private void copyAndSortDocIdsWithIndex(
int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
for (int i = 0; i < docInIndexes.length; i++) {
docInIndexes[i] = i;
}
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int tmp = docIds[i];
docIds[i] = docIds[j];
docIds[j] = tmp;
tmp = docInIndexes[i];
docInIndexes[i] = docInIndexes[j];
docInIndexes[j] = tmp;
}
@Override
protected int compare(int i, int j) {
return Integer.compare(docIds[i], docIds[j]);
}
}.sort(0, docIds.length);
}
/**
* This is copied straight from {@link UnifiedHighlighter#asDocIdSetIterator(int[])} )} because it
* has private access there. Please refer to the file header for licensing information on
* the original code.
*/
private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
return new DocIdSetIterator() {
int idx = -1;
@Override
public int docID() {
if (idx < 0 || idx >= sortedDocIds.length) {
return NO_MORE_DOCS;
}
return sortedDocIds[idx];
}
@Override
public int nextDoc() {
idx++;
return docID();
}
@Override
public int advance(int target) throws IOException {
return super.slowAdvance(target); // won't be called, so whatever
}
@Override
public long cost() {
return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
}
};
}
/**
* Wraps an IndexReader that remembers/caches the last call to {@link
* LeafReader#getTermVectors(int)} so that if the next call has the same ID, then it is reused. If
* TV's were column-stride (like doc-values), there would be no need for this.
*
* This is copied straight from {@link UnifiedHighlighter#asDocIdSetIterator(int[])} )} because
* it has private access there. Please refer to the file header for licensing information
* on the original code.
*/
private static class TermVectorReusingLeafReader extends FilterLeafReader {
static IndexReader wrap(IndexReader reader) throws IOException {
LeafReader[] leafReaders =
reader.leaves().stream()
.map(LeafReaderContext::reader)
.map(TermVectorReusingLeafReader::new)
.toArray(LeafReader[]::new);
if (VERSION_IS_PRE89) {
return new LegacyBaseCompositeReader(leafReaders) {
@Override
protected void doClose() throws IOException {
reader.close();
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
};
} else {
return new BaseCompositeReader(leafReaders, null) {
@Override
protected void doClose() throws IOException {
reader.close();
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
};
}
}
private int lastDocId = -1;
private Fields tvFields;
TermVectorReusingLeafReader(LeafReader in) {
super(in);
}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = in.getTermVectors(docID);
}
return tvFields;
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
}