de.digitalcollections.solrocr.lucene.OcrFieldHighlighter Maven / Gradle / Ivy
Show all versions of solr-ocrhighlighting Show documentation
* Contains verbatim code and custom code based on code from the Lucene
* project, licensed under the following terms. All parts where this is
* the case are clearly marked as such in a source code comment referring
* to this header.
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE.upstream file distributed
* with this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* For all parts where this is not the case, refer to the LICENSE file in the
* repository root.
package de.digitalcollections.solrocr.lucene;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.model.OcrSnippet;
import de.digitalcollections.solrocr.util.PageCacheWarmer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.util.BytesRef;
/** A customization of {@link FieldHighlighter} to support OCR fields */
public class OcrFieldHighlighter extends FieldHighlighter {
private final Map numMatches;
public OcrFieldHighlighter(
String field,
FieldOffsetStrategy fieldOffsetStrategy,
PassageScorer passageScorer,
int maxPassages,
int maxNoHighlightPassages) {
field, fieldOffsetStrategy, null, passageScorer, maxPassages, maxNoHighlightPassages, null);
this.numMatches = new HashMap<>();
* The primary method -- highlight this doc, assuming a specific field and given this content.
* Largely copied from {@link FieldHighlighter#highlightFieldForDoc(LeafReader, int, String)},
* modified to support an {@link IterableCharSequence} as content and dynamically setting the
* break iterator and the formatter. Please refer to the file header for licensing
* information on the original code.
public OcrSnippet[] highlightFieldForDoc(
LeafReader reader,
int docId,
BreakLocator breakLocator,
OcrPassageFormatter formatter,
IterableCharSequence content,
String pageId,
int snippetLimit,
boolean scorePassages)
throws IOException {
// note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl
// for it.
// If page cache pre-warming is enabled, cancel it, since we're doing the I/O ourselves now
PageCacheWarmer.getInstance().ifPresent(w -> w.cancelPreload(content.getPointer()));
if (content.length() == 0) {
return null; // nothing to do
Passage[] passages;
try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, docId, null)) {
passages =
offsetsEnums, docId, breakLocator, formatter, pageId, snippetLimit, scorePassages);
// Format the resulting Passages.
if (passages.length == 0 && pageId == null) {
// no passages were returned, so ask for a default summary
passages =
maxNoHighlightPassages == -1 ? maxPassages : maxNoHighlightPassages);
if (passages.length > 0) {
OcrSnippet[] snippets = formatter.format(passages, content);
Arrays.sort(snippets, Collections.reverseOrder());
return snippets;
} else {
return null;
protected Passage[] highlightOffsetsEnums(OffsetsEnum off) {
throw new UnsupportedOperationException();
* Score snippets as mini-documents, either based on TF-IDF/BM25 or simply their position.
Largely based on {@link FieldHighlighter#highlightOffsetsEnums(OffsetsEnum)} with
* modifications to add support for the {@link BreakLocator} interrface, the option to disable
* scoring, the option to limit the number of snippets to consider for scoring as well as
* restricting the returned snippets to those from OCR pages with a given identifier.
* Please refer to the file header for licensing information on the original code.
protected Passage[] highlightOffsetsEnums(
OffsetsEnum off,
int docId,
BreakLocator breakLocator,
OcrPassageFormatter formatter,
String pageId,
int snippetLimit,
boolean scorePassages)
throws IOException {
final int contentLength = breakLocator.getText().getEndIndex();
if (!off.nextPosition()) {
return new Passage[0];
// If we're filtering by a page identifier, we want *all* hits on that page
int queueSize = pageId != null ? 4096 : maxPassages;
if (queueSize <= 0) {
queueSize = 512;
Comparator cmp;
if (scorePassages) {
cmp =
(left, right) -> {
if (left.getScore() < right.getScore()) {
return -1;
} else if (left.getScore() > right.getScore()) {
return 1;
} else {
return left.getStartOffset() - right.getStartOffset();
} else {
cmp = Comparator.comparingInt(Passage::getStartOffset);
PriorityQueue passageQueue = new PriorityQueue<>(queueSize, cmp);
Passage passage =
new Passage(); // the current passage in-progress. Will either get reset or added to queue.
// If we've reached the limit, no longer calculate passages, only count matches as passages
boolean limitReached = false;
int numTotal = 0;
do {
int start = off.startOffset();
if (start == -1) {
throw new IllegalArgumentException(
"field '" + field + "' was indexed without offsets, cannot highlight");
if (pageId != null) {
String passagePageId = formatter.determineStartPage(start, breakLocator.getText()).id;
if (!passagePageId.equals(pageId)) {
int end = off.endOffset();
if (start < contentLength && end > contentLength) {
// Since building passages is expensive when using external files, we forego it past a certain
// limit (which can be set by the user) and just update the total count, counting each matc
// as a single passage.
if (limitReached || numTotal > snippetLimit) {
limitReached = true;
// advance breakIterator
int passageStart = Math.max(breakLocator.preceding(start + 1), 0);
int passageEnd = Math.min(breakLocator.following(end), contentLength);
// See if this term should be part of a new passage.
if (passageStart >= passage.getEndOffset()) {
if (passage.getStartOffset() >= 0) {
passage =
maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages);
// if we exceed limit, we are done
if (start >= contentLength) {
// Add this term to the passage.
BytesRef term = off.getTerm(); // a reference; safe to refer to
assert term != null;
passage.addMatch(start, end, term, off.freq());
} while (off.nextPosition());
if (passage.getStartOffset() >= 0) {
maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages);
this.numMatches.put(docId, numTotal);
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
// sort in ascending order
Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
return passages;
* Largely identical to {@link FieldHighlighter#maybeAddPassage(PriorityQueue, PassageScorer,
* Passage, int)}.
* This was copied due to private access in the upstream code and to add support for disabling
* scoring. Please refer to the file header for licensing information on the original
* code.
private Passage maybeAddPassage(
PriorityQueue passageQueue,
PassageScorer scorer,
Passage passage,
int contentLength,
boolean score) {
if (passage.getStartOffset() == -1) {
// empty passage, we can ignore it
return passage;
if (score) {
passage.setScore(scorer.score(passage, contentLength));
// new sentence: first add 'passage' to queue
if (score
&& passageQueue.size() == maxPassages
&& passage.getScore() < passageQueue.peek().getScore()) {
passage.reset(); // can't compete, just reset it
} else {
if (passageQueue.size() > maxPassages) {
passage = passageQueue.poll();
} else {
passage = new Passage();
return passage;
/** We don't provide summaries if there is no highlighting, i.e. no matches in the OCR text */
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
return new Passage[] {};
public int getNumMatches(int docId) {
return numMatches.getOrDefault(docId, -1);