de.digitalcollections.solrocr.lucene.OcrFieldHighlighter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.
The newest version!
/*
 * Contains verbatim code and custom code based on code from the Lucene
 * project, licensed under the following terms. All parts where this is
 * the case are clearly marked as such in a source code comment referring
 * to this header.
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE.upstream file distributed
 * with this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * For all parts where this is not the case, refer to the LICENSE file in the
 * repository root.
 */
package de.digitalcollections.solrocr.lucene;

import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.model.OcrSnippet;
import de.digitalcollections.solrocr.util.PageCacheWarmer;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.uhighlight.FieldHighlighter;
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
import org.apache.lucene.search.uhighlight.OffsetsEnum;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.util.BytesRef;

/** A customization of {@link FieldHighlighter} to support OCR fields */
public class OcrFieldHighlighter extends FieldHighlighter {
  private final Map numMatches;

  public OcrFieldHighlighter(
      String field,
      FieldOffsetStrategy fieldOffsetStrategy,
      PassageScorer passageScorer,
      int maxPassages,
      int maxNoHighlightPassages) {
    super(
        field, fieldOffsetStrategy, null, passageScorer, maxPassages, maxNoHighlightPassages, null);
    this.numMatches = new HashMap<>();
  }

  /**
   * The primary method -- highlight this doc, assuming a specific field and given this content.
   *
   * Largely copied from {@link FieldHighlighter#highlightFieldForDoc(LeafReader, int, String)},
   * modified to support an {@link IterableCharSequence} as content and dynamically setting the
   * break iterator and the formatter. Please refer to the file header for licensing
   * information on the original code.
   */
  public OcrSnippet[] highlightFieldForDoc(
      LeafReader reader,
      int docId,
      BreakLocator breakLocator,
      OcrPassageFormatter formatter,
      IterableCharSequence content,
      String pageId,
      int snippetLimit,
      boolean scorePassages)
      throws IOException {
    // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl
    // for it.

    // If page cache pre-warming is enabled, cancel it, since we're doing the I/O ourselves now
    PageCacheWarmer.getInstance().ifPresent(w -> w.cancelPreload(content.getPointer()));
    if (content.length() == 0) {
      return null; // nothing to do
    }

    Passage[] passages;
    try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, docId, null)) {
      passages =
          highlightOffsetsEnums(
              offsetsEnums, docId, breakLocator, formatter, pageId, snippetLimit, scorePassages);
    }

    // Format the resulting Passages.
    if (passages.length == 0 && pageId == null) {
      // no passages were returned, so ask for a default summary
      passages =
          getSummaryPassagesNoHighlight(
              maxNoHighlightPassages == -1 ? maxPassages : maxNoHighlightPassages);
    }

    if (passages.length > 0) {
      OcrSnippet[] snippets = formatter.format(passages, content);
      Arrays.sort(snippets, Collections.reverseOrder());
      return snippets;
    } else {
      return null;
    }
  }

  @Override
  protected Passage[] highlightOffsetsEnums(OffsetsEnum off) {
    throw new UnsupportedOperationException();
  }

  /**
   * Score snippets as mini-documents, either based on TF-IDF/BM25 or simply their position.
   *
   * 
Largely based on {@link FieldHighlighter#highlightOffsetsEnums(OffsetsEnum)} with
   * modifications to add support for the {@link BreakLocator} interrface, the option to disable
   * scoring, the option to limit the number of snippets to consider for scoring as well as
   * restricting the returned snippets to those from OCR pages with a given identifier. 
   * Please refer to the file header for licensing information on the original code.
   */
  protected Passage[] highlightOffsetsEnums(
      OffsetsEnum off,
      int docId,
      BreakLocator breakLocator,
      OcrPassageFormatter formatter,
      String pageId,
      int snippetLimit,
      boolean scorePassages)
      throws IOException {
    final int contentLength = breakLocator.getText().getEndIndex();
    if (!off.nextPosition()) {
      return new Passage[0];
    }
    // If we're filtering by a page identifier, we want *all* hits on that page
    int queueSize = pageId != null ? 4096 : maxPassages;
    if (queueSize <= 0) {
      queueSize = 512;
    }

    Comparator cmp;
    if (scorePassages) {
      cmp =
          (left, right) -> {
            if (left.getScore() < right.getScore()) {
              return -1;
            } else if (left.getScore() > right.getScore()) {
              return 1;
            } else {
              return left.getStartOffset() - right.getStartOffset();
            }
          };
    } else {
      cmp = Comparator.comparingInt(Passage::getStartOffset);
    }

    PriorityQueue passageQueue = new PriorityQueue<>(queueSize, cmp);
    Passage passage =
        new Passage(); // the current passage in-progress.  Will either get reset or added to queue.

    // If we've reached the limit, no longer calculate passages, only count matches as passages
    boolean limitReached = false;
    int numTotal = 0;
    do {
      int start = off.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException(
            "field '" + field + "' was indexed without offsets, cannot highlight");
      }
      if (pageId != null) {
        String passagePageId = formatter.determineStartPage(start, breakLocator.getText()).id;
        if (!passagePageId.equals(pageId)) {
          continue;
        }
      }
      int end = off.endOffset();
      if (start < contentLength && end > contentLength) {
        continue;
      }
      // Since building passages is expensive when using external files, we forego it past a certain
      // limit (which can be set by the user) and just update the total count, counting each matc
      // as a single passage.
      if (limitReached || numTotal > snippetLimit) {
        numTotal++;
        limitReached = true;
        continue;
      }
      // advance breakIterator
      int passageStart = Math.max(breakLocator.preceding(start + 1), 0);
      int passageEnd = Math.min(breakLocator.following(end), contentLength);

      // See if this term should be part of a new passage.
      if (passageStart >= passage.getEndOffset()) {
        if (passage.getStartOffset() >= 0) {
          numTotal++;
        }
        passage =
            maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages);
        // if we exceed limit, we are done
        if (start >= contentLength) {
          break;
        }
        passage.setStartOffset(passageStart);
      }
      passage.setEndOffset(passageEnd);
      // Add this term to the passage.
      BytesRef term = off.getTerm(); // a reference; safe to refer to
      assert term != null;
      passage.addMatch(start, end, term, off.freq());
    } while (off.nextPosition());
    if (passage.getStartOffset() >= 0) {
      numTotal++;
    }
    maybeAddPassage(passageQueue, passageScorer, passage, contentLength, scorePassages);

    this.numMatches.put(docId, numTotal);
    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    // sort in ascending order
    Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
    return passages;
  }

  /**
   * Largely identical to {@link FieldHighlighter#maybeAddPassage(PriorityQueue, PassageScorer,
   * Passage, int)}.
   *
   * This was copied due to private access in the upstream code and to add support for disabling
   * scoring. Please refer to the file header for licensing information on the original
   * code.
   */
  private Passage maybeAddPassage(
      PriorityQueue passageQueue,
      PassageScorer scorer,
      Passage passage,
      int contentLength,
      boolean score) {
    if (passage.getStartOffset() == -1) {
      // empty passage, we can ignore it
      return passage;
    }
    if (score) {
      passage.setScore(scorer.score(passage, contentLength));
    }
    // new sentence: first add 'passage' to queue
    if (score
        && passageQueue.size() == maxPassages
        && passage.getScore() < passageQueue.peek().getScore()) {
      passage.reset(); // can't compete, just reset it
    } else {
      passageQueue.offer(passage);
      if (passageQueue.size() > maxPassages) {
        passage = passageQueue.poll();
        passage.reset();
      } else {
        passage = new Passage();
      }
    }
    return passage;
  }

  /** We don't provide summaries if there is no highlighting, i.e. no matches in the OCR text */
  @Override
  protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
    return new Passage[] {};
  }

  public int getNumMatches(int docId) {
    return numMatches.getOrDefault(docId, -1);
  }
}