All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.lucene.OcrFieldHighlighter Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.lucene;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.formats.OcrSnippet;
import de.digitalcollections.solrocr.lucene.byteoffset.ByteOffsetsEnum;
import de.digitalcollections.solrocr.lucene.byteoffset.FieldByteOffsetStrategy;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.uhighlight.FieldHighlighter;
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
import org.apache.lucene.search.uhighlight.OffsetsEnum;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.util.BytesRef;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.IterableCharSequence.OffsetType;
import de.digitalcollections.solrocr.util.Utf8;

/**
 * A customization of {@link FieldHighlighter} to support lazy-loaded field values and byte offsets from payloads.
 */
public class OcrFieldHighlighter extends FieldHighlighter {
  protected FieldByteOffsetStrategy fieldByteOffsetStrategy;
  private Map numMatches;

  public OcrFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
                             FieldByteOffsetStrategy fieldByteOffsetStrategy, PassageScorer passageScorer,
                             BreakIterator breakIter, OcrPassageFormatter formatter, int maxPassages,
                             int maxNoHighlightPassages) {
    super(field, fieldOffsetStrategy, breakIter, passageScorer, maxPassages, maxNoHighlightPassages, formatter);
    this.fieldByteOffsetStrategy = fieldByteOffsetStrategy;
    this.numMatches = new HashMap<>();
  }

  /**
   * The primary method -- highlight this doc, assuming a specific field and given this content.
   *
   * Largely copied from {@link FieldHighlighter#highlightFieldForDoc(LeafReader, int, String)}, modified to support
   * an {@link IterableCharSequence} as content.
   */
  public OcrSnippet[] highlightFieldForDoc(LeafReader reader, int docId, IterableCharSequence content, String pageId)
      throws IOException {
    // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl for it.
    if (content.length() == 0) {
      return null; // nothing to do
    }

    breakIterator.setText(content);

    Passage[] passages;
    if (content.getOffsetType() == OffsetType.BYTES && content.getCharset() == StandardCharsets.UTF_8) {
      try (ByteOffsetsEnum byteOffsetsEnums = fieldByteOffsetStrategy.getByteOffsetsEnum(reader, docId)) {
        passages = highlightByteOffsetsEnums(byteOffsetsEnums, docId,  pageId);
      }
    } else {
      try (OffsetsEnum offsetsEnums = fieldOffsetStrategy.getOffsetsEnum(reader, docId, null)) {
        passages = highlightOffsetsEnums(offsetsEnums, docId, pageId);// and breakIterator & scorer
      }
    }

    // Format the resulting Passages.
    if (passages.length == 0 && pageId == null) {
      // no passages were returned, so ask for a default summary
      passages = getSummaryPassagesNoHighlight(maxNoHighlightPassages == -1 ? maxPassages : maxNoHighlightPassages);
    }

    if (passages.length > 0) {
      return ((OcrPassageFormatter) passageFormatter).format(passages, content);
    } else {
      return null;
    }
  }
  @Override
  protected Passage[] highlightOffsetsEnums(OffsetsEnum off) throws IOException {
    return this.highlightOffsetsEnums(off, -1, null);
  }

  protected Passage[] highlightOffsetsEnums(OffsetsEnum off, int docId, String pageId) throws IOException {
        final int contentLength = this.breakIterator.getText().getEndIndex();
    if (!off.nextPosition()) {
      return new Passage[0];
    }
    // If we're filtering by a page identifier, we want *all* hits on that page
    int queueSize = pageId != null ? 4096 : maxPassages;
    if (queueSize  <= 0) {
      queueSize = 512;
    }
    queueSize = Math.min(512, queueSize);

    PriorityQueue passageQueue = new PriorityQueue<>(queueSize, (left, right) -> {
      if (left.getScore() < right.getScore()) {
        return -1;
      } else if (left.getScore() > right.getScore()) {
        return 1;
      } else {
        return left.getStartOffset() - right.getStartOffset();
      }
    });
    Passage passage = new Passage(); // the current passage in-progress.  Will either get reset or added to queue.

    int numTotal = 0;
    do {
      int start = off.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
      }
      if (pageId != null) {
        String passagePageId = ((OcrPassageFormatter) passageFormatter).determineStartPage(
            null, start, (IterableCharSequence) breakIterator.getText());
        if (!passagePageId.equals(pageId)) {
          continue;
        }
      }
      int end = off.endOffset();
      if (start < contentLength && end > contentLength) {
        continue;
      }
      // See if this term should be part of a new passage.
      if (start >= passage.getEndOffset()) {
        if (passage.getStartOffset() >= 0) {
          numTotal++;
        }
        passage = maybeAddPassage(passageQueue, passageScorer, passage, contentLength);
        // if we exceed limit, we are done
        if (start >= contentLength) {
          break;
        }
        // advance breakIterator
        passage.setStartOffset(Math.max(this.breakIterator.preceding(start + 1), 0));
        passage.setEndOffset(Math.min(this.breakIterator.following(end), contentLength));
      } else {
        passage.setEndOffset(Math.min(this.breakIterator.following(end), contentLength));
      }
      // Add this term to the passage.
      BytesRef term = off.getTerm();// a reference; safe to refer to
      assert term != null;
      passage.addMatch(start, end, term, off.freq());
    } while (off.nextPosition());
    if (passage.getStartOffset() >= 0) {
      numTotal++;
    }
    maybeAddPassage(passageQueue, passageScorer, passage, contentLength);

    this.numMatches.put(docId, numTotal);
    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    // sort in ascending order
    Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
    return passages;
  }

  /**
   * Highlight passages from the document using the byte offsets in the payloads of the matching terms.
   *
   * Largely copied from {@link FieldHighlighter#highlightOffsetsEnums(OffsetsEnum)}, modified to load the byte offsets
   * from the term payloads.
   */
  protected Passage[] highlightByteOffsetsEnums(ByteOffsetsEnum off, int docId, String pageId) throws IOException {
    final int contentLength = this.breakIterator.getText().getEndIndex();
    if (!off.nextPosition()) {
      return new Passage[0];
    }
    // If we're filtering by a page identifier, we want *all* hits on that page
    int queueSize = pageId != null ? 4096 : maxPassages;
    if (queueSize  <= 0) {
      queueSize = 512;
    }
    queueSize = Math.min(512, queueSize);

    PriorityQueue passageQueue = new PriorityQueue<>(queueSize, (left, right) -> {
      if (left.getScore() < right.getScore()) {
        return -1;
      } else if (left.getScore() > right.getScore()) {
        return 1;
      } else {
        return left.getStartOffset() - right.getStartOffset();
      }
    });
    Passage passage = new Passage(); // the current passage in-progress.  Will either get reset or added to queue.
    int numTotal = 0;
    do {
      int offset = off.byteOffset();
      this.breakIterator.getText().setIndex(offset);
      int end = offset;
      while (true) {
        char c = this.breakIterator.getText().next();
        end += Utf8.encodedLength(Character.toString(c));
        if (!Character.isLetter(c)) {
          break;
        }
      }
      if (offset < contentLength && end > contentLength) {
        continue;
      }
      if (pageId != null) {
        String passagePageId = ((OcrPassageFormatter) passageFormatter).determineStartPage(
            null, offset, (IterableCharSequence) breakIterator.getText());
        if (!passagePageId.equals(pageId)) {
          continue;
        }
      }
      // See if this term should be part of a new passage.
      if (offset >= passage.getEndOffset()) {
        if (passage.getStartOffset() >= 0) {
          numTotal++;
        }
        passage = maybeAddPassage(passageQueue, passageScorer, passage, contentLength);
        // if we exceed limit, we are done
        if (offset >= contentLength) {
          break;
        }
        // advance breakIterator
        passage.setStartOffset(Math.max(this.breakIterator.preceding(offset + 1), 0));
        passage.setEndOffset(Math.min(this.breakIterator.following(offset), contentLength));
      }
      // Add this term to the passage.
      BytesRef term = off.getTerm();// a reference; safe to refer to
      assert term != null;
      passage.addMatch(offset, end, term, off.freq());
    } while (off.nextPosition());
    if (passage.getStartOffset() >= 0) {
      numTotal++;
    }
    maybeAddPassage(passageQueue, passageScorer, passage, contentLength);
    this.numMatches.put(docId, numTotal);

    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    // sort in ascending order
    Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
    return passages;
  }

  /** Completely copied from {@link FieldHighlighter} due to private access there. */
  private Passage maybeAddPassage(PriorityQueue passageQueue, PassageScorer scorer, Passage passage,
                                  int contentLength) {
    if (passage.getStartOffset() == -1) {
      // empty passage, we can ignore it
      return passage;
    }
    passage.setScore(scorer.score(passage, contentLength));
    // new sentence: first add 'passage' to queue
    if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
      passage.reset(); // can't compete, just reset it
    } else {
      passageQueue.offer(passage);
      if (passageQueue.size() > maxPassages) {
        passage = passageQueue.poll();
        passage.reset();
      } else {
        passage = new Passage();
      }
    }
    return passage;
  }

  /** We don't provide summaries if there is no highlighting, i.e. no matches in the OCR text */
  @Override
  protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
    return new Passage[]{};
  }

  public int getNumMatches(int  docId) {
    return numMatches.getOrDefault(docId, -1);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy