de.digitalcollections.solrocr.model.OcrSnippet Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!

package de.digitalcollections.solrocr.model;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;

/** A structured representation of a highlighted OCR snippet. */
public class OcrSnippet implements Comparable {
  private static final Comparator COMPARATOR =
      Comparator.comparing(OcrSnippet::getScore);
  private final String text;
  private final List pages;
  private final List snippetRegions;
  private final List highlightSpans;
  private float score;

  /**
   * Create a new snippet on the given region on the page along with its plaintext.
   *
   * @param text plaintext version of the highlighted page text with highlighting tags
   * @param pages Pages this snippet appears on
   * @param snippetRegions regions the snippet is located in
   */
  public OcrSnippet(String text, List pages, List snippetRegions) {
    this.text = text;
    this.pages = pages;
    this.snippetRegions = snippetRegions;
    this.highlightSpans = new ArrayList<>();
  }

  /**
   * Add a new highlighted span in the snippet.
   *
   * Note that the span regions should be relative to the snippet region!
   *
   * @param span Locations of the highlighted span relative to the snippet region.
   */
  public void addHighlightSpan(List span) {
    this.highlightSpans.add(span.toArray(new OcrBox[0]));
  }

  /** Get the plaintext version of the highlighted page text with highlighting tags */
  public String getText() {
    return text;
  }

  /** Get the region of the page that the snippets is located in */
  public List getSnippetRegions() {
    return snippetRegions;
  }

  /**
   * Get the highlighted regions of the snippet region.
   *
   * 
The highlighted regions are relative to the snippet region, not to the
   * page.
   */
  public List getHighlightSpans() {
    return highlightSpans;
  }

  /** Get the score of the passage, compared to all other passages in the document */
  public float getScore() {
    return score;
  }

  /** Set the score of the passage, compared to all other passages in the document */
  public void setScore(float score) {
    this.score = score;
  }

  /** Convert the snippet to a {@link NamedList} that is used by Solr to populate the response. */
  @SuppressWarnings("rawtypes")
  public NamedList