All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.OcrSnippet Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import de.digitalcollections.solrocr.util.OcrBox;

/** A structured representation of a highlighted OCR snippet. */
public class OcrSnippet {
  private final String text;
  private final List snippetRegions;
  private final List highlightRegions;
  private float score;

  /**
   * Create a new snippet on the given region on the page along with its plaintext.
   * @param text plaintext version of the highlighted page text with highlighting tags
   * @param snippetRegions regions the snippet is located in
   */
  public OcrSnippet(String text, List snippetRegions) {
    this.text = text;
    this.snippetRegions = snippetRegions;
    this.highlightRegions = new ArrayList<>();
  }

  /** Add a new highlighted region in the snippet.
   *
   * Note that the region should be relative to the snippet region!
   *
   * @param region Location of the highlighted region relative to the snippet region.
   */
  public void addHighlightRegion(List region) {
    this.highlightRegions.add(region.toArray(new OcrBox[0]));
  }

  /** Get the plaintext version of the highlighted page text with highlighting tags */
  public String getText() {
    return text;
  }

  /** Get the region of the page that the snippes is located in */
  public List getSnippetRegions() {
    return snippetRegions;
  }

  /**
   * Get the highlighted regions of the snippet region.
   *
   * The highlighted regions are relative to the snippet region, not to the page.
   */
  public List getHighlightRegions() {
    return highlightRegions;
  }

  /** Get the score of the passage, compared to all other passages in the document */
  public float getScore() {
    return score;
  }

  /** Set the score of the passage, compared to all other passages in the document */
  public void setScore(float score) {
    this.score = score;
  }

  /** Convert the snippet to a {@link NamedList} that is used by Solr to populate the response. */
  public NamedList toNamedList() {
    SimpleOrderedMap m = new SimpleOrderedMap();
    m.add("text", this.getText());
    m.add("score", this.getScore());
    NamedList[] snips = this.snippetRegions.stream()
        .map(OcrBox::toNamedList).toArray(NamedList[]::new);
    m.add("regions", snips);
    if (this.getHighlightRegions() != null) {
      List highlights = new ArrayList<>();
      for (OcrBox[] region : this.getHighlightRegions()) {
        NamedList[] regionBoxes = Arrays.stream(region)
            .map(OcrBox::toNamedList).toArray(NamedList[]::new);
        highlights.add(regionBoxes);
      }
      m.add("highlights", highlights);
    }
    return m;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy