de.digitalcollections.solrocr.formats.OcrSnippet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR)
without having to store the OCR documents in the index.
package de.digitalcollections.solrocr.formats;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import de.digitalcollections.solrocr.util.OcrBox;
/** A structured representation of a highlighted OCR snippet. */
public class OcrSnippet {
private final String text;
private final List snippetRegions;
private final List highlightRegions;
private float score;
/**
* Create a new snippet on the given region on the page along with its plaintext.
* @param text plaintext version of the highlighted page text with highlighting tags
* @param snippetRegions regions the snippet is located in
*/
public OcrSnippet(String text, List snippetRegions) {
this.text = text;
this.snippetRegions = snippetRegions;
this.highlightRegions = new ArrayList<>();
}
/** Add a new highlighted region in the snippet.
*
* Note that the region should be relative to the snippet region!
*
* @param region Location of the highlighted region relative to the snippet region.
*/
public void addHighlightRegion(List region) {
this.highlightRegions.add(region.toArray(new OcrBox[0]));
}
/** Get the plaintext version of the highlighted page text with highlighting tags */
public String getText() {
return text;
}
/** Get the region of the page that the snippes is located in */
public List getSnippetRegions() {
return snippetRegions;
}
/**
* Get the highlighted regions of the snippet region.
*
* The highlighted regions are relative to the snippet region, not to the page.
*/
public List getHighlightRegions() {
return highlightRegions;
}
/** Get the score of the passage, compared to all other passages in the document */
public float getScore() {
return score;
}
/** Set the score of the passage, compared to all other passages in the document */
public void setScore(float score) {
this.score = score;
}
/** Convert the snippet to a {@link NamedList} that is used by Solr to populate the response. */
public NamedList toNamedList() {
SimpleOrderedMap m = new SimpleOrderedMap();
m.add("text", this.getText());
m.add("score", this.getScore());
NamedList[] snips = this.snippetRegions.stream()
.map(OcrBox::toNamedList).toArray(NamedList[]::new);
m.add("regions", snips);
if (this.getHighlightRegions() != null) {
List highlights = new ArrayList<>();
for (OcrBox[] region : this.getHighlightRegions()) {
NamedList[] regionBoxes = Arrays.stream(region)
.map(OcrBox::toNamedList).toArray(NamedList[]::new);
highlights.add(regionBoxes);
}
m.add("highlights", highlights);
}
return m;
}
}