All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.mini.MiniOcrPassageFormatter Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats.mini;

import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.lucene.search.uhighlight.Passage;
import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.formats.OcrSnippet;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.OcrBox;
import de.digitalcollections.solrocr.util.TagBreakIterator;

public class MiniOcrPassageFormatter extends OcrPassageFormatter {
  private final static Pattern wordPat = Pattern.compile(
      "1?\\.?\\d+?) (?1?\\.?\\d+?) (?1?\\.?\\d+?) (?1?\\.?\\d+?)\">(?.+?)");
  private final static Pattern pagePat = Pattern.compile("

.+?)\">"); private final TagBreakIterator pageIter = new TagBreakIterator("p"); public MiniOcrPassageFormatter(String startHlTag, String endHlTag, boolean absoluteHighlights) { super(startHlTag, endHlTag, absoluteHighlights); } @Override public String determineStartPage(String xmlFragment, int startOffset, IterableCharSequence content) { pageIter.setText(content); int pageOffset = pageIter.preceding(startOffset); String pageFragment = content.subSequence( pageOffset, Math.min(pageOffset + 128, content.length())).toString(); Matcher m = pagePat.matcher(pageFragment); if (m.find()) { return m.group("pageId"); } return null; } private TreeMap determinePageBreaks(String ocrFragment) { TreeMap map = new TreeMap<>(); Matcher m = pagePat.matcher(ocrFragment); while (m.find()) { map.put(m.start(), m.group("pageId")); } return map; } @Override protected void addHighlightsToSnippet(List> hlBoxes, OcrSnippet snippet) { if (this.absoluteHighlights) { super.addHighlightsToSnippet(hlBoxes, snippet); return; } // Handle relative coordinates OcrBox snip = snippet.getSnippetRegions().get(0); float xOffset = snip.getUlx(); float yOffset = snip.getUly(); float snipWidth = snip.getLrx() - xOffset; float snipHeight = snip.getLry() - yOffset; hlBoxes.stream() .map(cs -> cs.stream().map( b -> new OcrBox( b.getText(), b.getPageId(), truncateFloat((b.getUlx() - xOffset) / snipWidth), truncateFloat((b.getUly() - yOffset) / snipHeight), truncateFloat((b.getLrx() - xOffset) / snipWidth), truncateFloat((b.getLry() - yOffset) / snipHeight), b.isHighlight())) .collect(Collectors.toList())) .map(this::mergeBoxes) .forEach(snippet::addHighlightRegion); } @Override protected List parseWords(String ocrFragment, String startPage) { List wordBoxes = new ArrayList<>(); boolean inHighlight = false; TreeMap pageBreaks = determinePageBreaks(ocrFragment); Matcher m = wordPat.matcher(ocrFragment); while (m.find()) { String pageId = startPage; if (pageBreaks.floorKey(m.start()) != null) { pageId = pageBreaks.floorEntry(m.start()).getValue(); } float x = Float.valueOf(m.group("x")); float y = Float.valueOf(m.group("y")); float width = Float.valueOf(m.group("w")); float height = Float.valueOf(m.group("h")); String text = m.group("text"); if (text.contains(startHlTag)) { inHighlight = true; } wordBoxes.add(new OcrBox(text.replace(startHlTag, "").replace(endHlTag, ""), pageId, x, y, x + width, y + height, inHighlight)); boolean endOfHl = ( text.contains(endHlTag) || ocrFragment.substring(m.end(), Math.min(m.end() + endHlTag.length(), ocrFragment.length())) .equals(endHlTag)); if (endOfHl) { inHighlight = false; } } return wordBoxes; } private float truncateFloat(float num) { return (float) Math.floor(num * 10000) / 10000; } @Override public Object format(Passage[] passages, String content) { throw new UnsupportedOperationException(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy