All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.hocr.HocrPassageFormatter Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats.hocr;

import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.OcrBox;

public class HocrPassageFormatter extends OcrPassageFormatter {
  private final static Pattern wordPat = Pattern.compile(
      "\\d+) (?\\d+) (?\\d+) (?\\d+);?.*?>(?.+?)");
  private final static Pattern pagePat = Pattern.compile(
      ".+?)['\"]");

  private final HocrClassBreakIterator pageIter;
  private final String startHlTag;
  private final String endHlTag;

  public HocrPassageFormatter(String startHlTag, String endHlTag, boolean absoluteHighlights) {
    super(startHlTag, endHlTag, absoluteHighlights);
    this.pageIter = new HocrClassBreakIterator("ocr_page");
    this.startHlTag = startHlTag;
    this.endHlTag = endHlTag;
  }

  @Override
  public String determineStartPage(String ocrFragment, int startOffset, IterableCharSequence content) {
    pageIter.setText(content);
    int pageOffset = pageIter.preceding(startOffset);
    String pageFragment = content.subSequence(
        pageOffset, Math.min(pageOffset + 256, content.length())).toString();
    Matcher m = pagePat.matcher(pageFragment);
    if (m.find()) {
      return m.group("pageId");
    }
    return null;
  }

  private TreeMap determinePageBreaks(String ocrFragment) {
    TreeMap map = new TreeMap<>();
    Matcher m = pagePat.matcher(ocrFragment);
    while (m.find()) {
      map.put(m.start(), m.group("pageId"));
    }
    return map;
  }

  @Override
  protected List parseWords(String ocrFragment, String startPage) {
    List wordBoxes = new ArrayList<>();
    TreeMap pageBreaks = determinePageBreaks(ocrFragment);
    Matcher m = wordPat.matcher(ocrFragment);
    boolean inHighlight = false;
    while (m.find()) {
      String pageId = startPage;
      if (pageBreaks.floorKey(m.start()) != null) {
        pageId = pageBreaks.floorEntry(m.start()).getValue();
      }
      int x0 = Integer.parseInt(m.group("ulx"));
      int y0 = Integer.parseInt(m.group("uly"));
      int x1 = Integer.parseInt(m.group("lrx"));
      int y1 = Integer.parseInt(m.group("lry"));
      String text = m.group("text");
      if (text.contains(startHlTag)) {
        inHighlight = true;
      }
      wordBoxes.add(new OcrBox(text.replace(startHlTag, "").replace(endHlTag, ""),
                               pageId, x0, y0, x1, y1, inHighlight));
      boolean endOfHl = (
          text.contains(endHlTag)
          || ocrFragment.substring(m.end(), Math.min(m.end() + endHlTag.length(), ocrFragment.length()))
              .equals(endHlTag));
      if (endOfHl) {
        inHighlight = false;
      }
    }
    return wordBoxes;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy