All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.mini.MiniOcrFormat Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats.mini;

import com.google.common.collect.ImmutableMap;
import java.text.BreakIterator;
import java.util.Map;
import de.digitalcollections.solrocr.formats.OcrBlock;
import de.digitalcollections.solrocr.formats.OcrFormat;
import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.util.ContextBreakIterator;
import de.digitalcollections.solrocr.util.TagBreakIterator;

public class MiniOcrFormat implements OcrFormat {
  private static final Map blockTagMapping = ImmutableMap.of(
      OcrBlock.PAGE, "p",
      OcrBlock.SECTION, "s",
      OcrBlock.BLOCK, "b",
      OcrBlock.LINE, "l",
      OcrBlock.WORD, "w");

  @Override
  public BreakIterator getBreakIterator(OcrBlock breakBlock, OcrBlock limitBlock, int contextSize) {
    String breakTag = blockTagMapping.get(breakBlock);
    String limitTag = limitBlock == null ? null : blockTagMapping.get(limitBlock);
    return new ContextBreakIterator(
        new TagBreakIterator(breakTag),
        limitTag != null ? new TagBreakIterator(limitTag) : null,
        contextSize);
  }

  @Override
  public OcrPassageFormatter getPassageFormatter(String prehHighlightTag, String postHighlightTag,
                                                 boolean absoluteHighlights) {
    return new MiniOcrPassageFormatter(prehHighlightTag, postHighlightTag, absoluteHighlights);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy