All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.hocr.HocrByteOffsetsParser Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats.hocr;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import net.byteseek.compiler.CompileException;
import net.byteseek.compiler.matcher.SequenceMatcherCompiler;
import net.byteseek.matcher.sequence.ByteSequenceMatcher;
import net.byteseek.matcher.sequence.SequenceMatcher;
import net.byteseek.searcher.ForwardSearchIterator;
import net.byteseek.searcher.SearchResult;
import net.byteseek.searcher.Searcher;
import net.byteseek.searcher.sequence.SequenceMatcherSearcher;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import de.digitalcollections.solrocr.util.Streams;

public class HocrByteOffsetsParser {

  private static final Searcher BEGIN_SPAN_SEARCHER =
      new SequenceMatcherSearcher(new ByteSequenceMatcher(" END_SPAN_SEARCHER =
      new SequenceMatcherSearcher(new ByteSequenceMatcher(""));

  private static int getPageOffset(byte[] ocrBytes, String pageId) {
    final Searcher pageSearcher;
    try {
      pageSearcher = new SequenceMatcherSearcher(SequenceMatcherCompiler.compileFrom(
          "'
it = new ForwardSearchIterator<>(pageSearcher, ocrBytes); if (!it.hasNext()) { throw new IllegalArgumentException("Could not find page with id '" + pageId + "'"); } return (int) it.next().get(0).getMatchPosition(); } public static void parse(byte[] ocrBytes, OutputStream os) throws IOException { parse(ocrBytes, os, null, null); } /** Convert the hOCR document, starting from startPage and ending at, not including endPage. */ public static void parse(byte[] ocrBytes, OutputStream os, String startPage, String endPage) throws IOException { int startOffset = 0; if (startPage != null) { startOffset = getPageOffset(ocrBytes, startPage); } int endOffset = ocrBytes.length - 1; if (endPage != null) { endOffset = getPageOffset(ocrBytes, endPage); } Searcher termSearcher; try { termSearcher = new SequenceMatcherSearcher(SequenceMatcherCompiler.compileFrom("'>' ^'<'")); } catch (CompileException e) { throw new RuntimeException(); } ForwardSearchIterator beginIt = new ForwardSearchIterator<>( BEGIN_SPAN_SEARCHER, startOffset, endOffset, ocrBytes); ForwardSearchIterator endIt = new ForwardSearchIterator<>( END_SPAN_SEARCHER,startOffset, endOffset, ocrBytes); List> wordOffsets = Streams.zip(Streams.stream(beginIt).flatMap(Collection::stream).map(SearchResult::getMatchPosition), Streams.stream(endIt).flatMap(Collection::stream).map(SearchResult::getMatchPosition), ImmutablePair::new) .filter(p -> { String hocrClass = new String(ocrBytes, p.left.intValue() + 13, 9, StandardCharsets.UTF_8); return hocrClass.equals("ocrx_word"); }).collect(Collectors.toList()); for (ImmutablePair p : wordOffsets) { int start = Math.toIntExact(p.left); int end = Math.toIntExact(p.right); ForwardSearchIterator termIt = new ForwardSearchIterator<>(termSearcher, start, end, ocrBytes); if (!termIt.hasNext()) { continue; } int startTerm = (int) (termIt.next().get(0).getMatchPosition()) + 1; int endTerm = ArrayUtils.indexOf(ocrBytes, (byte) '<', startTerm); int width = endTerm - startTerm; os.write(ocrBytes, startTerm, width); os.write(String.format("⚑%d ", startTerm).getBytes(StandardCharsets.UTF_8)); } } public static void main(String[] args) throws IOException { ByteArrayOutputStream bos = new ByteArrayOutputStream(); long start = System.nanoTime(); parse(Files.readAllBytes(Paths.get("src/test/resources/data/hocr_test.html")), bos, "page_118", "page_120"); System.out.println(String.format("Parsing took %.2fms", (System.nanoTime() - start) / 1e6)); String text = bos.toString(StandardCharsets.UTF_8.toString()); System.out.println(text); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy