All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.alto.AltoByteOffsetsParser Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats.alto;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import net.byteseek.matcher.sequence.ByteSequenceMatcher;
import net.byteseek.matcher.sequence.SequenceMatcher;
import net.byteseek.searcher.ForwardSearchIterator;
import net.byteseek.searcher.SearchResult;
import net.byteseek.searcher.Searcher;
import net.byteseek.searcher.sequence.SequenceMatcherSearcher;

public class AltoByteOffsetsParser {
  private static final Searcher CONTENT_SEARCHER =
      new SequenceMatcherSearcher(new ByteSequenceMatcher(" CONTENT=\""));
  private static final Searcher QUOTE_SEARCHER =
      new SequenceMatcherSearcher(new ByteSequenceMatcher("\""));

  public static void parse(byte[] altoBytes, OutputStream os) throws IOException {
    ForwardSearchIterator it = new ForwardSearchIterator(
        CONTENT_SEARCHER, altoBytes);

    while (it.hasNext()) {
      for (SearchResult m : it.next()) {
        int start = (int) m.getMatchPosition() + 10;
        int end = (int) new ForwardSearchIterator<>(
            QUOTE_SEARCHER, altoBytes, start).next().get(0).getMatchPosition();
        if (end == start) {
          continue;
        }
        os.write(altoBytes, start, end - start);
        os.write(String.format("⚑%d ", start).getBytes(StandardCharsets.UTF_8));
      }
    }
  }

  public static void main(String[] args) throws IOException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    parse(Files.readAllBytes(Paths.get("src/test/resources/data/alto.xml")), bos);
    String text = bos.toString(StandardCharsets.UTF_8.toString());
    System.out.println(text);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy