de.digitalcollections.solrocr.formats.alto.AltoByteOffsetsParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR)
without having to store the OCR documents in the index.
package de.digitalcollections.solrocr.formats.alto;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import net.byteseek.matcher.sequence.ByteSequenceMatcher;
import net.byteseek.matcher.sequence.SequenceMatcher;
import net.byteseek.searcher.ForwardSearchIterator;
import net.byteseek.searcher.SearchResult;
import net.byteseek.searcher.Searcher;
import net.byteseek.searcher.sequence.SequenceMatcherSearcher;
public class AltoByteOffsetsParser {
private static final Searcher CONTENT_SEARCHER =
new SequenceMatcherSearcher(new ByteSequenceMatcher(" CONTENT=\""));
private static final Searcher QUOTE_SEARCHER =
new SequenceMatcherSearcher(new ByteSequenceMatcher("\""));
public static void parse(byte[] altoBytes, OutputStream os) throws IOException {
ForwardSearchIterator it = new ForwardSearchIterator(
CONTENT_SEARCHER, altoBytes);
while (it.hasNext()) {
for (SearchResult m : it.next()) {
int start = (int) m.getMatchPosition() + 10;
int end = (int) new ForwardSearchIterator<>(
QUOTE_SEARCHER, altoBytes, start).next().get(0).getMatchPosition();
if (end == start) {
continue;
}
os.write(altoBytes, start, end - start);
os.write(String.format("⚑%d ", start).getBytes(StandardCharsets.UTF_8));
}
}
}
public static void main(String[] args) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
parse(Files.readAllBytes(Paths.get("src/test/resources/data/alto.xml")), bos);
String text = bos.toString(StandardCharsets.UTF_8.toString());
System.out.println(text);
}
}