de.digitalcollections.solrocr.formats.alto.AltoParser Maven / Gradle / Ivy
Show all versions of solr-ocrhighlighting Show documentation
package de.digitalcollections.solrocr.formats.alto;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import de.digitalcollections.solrocr.util.CharBufUtils;
import java.awt.Dimension;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.Set;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLStreamReader2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AltoParser extends OcrParser {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private boolean noMoreWords;
private OcrPage currentPage;
private Boolean hasExplicitSpaces = null;
private OcrBox hyphenEnd = null;
private boolean inHyphenation = false;
public AltoParser(Reader reader, ParsingFeature... features) throws XMLStreamException {
super(reader, features);
}
@Override
protected OcrBox readNext(XMLStreamReader2 xmlReader, Set features)
throws XMLStreamException {
if (this.hasExplicitSpaces == null) {
// ALTO can optionally encode explicit spaces with the element.
this.hasExplicitSpaces = this.input.peekBeginning().contains(" 0)) {
box.setTrailingChars(" ");
}
// Hyphenation handling
if (box.isHyphenStart()) {
if (this.inHyphenation) {
// Two subsequent hyphen starts, broken/invalid data, but we try to deal with it anyway
// by not looking for a hyphen end for every subsequent hyphen start
return box;
}
this.inHyphenation = true;
this.hyphenEnd = this.readNext(xmlReader, features);
if (this.hyphenEnd != null
&& this.hyphenEnd.isHyphenated()
&& !this.hyphenEnd.isHyphenStart()) {
// Insert highlighting markers at correct positions in the dehyphenated content
// This is assuming that both the end is fully part of the dehyphenated form.
boolean modified = false;
StringBuilder dehyphenated = new StringBuilder(hyphenEnd.getDehyphenatedForm());
if (box.getText().contains(START_HL)) {
dehyphenated.insert(box.getText().indexOf(START_HL), START_HL);
modified = true;
}
if (box.getText().contains(END_HL)) {
dehyphenated.insert(box.getText().indexOf(END_HL), END_HL);
modified = true;
}
int endIdx =
dehyphenated.indexOf(hyphenEnd.getText().replace(END_HL, "").replace(START_HL, ""));
if (hyphenEnd.getText().contains(START_HL) && endIdx >= 0) {
dehyphenated.insert(endIdx + hyphenEnd.getText().indexOf(START_HL), START_HL);
modified = true;
}
if (hyphenEnd.getText().contains(END_HL) && endIdx >= 0) {
dehyphenated.insert(endIdx + hyphenEnd.getText().indexOf(END_HL), END_HL);
modified = true;
}
if (modified) {
box.setHyphenInfo(true, dehyphenated.toString());
hyphenEnd.setHyphenInfo(false, dehyphenated.toString());
}
// Full hyphenation, no whitespace between start and end
box.setTrailingChars("");
} else {
box.setHyphenInfo(null, null);
box.setDehyphenatedOffset(null);
}
this.inHyphenation = false;
}
// Boxes without text or coordinates (if either is requested with a feature flag) are ignored
// since they break
// things downstream
boolean ignoreBox =
(features.contains(ParsingFeature.TEXT)
&& (box.getText() == null || box.getText().isEmpty()))
|| (features.contains(ParsingFeature.COORDINATES)
&& (box.getLrx() < 0 && box.getLry() < 0 && box.getUlx() < 0 && box.getUly() < 0));
if (ignoreBox) {
return null;
}
return box;
}
/**
* Advance parser to the next word, counting the number of pages encountered along the way, as
* well as page breaks, if desired.
*/
private int seekToNextWord(XMLStreamReader2 xmlReader, boolean trackPages)
throws XMLStreamException {
int numSpaces = 0;
boolean foundWord = false;
while (xmlReader.hasNext()) {
if (xmlReader.next() != XMLStreamConstants.START_ELEMENT) {
continue;
}
String localName = xmlReader.getLocalName();
if ("String".equals(localName)) {
foundWord = true;
break;
} else if ("SP".equals(localName)) {
this.hasExplicitSpaces = true;
numSpaces++;
} else if ("TextLine".equals(localName)) {
numSpaces++;
} else if ("Page".equals(localName) && trackPages) {
String width = xmlReader.getAttributeValue("", "WIDTH");
String height = xmlReader.getAttributeValue("", "HEIGHT");
Dimension dims = null;
if (width != null && height != null) {
try {
dims = new Dimension((int) Double.parseDouble(width), (int) Double.parseDouble(height));
} catch (NumberFormatException e) {
// NOP, we're only interested in integer dimensions
}
}
this.currentPage = new OcrPage(xmlReader.getAttributeValue("", "ID"), dims);
}
}
noMoreWords = !foundWord;
return numSpaces;
}
/**
* Get the character offset of the value for the given attribute in the reader.
*
* Assumes the XMLStreamReader is on a START_ELEMENT event.
*/
private long getAttributeValueOffset(String targetAttrib, XMLStreamReader2 xmlReader) {
if (xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) {
throw new IllegalStateException("XMLStreamReader must be on a START_ELEMENT event.");
}
char[] backContextBuffer = input.peekBackContextBuffer();
int contextLen = input.getBackContextSize();
// Place the back-context pointer on the start of the element
int contextIdx =
Math.toIntExact(
xmlReader.getLocationInfo().getStartingCharOffset()
- input.getBackContextStartOffset());
// Look for the attribute in the back context, starting from the start of
// the element. Done with character buffers since it's *way* *way* faster than
// creating a String from the buffer and calling `.indexOf`.
// Way faster as in doesn't even show up in the sampling profiler anymore.
char[] needle = (" " + targetAttrib + "=").toCharArray();
int needleIdx = CharBufUtils.indexOf(backContextBuffer, contextIdx, contextLen, needle);
if (needleIdx >= 0) {
// Append 1 to the index to account for the single- or double-quote after the `=`
return input.getBackContextStartOffset() + needleIdx + needle.length + 1;
}
return -1;
}
}