de.digitalcollections.solrocr.formats.alto.AltoFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR)
without having to store the OCR documents in the index.
The newest version!
package de.digitalcollections.solrocr.formats.alto;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Range;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.TagBreakLocator;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.io.Reader;
import java.util.Map;
import java.util.stream.IntStream;
import javax.xml.stream.XMLStreamException;
public class AltoFormat implements OcrFormat {
private static final Map blockTagMapping =
ImmutableMap.of(
OcrBlock.PAGE, "Page",
// OcrBlock.SECTION, "",
OcrBlock.BLOCK, "TextBlock",
OcrBlock.LINE, "TextLine",
OcrBlock.WORD, "String");
@Override
public BreakLocator getBreakLocator(IterableCharSequence text, OcrBlock... blockTypes) {
// NOTE: The ALTO hierarchy we support is pretty rigid, i.e. Page > TextBlock > TextLine >
// String is a given, hence we only grab the lowest-hierarchy block and call it a day
String breakTag = blockTagMapping.get(blockTypes[0]);
return new TagBreakLocator(text, breakTag);
}
@Override
public OcrParser getParser(Reader input, OcrParser.ParsingFeature... features) {
try {
return new AltoParser(input, features);
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
}
@Override
public OcrPage parsePageFragment(String pageFragment) {
// Poor/lean man's XML parsing
pageFragment = pageFragment.substring(0, pageFragment.indexOf(">"));
String[] elemParts = pageFragment.split(" ");
String width = null;
String height = null;
String id = null;
for (String elemPart : elemParts) {
String[] parts = elemPart.split("=");
switch (parts[0]) {
case "WIDTH":
width = parts[1].substring(1, parts[1].length() - 1);
break;
case "HEIGHT":
height = parts[1].substring(1, parts[1].length() - 1);
break;
case "ID":
id = parts[1].substring(1, parts[1].length() - 1);
break;
}
if (id != null && width != null && height != null) {
break;
}
}
Dimension dims = null;
if (width != null && height != null) {
try {
dims = new Dimension((int) Double.parseDouble(width), (int) Double.parseDouble(height));
} catch (NumberFormatException e) {
// NOP, we're only interested in integer dimensions
}
}
return new OcrPage(id, dims);
}
@Override
public boolean hasFormat(String ocrChunk) {
// Check if the chunk contains any ALTO tags
return ocrChunk.contains(" ocrChunk.contains("<" + t));
}
@Override
public int getLastContentStartIdx(String content) {
int contentIdx = content.lastIndexOf("CONTENT=");
if (contentIdx >= 0) {
contentIdx += 9;
}
return contentIdx;
}
@Override
public int getFirstContentEndIdx(String content) {
int singleQuoteIdx = content.indexOf("'");
int doubleQuoteIdx = content.indexOf("\"");
if (singleQuoteIdx < 0) {
return doubleQuoteIdx;
} else if (doubleQuoteIdx < 0) {
return singleQuoteIdx;
}
return Math.min(singleQuoteIdx, doubleQuoteIdx);
}
@Override
public Range getContainingWordLimits(String fragment, int position) {
int doubleStartIdx = fragment.lastIndexOf("CONTENT=\"", position) + 9;
int singleStartIdx = fragment.lastIndexOf("CONTENT='", position) + 9;
int altStartIdx = fragment.lastIndexOf("", position) + 13;
char attribChar;
int startIdx = IntStream.of(doubleStartIdx, singleStartIdx, altStartIdx).max().getAsInt();
if (startIdx == doubleStartIdx) {
attribChar = '"';
} else if (startIdx == singleStartIdx) {
attribChar = '\'';
} else {
attribChar = '<';
}
return Range.closedOpen(startIdx, fragment.indexOf(attribChar, position));
}
}