de.digitalcollections.solrocr.lucene.OcrPassageFormatter Maven / Gradle / Ivy
Show all versions of solr-ocrhighlighting Show documentation
package de.digitalcollections.solrocr.lucene;
import static de.digitalcollections.solrocr.formats.OcrParser.END_HL;
import static de.digitalcollections.solrocr.formats.OcrParser.START_HL;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.lucene.filters.SanitizingXmlFilter;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrPage;
import de.digitalcollections.solrocr.model.OcrSnippet;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** Takes care of formatting fragments of the OCR format into {@link OcrSnippet} instances. */
public class OcrPassageFormatter extends PassageFormatter {
protected static final Pattern LAST_INNER_TAG_PAT = Pattern.compile("[a-zA-Z0-9]");
private static final Logger logger = LoggerFactory.getLogger(OcrPassageFormatter.class);
private final OcrFormat format;
protected final String startHlTag;
protected final String endHlTag;
protected final boolean absoluteHighlights;
protected final boolean alignSpans;
protected final boolean trackPages;
public OcrPassageFormatter(
String startHlTag,
String endHlTag,
boolean absoluteHighlights,
boolean alignSpans,
boolean trackPages,
OcrFormat format) {
this.startHlTag = startHlTag;
this.endHlTag = endHlTag;
this.absoluteHighlights = absoluteHighlights;
this.alignSpans = alignSpans;
this.trackPages = trackPages;
this.format = format;
}
/** Merge overlapping matches. * */
protected List mergeMatches(int numMatches, int[] matchStarts, int[] matchEnds) {
Deque sortedMatches =
IntStream.range(0, numMatches)
.mapToObj(idx -> new PassageMatch(matchStarts[idx], matchEnds[idx]))
.collect(Collectors.toCollection(ArrayDeque::new));
Deque mergedMatches = new ArrayDeque<>();
mergedMatches.add(sortedMatches.removeFirst());
while (!sortedMatches.isEmpty()) {
PassageMatch candidate = sortedMatches.removeFirst();
if (!mergedMatches.isEmpty() && mergedMatches.peekLast().overlaps(candidate)) {
// Cannot be null due to isEmpty check, and no concurrent accesses that could
// remove it
mergedMatches.peekLast().merge(candidate);
} else {
mergedMatches.add(candidate);
}
}
return new ArrayList<>(mergedMatches);
}
/**
* Format the passages that point to subsequences of the document text into {@link OcrSnippet}
* instances
*
* @param passages in the the document text that contain highlighted text
* @param content of the OCR field, implemented as an {@link IterableCharSequence}
* @return the parsed snippet representation of the passages
*/
public OcrSnippet[] format(Passage[] passages, IterableCharSequence content) {
OcrSnippet[] snippets = new OcrSnippet[passages.length];
for (int i = 0; i < passages.length; i++) {
Passage passage = passages[i];
try {
snippets[i] = format(passage, content);
} catch (IndexOutOfBoundsException e) {
String errorMsg =
String.format(
"Could not create snippet (start=%d, end=%d) from content at '%s' due to an out-of-bounds error.\n"
+ "\nDoes the file on disk correspond to the document that was used during indexing?",
passage.getStartOffset(), passage.getEndOffset(), content.getIdentifier());
logger.error(errorMsg, e);
}
}
return snippets;
}
protected String getHighlightedFragment(Passage passage, IterableCharSequence content) {
StringBuilder sb =
new StringBuilder(content.subSequence(passage.getStartOffset(), passage.getEndOffset()));
int extraChars = 0;
if (passage.getNumMatches() > 0) {
List matches =
mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds());
for (PassageMatch match : matches) {
// Can't just do match.start - passage.getStartOffset(), since both offsets are relative to
// **UTF-8 bytes**, but
// we need **UTF-16 codepoint** offsets in the code.
String preMatchContent =
content.subSequence(passage.getStartOffset(), match.start).toString();
int matchStart = preMatchContent.length();
if (alignSpans) {
matchStart = format.getLastContentStartIdx(preMatchContent);
}
sb.insert(
this.adjustPositionToCharacterEntities(sb.toString(), extraChars + matchStart),
START_HL);
extraChars += START_HL.length();
// Again, can't just do match.end - passage.getStartOffset(), since we need char offsets
// (see above).
int matchEnd = content.subSequence(passage.getStartOffset(), match.end).toString().length();
String matchText = sb.substring(extraChars + matchStart, extraChars + matchEnd);
if (matchText.trim().endsWith(">")) {
// Set the end of the match to the position before the last inner closing tag inside of
// the match. This is only relevant for hOCR at the moment
Matcher m = LAST_INNER_TAG_PAT.matcher(matchText);
int idx = -1;
while (m.find()) {
idx = m.start() + 1;
}
if (idx > -1) {
matchEnd -= (matchText.length() - idx);
}
}
matchEnd = Math.min(matchEnd + extraChars, sb.length());
if (alignSpans && matchEnd != sb.length()) {
String postMatchContent = sb.substring(matchEnd, sb.length());
matchEnd += format.getFirstContentEndIdx(postMatchContent);
}
sb.insert(this.adjustPositionToCharacterEntities(sb.toString(), matchEnd), END_HL);
extraChars += END_HL.length();
}
}
return sb.toString();
}
/**
* Adjust the given position within the OCR fragment to account for XML character entities in the
* OCR word, assumes that the position is within an OCR word.
*
* This is necessary since doing this at indexing time would be extremely costly, given that it
* would need to be run for every single word. At highlighting time it only needs to be run for
* words that have a highlighting marker inside, since the difference is otherwise not
* problematic.
*/
private int adjustPositionToCharacterEntities(String fragment, int position) {
Range wordRange = this.format.getContainingWordLimits(fragment, position);
int idx = wordRange.lowerEndpoint();
while (idx >= wordRange.lowerEndpoint() && idx < position) {
int entStart = fragment.indexOf('&', idx);
if (entStart < 0 || entStart >= position || entStart > wordRange.upperEndpoint()) {
// No entities opened before position in the word, start doesn't need to be adjusted
break;
}
int entEnd = fragment.indexOf(';', entStart);
int entLength = entEnd - entStart;
// This assumes that the entity decodes to a codepoint that is only one character wide in
// UTF16, which should be the case for >99.9% of terms people search for...
position += entLength;
idx = entEnd + 1;
}
return position;
}
private OcrSnippet format(Passage passage, IterableCharSequence content) {
String xmlFragment = getHighlightedFragment(passage, content);
OcrPage initialPage = null;
if (trackPages) {
initialPage = determineStartPage(passage.getStartOffset(), content);
}
OcrSnippet snip = parseFragment(xmlFragment, initialPage);
if (snip != null) {
snip.setScore(passage.getScore());
}
return snip;
}
/** Determine the page an OCR fragment resides on. */
OcrPage determineStartPage(int startOffset, IterableCharSequence content) {
BreakLocator pageBreakLocator = this.format.getBreakLocator(content, OcrBlock.PAGE);
int pageOffset = pageBreakLocator.preceding(startOffset);
if (pageOffset == BreakLocator.DONE) {
// This means the page is, if present, part of the passage, and will be determined during
// parsing anyway
return null;
}
String pageFragment =
content.subSequence(pageOffset, Math.min(pageOffset + 512, content.length())).toString();
return this.format.parsePageFragment(pageFragment);
}
/** Parse an {@link OcrSnippet} from an OCR fragment. */
protected OcrSnippet parseFragment(String ocrFragment, OcrPage page) {
List allBoxes = this.parseWords(ocrFragment, page);
if (allBoxes.isEmpty()) {
return null;
}
// Grouped by columns
List> byColumns = new ArrayList<>();
List currentCol = new ArrayList<>();
OcrBox prevBox = null;
String pageId = null;
for (OcrBox box : allBoxes) {
// Stupid, haphazard heuristic for column detection: If the next box is at least the height of
// the current box times five higher on the page, we're on a new column. Or if the page
// changes.
// FIXME: This clearly needs some more thought put into it
boolean newColumn =
prevBox != null && (box.getUly() + prevBox.getHeight() * 5) < prevBox.getUly();
String boxPageId = box.getPage() == null ? null : box.getPage().id;
boolean newPage = pageId != null && !pageId.equals(boxPageId);
if (newColumn || newPage) {
byColumns.add(currentCol);
currentCol = new ArrayList<>();
}
currentCol.add(box);
// Skip very low-height boxes since they throw off the heuristic, we still track page changes,
// though!
if (box.getHeight() > 5) {
prevBox = box;
}
pageId = boxPageId;
}
byColumns.add(currentCol);
// Get highlighted spans
Set pages = new LinkedHashSet<>();
List> hlSpans = new ArrayList<>();
List currentSpan = null;
for (OcrBox wordBox : allBoxes) {
if (wordBox.getPage() != null) {
pages.add(wordBox.getPage());
}
if (wordBox.isInHighlight()) {
boolean isInNewSpan =
(currentSpan == null
|| currentSpan.isEmpty()
|| !wordBox.getHighlightSpan().equals(currentSpan.get(0).getHighlightSpan()));
if (isInNewSpan) {
if (currentSpan != null && !currentSpan.isEmpty()) {
hlSpans.add(currentSpan);
}
currentSpan = new ArrayList<>();
}
currentSpan.add(wordBox);
} else if (currentSpan != null && !currentSpan.isEmpty()) {
hlSpans.add(currentSpan);
currentSpan = null;
}
}
if (currentSpan != null && !currentSpan.isEmpty()) {
hlSpans.add(currentSpan);
}
String highlightedText =
OcrParser.boxesToString(allBoxes)
.replace(START_HL, startHlTag)
.replace(OcrParser.END_HL, endHlTag);
List snippetRegions =
byColumns.stream()
.map(this::determineSnippetRegion)
.filter(r -> !r.getText().isEmpty() && !r.getText().trim().isEmpty())
.collect(Collectors.toList());
Set snippetPageIds =
snippetRegions.stream()
.filter(b -> b.getPage() != null)
.map(b -> b.getPage().id)
.collect(Collectors.toSet());
List allPages = new ArrayList<>();
if (page != null) {
allPages.add(page);
}
allPages.addAll(pages);
List snippetPages =
allPages.stream()
.filter(p -> snippetPageIds.contains(p.id))
.distinct()
.collect(Collectors.toList());
OcrSnippet snip = new OcrSnippet(highlightedText, snippetPages, snippetRegions);
this.addHighlightsToSnippet(hlSpans, snip);
return snip;
}
private OcrBox determineSnippetRegion(List wordBoxes) {
float snipUlx = wordBoxes.stream().map(OcrBox::getUlx).min(Float::compareTo).get();
float snipUly = wordBoxes.stream().map(OcrBox::getUly).min(Float::compareTo).get();
float snipLrx = wordBoxes.stream().map(OcrBox::getLrx).max(Float::compareTo).get();
float snipLry = wordBoxes.stream().map(OcrBox::getLry).max(Float::compareTo).get();
OcrPage page = wordBoxes.get(0).getPage();
String regionText = OcrParser.boxesToString(wordBoxes);
OcrBox firstBox = wordBoxes.get(0);
OcrBox lastBox = wordBoxes.get(wordBoxes.size() - 1);
if (firstBox.isInHighlight() && !firstBox.getText().contains(START_HL)) {
regionText = START_HL + regionText;
}
if (lastBox.isInHighlight() && !lastBox.getText().contains(END_HL)) {
regionText = regionText + END_HL;
}
regionText = regionText.replace(START_HL, startHlTag).replace(END_HL, endHlTag);
return new OcrBox(regionText, page, snipUlx, snipUly, snipLrx, snipLry, null);
}
/** Parse word boxes from an OCR fragment. */
protected List parseWords(String ocrFragment, OcrPage startPage) {
List words = new ArrayList<>();
List parsingFeatures =
Lists.newArrayList(
OcrParser.ParsingFeature.TEXT,
OcrParser.ParsingFeature.COORDINATES,
OcrParser.ParsingFeature.ALTERNATIVES,
OcrParser.ParsingFeature.HIGHLIGHTS);
if (trackPages) {
parsingFeatures.add(OcrParser.ParsingFeature.PAGES);
}
OcrParser parser =
format.getParser(
new SanitizingXmlFilter(new StringReader(ocrFragment), true),
parsingFeatures.toArray(new OcrParser.ParsingFeature[0]));
boolean onStartPage = true;
for (OcrBox box : parser) {
if (onStartPage && box.getPage() == null) {
box.setPage(startPage);
} else if (box.getPage() != null) {
onStartPage = false;
}
words.add(box);
}
return words;
}
protected void addHighlightsToSnippet(List> hlSpans, OcrSnippet snippet) {
hlSpans.stream()
.flatMap(Collection::stream)
.forEach(
box -> {
Optional region =
snippet.getSnippetRegions().stream().filter(r -> r.contains(box)).findFirst();
if (!region.isPresent()) {
return;
}
if (!this.absoluteHighlights) {
float xOffset = region.get().getUlx();
float yOffset = region.get().getUly();
if ((box.getUlx() > 0 && box.getUlx() < 1)
|| (box.getUly() > 0 && box.getUly() < 1)) {
// Relative coordinates, need to do some more calculations
float snipWidth = region.get().getLrx() - xOffset;
float snipHeight = region.get().getLry() - yOffset;
box.setUlx(truncateFloat((box.getUlx() - xOffset) / snipWidth));
box.setLrx(truncateFloat((box.getLrx() - xOffset) / snipWidth));
box.setUly(truncateFloat((box.getUly() - yOffset) / snipHeight));
box.setLry(truncateFloat((box.getLry() - yOffset) / snipHeight));
} else {
box.setUlx(box.getUlx() - xOffset);
box.setLrx(box.getLrx() - xOffset);
box.setUly(box.getUly() - yOffset);
box.setLry(box.getLry() - yOffset);
}
}
box.setParentRegionIdx(snippet.getSnippetRegions().indexOf(region.get()));
// Remove the highlighting tags from the text
box.setText(box.getText().replace(START_HL, "").replace(END_HL, ""));
});
hlSpans.forEach(span -> snippet.addHighlightSpan(this.mergeBoxes(span)));
}
/** Merge adjacent OCR boxes into a single one, taking line breaks into account * */
protected List mergeBoxes(List boxes) {
if (boxes.size() < 2) {
return boxes;
}
List out = new ArrayList<>();
Iterator it = boxes.iterator();
OcrBox curBox = it.next();
StringBuilder curText = new StringBuilder(curBox.getText());
// Combine word boxes into a single new OCR box until we hit a linebreak
while (it.hasNext()) {
OcrBox nextBox = it.next();
// We consider a box on a new line if its vertical distance from the current box is close to
// the line height
float lineHeight = curBox.getLry() - curBox.getUly();
float yDiff = Math.abs(nextBox.getUly() - curBox.getUly());
boolean newLine = yDiff > (0.75 * lineHeight);
boolean newPage = !Objects.equals(nextBox.getPage(), curBox.getPage());
if (newLine || newPage) {
curBox.setText(curText.toString());
out.add(curBox);
curBox = nextBox;
curText = new StringBuilder(curBox.getText());
continue;
}
curText.append(" ");
curText.append(nextBox.getText());
if (nextBox.getLrx() > curBox.getLrx()) {
curBox.setLrx(nextBox.getLrx());
}
if (nextBox.getLry() > curBox.getLry()) {
curBox.setLry(nextBox.getLry());
}
if (nextBox.getUly() < curBox.getUly()) {
curBox.setUly(nextBox.getUly());
}
}
curBox.setText(curText.toString());
out.add(curBox);
out.forEach(b -> b.setPage(null));
return out;
}
/**
* Convenience implementation to format document text that is available as a {@link String}.
*
* Wraps the {@link String} in a {@link IterableCharSequence} implementation and calls {@link
* #format(Passage[], IterableCharSequence)}
*
* @param passages in the the document text that contain highlighted text
* @param content of the OCR field, implemented as an {@link IterableCharSequence}
* @return the parsed snippet representation of the passages
*/
@Override
public Object format(Passage[] passages, String content) {
OcrSnippet[] snips = this.format(passages, IterableCharSequence.fromString(content));
return Arrays.stream(snips).map(OcrSnippet::getText).toArray(String[]::new);
}
/**
* Truncate float to a precision of two digits after the decimal point.
*
*
Intended to keep the plugin response small and tidy.
*/
private static float truncateFloat(float num) {
return (float) Math.floor(num * 10000) / 10000;
}
protected static class PassageMatch {
public int start;
public int end;
public PassageMatch(int start, int end) {
this.start = start;
this.end = end;
}
public boolean overlaps(PassageMatch other) {
int s1 = this.start;
int e1 = this.end;
int s2 = other.start;
int e2 = other.end;
return (s1 <= s2 && s2 <= e1)
|| // --------
// -----
(s1 <= e2 && e2 <= e1)
|| // --------
// -----
(s2 <= s1 && s1 <= e2 && // --------
s2 <= e1 && e1 <= e2); // ---
}
public void merge(PassageMatch other) {
if (this.end < other.end) {
this.end = other.end;
} else if (this.start > other.start) {
this.start = other.start;
}
}
@Override
public String toString() {
return String.format("PassageMatch{start=%d, end=%d}", start, end);
}
}
}