All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.lucene.OcrPassageFormatter Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!
package de.digitalcollections.solrocr.lucene;

import static de.digitalcollections.solrocr.formats.OcrParser.END_HL;
import static de.digitalcollections.solrocr.formats.OcrParser.START_HL;

import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.lucene.filters.SanitizingXmlFilter;
import de.digitalcollections.solrocr.model.OcrBlock;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrFormat;
import de.digitalcollections.solrocr.model.OcrPage;
import de.digitalcollections.solrocr.model.OcrSnippet;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Takes care of formatting fragments of the OCR format into {@link OcrSnippet} instances. */
public class OcrPassageFormatter extends PassageFormatter {
  protected static final Pattern LAST_INNER_TAG_PAT = Pattern.compile("[a-zA-Z0-9] mergeMatches(int numMatches, int[] matchStarts, int[] matchEnds) {
    Deque sortedMatches =
        IntStream.range(0, numMatches)
            .mapToObj(idx -> new PassageMatch(matchStarts[idx], matchEnds[idx]))
            .collect(Collectors.toCollection(ArrayDeque::new));
    Deque mergedMatches = new ArrayDeque<>();
    mergedMatches.add(sortedMatches.removeFirst());
    while (!sortedMatches.isEmpty()) {
      PassageMatch candidate = sortedMatches.removeFirst();
      if (!mergedMatches.isEmpty() && mergedMatches.peekLast().overlaps(candidate)) {
        // Cannot be null due to isEmpty check, and no concurrent accesses that could
        // remove it
        mergedMatches.peekLast().merge(candidate);
      } else {
        mergedMatches.add(candidate);
      }
    }
    return new ArrayList<>(mergedMatches);
  }

  /**
   * Format the passages that point to subsequences of the document text into {@link OcrSnippet}
   * instances
   *
   * @param passages in the the document text that contain highlighted text
   * @param content of the OCR field, implemented as an {@link IterableCharSequence}
   * @return the parsed snippet representation of the passages
   */
  public OcrSnippet[] format(Passage[] passages, IterableCharSequence content) {
    OcrSnippet[] snippets = new OcrSnippet[passages.length];
    for (int i = 0; i < passages.length; i++) {
      Passage passage = passages[i];
      try {
        snippets[i] = format(passage, content);
      } catch (IndexOutOfBoundsException e) {
        String errorMsg =
            String.format(
                "Could not create snippet (start=%d, end=%d) from content at '%s' due to an out-of-bounds error.\n"
                    + "\nDoes the file on disk correspond to the document that was used during indexing?",
                passage.getStartOffset(), passage.getEndOffset(), content.getIdentifier());
        logger.error(errorMsg, e);
      }
    }
    return snippets;
  }

  protected String getHighlightedFragment(Passage passage, IterableCharSequence content) {
    StringBuilder sb =
        new StringBuilder(content.subSequence(passage.getStartOffset(), passage.getEndOffset()));
    int extraChars = 0;
    if (passage.getNumMatches() > 0) {
      List matches =
          mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds());
      for (PassageMatch match : matches) {
        // Can't just do match.start - passage.getStartOffset(), since both offsets are relative to
        // **UTF-8 bytes**, but
        // we need **UTF-16 codepoint** offsets in the code.
        String preMatchContent =
            content.subSequence(passage.getStartOffset(), match.start).toString();
        int matchStart = preMatchContent.length();
        if (alignSpans) {
          matchStart = format.getLastContentStartIdx(preMatchContent);
        }
        sb.insert(
            this.adjustPositionToCharacterEntities(sb.toString(), extraChars + matchStart),
            START_HL);
        extraChars += START_HL.length();
        // Again, can't just do match.end - passage.getStartOffset(), since we need char offsets
        // (see above).
        int matchEnd = content.subSequence(passage.getStartOffset(), match.end).toString().length();
        String matchText = sb.substring(extraChars + matchStart, extraChars + matchEnd);
        if (matchText.trim().endsWith(">")) {
          // Set the end of the match to the position before the last inner closing tag inside of
          // the match. This is only relevant for hOCR at the moment
          Matcher m = LAST_INNER_TAG_PAT.matcher(matchText);
          int idx = -1;
          while (m.find()) {
            idx = m.start() + 1;
          }
          if (idx > -1) {
            matchEnd -= (matchText.length() - idx);
          }
        }
        matchEnd = Math.min(matchEnd + extraChars, sb.length());
        if (alignSpans && matchEnd != sb.length()) {
          String postMatchContent = sb.substring(matchEnd, sb.length());
          matchEnd += format.getFirstContentEndIdx(postMatchContent);
        }
        sb.insert(this.adjustPositionToCharacterEntities(sb.toString(), matchEnd), END_HL);
        extraChars += END_HL.length();
      }
    }
    return sb.toString();
  }

  /**
   * Adjust the given position within the OCR fragment to account for XML character entities in the
   * OCR word, assumes that the position is within an OCR word.
   *
   * 

This is necessary since doing this at indexing time would be extremely costly, given that it * would need to be run for every single word. At highlighting time it only needs to be run for * words that have a highlighting marker inside, since the difference is otherwise not * problematic. */ private int adjustPositionToCharacterEntities(String fragment, int position) { Range wordRange = this.format.getContainingWordLimits(fragment, position); int idx = wordRange.lowerEndpoint(); while (idx >= wordRange.lowerEndpoint() && idx < position) { int entStart = fragment.indexOf('&', idx); if (entStart < 0 || entStart >= position || entStart > wordRange.upperEndpoint()) { // No entities opened before position in the word, start doesn't need to be adjusted break; } int entEnd = fragment.indexOf(';', entStart); int entLength = entEnd - entStart; // This assumes that the entity decodes to a codepoint that is only one character wide in // UTF16, which should be the case for >99.9% of terms people search for... position += entLength; idx = entEnd + 1; } return position; } private OcrSnippet format(Passage passage, IterableCharSequence content) { String xmlFragment = getHighlightedFragment(passage, content); OcrPage initialPage = null; if (trackPages) { initialPage = determineStartPage(passage.getStartOffset(), content); } OcrSnippet snip = parseFragment(xmlFragment, initialPage); if (snip != null) { snip.setScore(passage.getScore()); } return snip; } /** Determine the page an OCR fragment resides on. */ OcrPage determineStartPage(int startOffset, IterableCharSequence content) { BreakLocator pageBreakLocator = this.format.getBreakLocator(content, OcrBlock.PAGE); int pageOffset = pageBreakLocator.preceding(startOffset); if (pageOffset == BreakLocator.DONE) { // This means the page is, if present, part of the passage, and will be determined during // parsing anyway return null; } String pageFragment = content.subSequence(pageOffset, Math.min(pageOffset + 512, content.length())).toString(); return this.format.parsePageFragment(pageFragment); } /** Parse an {@link OcrSnippet} from an OCR fragment. */ protected OcrSnippet parseFragment(String ocrFragment, OcrPage page) { List allBoxes = this.parseWords(ocrFragment, page); if (allBoxes.isEmpty()) { return null; } // Grouped by columns List> byColumns = new ArrayList<>(); List currentCol = new ArrayList<>(); OcrBox prevBox = null; String pageId = null; for (OcrBox box : allBoxes) { // Stupid, haphazard heuristic for column detection: If the next box is at least the height of // the current box times five higher on the page, we're on a new column. Or if the page // changes. // FIXME: This clearly needs some more thought put into it boolean newColumn = prevBox != null && (box.getUly() + prevBox.getHeight() * 5) < prevBox.getUly(); String boxPageId = box.getPage() == null ? null : box.getPage().id; boolean newPage = pageId != null && !pageId.equals(boxPageId); if (newColumn || newPage) { byColumns.add(currentCol); currentCol = new ArrayList<>(); } currentCol.add(box); // Skip very low-height boxes since they throw off the heuristic, we still track page changes, // though! if (box.getHeight() > 5) { prevBox = box; } pageId = boxPageId; } byColumns.add(currentCol); // Get highlighted spans Set pages = new LinkedHashSet<>(); List> hlSpans = new ArrayList<>(); List currentSpan = null; for (OcrBox wordBox : allBoxes) { if (wordBox.getPage() != null) { pages.add(wordBox.getPage()); } if (wordBox.isInHighlight()) { boolean isInNewSpan = (currentSpan == null || currentSpan.isEmpty() || !wordBox.getHighlightSpan().equals(currentSpan.get(0).getHighlightSpan())); if (isInNewSpan) { if (currentSpan != null && !currentSpan.isEmpty()) { hlSpans.add(currentSpan); } currentSpan = new ArrayList<>(); } currentSpan.add(wordBox); } else if (currentSpan != null && !currentSpan.isEmpty()) { hlSpans.add(currentSpan); currentSpan = null; } } if (currentSpan != null && !currentSpan.isEmpty()) { hlSpans.add(currentSpan); } String highlightedText = OcrParser.boxesToString(allBoxes) .replace(START_HL, startHlTag) .replace(OcrParser.END_HL, endHlTag); List snippetRegions = byColumns.stream() .map(this::determineSnippetRegion) .filter(r -> !r.getText().isEmpty() && !r.getText().trim().isEmpty()) .collect(Collectors.toList()); Set snippetPageIds = snippetRegions.stream() .filter(b -> b.getPage() != null) .map(b -> b.getPage().id) .collect(Collectors.toSet()); List allPages = new ArrayList<>(); if (page != null) { allPages.add(page); } allPages.addAll(pages); List snippetPages = allPages.stream() .filter(p -> snippetPageIds.contains(p.id)) .distinct() .collect(Collectors.toList()); OcrSnippet snip = new OcrSnippet(highlightedText, snippetPages, snippetRegions); this.addHighlightsToSnippet(hlSpans, snip); return snip; } private OcrBox determineSnippetRegion(List wordBoxes) { float snipUlx = wordBoxes.stream().map(OcrBox::getUlx).min(Float::compareTo).get(); float snipUly = wordBoxes.stream().map(OcrBox::getUly).min(Float::compareTo).get(); float snipLrx = wordBoxes.stream().map(OcrBox::getLrx).max(Float::compareTo).get(); float snipLry = wordBoxes.stream().map(OcrBox::getLry).max(Float::compareTo).get(); OcrPage page = wordBoxes.get(0).getPage(); String regionText = OcrParser.boxesToString(wordBoxes); OcrBox firstBox = wordBoxes.get(0); OcrBox lastBox = wordBoxes.get(wordBoxes.size() - 1); if (firstBox.isInHighlight() && !firstBox.getText().contains(START_HL)) { regionText = START_HL + regionText; } if (lastBox.isInHighlight() && !lastBox.getText().contains(END_HL)) { regionText = regionText + END_HL; } regionText = regionText.replace(START_HL, startHlTag).replace(END_HL, endHlTag); return new OcrBox(regionText, page, snipUlx, snipUly, snipLrx, snipLry, null); } /** Parse word boxes from an OCR fragment. */ protected List parseWords(String ocrFragment, OcrPage startPage) { List words = new ArrayList<>(); List parsingFeatures = Lists.newArrayList( OcrParser.ParsingFeature.TEXT, OcrParser.ParsingFeature.COORDINATES, OcrParser.ParsingFeature.ALTERNATIVES, OcrParser.ParsingFeature.HIGHLIGHTS); if (trackPages) { parsingFeatures.add(OcrParser.ParsingFeature.PAGES); } OcrParser parser = format.getParser( new SanitizingXmlFilter(new StringReader(ocrFragment), true), parsingFeatures.toArray(new OcrParser.ParsingFeature[0])); boolean onStartPage = true; for (OcrBox box : parser) { if (onStartPage && box.getPage() == null) { box.setPage(startPage); } else if (box.getPage() != null) { onStartPage = false; } words.add(box); } return words; } protected void addHighlightsToSnippet(List> hlSpans, OcrSnippet snippet) { hlSpans.stream() .flatMap(Collection::stream) .forEach( box -> { Optional region = snippet.getSnippetRegions().stream().filter(r -> r.contains(box)).findFirst(); if (!region.isPresent()) { return; } if (!this.absoluteHighlights) { float xOffset = region.get().getUlx(); float yOffset = region.get().getUly(); if ((box.getUlx() > 0 && box.getUlx() < 1) || (box.getUly() > 0 && box.getUly() < 1)) { // Relative coordinates, need to do some more calculations float snipWidth = region.get().getLrx() - xOffset; float snipHeight = region.get().getLry() - yOffset; box.setUlx(truncateFloat((box.getUlx() - xOffset) / snipWidth)); box.setLrx(truncateFloat((box.getLrx() - xOffset) / snipWidth)); box.setUly(truncateFloat((box.getUly() - yOffset) / snipHeight)); box.setLry(truncateFloat((box.getLry() - yOffset) / snipHeight)); } else { box.setUlx(box.getUlx() - xOffset); box.setLrx(box.getLrx() - xOffset); box.setUly(box.getUly() - yOffset); box.setLry(box.getLry() - yOffset); } } box.setParentRegionIdx(snippet.getSnippetRegions().indexOf(region.get())); // Remove the highlighting tags from the text box.setText(box.getText().replace(START_HL, "").replace(END_HL, "")); }); hlSpans.forEach(span -> snippet.addHighlightSpan(this.mergeBoxes(span))); } /** Merge adjacent OCR boxes into a single one, taking line breaks into account * */ protected List mergeBoxes(List boxes) { if (boxes.size() < 2) { return boxes; } List out = new ArrayList<>(); Iterator it = boxes.iterator(); OcrBox curBox = it.next(); StringBuilder curText = new StringBuilder(curBox.getText()); // Combine word boxes into a single new OCR box until we hit a linebreak while (it.hasNext()) { OcrBox nextBox = it.next(); // We consider a box on a new line if its vertical distance from the current box is close to // the line height float lineHeight = curBox.getLry() - curBox.getUly(); float yDiff = Math.abs(nextBox.getUly() - curBox.getUly()); boolean newLine = yDiff > (0.75 * lineHeight); boolean newPage = !Objects.equals(nextBox.getPage(), curBox.getPage()); if (newLine || newPage) { curBox.setText(curText.toString()); out.add(curBox); curBox = nextBox; curText = new StringBuilder(curBox.getText()); continue; } curText.append(" "); curText.append(nextBox.getText()); if (nextBox.getLrx() > curBox.getLrx()) { curBox.setLrx(nextBox.getLrx()); } if (nextBox.getLry() > curBox.getLry()) { curBox.setLry(nextBox.getLry()); } if (nextBox.getUly() < curBox.getUly()) { curBox.setUly(nextBox.getUly()); } } curBox.setText(curText.toString()); out.add(curBox); out.forEach(b -> b.setPage(null)); return out; } /** * Convenience implementation to format document text that is available as a {@link String}. * *

Wraps the {@link String} in a {@link IterableCharSequence} implementation and calls {@link * #format(Passage[], IterableCharSequence)} * * @param passages in the the document text that contain highlighted text * @param content of the OCR field, implemented as an {@link IterableCharSequence} * @return the parsed snippet representation of the passages */ @Override public Object format(Passage[] passages, String content) { OcrSnippet[] snips = this.format(passages, IterableCharSequence.fromString(content)); return Arrays.stream(snips).map(OcrSnippet::getText).toArray(String[]::new); } /** * Truncate float to a precision of two digits after the decimal point. * *

Intended to keep the plugin response small and tidy. */ private static float truncateFloat(float num) { return (float) Math.floor(num * 10000) / 10000; } protected static class PassageMatch { public int start; public int end; public PassageMatch(int start, int end) { this.start = start; this.end = end; } public boolean overlaps(PassageMatch other) { int s1 = this.start; int e1 = this.end; int s2 = other.start; int e2 = other.end; return (s1 <= s2 && s2 <= e1) || // -------- // ----- (s1 <= e2 && e2 <= e1) || // -------- // ----- (s2 <= s1 && s1 <= e2 && // -------- s2 <= e1 && e1 <= e2); // --- } public void merge(PassageMatch other) { if (this.end < other.end) { this.end = other.end; } else if (this.start > other.start) { this.start = other.start; } } @Override public String toString() { return String.format("PassageMatch{start=%d, end=%d}", start, end); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy