de.digitalcollections.solrocr.formats.OcrPassageFormatter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.
There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.formats;

import com.google.common.collect.ImmutableSet;
import com.google.common.io.CharStreams;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import de.digitalcollections.solrocr.lucene.fieldloader.PathFieldLoader;
import de.digitalcollections.solrocr.util.IterableCharSequence;
import de.digitalcollections.solrocr.util.IterableCharSequence.OffsetType;
import de.digitalcollections.solrocr.util.OcrBox;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Takes care of formatting fragments of the OCR format into {@link OcrSnippet} instances.
 */
public abstract class OcrPassageFormatter extends PassageFormatter {
  private static final Pattern LAST_INNER_TAG_PAT = Pattern.compile("[a-zA-Z0-9] mergeMatches(int numMatches, int[] matchStarts, int[] matchEnds) {
    Deque sortedMatches = IntStream.range(0, numMatches)
        .mapToObj(idx -> new PassageMatch(matchStarts[idx], matchEnds[idx]))
        .collect(Collectors.toCollection(ArrayDeque::new));
    Deque mergedMatches = new ArrayDeque<>();
    mergedMatches.add(sortedMatches.removeFirst());
    while (!sortedMatches.isEmpty()) {
      PassageMatch candidate = sortedMatches.removeFirst();
      if (mergedMatches.peekLast().overlaps(candidate)) {
        mergedMatches.peekLast().merge(candidate);
      } else {
        mergedMatches.add(candidate);
      }
    }
    return new ArrayList<>(mergedMatches);
  }

  /**
   * Format the passages that point to subsequences of the document text into {@link OcrSnippet} instances
   *
   * @param passages in the the document text that contain highlighted text
   * @param content of the OCR field, implemented as an {@link IterableCharSequence}
   * @return the parsed snippet representation of the passages
   */
  public OcrSnippet[] format(Passage[] passages, IterableCharSequence content) {
    OcrSnippet[] snippets = new OcrSnippet[passages.length];
    for (int i=0; i < passages.length; i++) {
      Passage passage = passages[i];
      try {
        snippets[i] = format(passage, content);
      } catch (IndexOutOfBoundsException e) {
        String errorMsg = String.format(
            "Could not create snippet (start=%d, end=%d) from content at '%s' due to an out-of-bounds error.",
            passage.getStartOffset(), passage.getEndOffset(), content.getIdentifier());
        if (content.getOffsetType() == OffsetType.BYTES) {
          errorMsg += "\nDoes the file on disk correspond to the document that was used for determining the offsets during indexing?";
        } else {
          errorMsg += "\nDoes the file on disk correspond to the document that was used during indexing?";
        }
        logger.error(errorMsg, e);
      }
    }
    return snippets;
  }

  private OcrSnippet format(Passage passage, IterableCharSequence content) {
    StringBuilder sb = new StringBuilder(content.subSequence(passage.getStartOffset(), passage.getEndOffset()));
    int extraChars = 0;
    if (passage.getNumMatches() > 0) {
      List matches = mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds());
      for (PassageMatch match : matches) {
        int matchStart = content.subSequence(passage.getStartOffset(), match.start).toString().length();
        sb.insert(extraChars + matchStart, startHlTag);
        extraChars += startHlTag.length();
        int matchEnd = content.subSequence(passage.getStartOffset(), match.end).toString().length();
        String matchText = sb.substring(extraChars + matchStart, extraChars + matchEnd);
        if (matchText.trim().endsWith(">")) {
          // Set the end of the match to the position before the last inner closing tag inside of the match.
          Matcher m = LAST_INNER_TAG_PAT.matcher(matchText);
          int idx = -1;
          while (m.find()) {
            idx = m.start() + 1;
          }
          if (idx > -1) {
            int matchLength = match.end - match.start;
            matchEnd -= (matchLength - idx);
          }
        }
        sb.insert(Math.min(extraChars + matchEnd, sb.length()), endHlTag);
        extraChars += endHlTag.length();
      }
    }
    String xmlFragment = sb.toString();
    String pageId = determineStartPage(xmlFragment, passage.getStartOffset(), content);
    OcrSnippet snip = parseFragment(xmlFragment, pageId);
    if (snip != null) {
      snip.setScore(passage.getScore());
    }
    return snip;
  }

  /** Helper method to get plaintext from XML/HTML-like fragments */
  protected String getTextFromXml(String xmlFragment) {
    HTMLStripCharFilter filter = new HTMLStripCharFilter(
        new StringReader(xmlFragment),
        ImmutableSet.of(startHlTag.substring(1, startHlTag.length() - 1)));
    try {
      String text = CharStreams.toString(filter);
      return StringEscapeUtils.unescapeXml(text)
          .replaceAll("\n", "")
          .replaceAll("\\s+", " ")
          .trim();
    } catch (IOException e) {
      return xmlFragment;
    }
  }

  /** Determine the id of the page an OCR fragment resides on. */
  public abstract String determineStartPage(String ocrFragment, int startOffset, IterableCharSequence content);

  /** Parse an {@link OcrSnippet} from an OCR fragment. */
  protected OcrSnippet parseFragment(String ocrFragment, String pageId) {
    List> hlBoxes = new ArrayList<>();
    List allBoxes = this.parseWords(ocrFragment, pageId);
    if (allBoxes.isEmpty()) {
      return null;
    }

    Map> grouped = allBoxes.stream().collect(Collectors.groupingBy(
        OcrBox::getPageId, LinkedHashMap::new, Collectors.toList()));

    // Get highlighted spans
    List currentHl = null;
    for (OcrBox wordBox : allBoxes) {
      if (wordBox.isHighlight()) {
        if (currentHl == null) {
          currentHl = new ArrayList<>();
        }
        currentHl.add(wordBox);
      } else if (currentHl != null) {
        hlBoxes.add(currentHl);
        currentHl = null;
      }
    }
    if (currentHl != null) {
      hlBoxes.add(currentHl);
    }

    List snippetRegions = grouped.entrySet().stream()
        .map(e -> determineSnippetRegion(e.getValue(), e.getKey()))
        .collect(Collectors.toList());

    OcrSnippet snip = new OcrSnippet(getTextFromXml(ocrFragment), snippetRegions);
    this.addHighlightsToSnippet(hlBoxes, snip);
    return snip;
  }

  private OcrBox determineSnippetRegion(List wordBoxes, String pageId) {
    float snipUlx = wordBoxes.stream().map(OcrBox::getUlx).min(Float::compareTo).get();
    float snipUly = wordBoxes.stream().map(OcrBox::getUly).min(Float::compareTo).get();
    float snipLrx = wordBoxes.stream().map(OcrBox::getLrx).max(Float::compareTo).get();
    float snipLry = wordBoxes.stream().map(OcrBox::getLry).max(Float::compareTo).get();
    return new OcrBox(null, pageId, snipUlx, snipUly, snipLrx, snipLry, false);
  }


  /** Parse word boxes from an OCR fragment. */
  protected abstract List parseWords(String ocrFragment, String startPage);

  protected void addHighlightsToSnippet(List> hlBoxes, OcrSnippet snippet) {
    for (OcrBox region : snippet.getSnippetRegions()) {
      final float xOffset = this.absoluteHighlights ? 0 : region.getUlx();
      final float yOffset = this.absoluteHighlights ? 0 : region.getUly();
      hlBoxes.stream()
          .map(bs -> bs.stream()
              .filter(b -> b.getPageId().equals(region.getPageId()))
              .map(b -> new OcrBox(b.getText(), b.getPageId(), b.getUlx() - xOffset, b.getUly() - yOffset,
                                   b.getLrx() - xOffset, b.getLry() - yOffset, b.isHighlight()))
              .collect(Collectors.toList()))
          .forEach(bs -> snippet.addHighlightRegion(this.mergeBoxes(bs)));
    }
  }


  /** Merge adjacent OCR boxes into a single one, taking line breaks into account **/
  protected List mergeBoxes(List boxes) {
    if (boxes.size() < 2) {
      return boxes;
    }
    List out = new ArrayList<>();
    Iterator it = boxes.iterator();
    OcrBox curBox = it.next();
    StringBuilder curText = new StringBuilder(curBox.getText());
    // Combine word boxes into a single new OCR box until we hit a linebreak
    while (it.hasNext()) {
      OcrBox nextBox = it.next();
      // We consider a box on a new line if its vertical distance from the current box is close to the line height
      float lineHeight = curBox.getLry() - curBox.getUly();
      float yDiff = Math.abs(nextBox.getUly() - curBox.getUly());
      if (yDiff > (0.75 * lineHeight)) {
        curBox.setText(curText.toString());
        out.add(curBox);
        curBox = nextBox;
        curText = new StringBuilder(curBox.getText());
        continue;
      }
      curText.append(" ");
      curText.append(nextBox.getText());
      if (nextBox.getLrx() > curBox.getLrx()) {
        curBox.setLrx(nextBox.getLrx());
      }
      if (nextBox.getLry() > curBox.getLry()) {
        curBox.setLry(nextBox.getLry());
      }
      if (nextBox.getUly() < curBox.getUly()) {
        curBox.setUly(nextBox.getUly());
      }
    }
    curBox.setText(curText.toString());
    out.add(curBox);
    return out;
  }

  /**
   * Convenience implementation to format document text that is available as a {@link String}.
   *
   * Wraps the {@link String} in a {@link IterableCharSequence} implementation and calls
   * {@link #format(Passage[], IterableCharSequence)}
   *
   * @param passages in the the document text that contain highlighted text
   * @param content of the OCR field, implemented as an {@link IterableCharSequence}
   * @return the parsed snippet representation of the passages
   */
  @Override
  public Object format(Passage[] passages, String content) {
    OcrSnippet[] snips = this.format(passages, IterableCharSequence.fromString(content));
    return Arrays.stream(snips).map(OcrSnippet::getText).toArray(String[]::new);
  }

  private static class PassageMatch {
    public int start;
    public int end;

    public PassageMatch(int start, int end) {
      this.start = start;
      this.end = end;
    }

    public boolean overlaps(PassageMatch other) {
      int s1 = this.start;
      int e1 = this.end;
      int s2 = other.start;
      int e2 = other.end;
      return (s1 <= s2 && s2 <= e1) ||  //   --------
                                        // -----

             (s1 <= e2 && e2 <= e1) ||  // --------
                                        //      -----

             (s2 <= s1 && s1 <= e2 &&   // --------
              s2 <= e1 && e1 <= e2);    //   ---
    }

    public void merge(PassageMatch other) {
      if (this.end < other.end) {
        this.end = other.end;
      } else if (this.start > other.start) {
        this.start = other.start;
      }
    }

    @Override
    public String toString() {
      return String.format("PassageMatch{start=%d, end=%d}", start, end);
    }
  }
}