All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.formats.OcrParser Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!
package de.digitalcollections.solrocr.formats;

import com.ctc.wstx.api.WstxInputProperties;
import com.ctc.wstx.exc.WstxLazyException;
import com.ctc.wstx.stax.WstxInputFactory;
import com.google.common.collect.ImmutableMap;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.reader.PeekingReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.UUID;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLStreamReader2;

/** Base class for OCR parsers operating on XML markup */
public abstract class OcrParser implements Iterator, Iterable {

  /** Set of features that can be turned on/off depending on the downstream needs */
  public enum ParsingFeature {
    /** Parse text, i.e. "default", alternatives and hyphenated forms */
    TEXT,
    /** Parse offsets for texts */
    OFFSETS,
    /** Parse coordinates */
    COORDINATES,
    /** Track highlight spans during parsing */
    HIGHLIGHTS,
    /** Parse confidence (0..1) if available */
    CONFIDENCE,
    /** Parse alternatives */
    ALTERNATIVES,
    /** Track page changes */
    PAGES,
  }

  // Named XML character entities that are used in hOCR
  public static final ImmutableMap ENTITIES =
      ImmutableMap.builder()
          .put("shy", "\u00ad")
          .put("nbsp", "\u00a0")
          .put("ensp", "\u2002")
          .put("emsp", "\u2003")
          .put("thinsp", "\u2009")
          .put("zwnj", "\u200c")
          .put("zwj", "\u200d")
          .build();
  public static final String START_HL = "\uD83D\uDD25"; // 🔥
  public static final String END_HL = "\uD83E\uDDEF"; // 🧯

  private static final WstxInputFactory xmlInputFactory = new WstxInputFactory();

  protected final PeekingReader input;
  protected UUID currentHighlightSpan;
  protected boolean terminateHighlightSpanAfterNext = false;

  private final XMLStreamReader2 xmlReader;
  private final Set features = new HashSet<>();

  private OcrBox nextWord;

  public OcrParser(Reader input, ParsingFeature... features) throws XMLStreamException {
    if (input instanceof PeekingReader) {
      this.input = (PeekingReader) input;
    } else {
      this.input = new PeekingReader(input, 2048, 16384);
    }
    if (features.length == 0) {
      features =
          new ParsingFeature[] {
            ParsingFeature.TEXT,
            ParsingFeature.OFFSETS,
            ParsingFeature.COORDINATES,
            ParsingFeature.HIGHLIGHTS,
            ParsingFeature.CONFIDENCE,
            ParsingFeature.ALTERNATIVES,
            ParsingFeature.PAGES
          };
    }
    this.features.addAll(Arrays.asList(features));

    // Woodstax sometimes splits long text nodes, this option forces it to merge them together
    // before passing them to us
    xmlInputFactory.getConfig().doCoalesceText(true);
    // This parsing mode allows us to read multiple "concatenated" XML documents in a single pass
    xmlInputFactory.getConfig().setInputParsingMode(WstxInputProperties.PARSING_MODE_DOCUMENTS);
    // Ignore DTDs since they cause lookups to external URLs
    xmlInputFactory.getConfig().doSupportDTDs(false);
    // Register custom named entities used by hOCR
    xmlInputFactory.getConfig().setCustomInternalEntities(ENTITIES);
    // Fallback for unknown undeclared entities: just output them verbatim
    xmlInputFactory
        .getConfig()
        .setUndeclaredEntityResolver(
            (publicID, systemID, baseURI, namespace) -> String.format("&%s;", namespace));
    this.xmlReader = (XMLStreamReader2) xmlInputFactory.createXMLStreamReader(this.input);
    try {
      this.nextWord = this.readNext(this.xmlReader, this.features);
    } catch (XMLStreamException e) {
      throw new RuntimeException(
          String.format(
              "Failed to parse the OCR markup, make sure your files are well-formed and your regions start/end on "
                  + "complete tags! (Source was: %s)",
              this.input.getSource().orElse("[unknown]")),
          e);
    }
  }

  @Override
  public Iterator iterator() {
    return this;
  }

  public Stream stream() {
    return StreamSupport.stream(
        Spliterators.spliteratorUnknownSize(this, Spliterator.ORDERED), false);
  }

  @Override
  public boolean hasNext() {
    return this.nextWord != null;
  }

  @Override
  public OcrBox next() {
    if (!hasNext()) {
      throw new IllegalStateException("No more words in input");
    }
    OcrBox out = this.nextWord;
    try {
      do {
        this.nextWord = readNext(xmlReader, features);
      } while (hasNext() && this.nextWord == null);
    } catch (XMLStreamException | WstxLazyException e) {
      throw new RuntimeException(
          String.format(
              "Failed to parse the OCR markup, make sure your files are well-formed and your regions start/end on "
                  + "complete tags! (Source was: %s)",
              this.input.getSource().orElse("[unknown]")),
          e);
    }
    return out;
  }

  /**
   * "Peek" at the next word from the parse without advancing the parse to the word after it (i.e.
   * calling this does not influence the result of the `next()` call *
   */
  public Optional peek() {
    if (!hasNext()) {
      return Optional.empty();
    }
    return Optional.of(this.nextWord);
  }

  /**
   * Keep track of highlighted box spans encountered during parsing.
   *
   * 

Implements should always call this method when they encounter OCR text, since it might * contain highlighting markers that we need to track. * *

Returns the identifier of the box's highlighting span, if present, else null. */ protected UUID trackHighlightSpan(String text, OcrBox box) { if (this.currentHighlightSpan == null && text.contains(OcrParser.START_HL)) { this.currentHighlightSpan = UUID.randomUUID(); } if (this.currentHighlightSpan != null && (terminateHighlightSpanAfterNext || text.contains(OcrParser.END_HL))) { // Highlight spans that end on the start of a hyphenation should stretch as far as the end of // the hyphenation, // so we toggle a flag to delay the termination if (terminateHighlightSpanAfterNext) { terminateHighlightSpanAfterNext = false; } else if (box.isHyphenated() && box.isHyphenStart() && text.contains(OcrParser.END_HL)) { terminateHighlightSpanAfterNext = true; return this.currentHighlightSpan; } UUID out = this.currentHighlightSpan; this.currentHighlightSpan = null; return out; } return this.currentHighlightSpan; } /** Get the underlying peeking input reader. */ public PeekingReader getInput() { return input; } /** * Read the next OCR box in the input stream. * *

Implementers should take care to enable/disable various parsing steps depending on the set * of features passed in. */ protected abstract OcrBox readNext(XMLStreamReader2 xmlReader, Set features) throws XMLStreamException; /** * Helper method to convert a list of OCR boxes to a text string. * *

Includes smart handling of partial hyphenations as well as handling of alternative tokens * that are at the end and/or beginning of a highlighted span. In these cases the highlighted * alternative will be used in the output string instead of the default form of the box. This is * only possible if the alternative is at the beginning or end, since we otherwise don't have any * information available to us if the default form or an alternative matched. */ public static String boxesToString(List boxes) { StringBuilder sb = new StringBuilder(); int idx = 0; Iterator it = boxes.iterator(); while (it.hasNext()) { OcrBox b = it.next(); if (b.isHyphenated() && b.isHyphenStart()) { boolean wordIsCompleteHyphenation = (idx < boxes.size() - 1 && boxes.get(idx + 1).isHyphenated() && !boxes.get(idx + 1).isHyphenStart()); if (wordIsCompleteHyphenation) { // Both parts of the hyphenation are present, put the dehyphenated form in the text OcrBox next = it.next(); sb.append(next.getDehyphenatedForm()); b.setTrailingChars(next.getTrailingChars()); idx += 1; } else { // An isolated hyphen start without its corresponding ending, denote the hyphenation // explicitly String text = b.getText().trim(); if (!text.endsWith("-")) { text += "-"; } sb.append(text); } } else if (!b.getAlternatives().isEmpty()) { Optional alternativeWithHighlight = b.getAlternatives().stream() .filter(a -> a.contains(START_HL) || a.contains(END_HL)) .findFirst(); // If the highlight is on an alternative, output that alternative instead of the default // token if (alternativeWithHighlight.isPresent()) { sb.append(alternativeWithHighlight.get()); } else { sb.append(b.getText()); } } else { sb.append(b.getText()); } if (b.getTrailingChars() != null) { sb.append(b.getTrailingChars()); } idx += 1; } return sb.toString().trim(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy