de.digitalcollections.solrocr.formats.hocr.HocrParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.
The newest version!
package de.digitalcollections.solrocr.formats.hocr;

import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.lang3.StringUtils;
import org.codehaus.stax2.XMLStreamReader2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HocrParser extends OcrParser {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private boolean noMoreWords;
  private OcrPage currentPage;
  private OcrBox hyphenEnd = null;

  public HocrParser(Reader input, ParsingFeature... features) throws XMLStreamException {
    super(input, features);
  }

  @Override
  protected OcrBox readNext(XMLStreamReader2 xmlReader, Set features)
      throws XMLStreamException {
    // For hyphenated tokens, we parse  both parts first and then output them one after the other
    if (hyphenEnd != null) {
      OcrBox out = this.hyphenEnd;
      this.hyphenEnd = null;
      return out;
    }

    if (noMoreWords) {
      return null;
    }

    // Advance parser to the next word if necessary
    if (xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT
        || !"span".equals(xmlReader.getLocalName())
        || !"ocrx_word".equals(xmlReader.getAttributeValue("", "class"))) {
      this.seekToNextWord(xmlReader, features.contains(ParsingFeature.PAGES));
    }
    if (xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) {
      // No words in this document, nothing to do
      return null;
    }

    OcrBox box = new OcrBox();
    Map props = parseTitle(xmlReader.getAttributeValue("", "title"));
    if (features.contains(ParsingFeature.TEXT)) {
      this.parseText(
          xmlReader,
          box,
          features.contains(ParsingFeature.HIGHLIGHTS),
          features.contains(ParsingFeature.OFFSETS),
          features.contains(ParsingFeature.ALTERNATIVES));
    }
    if (features.contains(ParsingFeature.COORDINATES)) {
      this.parseCoordinates(box, props.get("bbox"));
    }
    if (features.contains(ParsingFeature.CONFIDENCE) && props.containsKey("x_wconf")) {
      box.setConfidence(Double.parseDouble(props.get("x_wconf")));
    }
    if (features.contains(ParsingFeature.PAGES) && this.currentPage != null) {
      box.setPage(this.currentPage);
    }

    String trailingChars = this.seekToNextWord(xmlReader, features.contains(ParsingFeature.PAGES));
    if (features.contains(ParsingFeature.TEXT) && !trailingChars.isEmpty()) {
      box.setTrailingChars(trailingChars);
    }

    boolean isHyphenated = false;
    if (box.getText() != null && box.getText().replace(END_HL, "").endsWith("\u00ad")) {
      isHyphenated = true;
      String boxText = box.getText();
      box.setText(boxText.replace("\u00ad", ""));
      // Preliminary hyphenation info, no dehyphenated form available yet
      box.setHyphenInfo(true, null);
    } else if (trailingChars.startsWith("\u00ad")) {
      isHyphenated = true;
    }
    if (isHyphenated) {
      box.setTrailingChars(null);
      hyphenEnd = this.readNext(xmlReader, features);
      if (hyphenEnd != null) {
        String dehyphenated = box.getText() + hyphenEnd.getText();
        box.setHyphenInfo(true, dehyphenated);
        hyphenEnd.setHyphenInfo(false, dehyphenated);
      } else {
        // No hyphen end, add hyphen character if needed and strip hyphenation info
        if (box.getText() != null && !box.getText().endsWith("-")) {
          box.setText(box.getText() + "-");
        }
        box.setHyphenInfo(null, null);
      }
    }

    // Boxes without text or coordinates (if either is requested with a feature flag) are ignored
    // since they break things downstream. Skip the current box and continue with next one.
    boolean ignoreBox =
        (features.contains(ParsingFeature.TEXT)
                && (box.getText() == null || box.getText().isEmpty()))
            || (features.contains(ParsingFeature.COORDINATES)
                && (box.getLrx() < 0 && box.getLry() < 0 && box.getUlx() < 0 && box.getUly() < 0));
    if (ignoreBox) {
      box = this.readNext(xmlReader, features);
    }
    return box;
  }

  private Map parseTitle(String title) {
    Map props = new HashMap<>();
    if (title == null) {
      return props;
    }
    String[] parts = title.split(";");
    for (String part : parts) {
      int spaceIdx = part.indexOf(' ', 3);
      props.put(part.substring(0, spaceIdx).trim(), part.substring(spaceIdx + 1).trim());
    }
    return props;
  }

  private void parseCoordinates(OcrBox box, String bboxStr) {
    String[] parts = bboxStr.split(" ");
    if (parts.length > 0) {
      box.setUlx(Integer.parseInt(parts[0]));
    }
    if (parts.length > 1) {
      box.setUly(Integer.parseInt(parts[1]));
    }
    if (parts.length > 2) {
      box.setLrx(Integer.parseInt(parts[2]));
    }
    if (parts.length > 3) {
      box.setLry(Integer.parseInt(parts[3]));
    } else {
      log.warn("bbox attribute '{}' is incomplete.", bboxStr);
    }
  }

  private void parseText(
      XMLStreamReader2 xmlReader,
      OcrBox box,
      boolean withHighlights,
      boolean withOffsets,
      boolean withAlternatives)
      throws XMLStreamException {
    String txt = null;
    int txtOffset = -1;
    boolean inAlternatives = false;
    while (xmlReader.hasNext()) {
      int nextEvent = xmlReader.next();
      if (nextEvent == XMLStreamConstants.CHARACTERS && txt == null) {
        if (withOffsets) {
          txtOffset = Math.toIntExact(xmlReader.getLocationInfo().getStartingCharOffset());
        }
        txt = xmlReader.getText();
        continue;
      } else if (nextEvent == XMLStreamConstants.END_ELEMENT) {
        if (inAlternatives) {
          inAlternatives = false;
          continue;
        }
        // We assume that we're dealing with valid hOCR, and in this case this is the event for the
        // end of the ocrx_word span, i.e. we have all the text we needed from the box and can
        // terminate and return
        box.setText(txt);
        if (withOffsets) {
          box.setTextOffset(txtOffset);
        }
        if (txt != null && txt.replace(END_HL, "").endsWith("\u00ad")) {
          // Preliminary hyphenation info
          box.setHyphenInfo(true, null);
        }
        // Make sure we don't overwrite highlight spans tracked from alternatives
        if (withHighlights && box.getHighlightSpan() == null) {
          box.setHighlightSpan(this.trackHighlightSpan(txt, box));
        }
        return;
      } else if (nextEvent != XMLStreamConstants.START_ELEMENT) {
        // Nothing of interest
        continue;
      }
      // We're on a START_ELEMENT event now
      String tag = xmlReader.getLocalName();
      if ("span".equals(tag) && "alternatives".equals(xmlReader.getAttributeValue("", "class"))) {
        inAlternatives = true;
        continue;
      }
      if ("ins".equals(tag)) {
        if (xmlReader.next() != XMLStreamConstants.CHARACTERS) {
          throw new IllegalStateException(" elements must have a text node as its sole child");
        }
        if (withOffsets) {
          txtOffset = Math.toIntExact(xmlReader.getLocationInfo().getStartingCharOffset());
        }
        txt = xmlReader.getText();
        if (xmlReader.next() != XMLStreamConstants.END_ELEMENT) {
          throw new IllegalStateException(" elements must have a text node as its sole child");
        }
      } else if (withAlternatives && "del".equals(tag)) {
        if (xmlReader.next() != XMLStreamConstants.CHARACTERS) {
          throw new IllegalStateException(" elements must have a text node as its sole child");
        }
        String altText = xmlReader.getText();
        Integer altOffset =
            withOffsets
                ? Math.toIntExact(xmlReader.getLocationInfo().getStartingCharOffset())
                : null;
        if (withHighlights && box.getHighlightSpan() == null) {
          box.setHighlightSpan(this.trackHighlightSpan(altText, box));
        }
        box.addAlternative(altText, altOffset);
        if (xmlReader.next() != XMLStreamConstants.END_ELEMENT) {
          throw new IllegalStateException(" elements must have a text node as its sole child");
        }
      }
    }
  }

  private String seekToNextWord(XMLStreamReader2 xmlReader, boolean trackPages)
      throws XMLStreamException {
    boolean foundWord = false;
    StringBuilder trailingChars = new StringBuilder();
    while (xmlReader.hasNext()) {
      int nextEvent = xmlReader.next();
      if (nextEvent == XMLStreamConstants.START_ELEMENT) {
        String localName = xmlReader.getLocalName();
        String hocrClass = xmlReader.getAttributeValue("", "class");
        if ("span".equals(localName) && "ocrx_word".equals(hocrClass)) {
          foundWord = true;
          break;
        } else if ("span".equals(localName)
            && "ocr_line".equals(hocrClass)
            && trailingChars.lastIndexOf(" ") < 0) {
          // Line breaks result in a trailing whitespace character
          trailingChars.append(' ');
        } else if (trackPages && "div".equals(localName) && "ocr_page".equals(hocrClass)) {
          // Page break
          Map pageProps = this.parseTitle(xmlReader.getAttributeValue("", "title"));
          Dimension pageDims = null;
          if (pageProps.containsKey("bbox")) {
            String[] bboxParts = pageProps.get("bbox").split(" ");
            pageDims =
                new Dimension(Integer.parseInt(bboxParts[2]), Integer.parseInt(bboxParts[3]));
          }
          String pageId = xmlReader.getAttributeValue("", "id");
          if (pageId == null) {
            pageId = pageProps.get("x_source");
          }
          if (pageId == null) {
            pageId = pageProps.get("ppageno");
          }
          this.currentPage = new OcrPage(pageId, pageDims);
        }
      } else if (nextEvent == XMLStreamConstants.CHARACTERS
          || nextEvent == XMLStreamConstants.SPACE) {
        String txt = xmlReader.getText();
        boolean isBlank = StringUtils.isBlank(txt);
        if (isBlank
            && (trailingChars.length() == 0
                || trailingChars.lastIndexOf(" ") != (trailingChars.length() - 1))) {
          trailingChars.append(' ');
        } else if (!isBlank) {
          trailingChars.append(txt);
        }
      }
    }
    noMoreWords = !foundWord;
    return trailingChars.toString();
  }
}