de.digitalcollections.solrocr.model.OcrFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.
The newest version!
package de.digitalcollections.solrocr.model;

import com.google.common.collect.Range;
import com.google.common.collect.Sets;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.lucene.OcrPassageFormatter;
import de.digitalcollections.solrocr.lucene.filters.OcrCharFilter;
import de.digitalcollections.solrocr.reader.PeekingReader;
import java.io.Reader;
import java.text.BreakIterator;
import java.util.Set;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.search.uhighlight.PassageFormatter;

/**
 * Provides access to format-specific {@link BreakIterator} and {@link OcrPassageFormatter}
 * instances.
 */
public interface OcrFormat {
  /**
   * Get a {@link BreakLocator} that splits the content on a given block type.
   *
   * @param blockTypes the type(s) of {@link OcrBlock} that the input document is split on
   * @return the {@link BreakLocator} instance
   */
  BreakLocator getBreakLocator(IterableCharSequence text, OcrBlock... blockTypes);

  /**
   * Get the parser for the format.
   *
   * @param input the input reader to parse {@link OcrBox}es from
   * @param features Desired features for the parsers
   * @return a parser instance configured with the requested parsing features
   */
  OcrParser getParser(Reader input, OcrParser.ParsingFeature... features);

  /**
   * Parse an {@link OcrPage} from a string fragment of the page markup.
   *
   * Implementers are safe to assume that {@code pageFragment} begins with the opening tag of a
   * page, as determined by the format's {@link OcrFormat#getBreakLocator(IterableCharSequence,
   * OcrBlock[])} output for the {@link OcrBlock#PAGE} block type.
   *
   * @param pageFragment The beginning of a page's markup, i.e. a String starting with {@code
   *     <$pageElem}
   * @return the parsed {@link OcrPage}
   */
  OcrPage parsePageFragment(String pageFragment);

  /**
   * Get a {@link PassageFormatter} that builds OCR snippets from passages
   *
   * @param preHighlightTag the tag to put in the snippet text before a highlighted region, e.g.
   *     <em>
   * @param postHighlightTag the tag to put in the snippet text after a highlighted region, e.g.
   *     </em>
   * @param absoluteHighlights whether the coordinates for highlights should be absolute, i.e.
   *     relative to the page and not the containing snippet
   * @param alignSpans whether the spans in the text and image should match precisely. If false, the
   *     text spans will be more precise than the image "spans", since the latter are restricted to
   *     the granularity of the OCR document.
   */
  default OcrPassageFormatter getPassageFormatter(
      String preHighlightTag,
      String postHighlightTag,
      boolean absoluteHighlights,
      boolean alignSpans,
      boolean trackPages) {
    return new OcrPassageFormatter(
        preHighlightTag, postHighlightTag, absoluteHighlights, alignSpans, trackPages, this);
  }

  /**
   * Get a {@link CharFilter} implementation for the OCR format that outputs plaintext.
   *
   * 
If the filter supports outputting alternatives, it must output the alternatives
   *
   * @param input Input reader for OCR markup
   * @param expandAlternatives whether outputting alternatives from the OCR markup is desired.
   * @return a {@link CharFilter} implementation that outputs plaintext from the OCR.
   */
  default Reader filter(PeekingReader input, boolean expandAlternatives) {
    Set features =
        Sets.newHashSet(OcrParser.ParsingFeature.TEXT, OcrParser.ParsingFeature.OFFSETS);
    if (expandAlternatives) {
      features.add(OcrParser.ParsingFeature.ALTERNATIVES);
    }
    return new OcrCharFilter(getParser(input, features.toArray(new OcrParser.ParsingFeature[] {})));
  }

  /**
   * Check if the string chunk contains data formatted according to the implementing format.
   *
   * @param ocrChunk a chunk of a file's content
   * @return whether the chunk is formatted according to the implementing format.
   */
  boolean hasFormat(String ocrChunk);

  int getLastContentStartIdx(String content);

  int getFirstContentEndIdx(String content);

  /**
   * Get the range of positions contained by the word containing the given position.
   *
   * This default implementation is valid for OCR formats that encode word text as character
   * nodes inside of a containing element (like hOCR and MiniOCR). For other formats, override. *
   */
  default Range getContainingWordLimits(String fragment, int position) {
    return Range.closedOpen(
        fragment.lastIndexOf('>', position) + 1, fragment.indexOf('<', position + 1));
  }
}