All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.model.OcrFormat Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!
package de.digitalcollections.solrocr.model;

import com.google.common.collect.Range;
import com.google.common.collect.Sets;
import de.digitalcollections.solrocr.formats.OcrParser;
import de.digitalcollections.solrocr.iter.BreakLocator;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.lucene.OcrPassageFormatter;
import de.digitalcollections.solrocr.lucene.filters.OcrCharFilter;
import de.digitalcollections.solrocr.reader.PeekingReader;
import java.io.Reader;
import java.text.BreakIterator;
import java.util.Set;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.search.uhighlight.PassageFormatter;

/**
 * Provides access to format-specific {@link BreakIterator} and {@link OcrPassageFormatter}
 * instances.
 */
public interface OcrFormat {
  /**
   * Get a {@link BreakLocator} that splits the content on a given block type.
   *
   * @param blockTypes the type(s) of {@link OcrBlock} that the input document is split on
   * @return the {@link BreakLocator} instance
   */
  BreakLocator getBreakLocator(IterableCharSequence text, OcrBlock... blockTypes);

  /**
   * Get the parser for the format.
   *
   * @param input the input reader to parse {@link OcrBox}es from
   * @param features Desired features for the parsers
   * @return a parser instance configured with the requested parsing features
   */
  OcrParser getParser(Reader input, OcrParser.ParsingFeature... features);

  /**
   * Parse an {@link OcrPage} from a string fragment of the page markup.
   *
   * 

Implementers are safe to assume that {@code pageFragment} begins with the opening tag of a * page, as determined by the format's {@link OcrFormat#getBreakLocator(IterableCharSequence, * OcrBlock[])} output for the {@link OcrBlock#PAGE} block type. * * @param pageFragment The beginning of a page's markup, i.e. a String starting with {@code * <$pageElem} * @return the parsed {@link OcrPage} */ OcrPage parsePageFragment(String pageFragment); /** * Get a {@link PassageFormatter} that builds OCR snippets from passages * * @param preHighlightTag the tag to put in the snippet text before a highlighted region, e.g. * <em> * @param postHighlightTag the tag to put in the snippet text after a highlighted region, e.g. * </em> * @param absoluteHighlights whether the coordinates for highlights should be absolute, i.e. * relative to the page and not the containing snippet * @param alignSpans whether the spans in the text and image should match precisely. If false, the * text spans will be more precise than the image "spans", since the latter are restricted to * the granularity of the OCR document. */ default OcrPassageFormatter getPassageFormatter( String preHighlightTag, String postHighlightTag, boolean absoluteHighlights, boolean alignSpans, boolean trackPages) { return new OcrPassageFormatter( preHighlightTag, postHighlightTag, absoluteHighlights, alignSpans, trackPages, this); } /** * Get a {@link CharFilter} implementation for the OCR format that outputs plaintext. * *

If the filter supports outputting alternatives, it must output the alternatives * * @param input Input reader for OCR markup * @param expandAlternatives whether outputting alternatives from the OCR markup is desired. * @return a {@link CharFilter} implementation that outputs plaintext from the OCR. */ default Reader filter(PeekingReader input, boolean expandAlternatives) { Set features = Sets.newHashSet(OcrParser.ParsingFeature.TEXT, OcrParser.ParsingFeature.OFFSETS); if (expandAlternatives) { features.add(OcrParser.ParsingFeature.ALTERNATIVES); } return new OcrCharFilter(getParser(input, features.toArray(new OcrParser.ParsingFeature[] {}))); } /** * Check if the string chunk contains data formatted according to the implementing format. * * @param ocrChunk a chunk of a file's content * @return whether the chunk is formatted according to the implementing format. */ boolean hasFormat(String ocrChunk); int getLastContentStartIdx(String content); int getFirstContentEndIdx(String content); /** * Get the range of positions contained by the word containing the given position. * *

This default implementation is valid for OCR formats that encode word text as character * nodes inside of a containing element (like hOCR and MiniOCR). For other formats, override. * */ default Range getContainingWordLimits(String fragment, int position) { return Range.closedOpen( fragment.lastIndexOf('>', position) + 1, fragment.indexOf('<', position + 1)); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy