All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.lucene.byteoffset.FieldByteOffsetStrategy Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.lucene.byteoffset;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.highlight.TermVectorLeafReader;
import org.apache.lucene.search.uhighlight.OverlaySingleDocTermsLeafReader;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import de.digitalcollections.solrocr.lucene.OcrHComponents;
import de.digitalcollections.solrocr.lucene.vendor.TermVectorFilteredLeafReader;

/**
 * Customization of {@link org.apache.lucene.search.uhighlight.FieldOffsetStrategy} to load byte offsets from payloads.
 *
 * A lot of this code is copied straight from the above class.
 */
public abstract class FieldByteOffsetStrategy {
  protected final OcrHComponents components;

  public FieldByteOffsetStrategy(OcrHComponents components) {
    this.components = components;
  }

  public String getField() {
    return components.getField();
  }

  public abstract OffsetSource getOffsetSource();

  public abstract ByteOffsetsEnum getByteOffsetsEnum(LeafReader leafReader, int doc) throws IOException;

  protected ByteOffsetsEnum createByteOffsetsEnumFromReader(LeafReader leafReader, int doc) throws IOException {
    final Terms termsIndex = leafReader.terms(getField());
    if (termsIndex == null) {
      return ByteOffsetsEnum.EMPTY;
    }

    final List byteOffsetsEnums = new ArrayList<>();

    // Handle position insensitive terms (a subset of this.terms field):
    final BytesRef[] insensitiveTerms;
    final ByteOffsetPhraseHelper phraseHelper = components.getByteOffsetPhraseHelper();
    final BytesRef[] terms = components.getTerms();
    if (phraseHelper.hasPositionSensitivity()) {
      insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms();
      assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms";
    } else {
      insensitiveTerms = terms;
    }
    if (insensitiveTerms.length > 0) {
      createByteOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, byteOffsetsEnums);
    }

    // Handle spans
    if (phraseHelper.hasPositionSensitivity()) {
      phraseHelper.createByteOffsetsEnumsForSpans(leafReader, doc, byteOffsetsEnums);
    }

    // Handle automata
    if (components.getAutomata().length > 0) {
      createByteOffsetsEnumsForAutomata(termsIndex, doc, byteOffsetsEnums);
    }

    switch (byteOffsetsEnums.size()) {
      case 0: return ByteOffsetsEnum.EMPTY;
      case 1: return byteOffsetsEnums.get(0);
      default: return new ByteOffsetsEnum.MultiByteOffsetsEnum(byteOffsetsEnums);
    }
  }

  protected void createByteOffsetsEnumsForTerms(
      BytesRef[] sourceTerms, Terms termsIndex, int doc, List results) throws IOException {
    TermsEnum termsEnum = termsIndex.iterator();//does not return null
    for (BytesRef term : sourceTerms) {
      if (termsEnum.seekExact(term)) {
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.PAYLOADS);
        if (postingsEnum == null) {
          // no offsets or positions available
          throw new IllegalArgumentException(
              "field '" + getField() + "' was indexed without payloads, cannot highlight");
        }
        if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
          results.add(new ByteOffsetsEnum.OfPostings(term, postingsEnum));
        }
      }
    }
  }

  protected void createByteOffsetsEnumsForAutomata(
      Terms termsIndex, int doc, List results) throws IOException {
    final CharacterRunAutomaton[] automata = components.getAutomata();
    List> automataPostings = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
      automataPostings.add(new ArrayList<>());
    }

    TermsEnum termsEnum = termsIndex.iterator();
    BytesRef term;

    CharsRefBuilder refBuilder = new CharsRefBuilder();
    while ((term = termsEnum.next()) != null) {
      for (int i = 0; i < automata.length; i++) {
        CharacterRunAutomaton automaton = automata[i];
        refBuilder.copyUTF8Bytes(term);
        if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
          PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS);
          if (doc == postings.advance(doc)) {
            automataPostings.get(i).add(postings);
          }
        }
      }
    }

    for (int i = 0; i < automata.length; i++) {
      CharacterRunAutomaton automaton = automata[i];
      List postingsEnums = automataPostings.get(i);
      if (postingsEnums.isEmpty()) {
        continue;
      }
      // Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq
      BytesRef wildcardTerm = new BytesRef(automaton.toString());
      int sumFreq = 0;
      for (PostingsEnum postingsEnum : postingsEnums) {
        sumFreq += postingsEnum.freq();
      }
      for (PostingsEnum postingsEnum : postingsEnums) {
        results.add(new ByteOffsetsEnum.OfPostings(wildcardTerm, sumFreq, postingsEnum));
      }
    }
  }

  public static class TermVectorByteOffsetStrategy extends FieldByteOffsetStrategy {
    public TermVectorByteOffsetStrategy(OcrHComponents components) {
      super(components);
    }

    @Override
    public OffsetSource getOffsetSource() {
      return OffsetSource.TERM_VECTORS;
    }

    @Override
    public ByteOffsetsEnum getByteOffsetsEnum(LeafReader reader, int docId) throws IOException {
      Terms tvTerms = reader.getTermVector(docId, getField());
      if (tvTerms == null) {
        return ByteOffsetsEnum.EMPTY;
      }

      LeafReader singleDocReader = new TermVectorLeafReader(getField(), tvTerms);
      return createByteOffsetsEnumFromReader(
          new OverlaySingleDocTermsLeafReader(
              reader,
              singleDocReader,
              getField(),
              docId),
          docId);
    }
  }

  public static class PostingsByteOffsetStrategy extends FieldByteOffsetStrategy {
    public PostingsByteOffsetStrategy(OcrHComponents components) {
      super(components);
    }

    @Override
    public OffsetSource getOffsetSource() {
      return OffsetSource.POSTINGS;
    }

    @Override
    public ByteOffsetsEnum getByteOffsetsEnum(LeafReader leafReader, int doc) throws IOException {
      return createByteOffsetsEnumFromReader(leafReader, doc);
    }
  }

  public static class PostingsWithTermVectorsByteOffsetStrategy extends FieldByteOffsetStrategy {

    public PostingsWithTermVectorsByteOffsetStrategy(OcrHComponents components) {
      super(components);
    }

    @Override
    public OffsetSource getOffsetSource() {
      return OffsetSource.POSTINGS_WITH_TERM_VECTORS;
    }

    @Override
    public ByteOffsetsEnum getByteOffsetsEnum(LeafReader leafReader, int doc) throws IOException {
      Terms docTerms = leafReader.getTermVector(doc, getField());
      if (docTerms == null) {
        return ByteOffsetsEnum.EMPTY;
      }
      leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, getField());

      return createByteOffsetsEnumFromReader(leafReader, doc);
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy