org.apache.lucene.search.uhighlight.AnalysisOffsetStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * Provides a base class for analysis based offset strategies to extend from. Requires an Analyzer
 * and provides an override-able method for altering how the TokenStream is created.
 *
 * @lucene.internal
 */
public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {

  protected final Analyzer analyzer;

  public AnalysisOffsetStrategy(UHComponents components, Analyzer analyzer) {
    super(components);
    this.analyzer = analyzer;
    if (analyzer.getOffsetGap(getField()) != 1) { // note: 1 is the default. It is RARELY changed.
      throw new IllegalArgumentException(
          "offset gap of the provided analyzer should be 1 (field " + getField() + ")");
    }
  }

  @Override
  public final UnifiedHighlighter.OffsetSource getOffsetSource() {
    return UnifiedHighlighter.OffsetSource.ANALYSIS;
  }

  protected TokenStream tokenStream(String content) throws IOException {
    // If there is no splitChar in content then we needn't wrap:
    int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
    if (splitCharIdx == -1) {
      return analyzer.tokenStream(getField(), content);
    }

    TokenStream subTokenStream =
        analyzer.tokenStream(getField(), content.substring(0, splitCharIdx));

    return new MultiValueTokenStream(
        subTokenStream,
        getField(),
        analyzer,
        content,
        UnifiedHighlighter.MULTIVAL_SEP_CHAR,
        splitCharIdx);
  }

  /**
   * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a
   * specified character. This exposes a TokenStream that matches what would get indexed considering
   * the {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link
   * Analyzer#getOffsetGap(String)} is 1; an exception will be thrown if it isn't.
   *
   * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but
   * doing so seems like more work. The underlying components see a Reader not a String -- and the
   * String is easy to split up without redundant buffering.
   *
   * @lucene.internal
   */
  // TODO we could make this go away.  MemoryIndexOffsetStrategy could simply split and analyze each
  // value into the
  //   MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this
  // logic,
  //   albeit with less code, less hack.
  private static final class MultiValueTokenStream extends TokenFilter {

    private final String fieldName;
    private final Analyzer indexAnalyzer;
    private final String content;
    private final char splitChar;

    private final PositionIncrementAttribute posIncAtt =
        addAttribute(PositionIncrementAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    private int startValIdx = 0;
    private int endValIdx;
    private int remainingPosInc = 0;

    private MultiValueTokenStream(
        TokenStream subTokenStream,
        String fieldName,
        Analyzer indexAnalyzer,
        String content,
        char splitChar,
        int splitCharIdx) {
      super(subTokenStream); // subTokenStream is already initialized to operate on the first value
      this.fieldName = fieldName;
      this.indexAnalyzer = indexAnalyzer;
      this.content = content;
      this.splitChar = splitChar;
      this.endValIdx = splitCharIdx;
    }

    @Override
    public void reset() throws IOException {
      if (startValIdx != 0) {
        throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
        // ... although we could if a need for it arises.
      }
      super.reset();
    }

    @Override
    public boolean incrementToken() throws IOException {
      while (true) {

        if (input.incrementToken()) {
          // Position tracking:
          if (remainingPosInc > 0) {
            // usually true first token of additional values (not first val)
            posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
            remainingPosInc = 0; // reset
          }
          // Offset tracking:
          offsetAtt.setOffset(
              startValIdx + offsetAtt.startOffset(), startValIdx + offsetAtt.endOffset());
          return true;
        }

        if (endValIdx == content.length()) { // no more
          return false;
        }

        input.end(); // might adjust position increment
        remainingPosInc += posIncAtt.getPositionIncrement();
        input.close();
        remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);

        // Get new tokenStream based on next segment divided by the splitChar
        startValIdx = endValIdx + 1;
        endValIdx = content.indexOf(splitChar, startValIdx);
        if (endValIdx == -1) { // EOF
          endValIdx = content.length();
        }
        TokenStream tokenStream =
            indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
        if (tokenStream != input) { // (input is defined in TokenFilter set in the constructor)
          // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to
          // produce the
          // very same tokenStream instance and thus have the same AttributeSource as this wrapping
          // TokenStream
          // since we used it as our input in the constructor.
          // Were this not the case, we'd have to copy every attribute of interest since we can't
          // alter the
          // AttributeSource of this wrapping TokenStream post-construction (it's all
          // private/final).
          // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute
          // that allows
          // us to easily set the char[] reference without literally copying char by char.
          throw new IllegalStateException(
              "Require TokenStream re-use.  Unsupported re-use strategy?: "
                  + indexAnalyzer.getReuseStrategy());
        }
        tokenStream.reset();
      } // while loop to increment token of this new value
    }

    @Override
    public void end() throws IOException {
      super.end();
      // Offset tracking:
      offsetAtt.setOffset(
          startValIdx + offsetAtt.startOffset(), startValIdx + offsetAtt.endOffset());
    }
  }
}