org.apache.lucene.search.uhighlight.Passage Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * Represents a passage (typically a sentence of the document).
 *
 * A passage contains {@link #getNumMatches} highlights from the query, and the offsets and query
 * terms that correspond with each match.
 *
 * @lucene.experimental
 */
public class Passage {
  private int startOffset = -1;
  private int endOffset = -1;
  private float score = 0.0f;

  private int[] matchStarts = new int[8];
  private int[] matchEnds = new int[8];
  private BytesRef[] matchTerms = new BytesRef[8];
  private int[] matchTermFreqInDoc = new int[8];
  private int numMatches = 0;

  /**
   * @lucene.internal
   */
  public void addMatch(int startOffset, int endOffset, BytesRef term, int termFreqInDoc) {
    assert startOffset >= this.startOffset && startOffset <= this.endOffset;
    if (numMatches == matchStarts.length) {
      int newLength = ArrayUtil.oversize(numMatches + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
      int[] newMatchStarts = new int[newLength];
      int[] newMatchEnds = new int[newLength];
      int[] newMatchTermFreqInDoc = new int[newLength];
      BytesRef[] newMatchTerms = new BytesRef[newLength];
      System.arraycopy(matchStarts, 0, newMatchStarts, 0, numMatches);
      System.arraycopy(matchEnds, 0, newMatchEnds, 0, numMatches);
      System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
      System.arraycopy(matchTermFreqInDoc, 0, newMatchTermFreqInDoc, 0, numMatches);
      matchStarts = newMatchStarts;
      matchEnds = newMatchEnds;
      matchTerms = newMatchTerms;
      matchTermFreqInDoc = newMatchTermFreqInDoc;
    }
    assert matchStarts.length == matchEnds.length && matchEnds.length == matchTerms.length;
    matchStarts[numMatches] = startOffset;
    matchEnds[numMatches] = endOffset;
    matchTerms[numMatches] = term;
    matchTermFreqInDoc[numMatches] = termFreqInDoc;
    numMatches++;
  }

  /**
   * @lucene.internal
   */
  public void reset() {
    startOffset = endOffset = -1;
    score = 0.0f;
    numMatches = 0;
  }

  /** For debugging. ex: Passage[0-22]{yin[0-3],yang[4-8],yin[10-13]}score=2.4964213 */
  @Override
  public String toString() {
    StringBuilder buf = new StringBuilder();
    buf.append("Passage[").append(startOffset).append('-').append(endOffset).append(']');
    buf.append('{');
    for (int i = 0; i < numMatches; i++) {
      if (i != 0) {
        buf.append(',');
      }
      buf.append(matchTerms[i].utf8ToString());
      buf.append('[')
          .append(matchStarts[i] - startOffset)
          .append('-')
          .append(matchEnds[i] - startOffset)
          .append(']');
    }
    buf.append('}');
    buf.append("score=").append(score);
    return buf.toString();
  }

  /**
   * Start offset of this passage.
   *
   * @return start index (inclusive) of the passage in the original content: always >= 0.
   */
  public int getStartOffset() {
    return startOffset;
  }

  /**
   * End offset of this passage.
   *
   * @return end index (exclusive) of the passage in the original content: always >= {@link
   *     #getStartOffset()}
   */
  public int getEndOffset() {
    return endOffset;
  }

  public int getLength() {
    return endOffset - startOffset;
  }

  /** Passage's score. */
  public float getScore() {
    return score;
  }

  public void setScore(float score) {
    this.score = score;
  }

  /**
   * Number of term matches available in {@link #getMatchStarts}, {@link #getMatchEnds}, {@link
   * #getMatchTerms}
   */
  public int getNumMatches() {
    return numMatches;
  }

  /**
   * Start offsets of the term matches, in increasing order.
   *
   * 
Only {@link #getNumMatches} are valid. Note that these offsets are absolute (not relative to
   * {@link #getStartOffset()}).
   */
  public int[] getMatchStarts() {
    return matchStarts;
  }

  /**
   * End offsets of the term matches, corresponding with {@link #getMatchStarts}.
   *
   * 
Only {@link #getNumMatches} are valid. Note that its possible that an end offset could
   * exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the Analyzer produced a
   * term which spans a passage boundary.
   */
  public int[] getMatchEnds() {
    return matchEnds;
  }

  /**
   * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}. The primary
   * purpose of this method is to expose the number of unique terms per passage for use in passage
   * scoring. The actual term byte content is not well defined by this highlighter, and thus use of
   * it is more subject to change.
   *
   * 
The term might be simply the analyzed term at this position. Depending on the highlighter's
   * configuration, the match term may be a phrase (instead of a word), and in such a case might be
   * a series of space-separated analyzed terms. If the match is from a {@link
   * org.apache.lucene.search.MultiTermQuery} then the match term may be the toString() of that
   * query.
   *
   * Only {@link #getNumMatches()} are valid.
   */
  public BytesRef[] getMatchTerms() {
    return matchTerms;
  }

  public int[] getMatchTermFreqsInDoc() {
    return matchTermFreqInDoc;
  }

  /**
   * @lucene.internal
   */
  public void setStartOffset(int startOffset) {
    this.startOffset = startOffset;
  }

  /**
   * @lucene.internal
   */
  public void setEndOffset(int endOffset) {
    assert startOffset <= endOffset;
    this.endOffset = endOffset;
  }
}