All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.highlight.TokenSources Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Created on 28-Oct-2004
 */
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.highlight;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;

/**
 * Convenience methods for obtaining a {@link TokenStream} for use with the {@link Highlighter} - can obtain from
 * term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
 *
 * @see TokenStreamFromTermVector
 */
public class TokenSources {

  private TokenSources() {}

  /**
   * Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.
   *
   * WARNING: Don't call this if there is more than one value for this field.  If there are, and if there are term
   * vectors, then there is a single tokenstream with offsets suggesting all the field values were concatenated.
   *
   * @param field The field to either get term vectors from or to analyze the text from.
   * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
   *                 be re-used for the same document (e.g. when highlighting multiple fields).
   * @param text the text to analyze, failing term vector un-inversion
   * @param analyzer the analyzer to analyze {@code text} with, failing term vector un-inversion
   * @param maxStartOffset Terms with a startOffset greater than this aren't returned.  Use -1 for no limit.
   *                       Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
   *
   * @return a token stream from either term vectors, or from analyzing the text. Never null.
   */
  public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
                                           int maxStartOffset) throws IOException {
    TokenStream tokenStream = getTermVectorTokenStreamOrNull(field, tvFields, maxStartOffset);
    if (tokenStream != null) {
      return tokenStream;
    }
    tokenStream = analyzer.tokenStream(field, text);
    if (maxStartOffset >= 0 && maxStartOffset < text.length() - 1) {
      tokenStream = new LimitTokenOffsetFilter(tokenStream, maxStartOffset);
    }
    return tokenStream;
  }

  /**
   * Get a token stream by un-inverting the term vector. This method returns null if {@code tvFields} is null
   * or if the field has no term vector, or if the term vector doesn't have offsets.  Positions are recommended on the
   * term vector but it isn't strictly required.
   *
   * @param field The field to get term vectors from.
   * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
   *                 be re-used for the same document (e.g. when highlighting multiple fields).
   * @param maxStartOffset Terms with a startOffset greater than this aren't returned.  Use -1 for no limit.
   *                       Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1
   * @return a token stream from term vectors. Null if no term vectors with the right options.
   */
  public static TokenStream getTermVectorTokenStreamOrNull(String field, Fields tvFields, int maxStartOffset)
      throws IOException {
    if (tvFields == null) {
      return null;
    }
    final Terms tvTerms = tvFields.terms(field);
    if (tvTerms == null || !tvTerms.hasOffsets()) {
      return null;
    }
    return new TokenStreamFromTermVector(tvTerms, maxStartOffset);
  }

  /**
   * A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
   * specified docId, then, falls back to using the passed in
   * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
   * This is useful when you already have the document, but would prefer to use
   * the vector first.
   *
   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
   *        and get the vector from
   * @param docId The docId to retrieve.
   * @param field The field to retrieve on the document
   * @param document The document to fall back on
   * @param analyzer The analyzer to use for creating the TokenStream if the
   *        vector doesn't exist
   * @return The {@link org.apache.lucene.analysis.TokenStream} for the
   *         {@link org.apache.lucene.index.IndexableField} on the
   *         {@link org.apache.lucene.document.Document}
   * @throws IOException if there was an error loading
   */
  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
      String field, Document document, Analyzer analyzer) throws IOException {
    TokenStream ts = null;

    Fields vectors = reader.getTermVectors(docId);
    if (vectors != null) {
      Terms vector = vectors.terms(field);
      if (vector != null) {
        ts = getTokenStream(vector);
      }
    }

    // No token info stored so fall back to analyzing raw content
    if (ts == null) {
      ts = getTokenStream(document, field, analyzer);
    }
    return ts;
  }

  /**
   * A convenience method that tries a number of approaches to getting a token
   * stream. The cost of finding there are no termVectors in the index is
   * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
   * approach to coding is probably acceptable
   * 
   * @return null if field not stored correctly
   * @throws IOException If there is a low-level I/O error
   */
  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
      String field, Analyzer analyzer) throws IOException {
    TokenStream ts = null;

    Fields vectors = reader.getTermVectors(docId);
    if (vectors != null) {
      Terms vector = vectors.terms(field);
      if (vector != null) {
        ts = getTokenStream(vector);
      }
    }

    // No token info stored so fall back to analyzing raw content
    if (ts == null) {
      ts = getTokenStream(reader, docId, field, analyzer);
    }
    return ts;
  }

  /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStream(Terms vector,
                                           boolean tokenPositionsGuaranteedContiguous) throws IOException {
    return getTokenStream(vector);
  }

  /**
   * Returns a token stream generated from a {@link Terms}. This
   * can be used to feed the highlighter with a pre-parsed token
   * stream.  The {@link Terms} must have offsets available. If there are no positions available,
   * all tokens will have position increments reflecting adjacent tokens, or coincident when terms
   * share a start offset. If there are stopwords filtered from the index, you probably want to ensure
   * term vectors have positions so that phrase queries won't match across stopwords.
   *
   * @throws IllegalArgumentException if no offsets are available
   */
  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStream(final Terms tpv) throws IOException {

    if (!tpv.hasOffsets()) {
      throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
      //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
      // highlighters require offsets, so we insist here.
    }

    return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
  }

  /**
   * Returns a {@link TokenStream} with positions and offsets constructed from
   * field termvectors.  If the field has no termvectors or offsets
   * are not included in the termvector, return null.  See {@link #getTokenStream(org.apache.lucene.index.Terms)}
   * for an explanation of what happens when positions aren't present.
   *
   * @param reader the {@link IndexReader} to retrieve term vectors from
   * @param docId the document to retrieve termvectors for
   * @param field the field to retrieve termvectors for
   * @return a {@link TokenStream}, or null if offsets are not available
   * @throws IOException If there is a low-level I/O error
   *
   * @see #getTokenStream(org.apache.lucene.index.Terms)
   */
  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
                                                      String field) throws IOException {

    Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
      return null;
    }

    Terms vector = vectors.terms(field);
    if (vector == null) {
      return null;
    }

    if (!vector.hasOffsets()) {
      return null;
    }
    
    return getTokenStream(vector);
  }

  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStream(IndexReader reader, int docId,
      String field, Analyzer analyzer) throws IOException {
    Document doc = reader.document(docId);
    return getTokenStream(doc, field, analyzer);
  }

  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStream(Document doc, String field,
      Analyzer analyzer) {
    String contents = doc.get(field);
    if (contents == null) {
      throw new IllegalArgumentException("Field " + field
          + " in document is not stored and cannot be analyzed");
    }
    return getTokenStream(field, contents, analyzer);
  }

  @Deprecated // maintenance reasons LUCENE-6445
  public static TokenStream getTokenStream(String field, String contents,
      Analyzer analyzer) {
    return analyzer.tokenStream(field, contents);
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy