org.apache.lucene.search.suggest.document.SuggestField Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.BytesRef;

/**
 * Field that indexes a string value and a weight as a weighted completion against a named
 * suggester. Field is tokenized, not stored and stores documents, frequencies and positions. Field
 * can be used to provide near real time document suggestions.
 *
 * Besides the usual {@link org.apache.lucene.analysis.Analyzer}s, {@link CompletionAnalyzer} can
 * be used to tune suggest field only parameters (e.g. preserving token separators, preserving
 * position increments when converting the token stream to an automaton)
 *
 * 
Example indexing usage:
 *
 * 
 * document.add(new SuggestField(name, "suggestion", 4));
 * 
 *
 * To perform document suggestions based on the this field, use {@link
 * SuggestIndexSearcher#suggest(CompletionQuery, int, boolean)}
 *
 * @lucene.experimental
 */
public class SuggestField extends Field {

  /** Default field type for suggest field */
  public static final FieldType FIELD_TYPE = new FieldType();

  static {
    FIELD_TYPE.setTokenized(true);
    FIELD_TYPE.setStored(false);
    FIELD_TYPE.setStoreTermVectors(false);
    FIELD_TYPE.setOmitNorms(false);
    FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    FIELD_TYPE.freeze();
  }

  static final byte TYPE = 0;

  private final BytesRef surfaceForm;
  private final int weight;

  /**
   * Creates a {@link SuggestField}
   *
   * @param name field name
   * @param value field value to get suggestions on
   * @param weight field weight
   * @throws IllegalArgumentException if either the name or value is null, if value is an empty
   *     string, if the weight is negative, if value contains any reserved characters
   */
  public SuggestField(String name, String value, int weight) {
    super(name, value, FIELD_TYPE);
    if (weight < 0) {
      throw new IllegalArgumentException("weight must be >= 0");
    }
    if (value.length() == 0) {
      throw new IllegalArgumentException("value must have a length > 0");
    }
    for (int i = 0; i < value.length(); i++) {
      if (isReserved(value.charAt(i))) {
        throw new IllegalArgumentException(
            "Illegal input ["
                + value
                + "] UTF-16 codepoint [0x"
                + Integer.toHexString((int) value.charAt(i))
                + "] at position "
                + i
                + " is a reserved character");
      }
    }
    this.surfaceForm = new BytesRef(value);
    this.weight = weight;
  }

  @Override
  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
    CompletionTokenStream completionStream = wrapTokenStream(super.tokenStream(analyzer, reuse));
    completionStream.setPayload(buildSuggestPayload());
    return completionStream;
  }

  /**
   * Wraps a stream with a CompletionTokenStream.
   *
   * Subclasses can override this method to change the indexing pipeline.
   */
  protected CompletionTokenStream wrapTokenStream(TokenStream stream) {
    if (stream instanceof CompletionTokenStream) {
      return (CompletionTokenStream) stream;
    } else {
      return new CompletionTokenStream(stream);
    }
  }

  /** Returns a byte to denote the type of the field */
  protected byte type() {
    return TYPE;
  }

  private BytesRef buildSuggestPayload() {
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
      output.writeVInt(surfaceForm.length);
      output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
      output.writeVInt(weight + 1);
      output.writeByte(type());
    } catch (IOException e) {
      throw new RuntimeException(e); // not possible, it's a ByteArrayOutputStream!
    }
    return new BytesRef(byteArrayOutputStream.toByteArray());
  }

  private boolean isReserved(char c) {
    switch (c) {
      case ConcatenateGraphFilter.SEP_LABEL:
      case CompletionAnalyzer.HOLE_CHARACTER:
      case NRTSuggesterBuilder.END_BYTE:
        return true;
      default:
        return false;
    }
  }
}