All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.elasticsearch.search.suggest.term.TermSuggestionBuilder Maven / Gradle / Ivy

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.codelibs.elasticsearch.search.suggest.term;

import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.codelibs.elasticsearch.ElasticsearchParseException;
import org.codelibs.elasticsearch.common.ParseFieldMatcher;
import org.codelibs.elasticsearch.common.ParsingException;
import org.codelibs.elasticsearch.common.io.stream.StreamInput;
import org.codelibs.elasticsearch.common.io.stream.StreamOutput;
import org.codelibs.elasticsearch.common.io.stream.Writeable;
import org.codelibs.elasticsearch.common.xcontent.XContentBuilder;
import org.codelibs.elasticsearch.common.xcontent.XContentParser;
import org.codelibs.elasticsearch.index.query.QueryParseContext;
import org.codelibs.elasticsearch.index.query.QueryShardContext;
import org.codelibs.elasticsearch.search.suggest.SortBy;
import org.codelibs.elasticsearch.search.suggest.SuggestionBuilder;
import org.codelibs.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;

import java.io.IOException;
import java.util.Locale;
import java.util.Objects;

import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_ACCURACY;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_EDITS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_INSPECTIONS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_TERM_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD;

/**
 * Defines the actual suggest command. Each command uses the global options
 * unless defined in the suggestion itself. All options are the same as the
 * global options, but are only applicable for this suggestion.
 */
public class TermSuggestionBuilder extends SuggestionBuilder {
    private static final String SUGGESTION_NAME = "term";

    private SuggestMode suggestMode = SuggestMode.MISSING;
    private float accuracy = DEFAULT_ACCURACY;
    private SortBy sort = SortBy.SCORE;
    private StringDistanceImpl stringDistance = StringDistanceImpl.INTERNAL;
    private int maxEdits = DEFAULT_MAX_EDITS;
    private int maxInspections = DEFAULT_MAX_INSPECTIONS;
    private float maxTermFreq = DEFAULT_MAX_TERM_FREQ;
    private int prefixLength = DEFAULT_PREFIX_LENGTH;
    private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
    private float minDocFreq = DEFAULT_MIN_DOC_FREQ;

    public TermSuggestionBuilder(String field) {
        super(field);
    }

    /**
     * internal copy constructor that copies over all class field except field.
     */
    private TermSuggestionBuilder(String field, TermSuggestionBuilder in) {
        super(field, in);
        suggestMode = in.suggestMode;
        accuracy = in.accuracy;
        sort = in.sort;
        stringDistance = in.stringDistance;
        maxEdits = in.maxEdits;
        maxInspections = in.maxInspections;
        maxTermFreq = in.maxTermFreq;
        prefixLength = in.prefixLength;
        minWordLength = in.minWordLength;
        minDocFreq = in.minDocFreq;
    }

    /**
     * Read from a stream.
     */
    TermSuggestionBuilder(StreamInput in) throws IOException {
        super(in);
        suggestMode = SuggestMode.readFromStream(in);
        accuracy = in.readFloat();
        sort = SortBy.readFromStream(in);
        stringDistance = StringDistanceImpl.readFromStream(in);
        maxEdits = in.readVInt();
        maxInspections = in.readVInt();
        maxTermFreq = in.readFloat();
        prefixLength = in.readVInt();
        minWordLength = in.readVInt();
        minDocFreq = in.readFloat();
    }

    @Override
    public void doWriteTo(StreamOutput out) throws IOException {
        suggestMode.writeTo(out);
        out.writeFloat(accuracy);
        sort.writeTo(out);
        stringDistance.writeTo(out);
        out.writeVInt(maxEdits);
        out.writeVInt(maxInspections);
        out.writeFloat(maxTermFreq);
        out.writeVInt(prefixLength);
        out.writeVInt(minWordLength);
        out.writeFloat(minDocFreq);
    }

    /**
     * The global suggest mode controls what suggested terms are included or
     * controls for what suggest text tokens, terms should be suggested for.
     * Three possible values can be specified:
     * 
    *
  1. missing - Only suggest terms in the suggest text that * aren't in the index. This is the default. *
  2. popular - Only suggest terms that occur in more docs * then the original suggest text term. *
  3. always - Suggest any matching suggest terms based on * tokens in the suggest text. *
*/ public TermSuggestionBuilder suggestMode(SuggestMode suggestMode) { Objects.requireNonNull(suggestMode, "suggestMode must not be null"); this.suggestMode = suggestMode; return this; } /** * Get the suggest mode setting. */ public SuggestMode suggestMode() { return suggestMode; } /** * s how similar the suggested terms at least need to be compared to the * original suggest text tokens. A value between 0 and 1 can be specified. * This value will be compared to the string distance result of each * candidate spelling correction. *

* Default is 0.5 */ public TermSuggestionBuilder accuracy(float accuracy) { if (accuracy < 0.0f || accuracy > 1.0f) { throw new IllegalArgumentException("accuracy must be between 0 and 1"); } this.accuracy = accuracy; return this; } /** * Get the accuracy setting. */ public float accuracy() { return accuracy; } /** * Sets how to sort the suggest terms per suggest text token. Two possible * values: *

    *
  1. score - Sort should first be based on score, then * document frequency and then the term itself. *
  2. frequency - Sort should first be based on document * frequency, then score and then the term itself. *
*

* What the score is depends on the suggester being used. */ public TermSuggestionBuilder sort(SortBy sort) { Objects.requireNonNull(sort, "sort must not be null"); this.sort = sort; return this; } /** * Get the sort setting. */ public SortBy sort() { return sort; } /** * Sets what string distance implementation to use for comparing how similar * suggested terms are. Five possible values can be specified: *

    *
  1. internal - This is the default and is based on * damerau_levenshtein, but highly optimized for comparing * string distance for terms inside the index. *
  2. damerau_levenshtein - String distance algorithm based on * Damerau-Levenshtein algorithm. *
  3. levenstein - String distance algorithm based on * Levenstein edit distance algorithm. *
  4. jarowinkler - String distance algorithm based on * Jaro-Winkler algorithm. *
  5. ngram - String distance algorithm based on character * n-grams. *
*/ public TermSuggestionBuilder stringDistance(StringDistanceImpl stringDistance) { Objects.requireNonNull(stringDistance, "stringDistance must not be null"); this.stringDistance = stringDistance; return this; } /** * Get the string distance implementation setting. */ public StringDistanceImpl stringDistance() { return stringDistance; } /** * Sets the maximum edit distance candidate suggestions can have in order to * be considered as a suggestion. Can only be a value between 1 and 2. Any * other value result in an bad request error being thrown. Defaults to * 2. */ public TermSuggestionBuilder maxEdits(int maxEdits) { if (maxEdits < 1 || maxEdits > 2) { throw new IllegalArgumentException("maxEdits must be between 1 and 2"); } this.maxEdits = maxEdits; return this; } /** * Get the maximum edit distance setting. */ public int maxEdits() { return maxEdits; } /** * A factor that is used to multiply with the size in order to inspect more * candidate suggestions. Can improve accuracy at the cost of performance. * Defaults to 5. */ public TermSuggestionBuilder maxInspections(int maxInspections) { if (maxInspections < 0) { throw new IllegalArgumentException("maxInspections must be positive"); } this.maxInspections = maxInspections; return this; } /** * Get the factor for inspecting more candidate suggestions setting. */ public int maxInspections() { return maxInspections; } /** * Sets a maximum threshold in number of documents a suggest text token can * exist in order to be corrected. Can be a relative percentage number (e.g * 0.4) or an absolute number to represent document frequencies. If an value * higher than 1 is specified then fractional can not be specified. Defaults * to 0.01. *

* This can be used to exclude high frequency terms from being suggested. * High frequency terms are usually spelled correctly on top of this this * also improves the suggest performance. */ public TermSuggestionBuilder maxTermFreq(float maxTermFreq) { if (maxTermFreq < 0.0f) { throw new IllegalArgumentException("maxTermFreq must be positive"); } if (maxTermFreq > 1.0f && maxTermFreq != Math.floor(maxTermFreq)) { throw new IllegalArgumentException("if maxTermFreq is greater than 1, it must not be a fraction"); } this.maxTermFreq = maxTermFreq; return this; } /** * Get the maximum term frequency threshold setting. */ public float maxTermFreq() { return maxTermFreq; } /** * Sets the number of minimal prefix characters that must match in order be * a candidate suggestion. Defaults to 1. Increasing this number improves * suggest performance. Usually misspellings don't occur in the beginning of * terms. */ public TermSuggestionBuilder prefixLength(int prefixLength) { if (prefixLength < 0) { throw new IllegalArgumentException("prefixLength must be positive"); } this.prefixLength = prefixLength; return this; } /** * Get the minimum prefix length that must match setting. */ public int prefixLength() { return prefixLength; } /** * The minimum length a suggest text term must have in order to be * corrected. Defaults to 4. */ public TermSuggestionBuilder minWordLength(int minWordLength) { if (minWordLength < 1) { throw new IllegalArgumentException("minWordLength must be greater or equal to 1"); } this.minWordLength = minWordLength; return this; } /** * Get the minimum length of a text term to be corrected setting. */ public int minWordLength() { return minWordLength; } /** * Sets a minimal threshold in number of documents a suggested term should * appear in. This can be specified as an absolute number or as a relative * percentage of number of documents. This can improve quality by only * suggesting high frequency terms. Defaults to 0f and is not enabled. If a * value higher than 1 is specified then the number cannot be fractional. */ public TermSuggestionBuilder minDocFreq(float minDocFreq) { if (minDocFreq < 0.0f) { throw new IllegalArgumentException("minDocFreq must be positive"); } if (minDocFreq > 1.0f && minDocFreq != Math.floor(minDocFreq)) { throw new IllegalArgumentException("if minDocFreq is greater than 1, it must not be a fraction"); } this.minDocFreq = minDocFreq; return this; } /** * Get the minimal threshold for the frequency of a term appearing in the * document set setting. */ public float minDocFreq() { return minDocFreq; } @Override public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { builder.field(SUGGESTMODE_FIELD.getPreferredName(), suggestMode); builder.field(ACCURACY_FIELD.getPreferredName(), accuracy); builder.field(SORT_FIELD.getPreferredName(), sort); builder.field(STRING_DISTANCE_FIELD.getPreferredName(), stringDistance); builder.field(MAX_EDITS_FIELD.getPreferredName(), maxEdits); builder.field(MAX_INSPECTIONS_FIELD.getPreferredName(), maxInspections); builder.field(MAX_TERM_FREQ_FIELD.getPreferredName(), maxTermFreq); builder.field(PREFIX_LENGTH_FIELD.getPreferredName(), prefixLength); builder.field(MIN_WORD_LENGTH_FIELD.getPreferredName(), minWordLength); builder.field(MIN_DOC_FREQ_FIELD.getPreferredName(), minDocFreq); return builder; } static TermSuggestionBuilder innerFromXContent(QueryParseContext parseContext) throws IOException { XContentParser parser = parseContext.parser(); TermSuggestionBuilder tmpSuggestion = new TermSuggestionBuilder("_na_"); parseContext.getParseFieldMatcher(); XContentParser.Token token; String currentFieldName = null; String fieldname = null; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); } else if (token.isValue()) { if (SuggestionBuilder.ANALYZER_FIELD.match(currentFieldName)) { tmpSuggestion.analyzer(parser.text()); } else if (SuggestionBuilder.FIELDNAME_FIELD.match(currentFieldName)) { fieldname = parser.text(); } else if (SuggestionBuilder.SIZE_FIELD.match(currentFieldName)) { tmpSuggestion.size(parser.intValue()); } else if (SuggestionBuilder.SHARDSIZE_FIELD.match(currentFieldName)) { tmpSuggestion.shardSize(parser.intValue()); } else if (SUGGESTMODE_FIELD.match(currentFieldName)) { tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text())); } else if (ACCURACY_FIELD.match(currentFieldName)) { tmpSuggestion.accuracy(parser.floatValue()); } else if (SORT_FIELD.match(currentFieldName)) { tmpSuggestion.sort(SortBy.resolve(parser.text())); } else if (STRING_DISTANCE_FIELD.match(currentFieldName)) { tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text())); } else if (MAX_EDITS_FIELD.match(currentFieldName)) { tmpSuggestion.maxEdits(parser.intValue()); } else if (MAX_INSPECTIONS_FIELD.match(currentFieldName)) { tmpSuggestion.maxInspections(parser.intValue()); } else if (MAX_TERM_FREQ_FIELD.match(currentFieldName)) { tmpSuggestion.maxTermFreq(parser.floatValue()); } else if (PREFIX_LENGTH_FIELD.match(currentFieldName)) { tmpSuggestion.prefixLength(parser.intValue()); } else if (MIN_WORD_LENGTH_FIELD.match(currentFieldName)) { tmpSuggestion.minWordLength(parser.intValue()); } else if (MIN_DOC_FREQ_FIELD.match(currentFieldName)) { tmpSuggestion.minDocFreq(parser.floatValue()); } else { throw new ParsingException(parser.getTokenLocation(), "suggester[term] doesn't support field [" + currentFieldName + "]"); } } else { throw new ParsingException(parser.getTokenLocation(), "suggester[term] parsing failed on [" + currentFieldName + "]"); } } // now we should have field name, check and copy fields over to the suggestion builder we return if (fieldname == null) { throw new ElasticsearchParseException( "the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing"); } return new TermSuggestionBuilder(fieldname, tmpSuggestion); } @Override public SuggestionContext build(QueryShardContext context) throws IOException { throw new UnsupportedOperationException("querybuilders does not support this operation."); } @Override public String getWriteableName() { return SUGGESTION_NAME; } @Override protected boolean doEquals(TermSuggestionBuilder other) { return Objects.equals(suggestMode, other.suggestMode) && Objects.equals(accuracy, other.accuracy) && Objects.equals(sort, other.sort) && Objects.equals(stringDistance, other.stringDistance) && Objects.equals(maxEdits, other.maxEdits) && Objects.equals(maxInspections, other.maxInspections) && Objects.equals(maxTermFreq, other.maxTermFreq) && Objects.equals(prefixLength, other.prefixLength) && Objects.equals(minWordLength, other.minWordLength) && Objects.equals(minDocFreq, other.minDocFreq); } @Override protected int doHashCode() { return Objects.hash(suggestMode, accuracy, sort, stringDistance, maxEdits, maxInspections, maxTermFreq, prefixLength, minWordLength, minDocFreq); } /** An enum representing the valid suggest modes. */ public enum SuggestMode implements Writeable { /** Only suggest terms in the suggest text that aren't in the index. This is the default. */ MISSING { @Override public org.apache.lucene.search.spell.SuggestMode toLucene() { return org.apache.lucene.search.spell.SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; } }, /** Only suggest terms that occur in more docs then the original suggest text term. */ POPULAR { @Override public org.apache.lucene.search.spell.SuggestMode toLucene() { return org.apache.lucene.search.spell.SuggestMode.SUGGEST_MORE_POPULAR; } }, /** Suggest any matching suggest terms based on tokens in the suggest text. */ ALWAYS { @Override public org.apache.lucene.search.spell.SuggestMode toLucene() { return org.apache.lucene.search.spell.SuggestMode.SUGGEST_ALWAYS; } }; @Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(ordinal()); } public static SuggestMode readFromStream(final StreamInput in) throws IOException { int ordinal = in.readVInt(); if (ordinal < 0 || ordinal >= values().length) { throw new IOException("Unknown SuggestMode ordinal [" + ordinal + "]"); } return values()[ordinal]; } public static SuggestMode resolve(final String str) { Objects.requireNonNull(str, "Input string is null"); return valueOf(str.toUpperCase(Locale.ROOT)); } public abstract org.apache.lucene.search.spell.SuggestMode toLucene(); } /** An enum representing the valid string edit distance algorithms for determining suggestions. */ public enum StringDistanceImpl implements Writeable { /** This is the default and is based on damerau_levenshtein, but highly optimized * for comparing string distance for terms inside the index. */ INTERNAL { @Override public StringDistance toLucene() { return DirectSpellChecker.INTERNAL_LEVENSHTEIN; } }, /** String distance algorithm based on Damerau-Levenshtein algorithm. */ DAMERAU_LEVENSHTEIN { @Override public StringDistance toLucene() { return new LuceneLevenshteinDistance(); } }, /** String distance algorithm based on Levenstein edit distance algorithm. */ LEVENSTEIN { @Override public StringDistance toLucene() { return new LevensteinDistance(); } }, /** String distance algorithm based on Jaro-Winkler algorithm. */ JAROWINKLER { @Override public StringDistance toLucene() { return new JaroWinklerDistance(); } }, /** String distance algorithm based on character n-grams. */ NGRAM { @Override public StringDistance toLucene() { return new NGramDistance(); } }; @Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(ordinal()); } public static StringDistanceImpl readFromStream(final StreamInput in) throws IOException { int ordinal = in.readVInt(); if (ordinal < 0 || ordinal >= values().length) { throw new IOException("Unknown StringDistanceImpl ordinal [" + ordinal + "]"); } return values()[ordinal]; } public static StringDistanceImpl resolve(final String str) { Objects.requireNonNull(str, "Input string is null"); final String distanceVal = str.toLowerCase(Locale.US); switch (distanceVal) { case "internal": return INTERNAL; case "damerau_levenshtein": case "damerauLevenshtein": return DAMERAU_LEVENSHTEIN; case "levenstein": return LEVENSTEIN; case "ngram": return NGRAM; case "jarowinkler": return JAROWINKLER; default: throw new IllegalArgumentException("Illegal distance option " + str); } } public abstract StringDistance toLucene(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy