org.codelibs.elasticsearch.search.suggest.term.TermSuggestionBuilder Maven / Gradle / Ivy

Go to download
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.codelibs.elasticsearch.search.suggest.term;

import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.codelibs.elasticsearch.ElasticsearchParseException;
import org.codelibs.elasticsearch.common.ParseFieldMatcher;
import org.codelibs.elasticsearch.common.ParsingException;
import org.codelibs.elasticsearch.common.io.stream.StreamInput;
import org.codelibs.elasticsearch.common.io.stream.StreamOutput;
import org.codelibs.elasticsearch.common.io.stream.Writeable;
import org.codelibs.elasticsearch.common.xcontent.XContentBuilder;
import org.codelibs.elasticsearch.common.xcontent.XContentParser;
import org.codelibs.elasticsearch.index.query.QueryParseContext;
import org.codelibs.elasticsearch.index.query.QueryShardContext;
import org.codelibs.elasticsearch.search.suggest.SortBy;
import org.codelibs.elasticsearch.search.suggest.SuggestionBuilder;
import org.codelibs.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;

import java.io.IOException;
import java.util.Locale;
import java.util.Objects;

import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_ACCURACY;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_EDITS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_INSPECTIONS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_TERM_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD;

/**
 * Defines the actual suggest command. Each command uses the global options
 * unless defined in the suggestion itself. All options are the same as the
 * global options, but are only applicable for this suggestion.
 */
public class TermSuggestionBuilder extends SuggestionBuilder {
    private static final String SUGGESTION_NAME = "term";

    private SuggestMode suggestMode = SuggestMode.MISSING;
    private float accuracy = DEFAULT_ACCURACY;
    private SortBy sort = SortBy.SCORE;
    private StringDistanceImpl stringDistance = StringDistanceImpl.INTERNAL;
    private int maxEdits = DEFAULT_MAX_EDITS;
    private int maxInspections = DEFAULT_MAX_INSPECTIONS;
    private float maxTermFreq = DEFAULT_MAX_TERM_FREQ;
    private int prefixLength = DEFAULT_PREFIX_LENGTH;
    private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
    private float minDocFreq = DEFAULT_MIN_DOC_FREQ;

    public TermSuggestionBuilder(String field) {
        super(field);
    }

    /**
     * internal copy constructor that copies over all class field except field.
     */
    private TermSuggestionBuilder(String field, TermSuggestionBuilder in) {
        super(field, in);
        suggestMode = in.suggestMode;
        accuracy = in.accuracy;
        sort = in.sort;
        stringDistance = in.stringDistance;
        maxEdits = in.maxEdits;
        maxInspections = in.maxInspections;
        maxTermFreq = in.maxTermFreq;
        prefixLength = in.prefixLength;
        minWordLength = in.minWordLength;
        minDocFreq = in.minDocFreq;
    }

    /**
     * Read from a stream.
     */
    TermSuggestionBuilder(StreamInput in) throws IOException {
        super(in);
        suggestMode = SuggestMode.readFromStream(in);
        accuracy = in.readFloat();
        sort = SortBy.readFromStream(in);
        stringDistance = StringDistanceImpl.readFromStream(in);
        maxEdits = in.readVInt();
        maxInspections = in.readVInt();
        maxTermFreq = in.readFloat();
        prefixLength = in.readVInt();
        minWordLength = in.readVInt();
        minDocFreq = in.readFloat();
    }

    @Override
    public void doWriteTo(StreamOutput out) throws IOException {
        suggestMode.writeTo(out);
        out.writeFloat(accuracy);
        sort.writeTo(out);
        stringDistance.writeTo(out);
        out.writeVInt(maxEdits);
        out.writeVInt(maxInspections);
        out.writeFloat(maxTermFreq);
        out.writeVInt(prefixLength);
        out.writeVInt(minWordLength);
        out.writeFloat(minDocFreq);
    }

    /**
     * The global suggest mode controls what suggested terms are included or
     * controls for what suggest text tokens, terms should be suggested for.
     * Three possible values can be specified:
     * 
     * missing - Only suggest terms in the suggest text that
     * aren't in the index. This is the default.
     * 
popular - Only suggest terms that occur in more docs
     * then the original suggest text term.
     * 
always - Suggest any matching suggest terms based on
     * tokens in the suggest text.
     * 
     */
    public TermSuggestionBuilder suggestMode(SuggestMode suggestMode) {
        Objects.requireNonNull(suggestMode, "suggestMode must not be null");
        this.suggestMode = suggestMode;
        return this;
    }

    /**
     * Get the suggest mode setting.
     */
    public SuggestMode suggestMode() {
        return suggestMode;
    }

    /**
     * s how similar the suggested terms at least need to be compared to the
     * original suggest text tokens. A value between 0 and 1 can be specified.
     * This value will be compared to the string distance result of each
     * candidate spelling correction.
     * 
     * Default is 0.5
     */
    public TermSuggestionBuilder accuracy(float accuracy) {
        if (accuracy < 0.0f || accuracy > 1.0f) {
            throw new IllegalArgumentException("accuracy must be between 0 and 1");
        }
        this.accuracy = accuracy;
        return this;
    }

    /**
     * Get the accuracy setting.
     */
    public float accuracy() {
        return accuracy;
    }

    /**
     * Sets how to sort the suggest terms per suggest text token. Two possible
     * values:
     * 

     * score - Sort should first be based on score, then
     * document frequency and then the term itself.
     * 
frequency - Sort should first be based on document
     * frequency, then score and then the term itself.
     * 
     * 
     * What the score is depends on the suggester being used.
     */
    public TermSuggestionBuilder sort(SortBy sort) {
        Objects.requireNonNull(sort, "sort must not be null");
        this.sort = sort;
        return this;
    }

    /**
     * Get the sort setting.
     */
    public SortBy sort() {
        return sort;
    }

    /**
     * Sets what string distance implementation to use for comparing how similar
     * suggested terms are. Five possible values can be specified:
     * 

     * internal - This is the default and is based on
     * damerau_levenshtein, but highly optimized for comparing
     * string distance for terms inside the index.
     * 
damerau_levenshtein - String distance algorithm based on
     * Damerau-Levenshtein algorithm.
     * 
levenstein - String distance algorithm based on
     * Levenstein edit distance algorithm.
     * 
jarowinkler - String distance algorithm based on
     * Jaro-Winkler algorithm.
     * 
ngram - String distance algorithm based on character
     * n-grams.
     * 
     */
    public TermSuggestionBuilder stringDistance(StringDistanceImpl stringDistance) {
        Objects.requireNonNull(stringDistance, "stringDistance must not be null");
        this.stringDistance = stringDistance;
        return this;
    }

    /**
     * Get the string distance implementation setting.
     */
    public StringDistanceImpl stringDistance() {
        return stringDistance;
    }

    /**
     * Sets the maximum edit distance candidate suggestions can have in order to
     * be considered as a suggestion. Can only be a value between 1 and 2. Any
     * other value result in an bad request error being thrown. Defaults to
     * 2.
     */
    public TermSuggestionBuilder maxEdits(int maxEdits) {
        if (maxEdits < 1 || maxEdits > 2) {
            throw new IllegalArgumentException("maxEdits must be between 1 and 2");
        }
        this.maxEdits = maxEdits;
        return this;
    }

    /**
     * Get the maximum edit distance setting.
     */
    public int maxEdits() {
        return maxEdits;
    }

    /**
     * A factor that is used to multiply with the size in order to inspect more
     * candidate suggestions. Can improve accuracy at the cost of performance.
     * Defaults to 5.
     */
    public TermSuggestionBuilder maxInspections(int maxInspections) {
        if (maxInspections < 0) {
            throw new IllegalArgumentException("maxInspections must be positive");
        }
        this.maxInspections = maxInspections;
        return this;
    }

    /**
     * Get the factor for inspecting more candidate suggestions setting.
     */
    public int maxInspections() {
        return maxInspections;
    }

    /**
     * Sets a maximum threshold in number of documents a suggest text token can
     * exist in order to be corrected. Can be a relative percentage number (e.g
     * 0.4) or an absolute number to represent document frequencies. If an value
     * higher than 1 is specified then fractional can not be specified. Defaults
     * to 0.01.
     * 
     * This can be used to exclude high frequency terms from being suggested.
     * High frequency terms are usually spelled correctly on top of this this
     * also improves the suggest performance.
     */
    public TermSuggestionBuilder maxTermFreq(float maxTermFreq) {
        if (maxTermFreq < 0.0f) {
            throw new IllegalArgumentException("maxTermFreq must be positive");
        }
        if (maxTermFreq > 1.0f && maxTermFreq != Math.floor(maxTermFreq)) {
            throw new IllegalArgumentException("if maxTermFreq is greater than 1, it must not be a fraction");
        }
        this.maxTermFreq = maxTermFreq;
        return this;
    }

    /**
     * Get the maximum term frequency threshold setting.
     */
    public float maxTermFreq() {
        return maxTermFreq;
    }

    /**
     * Sets the number of minimal prefix characters that must match in order be
     * a candidate suggestion. Defaults to 1. Increasing this number improves
     * suggest performance. Usually misspellings don't occur in the beginning of
     * terms.
     */
    public TermSuggestionBuilder prefixLength(int prefixLength) {
        if (prefixLength < 0) {
            throw new IllegalArgumentException("prefixLength must be positive");
        }
        this.prefixLength = prefixLength;
        return this;
    }

    /**
     * Get the minimum prefix length that must match setting.
     */
    public int prefixLength() {
        return prefixLength;
    }

    /**
     * The minimum length a suggest text term must have in order to be
     * corrected. Defaults to 4.
     */
    public TermSuggestionBuilder minWordLength(int minWordLength) {
        if (minWordLength < 1) {
            throw new IllegalArgumentException("minWordLength must be greater or equal to 1");
        }
        this.minWordLength = minWordLength;
        return this;
    }

    /**
     * Get the minimum length of a text term to be corrected setting.
     */
    public int minWordLength() {
        return minWordLength;
    }

    /**
     * Sets a minimal threshold in number of documents a suggested term should
     * appear in. This can be specified as an absolute number or as a relative
     * percentage of number of documents. This can improve quality by only
     * suggesting high frequency terms. Defaults to 0f and is not enabled. If a
     * value higher than 1 is specified then the number cannot be fractional.
     */
    public TermSuggestionBuilder minDocFreq(float minDocFreq) {
        if (minDocFreq < 0.0f) {
            throw new IllegalArgumentException("minDocFreq must be positive");
        }
        if (minDocFreq > 1.0f && minDocFreq != Math.floor(minDocFreq)) {
            throw new IllegalArgumentException("if minDocFreq is greater than 1, it must not be a fraction");
        }
        this.minDocFreq = minDocFreq;
        return this;
    }

    /**
     * Get the minimal threshold for the frequency of a term appearing in the
     * document set setting.
     */
    public float minDocFreq() {
        return minDocFreq;
    }

    @Override
    public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
        builder.field(SUGGESTMODE_FIELD.getPreferredName(), suggestMode);
        builder.field(ACCURACY_FIELD.getPreferredName(), accuracy);
        builder.field(SORT_FIELD.getPreferredName(), sort);
        builder.field(STRING_DISTANCE_FIELD.getPreferredName(), stringDistance);
        builder.field(MAX_EDITS_FIELD.getPreferredName(), maxEdits);
        builder.field(MAX_INSPECTIONS_FIELD.getPreferredName(), maxInspections);
        builder.field(MAX_TERM_FREQ_FIELD.getPreferredName(), maxTermFreq);
        builder.field(PREFIX_LENGTH_FIELD.getPreferredName(), prefixLength);
        builder.field(MIN_WORD_LENGTH_FIELD.getPreferredName(), minWordLength);
        builder.field(MIN_DOC_FREQ_FIELD.getPreferredName(), minDocFreq);
        return builder;
    }

    static TermSuggestionBuilder innerFromXContent(QueryParseContext parseContext) throws IOException {
        XContentParser parser = parseContext.parser();
        TermSuggestionBuilder tmpSuggestion = new TermSuggestionBuilder("_na_");
        parseContext.getParseFieldMatcher();
        XContentParser.Token token;
        String currentFieldName = null;
        String fieldname = null;
        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
                currentFieldName = parser.currentName();
            } else if (token.isValue()) {
                if (SuggestionBuilder.ANALYZER_FIELD.match(currentFieldName)) {
                    tmpSuggestion.analyzer(parser.text());
                } else if (SuggestionBuilder.FIELDNAME_FIELD.match(currentFieldName)) {
                    fieldname = parser.text();
                } else if (SuggestionBuilder.SIZE_FIELD.match(currentFieldName)) {
                    tmpSuggestion.size(parser.intValue());
                } else if (SuggestionBuilder.SHARDSIZE_FIELD.match(currentFieldName)) {
                    tmpSuggestion.shardSize(parser.intValue());
                } else if (SUGGESTMODE_FIELD.match(currentFieldName)) {
                    tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text()));
                } else if (ACCURACY_FIELD.match(currentFieldName)) {
                    tmpSuggestion.accuracy(parser.floatValue());
                } else if (SORT_FIELD.match(currentFieldName)) {
                    tmpSuggestion.sort(SortBy.resolve(parser.text()));
                } else if (STRING_DISTANCE_FIELD.match(currentFieldName)) {
                    tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text()));
                } else if (MAX_EDITS_FIELD.match(currentFieldName)) {
                    tmpSuggestion.maxEdits(parser.intValue());
                } else if (MAX_INSPECTIONS_FIELD.match(currentFieldName)) {
                    tmpSuggestion.maxInspections(parser.intValue());
                } else if (MAX_TERM_FREQ_FIELD.match(currentFieldName)) {
                    tmpSuggestion.maxTermFreq(parser.floatValue());
                } else if (PREFIX_LENGTH_FIELD.match(currentFieldName)) {
                    tmpSuggestion.prefixLength(parser.intValue());
                } else if (MIN_WORD_LENGTH_FIELD.match(currentFieldName)) {
                    tmpSuggestion.minWordLength(parser.intValue());
                } else if (MIN_DOC_FREQ_FIELD.match(currentFieldName)) {
                    tmpSuggestion.minDocFreq(parser.floatValue());
                } else {
                    throw new ParsingException(parser.getTokenLocation(),
                                                  "suggester[term] doesn't support field [" + currentFieldName + "]");
                }
            } else {
                throw new ParsingException(parser.getTokenLocation(), "suggester[term] parsing failed on [" + currentFieldName + "]");
            }
        }

        // now we should have field name, check and copy fields over to the suggestion builder we return
        if (fieldname == null) {
            throw new ElasticsearchParseException(
                "the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
        }
        return new TermSuggestionBuilder(fieldname, tmpSuggestion);
    }

    @Override
    public SuggestionContext build(QueryShardContext context) throws IOException {
        throw new UnsupportedOperationException("querybuilders does not support this operation.");
    }

    @Override
    public String getWriteableName() {
        return SUGGESTION_NAME;
    }

    @Override
    protected boolean doEquals(TermSuggestionBuilder other) {
        return Objects.equals(suggestMode, other.suggestMode) &&
               Objects.equals(accuracy, other.accuracy) &&
               Objects.equals(sort, other.sort) &&
               Objects.equals(stringDistance, other.stringDistance) &&
               Objects.equals(maxEdits, other.maxEdits) &&
               Objects.equals(maxInspections, other.maxInspections) &&
               Objects.equals(maxTermFreq, other.maxTermFreq) &&
               Objects.equals(prefixLength, other.prefixLength) &&
               Objects.equals(minWordLength, other.minWordLength) &&
               Objects.equals(minDocFreq, other.minDocFreq);
    }

    @Override
    protected int doHashCode() {
        return Objects.hash(suggestMode, accuracy, sort, stringDistance, maxEdits, maxInspections,
                            maxTermFreq, prefixLength, minWordLength, minDocFreq);
    }

    /** An enum representing the valid suggest modes. */
    public enum SuggestMode implements Writeable {
        /** Only suggest terms in the suggest text that aren't in the index. This is the default. */
        MISSING {
            @Override
            public org.apache.lucene.search.spell.SuggestMode toLucene() {
                return org.apache.lucene.search.spell.SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
            }
        },
        /** Only suggest terms that occur in more docs then the original suggest text term. */
        POPULAR {
            @Override
            public org.apache.lucene.search.spell.SuggestMode toLucene() {
                return org.apache.lucene.search.spell.SuggestMode.SUGGEST_MORE_POPULAR;
            }
        },
        /** Suggest any matching suggest terms based on tokens in the suggest text. */
        ALWAYS {
            @Override
            public org.apache.lucene.search.spell.SuggestMode toLucene() {
              return org.apache.lucene.search.spell.SuggestMode.SUGGEST_ALWAYS;
            }
        };

        @Override
        public void writeTo(final StreamOutput out) throws IOException {
            out.writeVInt(ordinal());
        }

        public static SuggestMode readFromStream(final StreamInput in) throws IOException {
            int ordinal = in.readVInt();
            if (ordinal < 0 || ordinal >= values().length) {
                throw new IOException("Unknown SuggestMode ordinal [" + ordinal + "]");
            }
            return values()[ordinal];
        }

        public static SuggestMode resolve(final String str) {
            Objects.requireNonNull(str, "Input string is null");
            return valueOf(str.toUpperCase(Locale.ROOT));
        }

        public abstract org.apache.lucene.search.spell.SuggestMode toLucene();
    }

    /** An enum representing the valid string edit distance algorithms for determining suggestions. */
    public enum StringDistanceImpl implements Writeable {
        /** This is the default and is based on damerau_levenshtein, but highly optimized
         * for comparing string distance for terms inside the index. */
        INTERNAL {
            @Override
            public StringDistance toLucene() {
                return DirectSpellChecker.INTERNAL_LEVENSHTEIN;
            }
        },
        /** String distance algorithm based on Damerau-Levenshtein algorithm. */
        DAMERAU_LEVENSHTEIN {
            @Override
            public StringDistance toLucene() {
                return new LuceneLevenshteinDistance();
            }
        },
        /** String distance algorithm based on Levenstein edit distance algorithm. */
        LEVENSTEIN {
            @Override
            public StringDistance toLucene() {
                return new LevensteinDistance();
            }
        },
        /** String distance algorithm based on Jaro-Winkler algorithm. */
        JAROWINKLER {
            @Override
            public StringDistance toLucene() {
                return new JaroWinklerDistance();
            }
        },
        /** String distance algorithm based on character n-grams. */
        NGRAM {
            @Override
            public StringDistance toLucene() {
                return new NGramDistance();
            }
        };

        @Override
        public void writeTo(final StreamOutput out) throws IOException {
            out.writeVInt(ordinal());
        }

        public static StringDistanceImpl readFromStream(final StreamInput in) throws IOException {
            int ordinal = in.readVInt();
            if (ordinal < 0 || ordinal >= values().length) {
                throw new IOException("Unknown StringDistanceImpl ordinal [" + ordinal + "]");
            }
            return values()[ordinal];
        }

        public static StringDistanceImpl resolve(final String str) {
            Objects.requireNonNull(str, "Input string is null");
            final String distanceVal = str.toLowerCase(Locale.US);
            switch (distanceVal) {
                case "internal":
                    return INTERNAL;
                case "damerau_levenshtein":
                case "damerauLevenshtein":
                    return DAMERAU_LEVENSHTEIN;
                case "levenstein":
                    return LEVENSTEIN;
                case "ngram":
                    return NGRAM;
                case "jarowinkler":
                    return JAROWINKLER;
                default: throw new IllegalArgumentException("Illegal distance option " + str);
            }
        }

        public abstract StringDistance toLucene();
    }

}