org.codelibs.elasticsearch.search.suggest.term.TermSuggestionBuilder Maven / Gradle / Ivy
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.codelibs.elasticsearch.search.suggest.term;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.codelibs.elasticsearch.ElasticsearchParseException;
import org.codelibs.elasticsearch.common.ParseFieldMatcher;
import org.codelibs.elasticsearch.common.ParsingException;
import org.codelibs.elasticsearch.common.io.stream.StreamInput;
import org.codelibs.elasticsearch.common.io.stream.StreamOutput;
import org.codelibs.elasticsearch.common.io.stream.Writeable;
import org.codelibs.elasticsearch.common.xcontent.XContentBuilder;
import org.codelibs.elasticsearch.common.xcontent.XContentParser;
import org.codelibs.elasticsearch.index.query.QueryParseContext;
import org.codelibs.elasticsearch.index.query.QueryShardContext;
import org.codelibs.elasticsearch.search.suggest.SortBy;
import org.codelibs.elasticsearch.search.suggest.SuggestionBuilder;
import org.codelibs.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
import java.io.IOException;
import java.util.Locale;
import java.util.Objects;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_ACCURACY;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_EDITS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_INSPECTIONS;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MAX_TERM_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD;
import static org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD;
/**
* Defines the actual suggest command. Each command uses the global options
* unless defined in the suggestion itself. All options are the same as the
* global options, but are only applicable for this suggestion.
*/
public class TermSuggestionBuilder extends SuggestionBuilder {
private static final String SUGGESTION_NAME = "term";
private SuggestMode suggestMode = SuggestMode.MISSING;
private float accuracy = DEFAULT_ACCURACY;
private SortBy sort = SortBy.SCORE;
private StringDistanceImpl stringDistance = StringDistanceImpl.INTERNAL;
private int maxEdits = DEFAULT_MAX_EDITS;
private int maxInspections = DEFAULT_MAX_INSPECTIONS;
private float maxTermFreq = DEFAULT_MAX_TERM_FREQ;
private int prefixLength = DEFAULT_PREFIX_LENGTH;
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private float minDocFreq = DEFAULT_MIN_DOC_FREQ;
public TermSuggestionBuilder(String field) {
super(field);
}
/**
* internal copy constructor that copies over all class field except field.
*/
private TermSuggestionBuilder(String field, TermSuggestionBuilder in) {
super(field, in);
suggestMode = in.suggestMode;
accuracy = in.accuracy;
sort = in.sort;
stringDistance = in.stringDistance;
maxEdits = in.maxEdits;
maxInspections = in.maxInspections;
maxTermFreq = in.maxTermFreq;
prefixLength = in.prefixLength;
minWordLength = in.minWordLength;
minDocFreq = in.minDocFreq;
}
/**
* Read from a stream.
*/
TermSuggestionBuilder(StreamInput in) throws IOException {
super(in);
suggestMode = SuggestMode.readFromStream(in);
accuracy = in.readFloat();
sort = SortBy.readFromStream(in);
stringDistance = StringDistanceImpl.readFromStream(in);
maxEdits = in.readVInt();
maxInspections = in.readVInt();
maxTermFreq = in.readFloat();
prefixLength = in.readVInt();
minWordLength = in.readVInt();
minDocFreq = in.readFloat();
}
@Override
public void doWriteTo(StreamOutput out) throws IOException {
suggestMode.writeTo(out);
out.writeFloat(accuracy);
sort.writeTo(out);
stringDistance.writeTo(out);
out.writeVInt(maxEdits);
out.writeVInt(maxInspections);
out.writeFloat(maxTermFreq);
out.writeVInt(prefixLength);
out.writeVInt(minWordLength);
out.writeFloat(minDocFreq);
}
/**
* The global suggest mode controls what suggested terms are included or
* controls for what suggest text tokens, terms should be suggested for.
* Three possible values can be specified:
*
* missing
- Only suggest terms in the suggest text that
* aren't in the index. This is the default.
* popular
- Only suggest terms that occur in more docs
* then the original suggest text term.
* always
- Suggest any matching suggest terms based on
* tokens in the suggest text.
*
*/
public TermSuggestionBuilder suggestMode(SuggestMode suggestMode) {
Objects.requireNonNull(suggestMode, "suggestMode must not be null");
this.suggestMode = suggestMode;
return this;
}
/**
* Get the suggest mode setting.
*/
public SuggestMode suggestMode() {
return suggestMode;
}
/**
* s how similar the suggested terms at least need to be compared to the
* original suggest text tokens. A value between 0 and 1 can be specified.
* This value will be compared to the string distance result of each
* candidate spelling correction.
*
* Default is 0.5
*/
public TermSuggestionBuilder accuracy(float accuracy) {
if (accuracy < 0.0f || accuracy > 1.0f) {
throw new IllegalArgumentException("accuracy must be between 0 and 1");
}
this.accuracy = accuracy;
return this;
}
/**
* Get the accuracy setting.
*/
public float accuracy() {
return accuracy;
}
/**
* Sets how to sort the suggest terms per suggest text token. Two possible
* values:
*
* score
- Sort should first be based on score, then
* document frequency and then the term itself.
* frequency
- Sort should first be based on document
* frequency, then score and then the term itself.
*
*
* What the score is depends on the suggester being used.
*/
public TermSuggestionBuilder sort(SortBy sort) {
Objects.requireNonNull(sort, "sort must not be null");
this.sort = sort;
return this;
}
/**
* Get the sort setting.
*/
public SortBy sort() {
return sort;
}
/**
* Sets what string distance implementation to use for comparing how similar
* suggested terms are. Five possible values can be specified:
*
* internal
- This is the default and is based on
* damerau_levenshtein
, but highly optimized for comparing
* string distance for terms inside the index.
* damerau_levenshtein
- String distance algorithm based on
* Damerau-Levenshtein algorithm.
* levenstein
- String distance algorithm based on
* Levenstein edit distance algorithm.
* jarowinkler
- String distance algorithm based on
* Jaro-Winkler algorithm.
* ngram
- String distance algorithm based on character
* n-grams.
*
*/
public TermSuggestionBuilder stringDistance(StringDistanceImpl stringDistance) {
Objects.requireNonNull(stringDistance, "stringDistance must not be null");
this.stringDistance = stringDistance;
return this;
}
/**
* Get the string distance implementation setting.
*/
public StringDistanceImpl stringDistance() {
return stringDistance;
}
/**
* Sets the maximum edit distance candidate suggestions can have in order to
* be considered as a suggestion. Can only be a value between 1 and 2. Any
* other value result in an bad request error being thrown. Defaults to
* 2.
*/
public TermSuggestionBuilder maxEdits(int maxEdits) {
if (maxEdits < 1 || maxEdits > 2) {
throw new IllegalArgumentException("maxEdits must be between 1 and 2");
}
this.maxEdits = maxEdits;
return this;
}
/**
* Get the maximum edit distance setting.
*/
public int maxEdits() {
return maxEdits;
}
/**
* A factor that is used to multiply with the size in order to inspect more
* candidate suggestions. Can improve accuracy at the cost of performance.
* Defaults to 5.
*/
public TermSuggestionBuilder maxInspections(int maxInspections) {
if (maxInspections < 0) {
throw new IllegalArgumentException("maxInspections must be positive");
}
this.maxInspections = maxInspections;
return this;
}
/**
* Get the factor for inspecting more candidate suggestions setting.
*/
public int maxInspections() {
return maxInspections;
}
/**
* Sets a maximum threshold in number of documents a suggest text token can
* exist in order to be corrected. Can be a relative percentage number (e.g
* 0.4) or an absolute number to represent document frequencies. If an value
* higher than 1 is specified then fractional can not be specified. Defaults
* to 0.01.
*
* This can be used to exclude high frequency terms from being suggested.
* High frequency terms are usually spelled correctly on top of this this
* also improves the suggest performance.
*/
public TermSuggestionBuilder maxTermFreq(float maxTermFreq) {
if (maxTermFreq < 0.0f) {
throw new IllegalArgumentException("maxTermFreq must be positive");
}
if (maxTermFreq > 1.0f && maxTermFreq != Math.floor(maxTermFreq)) {
throw new IllegalArgumentException("if maxTermFreq is greater than 1, it must not be a fraction");
}
this.maxTermFreq = maxTermFreq;
return this;
}
/**
* Get the maximum term frequency threshold setting.
*/
public float maxTermFreq() {
return maxTermFreq;
}
/**
* Sets the number of minimal prefix characters that must match in order be
* a candidate suggestion. Defaults to 1. Increasing this number improves
* suggest performance. Usually misspellings don't occur in the beginning of
* terms.
*/
public TermSuggestionBuilder prefixLength(int prefixLength) {
if (prefixLength < 0) {
throw new IllegalArgumentException("prefixLength must be positive");
}
this.prefixLength = prefixLength;
return this;
}
/**
* Get the minimum prefix length that must match setting.
*/
public int prefixLength() {
return prefixLength;
}
/**
* The minimum length a suggest text term must have in order to be
* corrected. Defaults to 4.
*/
public TermSuggestionBuilder minWordLength(int minWordLength) {
if (minWordLength < 1) {
throw new IllegalArgumentException("minWordLength must be greater or equal to 1");
}
this.minWordLength = minWordLength;
return this;
}
/**
* Get the minimum length of a text term to be corrected setting.
*/
public int minWordLength() {
return minWordLength;
}
/**
* Sets a minimal threshold in number of documents a suggested term should
* appear in. This can be specified as an absolute number or as a relative
* percentage of number of documents. This can improve quality by only
* suggesting high frequency terms. Defaults to 0f and is not enabled. If a
* value higher than 1 is specified then the number cannot be fractional.
*/
public TermSuggestionBuilder minDocFreq(float minDocFreq) {
if (minDocFreq < 0.0f) {
throw new IllegalArgumentException("minDocFreq must be positive");
}
if (minDocFreq > 1.0f && minDocFreq != Math.floor(minDocFreq)) {
throw new IllegalArgumentException("if minDocFreq is greater than 1, it must not be a fraction");
}
this.minDocFreq = minDocFreq;
return this;
}
/**
* Get the minimal threshold for the frequency of a term appearing in the
* document set setting.
*/
public float minDocFreq() {
return minDocFreq;
}
@Override
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
builder.field(SUGGESTMODE_FIELD.getPreferredName(), suggestMode);
builder.field(ACCURACY_FIELD.getPreferredName(), accuracy);
builder.field(SORT_FIELD.getPreferredName(), sort);
builder.field(STRING_DISTANCE_FIELD.getPreferredName(), stringDistance);
builder.field(MAX_EDITS_FIELD.getPreferredName(), maxEdits);
builder.field(MAX_INSPECTIONS_FIELD.getPreferredName(), maxInspections);
builder.field(MAX_TERM_FREQ_FIELD.getPreferredName(), maxTermFreq);
builder.field(PREFIX_LENGTH_FIELD.getPreferredName(), prefixLength);
builder.field(MIN_WORD_LENGTH_FIELD.getPreferredName(), minWordLength);
builder.field(MIN_DOC_FREQ_FIELD.getPreferredName(), minDocFreq);
return builder;
}
static TermSuggestionBuilder innerFromXContent(QueryParseContext parseContext) throws IOException {
XContentParser parser = parseContext.parser();
TermSuggestionBuilder tmpSuggestion = new TermSuggestionBuilder("_na_");
parseContext.getParseFieldMatcher();
XContentParser.Token token;
String currentFieldName = null;
String fieldname = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (token.isValue()) {
if (SuggestionBuilder.ANALYZER_FIELD.match(currentFieldName)) {
tmpSuggestion.analyzer(parser.text());
} else if (SuggestionBuilder.FIELDNAME_FIELD.match(currentFieldName)) {
fieldname = parser.text();
} else if (SuggestionBuilder.SIZE_FIELD.match(currentFieldName)) {
tmpSuggestion.size(parser.intValue());
} else if (SuggestionBuilder.SHARDSIZE_FIELD.match(currentFieldName)) {
tmpSuggestion.shardSize(parser.intValue());
} else if (SUGGESTMODE_FIELD.match(currentFieldName)) {
tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text()));
} else if (ACCURACY_FIELD.match(currentFieldName)) {
tmpSuggestion.accuracy(parser.floatValue());
} else if (SORT_FIELD.match(currentFieldName)) {
tmpSuggestion.sort(SortBy.resolve(parser.text()));
} else if (STRING_DISTANCE_FIELD.match(currentFieldName)) {
tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text()));
} else if (MAX_EDITS_FIELD.match(currentFieldName)) {
tmpSuggestion.maxEdits(parser.intValue());
} else if (MAX_INSPECTIONS_FIELD.match(currentFieldName)) {
tmpSuggestion.maxInspections(parser.intValue());
} else if (MAX_TERM_FREQ_FIELD.match(currentFieldName)) {
tmpSuggestion.maxTermFreq(parser.floatValue());
} else if (PREFIX_LENGTH_FIELD.match(currentFieldName)) {
tmpSuggestion.prefixLength(parser.intValue());
} else if (MIN_WORD_LENGTH_FIELD.match(currentFieldName)) {
tmpSuggestion.minWordLength(parser.intValue());
} else if (MIN_DOC_FREQ_FIELD.match(currentFieldName)) {
tmpSuggestion.minDocFreq(parser.floatValue());
} else {
throw new ParsingException(parser.getTokenLocation(),
"suggester[term] doesn't support field [" + currentFieldName + "]");
}
} else {
throw new ParsingException(parser.getTokenLocation(), "suggester[term] parsing failed on [" + currentFieldName + "]");
}
}
// now we should have field name, check and copy fields over to the suggestion builder we return
if (fieldname == null) {
throw new ElasticsearchParseException(
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
}
return new TermSuggestionBuilder(fieldname, tmpSuggestion);
}
@Override
public SuggestionContext build(QueryShardContext context) throws IOException {
throw new UnsupportedOperationException("querybuilders does not support this operation.");
}
@Override
public String getWriteableName() {
return SUGGESTION_NAME;
}
@Override
protected boolean doEquals(TermSuggestionBuilder other) {
return Objects.equals(suggestMode, other.suggestMode) &&
Objects.equals(accuracy, other.accuracy) &&
Objects.equals(sort, other.sort) &&
Objects.equals(stringDistance, other.stringDistance) &&
Objects.equals(maxEdits, other.maxEdits) &&
Objects.equals(maxInspections, other.maxInspections) &&
Objects.equals(maxTermFreq, other.maxTermFreq) &&
Objects.equals(prefixLength, other.prefixLength) &&
Objects.equals(minWordLength, other.minWordLength) &&
Objects.equals(minDocFreq, other.minDocFreq);
}
@Override
protected int doHashCode() {
return Objects.hash(suggestMode, accuracy, sort, stringDistance, maxEdits, maxInspections,
maxTermFreq, prefixLength, minWordLength, minDocFreq);
}
/** An enum representing the valid suggest modes. */
public enum SuggestMode implements Writeable {
/** Only suggest terms in the suggest text that aren't in the index. This is the default. */
MISSING {
@Override
public org.apache.lucene.search.spell.SuggestMode toLucene() {
return org.apache.lucene.search.spell.SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
}
},
/** Only suggest terms that occur in more docs then the original suggest text term. */
POPULAR {
@Override
public org.apache.lucene.search.spell.SuggestMode toLucene() {
return org.apache.lucene.search.spell.SuggestMode.SUGGEST_MORE_POPULAR;
}
},
/** Suggest any matching suggest terms based on tokens in the suggest text. */
ALWAYS {
@Override
public org.apache.lucene.search.spell.SuggestMode toLucene() {
return org.apache.lucene.search.spell.SuggestMode.SUGGEST_ALWAYS;
}
};
@Override
public void writeTo(final StreamOutput out) throws IOException {
out.writeVInt(ordinal());
}
public static SuggestMode readFromStream(final StreamInput in) throws IOException {
int ordinal = in.readVInt();
if (ordinal < 0 || ordinal >= values().length) {
throw new IOException("Unknown SuggestMode ordinal [" + ordinal + "]");
}
return values()[ordinal];
}
public static SuggestMode resolve(final String str) {
Objects.requireNonNull(str, "Input string is null");
return valueOf(str.toUpperCase(Locale.ROOT));
}
public abstract org.apache.lucene.search.spell.SuggestMode toLucene();
}
/** An enum representing the valid string edit distance algorithms for determining suggestions. */
public enum StringDistanceImpl implements Writeable {
/** This is the default and is based on damerau_levenshtein
, but highly optimized
* for comparing string distance for terms inside the index. */
INTERNAL {
@Override
public StringDistance toLucene() {
return DirectSpellChecker.INTERNAL_LEVENSHTEIN;
}
},
/** String distance algorithm based on Damerau-Levenshtein algorithm. */
DAMERAU_LEVENSHTEIN {
@Override
public StringDistance toLucene() {
return new LuceneLevenshteinDistance();
}
},
/** String distance algorithm based on Levenstein edit distance algorithm. */
LEVENSTEIN {
@Override
public StringDistance toLucene() {
return new LevensteinDistance();
}
},
/** String distance algorithm based on Jaro-Winkler algorithm. */
JAROWINKLER {
@Override
public StringDistance toLucene() {
return new JaroWinklerDistance();
}
},
/** String distance algorithm based on character n-grams. */
NGRAM {
@Override
public StringDistance toLucene() {
return new NGramDistance();
}
};
@Override
public void writeTo(final StreamOutput out) throws IOException {
out.writeVInt(ordinal());
}
public static StringDistanceImpl readFromStream(final StreamInput in) throws IOException {
int ordinal = in.readVInt();
if (ordinal < 0 || ordinal >= values().length) {
throw new IOException("Unknown StringDistanceImpl ordinal [" + ordinal + "]");
}
return values()[ordinal];
}
public static StringDistanceImpl resolve(final String str) {
Objects.requireNonNull(str, "Input string is null");
final String distanceVal = str.toLowerCase(Locale.US);
switch (distanceVal) {
case "internal":
return INTERNAL;
case "damerau_levenshtein":
case "damerauLevenshtein":
return DAMERAU_LEVENSHTEIN;
case "levenstein":
return LEVENSTEIN;
case "ngram":
return NGRAM;
case "jarowinkler":
return JAROWINKLER;
default: throw new IllegalArgumentException("Illegal distance option " + str);
}
}
public abstract StringDistance toLucene();
}
}