org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder Maven / Gradle / Ivy
Show all versions of elasticsearch Show documentation
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.search.suggest.SortBy;
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder.CandidateGenerator;
import java.io.IOException;
import java.util.Locale;
import java.util.Objects;
import java.util.function.Consumer;
public final class DirectCandidateGeneratorBuilder implements CandidateGenerator {
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(
Loggers.getLogger(DirectCandidateGeneratorBuilder.class));
private static final String TYPE = "direct_generator";
public static final ParseField DIRECT_GENERATOR_FIELD = new ParseField(TYPE);
public static final ParseField FIELDNAME_FIELD = new ParseField("field");
public static final ParseField PREFILTER_FIELD = new ParseField("pre_filter");
public static final ParseField POSTFILTER_FIELD = new ParseField("post_filter");
public static final ParseField SUGGESTMODE_FIELD = new ParseField("suggest_mode");
public static final ParseField MIN_DOC_FREQ_FIELD = new ParseField("min_doc_freq");
public static final ParseField ACCURACY_FIELD = new ParseField("accuracy");
public static final ParseField SIZE_FIELD = new ParseField("size");
public static final ParseField SORT_FIELD = new ParseField("sort");
public static final ParseField STRING_DISTANCE_FIELD = new ParseField("string_distance");
public static final ParseField MAX_EDITS_FIELD = new ParseField("max_edits");
public static final ParseField MAX_INSPECTIONS_FIELD = new ParseField("max_inspections");
public static final ParseField MAX_TERM_FREQ_FIELD = new ParseField("max_term_freq");
public static final ParseField PREFIX_LENGTH_FIELD = new ParseField("prefix_length");
public static final ParseField MIN_WORD_LENGTH_FIELD = new ParseField("min_word_length");
private final String field;
private String preFilter;
private String postFilter;
private String suggestMode;
private Float accuracy;
private Integer size;
private String sort;
private String stringDistance;
private Integer maxEdits;
private Integer maxInspections;
private Float maxTermFreq;
private Integer prefixLength;
private Integer minWordLength;
private Float minDocFreq;
/**
* @param field Sets from what field to fetch the candidate suggestions from.
*/
public DirectCandidateGeneratorBuilder(String field) {
this.field = field;
}
/**
* Read from a stream.
*/
public DirectCandidateGeneratorBuilder(StreamInput in) throws IOException {
field = in.readString();
suggestMode = in.readOptionalString();
accuracy = in.readOptionalFloat();
size = in.readOptionalVInt();
sort = in.readOptionalString();
stringDistance = in.readOptionalString();
maxEdits = in.readOptionalVInt();
maxInspections = in.readOptionalVInt();
maxTermFreq = in.readOptionalFloat();
prefixLength = in.readOptionalVInt();
minWordLength = in.readOptionalVInt();
minDocFreq = in.readOptionalFloat();
preFilter = in.readOptionalString();
postFilter = in.readOptionalString();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(field);
out.writeOptionalString(suggestMode);
out.writeOptionalFloat(accuracy);
out.writeOptionalVInt(size);
out.writeOptionalString(sort);
out.writeOptionalString(stringDistance);
out.writeOptionalVInt(maxEdits);
out.writeOptionalVInt(maxInspections);
out.writeOptionalFloat(maxTermFreq);
out.writeOptionalVInt(prefixLength);
out.writeOptionalVInt(minWordLength);
out.writeOptionalFloat(minDocFreq);
out.writeOptionalString(preFilter);
out.writeOptionalString(postFilter);
}
String field() {
return this.field;
}
/**
* The global suggest mode controls what suggested terms are included or
* controls for what suggest text tokens, terms should be suggested for.
* Three possible values can be specified:
*
* missing
- Only suggest terms in the suggest text
* that aren't in the index. This is the default.
* popular
- Only suggest terms that occur in more docs
* then the original suggest text term.
* always
- Suggest any matching suggest terms based on
* tokens in the suggest text.
*
*/
public DirectCandidateGeneratorBuilder suggestMode(String suggestMode) {
this.suggestMode = suggestMode;
return this;
}
String suggestMode() {
return suggestMode;
}
/**
* Sets how similar the suggested terms at least need to be compared to
* the original suggest text tokens. A value between 0 and 1 can be
* specified. This value will be compared to the string distance result
* of each candidate spelling correction.
*
* Default is 0.5
*/
public DirectCandidateGeneratorBuilder accuracy(float accuracy) {
this.accuracy = accuracy;
return this;
}
Float accuracy() {
return this.accuracy;
}
/**
* Sets the maximum suggestions to be returned per suggest text term.
*/
public DirectCandidateGeneratorBuilder size(int size) {
if (size <= 0) {
throw new IllegalArgumentException("Size must be positive");
}
this.size = size;
return this;
}
Integer size() {
return size;
}
/**
* Sets how to sort the suggest terms per suggest text token. Two
* possible values:
*
* score
- Sort should first be based on score, then
* document frequency and then the term itself.
* frequency
- Sort should first be based on document
* frequency, then score and then the term itself.
*
*
* What the score is depends on the suggester being used.
*/
public DirectCandidateGeneratorBuilder sort(String sort) {
this.sort = sort;
return this;
}
String sort() {
return sort;
}
/**
* Sets what string distance implementation to use for comparing how
* similar suggested terms are. Four possible values can be specified:
*
* internal
- This is the default and is based on
* damerau_levenshtein
, but highly optimized for comparing
* string distance for terms inside the index.
* damerau_levenshtein
- String distance algorithm
* based on Damerau-Levenshtein algorithm.
* levenshtein
- String distance algorithm based on
* Levenshtein edit distance algorithm.
* jaro_winkler
- String distance algorithm based on
* Jaro-Winkler algorithm.
* ngram
- String distance algorithm based on character
* n-grams.
*
*/
public DirectCandidateGeneratorBuilder stringDistance(String stringDistance) {
this.stringDistance = stringDistance;
return this;
}
String stringDistance() {
return stringDistance;
}
/**
* Sets the maximum edit distance candidate suggestions can have in
* order to be considered as a suggestion. Can only be a value between 1
* and 2. Any other value result in an bad request error being thrown.
* Defaults to 2.
*/
public DirectCandidateGeneratorBuilder maxEdits(Integer maxEdits) {
if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("Illegal max_edits value " + maxEdits);
}
this.maxEdits = maxEdits;
return this;
}
Integer maxEdits() {
return maxEdits;
}
/**
* A factor that is used to multiply with the size in order to inspect
* more candidate suggestions. Can improve accuracy at the cost of
* performance. Defaults to 5.
*/
public DirectCandidateGeneratorBuilder maxInspections(Integer maxInspections) {
this.maxInspections = maxInspections;
return this;
}
Integer maxInspections() {
return maxInspections;
}
/**
* Sets a maximum threshold in number of documents a suggest text token
* can exist in order to be corrected. Can be a relative percentage
* number (e.g 0.4) or an absolute number to represent document
* frequencies. If an value higher than 1 is specified then fractional
* can not be specified. Defaults to 0.01.
*
* This can be used to exclude high frequency terms from being
* suggested. High frequency terms are usually spelled correctly on top
* of this this also improves the suggest performance.
*/
public DirectCandidateGeneratorBuilder maxTermFreq(float maxTermFreq) {
this.maxTermFreq = maxTermFreq;
return this;
}
Float maxTermFreq() {
return maxTermFreq;
}
/**
* Sets the number of minimal prefix characters that must match in order
* be a candidate suggestion. Defaults to 1. Increasing this number
* improves suggest performance. Usually misspellings don't occur in the
* beginning of terms.
*/
public DirectCandidateGeneratorBuilder prefixLength(int prefixLength) {
this.prefixLength = prefixLength;
return this;
}
Integer prefixLength() {
return prefixLength;
}
/**
* The minimum length a suggest text term must have in order to be
* corrected. Defaults to 4.
*/
public DirectCandidateGeneratorBuilder minWordLength(int minWordLength) {
this.minWordLength = minWordLength;
return this;
}
Integer minWordLength() {
return minWordLength;
}
/**
* Sets a minimal threshold in number of documents a suggested term
* should appear in. This can be specified as an absolute number or as a
* relative percentage of number of documents. This can improve quality
* by only suggesting high frequency terms. Defaults to 0f and is not
* enabled. If a value higher than 1 is specified then the number cannot
* be fractional.
*/
public DirectCandidateGeneratorBuilder minDocFreq(float minDocFreq) {
this.minDocFreq = minDocFreq;
return this;
}
Float minDocFreq() {
return minDocFreq;
}
/**
* Sets a filter (analyzer) that is applied to each of the tokens passed to this candidate generator.
* This filter is applied to the original token before candidates are generated.
*/
public DirectCandidateGeneratorBuilder preFilter(String preFilter) {
this.preFilter = preFilter;
return this;
}
String preFilter() {
return preFilter;
}
/**
* Sets a filter (analyzer) that is applied to each of the generated tokens
* before they are passed to the actual phrase scorer.
*/
public DirectCandidateGeneratorBuilder postFilter(String postFilter) {
this.postFilter = postFilter;
return this;
}
String postFilter() {
return postFilter;
}
/**
* gets the type identifier of this {@link CandidateGenerator}
*/
@Override
public String getType() {
return TYPE;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
outputFieldIfNotNull(field, FIELDNAME_FIELD, builder);
outputFieldIfNotNull(accuracy, ACCURACY_FIELD, builder);
outputFieldIfNotNull(maxEdits, MAX_EDITS_FIELD, builder);
outputFieldIfNotNull(maxInspections, MAX_INSPECTIONS_FIELD, builder);
outputFieldIfNotNull(maxTermFreq, MAX_TERM_FREQ_FIELD, builder);
outputFieldIfNotNull(minWordLength, MIN_WORD_LENGTH_FIELD, builder);
outputFieldIfNotNull(minDocFreq, MIN_DOC_FREQ_FIELD, builder);
outputFieldIfNotNull(preFilter, PREFILTER_FIELD, builder);
outputFieldIfNotNull(prefixLength, PREFIX_LENGTH_FIELD, builder);
outputFieldIfNotNull(postFilter, POSTFILTER_FIELD, builder);
outputFieldIfNotNull(suggestMode, SUGGESTMODE_FIELD, builder);
outputFieldIfNotNull(size, SIZE_FIELD, builder);
outputFieldIfNotNull(sort, SORT_FIELD, builder);
outputFieldIfNotNull(stringDistance, STRING_DISTANCE_FIELD, builder);
builder.endObject();
return builder;
}
private static void outputFieldIfNotNull(T value, ParseField field, XContentBuilder builder) throws IOException {
if (value != null) {
builder.field(field.getPreferredName(), value);
}
}
public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(
TYPE, args -> new DirectCandidateGeneratorBuilder((String) args[0]));
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), FIELDNAME_FIELD);
PARSER.declareString(DirectCandidateGeneratorBuilder::preFilter, PREFILTER_FIELD);
PARSER.declareString(DirectCandidateGeneratorBuilder::postFilter, POSTFILTER_FIELD);
PARSER.declareString(DirectCandidateGeneratorBuilder::suggestMode, SUGGESTMODE_FIELD);
PARSER.declareFloat(DirectCandidateGeneratorBuilder::minDocFreq, MIN_DOC_FREQ_FIELD);
PARSER.declareFloat(DirectCandidateGeneratorBuilder::accuracy, ACCURACY_FIELD);
PARSER.declareInt(DirectCandidateGeneratorBuilder::size, SIZE_FIELD);
PARSER.declareString(DirectCandidateGeneratorBuilder::sort, SORT_FIELD);
PARSER.declareString(DirectCandidateGeneratorBuilder::stringDistance, STRING_DISTANCE_FIELD);
PARSER.declareInt(DirectCandidateGeneratorBuilder::maxInspections, MAX_INSPECTIONS_FIELD);
PARSER.declareFloat(DirectCandidateGeneratorBuilder::maxTermFreq, MAX_TERM_FREQ_FIELD);
PARSER.declareInt(DirectCandidateGeneratorBuilder::maxEdits, MAX_EDITS_FIELD);
PARSER.declareInt(DirectCandidateGeneratorBuilder::minWordLength, MIN_WORD_LENGTH_FIELD);
PARSER.declareInt(DirectCandidateGeneratorBuilder::prefixLength, PREFIX_LENGTH_FIELD);
}
@Override
public PhraseSuggestionContext.DirectCandidateGenerator build(MapperService mapperService) throws IOException {
PhraseSuggestionContext.DirectCandidateGenerator generator = new PhraseSuggestionContext.DirectCandidateGenerator();
generator.setField(this.field);
transferIfNotNull(this.size, generator::size);
if (this.preFilter != null) {
generator.preFilter(mapperService.getNamedAnalyzer(this.preFilter));
if (generator.preFilter() == null) {
throw new IllegalArgumentException("Analyzer [" + this.preFilter + "] doesn't exists");
}
}
if (this.postFilter != null) {
generator.postFilter(mapperService.getNamedAnalyzer(this.postFilter));
if (generator.postFilter() == null) {
throw new IllegalArgumentException("Analyzer [" + this.postFilter + "] doesn't exists");
}
}
transferIfNotNull(this.accuracy, generator::accuracy);
if (this.suggestMode != null) {
generator.suggestMode(resolveSuggestMode(this.suggestMode));
}
if (this.sort != null) {
generator.sort(SortBy.resolve(this.sort));
}
if (this.stringDistance != null) {
generator.stringDistance(resolveDistance(this.stringDistance));
}
transferIfNotNull(this.maxEdits, generator::maxEdits);
if (generator.maxEdits() < 1 || generator.maxEdits() > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("Illegal max_edits value " + generator.maxEdits());
}
transferIfNotNull(this.maxInspections, generator::maxInspections);
transferIfNotNull(this.maxTermFreq, generator::maxTermFreq);
transferIfNotNull(this.prefixLength, generator::prefixLength);
transferIfNotNull(this.minWordLength, generator::minWordLength);
transferIfNotNull(this.minDocFreq, generator::minDocFreq);
return generator;
}
private static SuggestMode resolveSuggestMode(String suggestMode) {
suggestMode = suggestMode.toLowerCase(Locale.US);
if ("missing".equals(suggestMode)) {
return SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
} else if ("popular".equals(suggestMode)) {
return SuggestMode.SUGGEST_MORE_POPULAR;
} else if ("always".equals(suggestMode)) {
return SuggestMode.SUGGEST_ALWAYS;
} else {
throw new IllegalArgumentException("Illegal suggest mode " + suggestMode);
}
}
static StringDistance resolveDistance(String distanceVal) {
distanceVal = distanceVal.toLowerCase(Locale.US);
if ("internal".equals(distanceVal)) {
return DirectSpellChecker.INTERNAL_LEVENSHTEIN;
} else if ("damerau_levenshtein".equals(distanceVal) || "damerauLevenshtein".equals(distanceVal)) {
return new LuceneLevenshteinDistance();
} else if ("levenstein".equals(distanceVal)) {
DEPRECATION_LOGGER.deprecated("Deprecated distance [levenstein] used, replaced by [levenshtein]");
return new LevensteinDistance();
} else if ("levenshtein".equals(distanceVal)) {
return new LevensteinDistance();
} else if ("jarowinkler".equals(distanceVal)) {
DEPRECATION_LOGGER.deprecated("Deprecated distance [jarowinkler] used, replaced by [jaro_winkler]");
return new JaroWinklerDistance();
} else if ("jaro_winkler".equals(distanceVal)) {
return new JaroWinklerDistance();
} else if ("ngram".equals(distanceVal)) {
return new NGramDistance();
} else {
throw new IllegalArgumentException("Illegal distance option " + distanceVal);
}
}
private static void transferIfNotNull(T value, Consumer consumer) {
if (value != null) {
consumer.accept(value);
}
}
@Override
public String toString() {
try {
XContentBuilder builder = XContentFactory.jsonBuilder();
builder.prettyPrint();
toXContent(builder, EMPTY_PARAMS);
return builder.string();
} catch (Exception e) {
return "{ \"error\" : \"" + ExceptionsHelper.detailedMessage(e) + "\"}";
}
}
@Override
public int hashCode() {
return Objects.hash(field, preFilter, postFilter, suggestMode, accuracy,
size, sort, stringDistance, maxEdits, maxInspections,
maxTermFreq, prefixLength, minWordLength, minDocFreq);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
DirectCandidateGeneratorBuilder other = (DirectCandidateGeneratorBuilder) obj;
return Objects.equals(field, other.field) &&
Objects.equals(preFilter, other.preFilter) &&
Objects.equals(postFilter, other.postFilter) &&
Objects.equals(suggestMode, other.suggestMode) &&
Objects.equals(accuracy, other.accuracy) &&
Objects.equals(size, other.size) &&
Objects.equals(sort, other.sort) &&
Objects.equals(stringDistance, other.stringDistance) &&
Objects.equals(maxEdits, other.maxEdits) &&
Objects.equals(maxInspections, other.maxInspections) &&
Objects.equals(maxTermFreq, other.maxTermFreq) &&
Objects.equals(prefixLength, other.prefixLength) &&
Objects.equals(minWordLength, other.minWordLength) &&
Objects.equals(minDocFreq, other.minDocFreq);
}
}