org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder Maven / Gradle / Ivy
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.search.suggest.SuggestBuilder.SuggestionBuilder;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
/**
* Defines the actual suggest command for phrase suggestions ( phrase).
*/
public final class PhraseSuggestionBuilder extends SuggestionBuilder {
private Float maxErrors;
private String separator;
private Float realWordErrorLikelihood;
private Float confidence;
private final Map> generators = new HashMap>();
private Integer gramSize;
private SmoothingModel model;
private Boolean forceUnigrams;
private Integer tokenLimit;
private String preTag;
private String postTag;
public PhraseSuggestionBuilder(String name) {
super(name, "phrase");
}
/**
* Sets the gram size for the n-gram model used for this suggester. The
* default value is 1 corresponding to unigrams. Use
* 2 for bigrams and 3 for trigrams.
*/
public PhraseSuggestionBuilder gramSize(int gramSize) {
if (gramSize < 1) {
throw new ElasticSearchIllegalArgumentException("gramSize must be >= 1");
}
this.gramSize = gramSize;
return this;
}
/**
* Sets the maximum percentage of the terms that at most considered to be
* misspellings in order to form a correction. This method accepts a float
* value in the range [0..1) as a fraction of the actual query terms a
* number >=1 as an absolut number of query terms.
*
* The default is set to 1.0 which corresponds to that only
* corrections with at most 1 missspelled term are returned.
*/
public PhraseSuggestionBuilder maxErrors(Float maxErrors) {
this.maxErrors = maxErrors;
return this;
}
/**
* Sets the separator that is used to separate terms in the bigram field. If
* not set the whitespace character is used as a separator.
*/
public PhraseSuggestionBuilder separator(String separator) {
this.separator = separator;
return this;
}
/**
* Sets the likelihood of a term being a misspelled even if the term exists
* in the dictionary. The default it 0.95 corresponding to 5% or
* the real words are misspelled.
*/
public PhraseSuggestionBuilder realWordErrorLikelihood(Float realWordErrorLikelihood) {
this.realWordErrorLikelihood = realWordErrorLikelihood;
return this;
}
/**
* Sets the confidence level for this suggester. The confidence level
* defines a factor applied to the input phrases score which is used as a
* threshold for other suggest candidates. Only candidates that score higher
* than the threshold will be included in the result. For instance a
* confidence level of 1.0 will only return suggestions that score
* higher than the input phrase. If set to 0.0 the top N candidates
* are returned. The default is 1.0
*/
public PhraseSuggestionBuilder confidence(Float confidence) {
this.confidence = confidence;
return this;
}
/**
* Adds a {@link CandidateGenerator} to this suggester. The
* {@link CandidateGenerator} is used to draw candidates for each individual
* phrase term before the candidates are scored.
*/
public PhraseSuggestionBuilder addCandidateGenerator(CandidateGenerator generator) {
List list = this.generators.get(generator.getType());
if (list == null) {
list = new ArrayList();
this.generators.put(generator.getType(), list);
}
list.add(generator);
return this;
}
/**
* Clear the candidate generators.
*/
public PhraseSuggestionBuilder clearCandidateGenerators() {
this.generators.clear();
return this;
}
/**
* If set to true
the phrase suggester will fail if the analyzer only
* produces ngrams. the default it true
.
*/
public PhraseSuggestionBuilder forceUnigrams(boolean forceUnigrams) {
this.forceUnigrams = forceUnigrams;
return this;
}
/**
* Sets an explicit smoothing model used for this suggester. The default is
* {@link PhraseSuggester#StupidBackoff}.
*/
public PhraseSuggestionBuilder smoothingModel(SmoothingModel model) {
this.model = model;
return this;
}
public PhraseSuggestionBuilder tokenLimit(int tokenLimit) {
this.tokenLimit = tokenLimit;
return this;
}
/**
* Setup highlighting for suggestions. If this is called a highlight field
* is returned with suggestions wrapping changed tokens with preTag and postTag.
*/
public PhraseSuggestionBuilder highlight(String preTag, String postTag) {
if ((preTag == null) != (postTag == null)) {
throw new ElasticSearchIllegalArgumentException("Pre and post tag must both be null or both not be null.");
}
this.preTag = preTag;
this.postTag = postTag;
return this;
}
@Override
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
if (realWordErrorLikelihood != null) {
builder.field("real_word_error_likelihood", realWordErrorLikelihood);
}
if (confidence != null) {
builder.field("confidence", confidence);
}
if (separator != null) {
builder.field("separator", separator);
}
if (maxErrors != null) {
builder.field("max_errors", maxErrors);
}
if (gramSize != null) {
builder.field("gram_size", gramSize);
}
if (forceUnigrams != null) {
builder.field("force_unigrams", forceUnigrams);
}
if (tokenLimit != null) {
builder.field("token_limit", tokenLimit);
}
if (!generators.isEmpty()) {
Set>> entrySet = generators.entrySet();
for (Entry> entry : entrySet) {
builder.startArray(entry.getKey());
for (CandidateGenerator generator : entry.getValue()) {
generator.toXContent(builder, params);
}
builder.endArray();
}
}
if (model != null) {
builder.startObject("smoothing");
model.toXContent(builder, params);
builder.endObject();
}
if (preTag != null) {
builder.startObject("highlight");
builder.field("pre_tag", preTag);
builder.field("post_tag", postTag);
builder.endObject();
}
return builder;
}
/**
* Creates a new {@link DirectCandidateGenerator}
*
* @param field
* the field this candidate generator operates on.
*/
public static DirectCandidateGenerator candidateGenerator(String field) {
return new DirectCandidateGenerator(field);
}
/**
* A "stupid-backoff" smoothing model simialr to Katz's
* Backoff. This model is used as the default if no model is configured.
*
* See N-Gram
* Smoothing for details.
*
*/
public static final class StupidBackoff extends SmoothingModel {
private final double discount;
/**
* Creates a Stupid-Backoff smoothing model.
*
* @param discount
* the discount given to lower order ngrams if the higher order ngram doesn't exits
*/
public StupidBackoff(double discount) {
super("stupid_backoff");
this.discount = discount;
}
@Override
protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
builder.field("discount", discount);
return builder;
}
}
/**
* An additive
* smoothing model.
*
* See N-Gram
* Smoothing for details.
*
*/
public static final class Laplace extends SmoothingModel {
private final double alpha;
/**
* Creates a Laplace smoothing model.
*
* @param discount
* the discount given to lower order ngrams if the higher order ngram doesn't exits
*/
public Laplace(double alpha) {
super("laplace");
this.alpha = alpha;
}
@Override
protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
builder.field("alpha", alpha);
return builder;
}
}
public static abstract class SmoothingModel implements ToXContent {
private final String type;
protected SmoothingModel(String type) {
this.type = type;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(type);
innerToXContent(builder,params);
builder.endObject();
return builder;
}
protected abstract XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException;
}
/**
* Linear interpolation smoothing model.
*
* See N-Gram
* Smoothing for details.
*
*/
public static final class LinearInterpolation extends SmoothingModel {
private final double trigramLambda;
private final double bigramLambda;
private final double unigramLambda;
/**
* Creates a linear interpolation smoothing model.
*
* Note: the lambdas must sum up to one.
*
* @param trigramLambda
* the trigram lambda
* @param bigramLambda
* the bigram lambda
* @param unigramLambda
* the unigram lambda
*/
public LinearInterpolation(double trigramLambda, double bigramLambda, double unigramLambda) {
super("linear");
this.trigramLambda = trigramLambda;
this.bigramLambda = bigramLambda;
this.unigramLambda = unigramLambda;
}
@Override
protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
builder.field("trigram_lambda", trigramLambda);
builder.field("bigram_lambda", bigramLambda);
builder.field("unigram_lambda", unigramLambda);
return builder;
}
}
/**
* {@link CandidateGenerator} base class.
*/
public static abstract class CandidateGenerator implements ToXContent {
private final String type;
public CandidateGenerator(String type) {
this.type = type;
}
public String getType() {
return type;
}
}
/**
*
*
*/
public static final class DirectCandidateGenerator extends CandidateGenerator {
private final String field;
private String preFilter;
private String postFilter;
private String suggestMode;
private Float accuracy;
private Integer size;
private String sort;
private String stringDistance;
private Integer maxEdits;
private Integer maxInspections;
private Float maxTermFreq;
private Integer prefixLength;
private Integer minWordLength;
private Float minDocFreq;
/**
* Sets from what field to fetch the candidate suggestions from. This is
* an required option and needs to be set via this setter or
* {@link org.elasticsearch.search.suggest.SuggestBuilder.TermSuggestionBuilder#setField(String)}
* method
*/
public DirectCandidateGenerator(String field) {
super("direct_generator");
this.field = field;
}
/**
* The global suggest mode controls what suggested terms are included or
* controls for what suggest text tokens, terms should be suggested for.
* Three possible values can be specified:
*
* missing
- Only suggest terms in the suggest text
* that aren't in the index. This is the default.
* popular
- Only suggest terms that occur in more docs
* then the original suggest text term.
* always
- Suggest any matching suggest terms based on
* tokens in the suggest text.
*
*/
public DirectCandidateGenerator suggestMode(String suggestMode) {
this.suggestMode = suggestMode;
return this;
}
/**
* Sets how similar the suggested terms at least need to be compared to
* the original suggest text tokens. A value between 0 and 1 can be
* specified. This value will be compared to the string distance result
* of each candidate spelling correction.
*
* Default is 0.5
*/
public DirectCandidateGenerator accuracy(float accuracy) {
this.accuracy = accuracy;
return this;
}
/**
* Sets the maximum suggestions to be returned per suggest text term.
*/
public DirectCandidateGenerator size(int size) {
if (size <= 0) {
throw new ElasticSearchIllegalArgumentException("Size must be positive");
}
this.size = size;
return this;
}
/**
* Sets how to sort the suggest terms per suggest text token. Two
* possible values:
*
* score
- Sort should first be based on score, then
* document frequency and then the term itself.
* frequency
- Sort should first be based on document
* frequency, then scotr and then the term itself.
*
*
* What the score is depends on the suggester being used.
*/
public DirectCandidateGenerator sort(String sort) {
this.sort = sort;
return this;
}
/**
* Sets what string distance implementation to use for comparing how
* similar suggested terms are. Four possible values can be specified:
*
* internal
- This is the default and is based on
* damerau_levenshtein
, but highly optimized for comparing
* string distance for terms inside the index.
* damerau_levenshtein
- String distance algorithm
* based on Damerau-Levenshtein algorithm.
* levenstein
- String distance algorithm based on
* Levenstein edit distance algorithm.
* jarowinkler
- String distance algorithm based on
* Jaro-Winkler algorithm.
* ngram
- String distance algorithm based on character
* n-grams.
*
*/
public DirectCandidateGenerator stringDistance(String stringDistance) {
this.stringDistance = stringDistance;
return this;
}
/**
* Sets the maximum edit distance candidate suggestions can have in
* order to be considered as a suggestion. Can only be a value between 1
* and 2. Any other value result in an bad request error being thrown.
* Defaults to 2.
*/
public DirectCandidateGenerator maxEdits(Integer maxEdits) {
this.maxEdits = maxEdits;
return this;
}
/**
* A factor that is used to multiply with the size in order to inspect
* more candidate suggestions. Can improve accuracy at the cost of
* performance. Defaults to 5.
*/
public DirectCandidateGenerator maxInspections(Integer maxInspections) {
this.maxInspections = maxInspections;
return this;
}
/**
* Sets a maximum threshold in number of documents a suggest text token
* can exist in order to be corrected. Can be a relative percentage
* number (e.g 0.4) or an absolute number to represent document
* frequencies. If an value higher than 1 is specified then fractional
* can not be specified. Defaults to 0.01.
*
* This can be used to exclude high frequency terms from being
* suggested. High frequency terms are usually spelled correctly on top
* of this this also improves the suggest performance.
*/
public DirectCandidateGenerator maxTermFreq(float maxTermFreq) {
this.maxTermFreq = maxTermFreq;
return this;
}
/**
* Sets the number of minimal prefix characters that must match in order
* be a candidate suggestion. Defaults to 1. Increasing this number
* improves suggest performance. Usually misspellings don't occur in the
* beginning of terms.
*/
public DirectCandidateGenerator prefixLength(int prefixLength) {
this.prefixLength = prefixLength;
return this;
}
/**
* The minimum length a suggest text term must have in order to be
* corrected. Defaults to 4.
*/
public DirectCandidateGenerator minWordLength(int minWordLength) {
this.minWordLength = minWordLength;
return this;
}
/**
* Sets a minimal threshold in number of documents a suggested term
* should appear in. This can be specified as an absolute number or as a
* relative percentage of number of documents. This can improve quality
* by only suggesting high frequency terms. Defaults to 0f and is not
* enabled. If a value higher than 1 is specified then the number cannot
* be fractional.
*/
public DirectCandidateGenerator minDocFreq(float minDocFreq) {
this.minDocFreq = minDocFreq;
return this;
}
/**
* Sets a filter (analyzer) that is applied to each of the tokens passed to this candidate generator.
* This filter is applied to the original token before candidates are generated.
*/
public DirectCandidateGenerator preFilter(String preFilter) {
this.preFilter = preFilter;
return this;
}
/**
* Sets a filter (analyzer) that is applied to each of the generated tokens
* before they are passed to the actual phrase scorer.
*/
public DirectCandidateGenerator postFilter(String postFilter) {
this.postFilter = postFilter;
return this;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
if (field != null) {
builder.field("field", field);
}
if (suggestMode != null) {
builder.field("suggest_mode", suggestMode);
}
if (accuracy != null) {
builder.field("accuracy", accuracy);
}
if (size != null) {
builder.field("size", size);
}
if (sort != null) {
builder.field("sort", sort);
}
if (stringDistance != null) {
builder.field("string_distance", stringDistance);
}
if (maxEdits != null) {
builder.field("max_edits", maxEdits);
}
if (maxInspections != null) {
builder.field("max_inspections", maxInspections);
}
if (maxTermFreq != null) {
builder.field("max_term_freq", maxTermFreq);
}
if (prefixLength != null) {
builder.field("prefix_len", prefixLength);
}
if (minWordLength != null) {
builder.field("min_word_len", minWordLength);
}
if (minDocFreq != null) {
builder.field("min_doc_freq", minDocFreq);
}
if (preFilter != null) {
builder.field("pre_filter", preFilter);
}
if (postFilter != null) {
builder.field("post_filter", postFilter);
}
builder.endObject();
return builder;
}
}
}