querqy.elasticsearch.rewriter.WordBreakCompoundRewriterFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of querqy-elasticsearch Show documentation
Show all versions of querqy-elasticsearch Show documentation
Querqy library for query rewriting: Querqy for Elasticsearch
package querqy.elasticsearch.rewriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.WordBreakSpellChecker;
import org.elasticsearch.index.shard.IndexShard;
import querqy.elasticsearch.ConfigUtils;
import querqy.elasticsearch.DismaxSearchEngineRequestAdapter;
import querqy.elasticsearch.ESRewriterFactory;
import querqy.lucene.contrib.rewrite.wordbreak.MorphologicalWordBreaker;
import querqy.lucene.contrib.rewrite.wordbreak.Morphology;
import querqy.lucene.contrib.rewrite.wordbreak.SpellCheckerCompounder;
import querqy.lucene.contrib.rewrite.wordbreak.WordBreakCompoundRewriter;
import querqy.model.ExpandedQuery;
import querqy.model.Term;
import querqy.rewrite.QueryRewriter;
import querqy.rewrite.RewriterFactory;
import querqy.rewrite.SearchEngineRequestAdapter;
import querqy.trie.TrieMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
public class WordBreakCompoundRewriterFactory extends ESRewriterFactory {
// this controls behaviour of the Lucene WordBreakSpellChecker:
// for compounds: maximum distance of leftmost and rightmost term index
// e.g. max_changes = 1 for A B C D will check AB BC CD,
// max_changes = 2 for A B C D will check AB ABC BC BCD CD
// for decompounds: maximum splits performed
// e.g. max_changes = 1 for ABCD will check A BCD, AB CD, ABC D,
// max_changes = 2 for ABCD will check A BCD, A B CD, A BC D, AB CD, AB C D, ABC D
// as we currently only send 2-grams to WBSP for compounding only max_changes = 1 is correctly supported
static final int MAX_CHANGES = 1;
static final int MAX_EVALUATIONS = 100;
static final int DEFAULT_MIN_SUGGESTION_FREQ = 1;
static final int DEFAULT_MAX_COMBINE_LENGTH = 30;
static final int DEFAULT_MIN_BREAK_LENGTH = 3;
static final int DEFAULT_MAX_DECOMPOUND_EXPANSIONS = 3;
static final boolean DEFAULT_LOWER_CASE_INPUT = false;
static final boolean DEFAULT_ALWAYS_ADD_REVERSE_COMPOUNDS = false;
static final boolean DEFAULT_VERIFY_DECOMPOUND_COLLATION = false;
private String dictionaryField;
private boolean lowerCaseInput = DEFAULT_LOWER_CASE_INPUT;
private boolean alwaysAddReverseCompounds = DEFAULT_ALWAYS_ADD_REVERSE_COMPOUNDS;
private WordBreakSpellChecker spellChecker;
private SpellCheckerCompounder compounder;
private MorphologicalWordBreaker wordBreaker;
private TrieMap reverseCompoundTriggerWords;
private TrieMap protectedWords;
private int maxDecompoundExpansions = DEFAULT_MAX_DECOMPOUND_EXPANSIONS;
private boolean verifyDecompoundCollation = DEFAULT_VERIFY_DECOMPOUND_COLLATION;
public WordBreakCompoundRewriterFactory(final String rewriterId) {
super(rewriterId);
}
@Override
public void configure(final Map config) {
final int minSuggestionFreq = ConfigUtils.getArg(config, "minSuggestionFreq", DEFAULT_MIN_SUGGESTION_FREQ);
final int maxCombineLength = ConfigUtils.getArg(config, "maxCombineLength", DEFAULT_MAX_COMBINE_LENGTH);
final int minBreakLength = ConfigUtils.getArg(config, "minBreakLength", DEFAULT_MIN_BREAK_LENGTH);
dictionaryField = ConfigUtils.getStringArg(config, "dictionaryField")
.map(String::trim)
.filter(s -> !s.isEmpty())
.orElseThrow(() -> new IllegalArgumentException("Missing config: dictionaryField"));
lowerCaseInput = ConfigUtils.getArg(config, "lowerCaseInput", DEFAULT_LOWER_CASE_INPUT);
alwaysAddReverseCompounds = ConfigUtils.getArg(config, "alwaysAddReverseCompounds",
DEFAULT_ALWAYS_ADD_REVERSE_COMPOUNDS);
spellChecker = new WordBreakSpellChecker();
spellChecker.setMaxChanges(MAX_CHANGES);
spellChecker.setMinSuggestionFrequency(minSuggestionFreq);
spellChecker.setMaxCombineWordLength(maxCombineLength);
spellChecker.setMinBreakWordLength(minBreakLength);
spellChecker.setMaxEvaluations(100);
compounder = new SpellCheckerCompounder(spellChecker, dictionaryField, lowerCaseInput);
final Morphology morphology = ConfigUtils.getEnumArg(config, "morphology", Morphology.class)
.orElse(Morphology.DEFAULT);
wordBreaker = new MorphologicalWordBreaker(morphology, dictionaryField, lowerCaseInput, minSuggestionFreq,
minBreakLength, MAX_EVALUATIONS);
reverseCompoundTriggerWords = ConfigUtils.getTrieSetArg(config, "reverseCompoundTriggerWords");
protectedWords = ConfigUtils.getTrieSetArg(config, "protectedWords");
Map decompoundConf = (Map) config.get("decompound");
if (decompoundConf == null) {
decompoundConf = Collections.emptyMap();
}
maxDecompoundExpansions = ConfigUtils.getArg(decompoundConf, "maxExpansions",
DEFAULT_MAX_DECOMPOUND_EXPANSIONS);
verifyDecompoundCollation = ConfigUtils.getArg(decompoundConf, "verifyCollation",
DEFAULT_VERIFY_DECOMPOUND_COLLATION);
}
@Override
public List validateConfiguration(final Map config) {
final List errors = new LinkedList<>();
final Optional optValue = ConfigUtils.getStringArg(config, "dictionaryField").map(String::trim)
.filter(s -> !s.isEmpty());
if (!optValue.isPresent()) {
errors.add("Missing config: dictionaryField");
}
ConfigUtils.getStringArg(config, "morphology").ifPresent(morphologyName -> {
if (Arrays.stream(Morphology.values()).map(Enum::name).noneMatch(name -> name.equals(morphologyName))) {
errors.add("Unknown morphology: " + morphologyName);
}
});
return errors;
}
@Override
public RewriterFactory createRewriterFactory(final IndexShard indexShard) {
return new RewriterFactory(getRewriterId()) {
@Override
public QueryRewriter createRewriter(final ExpandedQuery input,
final SearchEngineRequestAdapter searchEngineRequestAdapter) {
return new WordBreakCompoundRewriter(wordBreaker, compounder,
getShardIndexReader((DismaxSearchEngineRequestAdapter) searchEngineRequestAdapter),
lowerCaseInput, alwaysAddReverseCompounds, reverseCompoundTriggerWords, maxDecompoundExpansions,
verifyDecompoundCollation, protectedWords);
}
@Override
public Set getGenerableTerms() {
return QueryRewriter.EMPTY_GENERABLE_TERMS;
}
};
}
public WordBreakSpellChecker getSpellChecker() {
return spellChecker;
}
public String getDictionaryField() {
return dictionaryField;
}
public boolean isLowerCaseInput() {
return lowerCaseInput;
}
public boolean isAlwaysAddReverseCompounds() {
return alwaysAddReverseCompounds;
}
public TrieMap getReverseCompoundTriggerWords() {
return reverseCompoundTriggerWords;
}
public TrieMap getProtectedWords() {
return protectedWords;
}
public int getMaxDecompoundExpansions() {
return maxDecompoundExpansions;
}
public boolean isVerifyDecompoundCollation() {
return verifyDecompoundCollation;
}
private IndexReader getShardIndexReader(final DismaxSearchEngineRequestAdapter searchEngineRequestAdapter) {
return searchEngineRequestAdapter.getSearchExecutionContext().searcher().getTopReaderContext().reader();
}
public SpellCheckerCompounder getCompounder() {
return compounder;
}
public MorphologicalWordBreaker getWordBreaker() {
return wordBreaker;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy