org.apache.solr.spelling.WordBreakSolrSpellChecker Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.CombineSuggestion;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.WordBreakSpellChecker;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A spellchecker that breaks and combines words.
*
* This will not combine adjacent tokens that do not have the same required status (prohibited,
* required, optional). However, this feature depends on incoming term flags being properly set.
* ({@link QueryConverter#PROHIBITED_TERM_FLAG}, {@link QueryConverter#REQUIRED_TERM_FLAG}, {@link
* QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and {@link
* QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} ) This feature breaks completely if the
* upstream analyzer or query converter sets flags with the same values but different meanings. The
* default query converter (if not using "spellcheck.q") is {@link SpellingQueryConverter}, which
* properly sets these flags.
*/
public class WordBreakSolrSpellChecker extends SolrSpellChecker {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/** Try to combine multiple words into one? [true|false] */
public static final String PARAM_COMBINE_WORDS = "combineWords";
/** Try to break words into multiples? [true|false] */
public static final String PARAM_BREAK_WORDS = "breakWords";
/** See {@link WordBreakSpellChecker#setMaxChanges} */
public static final String PARAM_MAX_CHANGES = "maxChanges";
/** See {@link WordBreakSpellChecker#setMaxCombineWordLength} */
public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength";
/** See {@link WordBreakSpellChecker#setMinBreakWordLength} */
public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";
/**
* See {@link BreakSuggestionTieBreaker} for options.
*
* @deprecated Only used for backwards compatibility. It will be removed in 10.x.
*/
@Deprecated(since = "9.6")
private static final String PARAM_BREAK_SUGESTION_TIE_BREAKER = "breakSugestionTieBreaker";
/** See {@link BreakSuggestionTieBreaker} for options. */
public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSuggestionTieBreaker";
/** See {@link WordBreakSpellChecker#setMaxEvaluations} */
public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";
/** See {@link WordBreakSpellChecker#setMinSuggestionFrequency} */
public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq";
/** Specify a value on the "breakSuggestionTieBreaker" parameter. The default is MAX_FREQ. */
public enum BreakSuggestionTieBreaker {
/** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} # */
MAX_FREQ,
/** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY} */
SUM_FREQ
};
private WordBreakSpellChecker wbsp = null;
private boolean combineWords = false;
private boolean breakWords = false;
private BreakSuggestionSortMethod sortMethod =
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
private static final Pattern spacePattern = Pattern.compile("\\s+");
@Override
public String init(NamedList config, SolrCore core) {
String name = super.init(config, core);
combineWords = boolParam(config, PARAM_COMBINE_WORDS);
breakWords = boolParam(config, PARAM_BREAK_WORDS);
wbsp = new WordBreakSpellChecker();
String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
if (bstb == null) {
bstb = strParam(config, PARAM_BREAK_SUGESTION_TIE_BREAKER);
if (bstb != null && log.isWarnEnabled()) {
log.warn(
"Parameter '"
+ PARAM_BREAK_SUGESTION_TIE_BREAKER
+ "' is deprecated and will be removed in Solr 10.x. Please use '"
+ PARAM_BREAK_SUGGESTION_TIE_BREAKER
+ "' instead."); // nowarn
}
}
if (bstb != null) {
bstb = bstb.toUpperCase(Locale.ROOT);
if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY;
} else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
} else {
throw new IllegalArgumentException(
"Invalid value for parameter " + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb);
}
} else {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
}
int mc = intParam(config, PARAM_MAX_CHANGES);
if (mc > 0) {
wbsp.setMaxChanges(mc);
}
int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH);
if (mcl > 0) {
wbsp.setMaxCombineWordLength(mcl);
}
int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH);
if (mbwl > 0) {
wbsp.setMinBreakWordLength(mbwl);
}
int me = intParam(config, PARAM_MAX_EVALUATIONS);
if (me > 0) {
wbsp.setMaxEvaluations(me);
}
int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY);
if (msf > 0) {
wbsp.setMinSuggestionFrequency(msf);
}
return name;
}
private String strParam(NamedList config, String paramName) {
Object o = config.get(paramName);
return o == null ? null : o.toString();
}
private boolean boolParam(NamedList config, String paramName) {
String s = strParam(config, paramName);
if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) {
return true;
}
return false;
}
private int intParam(NamedList config, String paramName) {
Object o = config.get(paramName);
if (o == null) {
return 0;
}
try {
return Integer.parseInt(o.toString());
} catch (NumberFormatException nfe) {
throw new IllegalArgumentException("Invalid integer for parameter " + paramName + " : " + o);
}
}
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
IndexReader ir = options.reader;
int numSuggestions = options.count;
StringBuilder sb = new StringBuilder();
Token[] tokenArr = options.tokens.toArray(new Token[0]);
List tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
List termArr = new ArrayList<>(options.tokens.size() + 2);
List breakSuggestionList = new ArrayList<>();
List noBreakSuggestionList = new ArrayList<>();
boolean lastOneProhibited = false;
boolean lastOneRequired = false;
boolean lastOneprocedesNewBooleanOp = false;
for (int i = 0; i < tokenArr.length; i++) {
boolean prohibited =
(tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG)
== QueryConverter.PROHIBITED_TERM_FLAG;
boolean required =
(tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG)
== QueryConverter.REQUIRED_TERM_FLAG;
boolean procedesNewBooleanOp =
(tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG)
== QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
if (i > 0
&& (prohibited != lastOneProhibited
|| required != lastOneRequired
|| lastOneprocedesNewBooleanOp)) {
termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
tokenArrWithSeparators.add(null);
}
lastOneProhibited = prohibited;
lastOneRequired = required;
lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
Term thisTerm = new Term(field, tokenArr[i].toString());
termArr.add(thisTerm);
tokenArrWithSeparators.add(tokenArr[i]);
if (breakWords) {
SuggestWord[][] breakSuggestions =
wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
if (breakSuggestions.length == 0) {
noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
}
for (SuggestWord[] breakSuggestion : breakSuggestions) {
sb.delete(0, sb.length());
boolean firstOne = true;
int freq = 0;
for (SuggestWord word : breakSuggestion) {
if (!firstOne) {
sb.append(" ");
}
firstOne = false;
sb.append(word.string);
if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
freq = Math.max(freq, word.freq);
} else {
freq += word.freq;
}
}
breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
}
}
}
breakSuggestionList.addAll(noBreakSuggestionList);
List combineSuggestionList = Collections.emptyList();
CombineSuggestion[] combineSuggestions =
wbsp.suggestWordCombinations(
termArr.toArray(new Term[0]), numSuggestions, ir, options.suggestMode);
if (combineWords) {
combineSuggestionList = new ArrayList<>(combineSuggestions.length);
for (CombineSuggestion cs : combineSuggestions) {
int firstTermIndex = cs.originalTermIndexes[0];
int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
sb.delete(0, sb.length());
for (int i = firstTermIndex; i <= lastTermIndex; i++) {
if (i > firstTermIndex) {
sb.append(" ");
}
sb.append(tokenArrWithSeparators.get(i).toString());
}
Token token =
new Token(
sb.toString(),
tokenArrWithSeparators.get(firstTermIndex).startOffset(),
tokenArrWithSeparators.get(lastTermIndex).endOffset());
combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
}
}
// Interleave the two lists of suggestions into one SpellingResult
SpellingResult result = new SpellingResult();
Iterator breakIter = breakSuggestionList.iterator();
Iterator combineIter = combineSuggestionList.iterator();
ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
int breakCount = 0;
int combineCount = 0;
while (lastBreak != null || lastCombine != null) {
if (lastBreak == null) {
addToResult(
result,
lastCombine.token,
getCombineFrequency(ir, lastCombine.token),
lastCombine.suggestion,
lastCombine.freq);
lastCombine = null;
} else if (lastCombine == null) {
addToResult(
result,
lastBreak.token,
ir.docFreq(new Term(field, lastBreak.token.toString())),
lastBreak.suggestion,
lastBreak.freq);
lastBreak = null;
} else if (lastBreak.freq < lastCombine.freq) {
addToResult(
result,
lastCombine.token,
getCombineFrequency(ir, lastCombine.token),
lastCombine.suggestion,
lastCombine.freq);
lastCombine = null;
} else if (lastCombine.freq < lastBreak.freq) {
addToResult(
result,
lastBreak.token,
ir.docFreq(new Term(field, lastBreak.token.toString())),
lastBreak.suggestion,
lastBreak.freq);
lastBreak = null;
} else if (breakCount >= combineCount) { // TODO: Should reverse >= to < ??S
addToResult(
result,
lastCombine.token,
getCombineFrequency(ir, lastCombine.token),
lastCombine.suggestion,
lastCombine.freq);
lastCombine = null;
} else {
addToResult(
result,
lastBreak.token,
ir.docFreq(new Term(field, lastBreak.token.toString())),
lastBreak.suggestion,
lastBreak.freq);
lastBreak = null;
}
if (lastBreak == null && breakIter.hasNext()) {
lastBreak = breakIter.next();
breakCount++;
}
if (lastCombine == null && combineIter.hasNext()) {
lastCombine = combineIter.next();
combineCount++;
}
}
return result;
}
private void addToResult(
SpellingResult result,
Token token,
int tokenFrequency,
String suggestion,
int suggestionFrequency) {
if (suggestion == null) {
result.add(token, Collections.emptyList());
result.addFrequency(token, tokenFrequency);
} else {
result.add(token, suggestion, suggestionFrequency);
result.addFrequency(token, tokenFrequency);
}
}
private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
String[] words = spacePattern.split(token.toString());
int result = 0;
if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
for (String word : words) {
result = Math.max(result, ir.docFreq(new Term(field, word)));
}
} else {
for (String word : words) {
result += ir.docFreq(new Term(field, word));
}
}
return result;
}
@Override
public void build(SolrCore core, SolrIndexSearcher searcher) {
/* no-op */
}
@Override
public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {
/* no-op */
}
@Override
public boolean isSuggestionsMayOverlap() {
return true;
}
}