All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.spelling.WordBreakSolrSpellChecker Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.spelling;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.CombineSuggestion;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.WordBreakSpellChecker;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A spellchecker that breaks and combines words.
 *
 * 

This will not combine adjacent tokens that do not have the same required status (prohibited, * required, optional). However, this feature depends on incoming term flags being properly set. * ({@link QueryConverter#PROHIBITED_TERM_FLAG}, {@link QueryConverter#REQUIRED_TERM_FLAG}, {@link * QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and {@link * QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} ) This feature breaks completely if the * upstream analyzer or query converter sets flags with the same values but different meanings. The * default query converter (if not using "spellcheck.q") is {@link SpellingQueryConverter}, which * properly sets these flags. */ public class WordBreakSolrSpellChecker extends SolrSpellChecker { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); /** Try to combine multiple words into one? [true|false] */ public static final String PARAM_COMBINE_WORDS = "combineWords"; /** Try to break words into multiples? [true|false] */ public static final String PARAM_BREAK_WORDS = "breakWords"; /** See {@link WordBreakSpellChecker#setMaxChanges} */ public static final String PARAM_MAX_CHANGES = "maxChanges"; /** See {@link WordBreakSpellChecker#setMaxCombineWordLength} */ public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength"; /** See {@link WordBreakSpellChecker#setMinBreakWordLength} */ public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength"; /** * See {@link BreakSuggestionTieBreaker} for options. * * @deprecated Only used for backwards compatibility. It will be removed in 10.x. */ @Deprecated(since = "9.6") private static final String PARAM_BREAK_SUGESTION_TIE_BREAKER = "breakSugestionTieBreaker"; /** See {@link BreakSuggestionTieBreaker} for options. */ public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSuggestionTieBreaker"; /** See {@link WordBreakSpellChecker#setMaxEvaluations} */ public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations"; /** See {@link WordBreakSpellChecker#setMinSuggestionFrequency} */ public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq"; /** Specify a value on the "breakSuggestionTieBreaker" parameter. The default is MAX_FREQ. */ public enum BreakSuggestionTieBreaker { /** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} # */ MAX_FREQ, /** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY} */ SUM_FREQ }; private WordBreakSpellChecker wbsp = null; private boolean combineWords = false; private boolean breakWords = false; private BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; private static final Pattern spacePattern = Pattern.compile("\\s+"); @Override public String init(NamedList config, SolrCore core) { String name = super.init(config, core); combineWords = boolParam(config, PARAM_COMBINE_WORDS); breakWords = boolParam(config, PARAM_BREAK_WORDS); wbsp = new WordBreakSpellChecker(); String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER); if (bstb == null) { bstb = strParam(config, PARAM_BREAK_SUGESTION_TIE_BREAKER); if (bstb != null && log.isWarnEnabled()) { log.warn( "Parameter '" + PARAM_BREAK_SUGESTION_TIE_BREAKER + "' is deprecated and will be removed in Solr 10.x. Please use '" + PARAM_BREAK_SUGGESTION_TIE_BREAKER + "' instead."); // nowarn } } if (bstb != null) { bstb = bstb.toUpperCase(Locale.ROOT); if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) { sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY; } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) { sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; } else { throw new IllegalArgumentException( "Invalid value for parameter " + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb); } } else { sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; } int mc = intParam(config, PARAM_MAX_CHANGES); if (mc > 0) { wbsp.setMaxChanges(mc); } int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH); if (mcl > 0) { wbsp.setMaxCombineWordLength(mcl); } int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH); if (mbwl > 0) { wbsp.setMinBreakWordLength(mbwl); } int me = intParam(config, PARAM_MAX_EVALUATIONS); if (me > 0) { wbsp.setMaxEvaluations(me); } int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY); if (msf > 0) { wbsp.setMinSuggestionFrequency(msf); } return name; } private String strParam(NamedList config, String paramName) { Object o = config.get(paramName); return o == null ? null : o.toString(); } private boolean boolParam(NamedList config, String paramName) { String s = strParam(config, paramName); if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) { return true; } return false; } private int intParam(NamedList config, String paramName) { Object o = config.get(paramName); if (o == null) { return 0; } try { return Integer.parseInt(o.toString()); } catch (NumberFormatException nfe) { throw new IllegalArgumentException("Invalid integer for parameter " + paramName + " : " + o); } } @Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { IndexReader ir = options.reader; int numSuggestions = options.count; StringBuilder sb = new StringBuilder(); Token[] tokenArr = options.tokens.toArray(new Token[0]); List tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2); List termArr = new ArrayList<>(options.tokens.size() + 2); List breakSuggestionList = new ArrayList<>(); List noBreakSuggestionList = new ArrayList<>(); boolean lastOneProhibited = false; boolean lastOneRequired = false; boolean lastOneprocedesNewBooleanOp = false; for (int i = 0; i < tokenArr.length; i++) { boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG; boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG; boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) { termArr.add(WordBreakSpellChecker.SEPARATOR_TERM); tokenArrWithSeparators.add(null); } lastOneProhibited = prohibited; lastOneRequired = required; lastOneprocedesNewBooleanOp = procedesNewBooleanOp; Term thisTerm = new Term(field, tokenArr[i].toString()); termArr.add(thisTerm); tokenArrWithSeparators.add(tokenArr[i]); if (breakWords) { SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod); if (breakSuggestions.length == 0) { noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0)); } for (SuggestWord[] breakSuggestion : breakSuggestions) { sb.delete(0, sb.length()); boolean firstOne = true; int freq = 0; for (SuggestWord word : breakSuggestion) { if (!firstOne) { sb.append(" "); } firstOne = false; sb.append(word.string); if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { freq = Math.max(freq, word.freq); } else { freq += word.freq; } } breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq)); } } } breakSuggestionList.addAll(noBreakSuggestionList); List combineSuggestionList = Collections.emptyList(); CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations( termArr.toArray(new Term[0]), numSuggestions, ir, options.suggestMode); if (combineWords) { combineSuggestionList = new ArrayList<>(combineSuggestions.length); for (CombineSuggestion cs : combineSuggestions) { int firstTermIndex = cs.originalTermIndexes[0]; int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1]; sb.delete(0, sb.length()); for (int i = firstTermIndex; i <= lastTermIndex; i++) { if (i > firstTermIndex) { sb.append(" "); } sb.append(tokenArrWithSeparators.get(i).toString()); } Token token = new Token( sb.toString(), tokenArrWithSeparators.get(firstTermIndex).startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset()); combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq)); } } // Interleave the two lists of suggestions into one SpellingResult SpellingResult result = new SpellingResult(); Iterator breakIter = breakSuggestionList.iterator(); Iterator combineIter = combineSuggestionList.iterator(); ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null; ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null; int breakCount = 0; int combineCount = 0; while (lastBreak != null || lastCombine != null) { if (lastBreak == null) { addToResult( result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); lastCombine = null; } else if (lastCombine == null) { addToResult( result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); lastBreak = null; } else if (lastBreak.freq < lastCombine.freq) { addToResult( result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); lastCombine = null; } else if (lastCombine.freq < lastBreak.freq) { addToResult( result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); lastBreak = null; } else if (breakCount >= combineCount) { // TODO: Should reverse >= to < ??S addToResult( result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); lastCombine = null; } else { addToResult( result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); lastBreak = null; } if (lastBreak == null && breakIter.hasNext()) { lastBreak = breakIter.next(); breakCount++; } if (lastCombine == null && combineIter.hasNext()) { lastCombine = combineIter.next(); combineCount++; } } return result; } private void addToResult( SpellingResult result, Token token, int tokenFrequency, String suggestion, int suggestionFrequency) { if (suggestion == null) { result.add(token, Collections.emptyList()); result.addFrequency(token, tokenFrequency); } else { result.add(token, suggestion, suggestionFrequency); result.addFrequency(token, tokenFrequency); } } private int getCombineFrequency(IndexReader ir, Token token) throws IOException { String[] words = spacePattern.split(token.toString()); int result = 0; if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { for (String word : words) { result = Math.max(result, ir.docFreq(new Term(field, word))); } } else { for (String word : words) { result += ir.docFreq(new Term(field, word)); } } return result; } @Override public void build(SolrCore core, SolrIndexSearcher searcher) { /* no-op */ } @Override public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException { /* no-op */ } @Override public boolean isSuggestionsMayOverlap() { return true; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy