org.apache.solr.spelling.WordBreakSolrSpellChecker Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.spelling;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.CombineSuggestion;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.WordBreakSpellChecker;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A spellchecker that breaks and combines words.
 *
 * This will not combine adjacent tokens that do not have the same required status (prohibited,
 * required, optional). However, this feature depends on incoming term flags being properly set.
 * ({@link QueryConverter#PROHIBITED_TERM_FLAG}, {@link QueryConverter#REQUIRED_TERM_FLAG}, {@link
 * QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and {@link
 * QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} ) This feature breaks completely if the
 * upstream analyzer or query converter sets flags with the same values but different meanings. The
 * default query converter (if not using "spellcheck.q") is {@link SpellingQueryConverter}, which
 * properly sets these flags.
 */
public class WordBreakSolrSpellChecker extends SolrSpellChecker {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  /** Try to combine multiple words into one? [true|false] */
  public static final String PARAM_COMBINE_WORDS = "combineWords";

  /** Try to break words into multiples? [true|false] */
  public static final String PARAM_BREAK_WORDS = "breakWords";

  /** See {@link WordBreakSpellChecker#setMaxChanges} */
  public static final String PARAM_MAX_CHANGES = "maxChanges";

  /** See {@link WordBreakSpellChecker#setMaxCombineWordLength} */
  public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength";

  /** See {@link WordBreakSpellChecker#setMinBreakWordLength} */
  public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";

  /**
   * See {@link BreakSuggestionTieBreaker} for options.
   *
   * @deprecated Only used for backwards compatibility. It will be removed in 10.x.
   */
  @Deprecated(since = "9.6")
  private static final String PARAM_BREAK_SUGESTION_TIE_BREAKER = "breakSugestionTieBreaker";

  /** See {@link BreakSuggestionTieBreaker} for options. */
  public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSuggestionTieBreaker";

  /** See {@link WordBreakSpellChecker#setMaxEvaluations} */
  public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";

  /** See {@link WordBreakSpellChecker#setMinSuggestionFrequency} */
  public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq";

  /** Specify a value on the "breakSuggestionTieBreaker" parameter. The default is MAX_FREQ. */
  public enum BreakSuggestionTieBreaker {
    /** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} # */
    MAX_FREQ,
    /** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY} */
    SUM_FREQ
  };

  private WordBreakSpellChecker wbsp = null;
  private boolean combineWords = false;
  private boolean breakWords = false;
  private BreakSuggestionSortMethod sortMethod =
      BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
  private static final Pattern spacePattern = Pattern.compile("\\s+");

  @Override
  public String init(NamedList config, SolrCore core) {
    String name = super.init(config, core);
    combineWords = boolParam(config, PARAM_COMBINE_WORDS);
    breakWords = boolParam(config, PARAM_BREAK_WORDS);
    wbsp = new WordBreakSpellChecker();
    String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
    if (bstb == null) {
      bstb = strParam(config, PARAM_BREAK_SUGESTION_TIE_BREAKER);
      if (bstb != null && log.isWarnEnabled()) {
        log.warn(
            "Parameter '"
                + PARAM_BREAK_SUGESTION_TIE_BREAKER
                + "' is deprecated and will be removed in Solr 10.x. Please use '"
                + PARAM_BREAK_SUGGESTION_TIE_BREAKER
                + "' instead."); // nowarn
      }
    }
    if (bstb != null) {
      bstb = bstb.toUpperCase(Locale.ROOT);
      if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
        sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY;
      } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) {
        sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
      } else {
        throw new IllegalArgumentException(
            "Invalid value for parameter " + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb);
      }
    } else {
      sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
    }
    int mc = intParam(config, PARAM_MAX_CHANGES);
    if (mc > 0) {
      wbsp.setMaxChanges(mc);
    }
    int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH);
    if (mcl > 0) {
      wbsp.setMaxCombineWordLength(mcl);
    }
    int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH);
    if (mbwl > 0) {
      wbsp.setMinBreakWordLength(mbwl);
    }
    int me = intParam(config, PARAM_MAX_EVALUATIONS);
    if (me > 0) {
      wbsp.setMaxEvaluations(me);
    }
    int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY);
    if (msf > 0) {
      wbsp.setMinSuggestionFrequency(msf);
    }
    return name;
  }

  private String strParam(NamedList config, String paramName) {
    Object o = config.get(paramName);
    return o == null ? null : o.toString();
  }

  private boolean boolParam(NamedList config, String paramName) {
    String s = strParam(config, paramName);
    if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) {
      return true;
    }
    return false;
  }

  private int intParam(NamedList config, String paramName) {
    Object o = config.get(paramName);
    if (o == null) {
      return 0;
    }
    try {
      return Integer.parseInt(o.toString());
    } catch (NumberFormatException nfe) {
      throw new IllegalArgumentException("Invalid integer for parameter " + paramName + " : " + o);
    }
  }

  @Override
  public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    IndexReader ir = options.reader;
    int numSuggestions = options.count;

    StringBuilder sb = new StringBuilder();
    Token[] tokenArr = options.tokens.toArray(new Token[0]);
    List tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
    List termArr = new ArrayList<>(options.tokens.size() + 2);
    List breakSuggestionList = new ArrayList<>();
    List noBreakSuggestionList = new ArrayList<>();
    boolean lastOneProhibited = false;
    boolean lastOneRequired = false;
    boolean lastOneprocedesNewBooleanOp = false;
    for (int i = 0; i < tokenArr.length; i++) {
      boolean prohibited =
          (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG)
              == QueryConverter.PROHIBITED_TERM_FLAG;
      boolean required =
          (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG)
              == QueryConverter.REQUIRED_TERM_FLAG;
      boolean procedesNewBooleanOp =
          (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG)
              == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
      if (i > 0
          && (prohibited != lastOneProhibited
              || required != lastOneRequired
              || lastOneprocedesNewBooleanOp)) {
        termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
        tokenArrWithSeparators.add(null);
      }
      lastOneProhibited = prohibited;
      lastOneRequired = required;
      lastOneprocedesNewBooleanOp = procedesNewBooleanOp;

      Term thisTerm = new Term(field, tokenArr[i].toString());
      termArr.add(thisTerm);
      tokenArrWithSeparators.add(tokenArr[i]);
      if (breakWords) {
        SuggestWord[][] breakSuggestions =
            wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
        if (breakSuggestions.length == 0) {
          noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
        }
        for (SuggestWord[] breakSuggestion : breakSuggestions) {
          sb.delete(0, sb.length());
          boolean firstOne = true;
          int freq = 0;
          for (SuggestWord word : breakSuggestion) {
            if (!firstOne) {
              sb.append(" ");
            }
            firstOne = false;
            sb.append(word.string);
            if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
              freq = Math.max(freq, word.freq);
            } else {
              freq += word.freq;
            }
          }
          breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
        }
      }
    }
    breakSuggestionList.addAll(noBreakSuggestionList);

    List combineSuggestionList = Collections.emptyList();
    CombineSuggestion[] combineSuggestions =
        wbsp.suggestWordCombinations(
            termArr.toArray(new Term[0]), numSuggestions, ir, options.suggestMode);
    if (combineWords) {
      combineSuggestionList = new ArrayList<>(combineSuggestions.length);
      for (CombineSuggestion cs : combineSuggestions) {
        int firstTermIndex = cs.originalTermIndexes[0];
        int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
        sb.delete(0, sb.length());
        for (int i = firstTermIndex; i <= lastTermIndex; i++) {
          if (i > firstTermIndex) {
            sb.append(" ");
          }
          sb.append(tokenArrWithSeparators.get(i).toString());
        }
        Token token =
            new Token(
                sb.toString(),
                tokenArrWithSeparators.get(firstTermIndex).startOffset(),
                tokenArrWithSeparators.get(lastTermIndex).endOffset());
        combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
      }
    }

    // Interleave the two lists of suggestions into one SpellingResult
    SpellingResult result = new SpellingResult();
    Iterator breakIter = breakSuggestionList.iterator();
    Iterator combineIter = combineSuggestionList.iterator();
    ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
    ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
    int breakCount = 0;
    int combineCount = 0;
    while (lastBreak != null || lastCombine != null) {
      if (lastBreak == null) {
        addToResult(
            result,
            lastCombine.token,
            getCombineFrequency(ir, lastCombine.token),
            lastCombine.suggestion,
            lastCombine.freq);
        lastCombine = null;
      } else if (lastCombine == null) {
        addToResult(
            result,
            lastBreak.token,
            ir.docFreq(new Term(field, lastBreak.token.toString())),
            lastBreak.suggestion,
            lastBreak.freq);
        lastBreak = null;
      } else if (lastBreak.freq < lastCombine.freq) {
        addToResult(
            result,
            lastCombine.token,
            getCombineFrequency(ir, lastCombine.token),
            lastCombine.suggestion,
            lastCombine.freq);
        lastCombine = null;
      } else if (lastCombine.freq < lastBreak.freq) {
        addToResult(
            result,
            lastBreak.token,
            ir.docFreq(new Term(field, lastBreak.token.toString())),
            lastBreak.suggestion,
            lastBreak.freq);
        lastBreak = null;
      } else if (breakCount >= combineCount) { // TODO: Should reverse >= to < ??S
        addToResult(
            result,
            lastCombine.token,
            getCombineFrequency(ir, lastCombine.token),
            lastCombine.suggestion,
            lastCombine.freq);
        lastCombine = null;
      } else {
        addToResult(
            result,
            lastBreak.token,
            ir.docFreq(new Term(field, lastBreak.token.toString())),
            lastBreak.suggestion,
            lastBreak.freq);
        lastBreak = null;
      }
      if (lastBreak == null && breakIter.hasNext()) {
        lastBreak = breakIter.next();
        breakCount++;
      }
      if (lastCombine == null && combineIter.hasNext()) {
        lastCombine = combineIter.next();
        combineCount++;
      }
    }
    return result;
  }

  private void addToResult(
      SpellingResult result,
      Token token,
      int tokenFrequency,
      String suggestion,
      int suggestionFrequency) {
    if (suggestion == null) {
      result.add(token, Collections.emptyList());
      result.addFrequency(token, tokenFrequency);
    } else {
      result.add(token, suggestion, suggestionFrequency);
      result.addFrequency(token, tokenFrequency);
    }
  }

  private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
    String[] words = spacePattern.split(token.toString());
    int result = 0;
    if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
      for (String word : words) {
        result = Math.max(result, ir.docFreq(new Term(field, word)));
      }
    } else {
      for (String word : words) {
        result += ir.docFreq(new Term(field, word));
      }
    }
    return result;
  }

  @Override
  public void build(SolrCore core, SolrIndexSearcher searcher) {
    /* no-op */
  }

  @Override
  public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {
    /* no-op */
  }

  @Override
  public boolean isSuggestionsMayOverlap() {
    return true;
  }
}