All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.spelling.DirectSolrSpellChecker Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.spelling;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.SpellCheckMergeData;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Spellchecker implementation that uses {@link DirectSpellChecker}
 *
 * 

Requires no auxiliary index or data structure. * *

Supported options: * *

    *
  • field: Used as the source of terms. *
  • distanceMeasure: Sets {@link DirectSpellChecker#setDistance(StringDistance)}. Note: to set * the default {@link DirectSpellChecker#INTERNAL_LEVENSHTEIN}, use "internal". *
  • accuracy: Sets {@link DirectSpellChecker#setAccuracy(float)}. *
  • maxEdits: Sets {@link DirectSpellChecker#setMaxEdits(int)}. *
  • minPrefix: Sets {@link DirectSpellChecker#setMinPrefix(int)}. *
  • maxInspections: Sets {@link DirectSpellChecker#setMaxInspections(int)}. *
  • comparatorClass: Sets {@link DirectSpellChecker#setComparator(Comparator)}. Note: * score-then-frequency can be specified as "score" and frequency-then-score can be specified * as "freq". *
  • thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}. *
  • minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}. *
  • maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}. *
  • maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}. *
* * @see DirectSpellChecker */ public class DirectSolrSpellChecker extends SolrSpellChecker { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); // configuration params shared with other spellcheckers public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS; public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP; public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP; public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE; public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY; public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY; public static final String INTERNAL_DISTANCE = "internal"; public static final float DEFAULT_ACCURACY = 0.5f; public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f; public static final String MAXEDITS = "maxEdits"; public static final int DEFAULT_MAXEDITS = 2; // params specific to this implementation public static final String MINPREFIX = "minPrefix"; public static final int DEFAULT_MINPREFIX = 1; public static final String MAXINSPECTIONS = "maxInspections"; public static final int DEFAULT_MAXINSPECTIONS = 5; public static final String MINQUERYLENGTH = "minQueryLength"; public static final int DEFAULT_MINQUERYLENGTH = 4; public static final String MAXQUERYLENGTH = "maxQueryLength"; public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE; public static final String MAXQUERYFREQUENCY = "maxQueryFrequency"; public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f; private DirectSpellChecker checker = new DirectSpellChecker(); @Override public String init(NamedList config, SolrCore core) { SolrParams params = config.toSolrParams(); log.info("init: {}", config); String name = super.init(config, core); Comparator comp = SuggestWordQueue.DEFAULT_COMPARATOR; String compClass = (String) config.get(COMPARATOR_CLASS); if (compClass != null) { if (compClass.equalsIgnoreCase(SCORE_COMP)) comp = SuggestWordQueue.DEFAULT_COMPARATOR; else if (compClass.equalsIgnoreCase(FREQ_COMP)) comp = new SuggestWordFrequencyComparator(); else { // must be a FQCN @SuppressWarnings({"unchecked"}) Comparator temp = (Comparator) core.getResourceLoader().newInstance(compClass, Comparator.class); comp = temp; } } StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN; String distClass = (String) config.get(STRING_DISTANCE); if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE)) sd = core.getResourceLoader().newInstance(distClass, StringDistance.class); float minAccuracy = DEFAULT_ACCURACY; Float accuracy = params.getFloat(ACCURACY); if (accuracy != null) minAccuracy = accuracy; int maxEdits = DEFAULT_MAXEDITS; Integer edits = params.getInt(MAXEDITS); if (edits != null) maxEdits = edits; int minPrefix = DEFAULT_MINPREFIX; Integer prefix = params.getInt(MINPREFIX); if (prefix != null) minPrefix = prefix; int maxInspections = DEFAULT_MAXINSPECTIONS; Integer inspections = params.getInt(MAXINSPECTIONS); if (inspections != null) maxInspections = inspections; float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY; Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY); if (threshold != null) minThreshold = threshold; int minQueryLength = DEFAULT_MINQUERYLENGTH; Integer queryLength = params.getInt(MINQUERYLENGTH); if (queryLength != null) minQueryLength = queryLength; int maxQueryLength = DEFAULT_MAXQUERYLENGTH; Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH); if (overriddenMaxQueryLength != null) maxQueryLength = overriddenMaxQueryLength; float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY; Float queryFreq = params.getFloat(MAXQUERYFREQUENCY); if (queryFreq != null) maxQueryFrequency = queryFreq; checker.setComparator(comp); checker.setDistance(sd); checker.setMaxEdits(maxEdits); checker.setMinPrefix(minPrefix); checker.setAccuracy(minAccuracy); checker.setThresholdFrequency(minThreshold); checker.setMaxInspections(maxInspections); checker.setMinQueryLength(minQueryLength); checker.setMaxQueryLength(maxQueryLength); checker.setMaxQueryFrequency(maxQueryFrequency); checker.setLowerCaseTerms(false); return name; } @Override public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {} @Override public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {} @Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { log.debug("getSuggestions: {}", options.tokens); SpellingResult result = new SpellingResult(); float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy; for (Token token : options.tokens) { if (token.length() == 0) { result.add(token, Collections.emptyList()); continue; } String tokenText = token.toString(); Term term = new Term(field, tokenText); int freq = options.reader.docFreq(term); int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount : options.count; SuggestWord[] suggestions = checker.suggestSimilar(term, count, options.reader, options.suggestMode, accuracy); result.addFrequency(token, freq); // If considering alternatives to "correctly-spelled" terms, then add the // original as a viable suggestion. if (options.alternativeTermCount > 0 && freq > 0) { boolean foundOriginal = false; SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1]; for (int i = 0; i < suggestions.length; i++) { if (suggestions[i].string.equals(tokenText)) { foundOriginal = true; break; } suggestionsWithOrig[i + 1] = suggestions[i]; } if (!foundOriginal) { SuggestWord orig = new SuggestWord(); orig.freq = freq; orig.string = tokenText; suggestionsWithOrig[0] = orig; suggestions = suggestionsWithOrig; } } if (suggestions.length == 0 && freq == 0) { List empty = Collections.emptyList(); result.add(token, empty); } else { for (SuggestWord suggestion : suggestions) { result.add(token, suggestion.string, suggestion.freq); } } } return result; } @Override public void modifyRequest(ResponseBuilder rb, ShardRequest sreq) { if (1.0F <= checker.getMaxQueryFrequency()) { // we need extended results in order to prune things individual shards might think are below // max freq sreq.params.set(SpellingParams.SPELLCHECK_EXTENDED_RESULTS, true); sreq.params.set(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, true); } } @Override public SpellingResult mergeSuggestions( SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) { for (String original : new ArrayList<>(mergeData.origVsSuggested.keySet())) { if (mergeData.origVsFreq.containsKey(original)) { if (1.0F <= checker.getMaxQueryFrequency()) { // absolute maxQueryFreq threshold if (checker.getMaxQueryFrequency() < mergeData.origVsFreq.get(original)) { // one or more shards thought the word needed suggestions because it's // (per-shard) origFreq was too low, but the aggregate sum of origFreq // is above our threshold, so ignore those suggestions. mergeData.removeOriginal(original); } } else { // percentage maxQueryFreq threshold // This situation is also problematic, but in the reverse situation of // the absolute maxQueryFreq threshold. // // An individual shard may have found that it's (per-shard) origFreq // was higher then the computed max for that shard (relative to the // per-shard maxDoc) and said it's "correctlySpelled" even though // it's cumulative origFreq may not meet the computed max across the // entire collection. // // But we don't have a straightforward way to determine that, so for // now it's just a documented deficiency in the ref-guide. } } } return super.mergeSuggestions(mergeData, numSug, count, extendedResults); } @Override public float getAccuracy() { return checker.getAccuracy(); } @Override public StringDistance getStringDistance() { return checker.getDistance(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy