org.apache.lucene.analysis.hunspell.Suggester Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-common Show documentation
Show all versions of lucene-analysis-common Show documentation
Apache Lucene (module: common)
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.CharsRef;
/**
* A generator for misspelled word corrections based on Hunspell flags. The suggestions are searched
* for in two main ways:
*
*
* - Modification: trying to insert/remove/delete/swap parts of the word to get something
* acceptable. The performance of this part depends heavily on the contents of TRY, MAP, REP,
* KEY directives in the .aff file. To speed up this part, consider using {@link
* #withFragmentChecker}.
*
- Enumeration: if the modification hasn't produced "good enough" suggestions, the whole
* dictionary is scanned and simple affixes are added onto the entries to check if that
* produces anything similar to the given misspelled word. This depends on the dictionary size
* and the affix count, and it can take noticeable amount of time. To speed this up, {@link
* #withSuggestibleEntryCache()} can be used.
*
*/
public class Suggester {
private final Dictionary dictionary;
private final SuggestibleEntryCache suggestibleCache;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
public Suggester(Dictionary dictionary) {
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
}
private Suggester(
Dictionary dictionary,
SuggestibleEntryCache suggestibleCache,
FragmentChecker checker,
boolean proceedPastRep) {
this.dictionary = dictionary;
this.suggestibleCache = suggestibleCache;
this.fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}
/**
* Returns a copy of this suggester instance with better "Enumeration" phase performance (see
* {@link Suggester} documentation), but using more memory. With this option, the dictionary
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
*/
public Suggester withSuggestibleEntryCache() {
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
}
/**
* Returns a copy of this suggester instance with {@link FragmentChecker} hint that can improve
* the performance of the "Modification" phase performance.
*/
public Suggester withFragmentChecker(FragmentChecker checker) {
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
}
/**
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
* not "times", which could also be meant.
*/
public Suggester proceedPastRep() {
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
}
/**
* Compute suggestions for the given misspelled word
*
* @param word the misspelled word to calculate suggestions for
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
* generation by throwing an exception
*/
public List suggestNoTimeout(String word, Runnable checkCanceled) {
LinkedHashSet suggestions = new LinkedHashSet<>();
return suggest(word, suggestions, handleCustomTimeoutException(checkCanceled, suggestions));
}
private Runnable handleCustomTimeoutException(
Runnable checkCanceled, LinkedHashSet suggestions) {
return () -> {
try {
checkCanceled.run();
} catch (SuggestionTimeoutException e) {
if (e.getPartialResult() != null) {
throw e;
}
throw new SuggestionTimeoutException(e.getMessage(), postprocess(suggestions));
}
};
}
/**
* @param word the misspelled word to calculate suggestions for
* @param timeLimitMs the duration limit in milliseconds after which the computation is interruped
* by an exception
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
* generation by throwing an exception
* @throws SuggestionTimeoutException if the computation takes too long. Use {@link
* SuggestionTimeoutException#getPartialResult()} to get the suggestions computed up to that
* point
*/
public List suggestWithTimeout(String word, long timeLimitMs, Runnable checkCanceled)
throws SuggestionTimeoutException {
LinkedHashSet suggestions = new LinkedHashSet<>();
Runnable checkTime = checkTimeLimit(word, suggestions, timeLimitMs, checkCanceled);
return suggest(word, suggestions, handleCustomTimeoutException(checkTime, suggestions));
}
private List suggest(
String word, LinkedHashSet suggestions, Runnable checkCanceled)
throws SuggestionTimeoutException {
checkCanceled.run();
if (word.length() >= 100) return Collections.emptyList();
if (dictionary.needsInputCleaning(word)) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
Hunspell suggestionSpeller =
new Hunspell(dictionary, NO_TIMEOUT, checkCanceled) {
// Cache for expensive "findStem" requests issued when trying to split a compound word.
// The suggestion algorithm issues many of them, often with the same text.
// The cache can be large, but will be GC-ed after the "suggest" call.
final Map>> compoundCache = new HashMap<>();
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
@Override
Root findStem(
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
if (context == COMPOUND_BEGIN && originalCase == null) {
return compoundCache
.computeIfAbsent(
new String(chars, offset, length),
__ ->
Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
.orElse(null);
}
return super.findStem(chars, offset, length, originalCase, context);
}
};
WordCase wordCase = WordCase.caseOf(word);
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
String title = dictionary.toTitleCase(word);
if (suggestionSpeller.spell(title)) {
return Collections.singletonList(title);
}
}
boolean hasGoodSuggestions =
new ModifyingSuggester(
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
.suggest();
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
List generated =
new GeneratingSuggester(suggestionSpeller, suggestibleCache)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions);
for (String raw : generated) {
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
}
}
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
for (String raw : modifyChunksBetweenDashes(word, suggestionSpeller, checkCanceled)) {
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
}
}
return postprocess(suggestions);
}
private Runnable checkTimeLimit(
String word, Set suggestions, long timeLimitMs, Runnable checkCanceled) {
return new Runnable() {
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
int invocationCounter = 100;
@Override
public void run() {
checkCanceled.run();
if (--invocationCounter <= 0) {
if (System.nanoTime() - deadline > 0) {
stop();
}
invocationCounter = 100;
}
}
private void stop() {
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
throw new SuggestionTimeoutException(message, postprocess(suggestions));
}
};
}
private List postprocess(Collection suggestions) {
return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
}
private List modifyChunksBetweenDashes(
String word, Hunspell speller, Runnable checkCanceled) {
List result = new ArrayList<>();
int chunkStart = 0;
while (chunkStart < word.length()) {
int chunkEnd = word.indexOf('-', chunkStart);
if (chunkEnd < 0) {
chunkEnd = word.length();
}
if (chunkEnd > chunkStart) {
String chunk = word.substring(chunkStart, chunkEnd);
if (!speller.spell(chunk)) {
for (String chunkSug : suggestNoTimeout(chunk, checkCanceled)) {
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
if (speller.spell(replaced)) {
result.add(replaced);
}
}
}
}
chunkStart = chunkEnd + 1;
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy