org.apache.lucene.analysis.hunspell.Hunspell Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
/**
* A spell checker based on Hunspell dictionaries. This class can be used in place of native
* Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
* languages are supported yet. For example:
*
*
* - Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the
* source code
*
- Languages with Unicode characters outside of the Basic Multilingual Plane
*
- PHONE affix file option for suggestions
*
*
* The objects of this class are thread-safe.
*/
public class Hunspell {
static final long SUGGEST_TIME_LIMIT = 250;
final Dictionary dictionary;
final Stemmer stemmer;
private final TimeoutPolicy policy;
final Runnable checkCanceled;
public Hunspell(Dictionary dictionary) {
this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
}
/**
* @param policy a strategy determining what to do when API calls take too much time
* @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
* or suggestion generation by throwing an exception
*/
public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
this.dictionary = dictionary;
this.policy = policy;
this.checkCanceled = checkCanceled;
stemmer = new Stemmer(dictionary);
}
/** @return whether the given word's spelling is considered correct according to Hunspell rules */
public boolean spell(String word) {
checkCanceled.run();
if (word.isEmpty()) return true;
if (dictionary.needsInputCleaning(word)) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
if (word.endsWith(".")) {
return spellWithTrailingDots(word);
}
return spellClean(word);
}
private boolean spellClean(String word) {
if (isNumber(word)) {
return true;
}
char[] wordChars = word.toCharArray();
Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
if (simpleResult != null) {
return simpleResult;
}
if (checkCompounds(wordChars, wordChars.length, null)) {
return true;
}
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
Stemmer.CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
return true;
}
}
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
return tryBreaks(word);
}
return false;
}
private boolean spellWithTrailingDots(String word) {
int length = word.length() - 1;
while (length > 0 && word.charAt(length - 1) == '.') {
length--;
}
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
}
boolean checkWord(String word) {
return checkWord(word.toCharArray(), word.length(), null);
}
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
if (entry != null) {
return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
}
return null;
}
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
if (simpleResult != null) {
return simpleResult;
}
return checkCompounds(wordChars, length, originalCase);
}
private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.compoundRules != null
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true;
}
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null);
}
return false;
}
private Root findStem(
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
checkCanceled.run();
boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END;
@SuppressWarnings({"rawtypes", "unchecked"})
Root[] result = new Root[1];
stemmer.doStem(
wordChars,
offset,
length,
context,
(stem, formID, morphDataId) -> {
if (checkCase && !acceptCase(originalCase, formID, stem)) {
return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
}
if (acceptsStem(formID)) {
result[0] = new Root<>(stem, formID);
}
return false;
});
return result[0];
}
private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) {
boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
if (originalCase != null) {
if (keepCase
&& dictionary.checkSharpS
&& originalCase == WordCase.TITLE
&& containsSharpS(root.chars, root.offset, root.length)) {
return true;
}
return !keepCase;
}
return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
}
private boolean containsSharpS(char[] word, int offset, int length) {
for (int i = 0; i < length; i++) {
if (word[i + offset] == 'ß') {
return true;
}
}
return false;
}
boolean acceptsStem(int formID) {
return true;
}
private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
if (prev != null && prev.index > dictionary.compoundMax - 2) return false;
if (prev == null && word.offset != 0) {
// we check the word's beginning for FORCEUCASE and expect to find it at 0
throw new IllegalArgumentException();
}
int limit = word.length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakOffset = word.offset + breakPos;
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
Root stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
if (stem == null
&& dictionary.simplifiedTriple
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
}
if (stem != null
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
if (checkCompoundsAfter(originalCase, part)) {
return true;
}
}
}
if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) {
return true;
}
}
return false;
}
private boolean checkCompoundPatternReplacements(
CharsRef word, int pos, WordCase originalCase, CompoundPart prev) {
for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
CharsRef expanded = pattern.expandReplacement(word, pos);
if (expanded != null) {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakPos = pos + pattern.endLength();
Root stem =
findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
if (stem != null) {
CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
if (checkCompoundsAfter(originalCase, part)) {
return true;
}
}
}
}
return false;
}
private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) {
CharsRef word = prev.tail;
int breakPos = prev.length;
int remainingLength = word.length - breakPos;
int breakOffset = word.offset + breakPos;
Root lastRoot =
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (lastRoot != null
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
&& !hasForceUCaseProblem(lastRoot, originalCase, word.chars)
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
return true;
}
CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
return checkCompounds(tail, originalCase, prev);
}
private boolean hasForceUCaseProblem(Root> root, WordCase originalCase, char[] wordChars) {
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
if (originalCase == null && Character.isUpperCase(wordChars[0])) return false;
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
}
/**
* Find all roots that could result in the given word after case conversion and adding affixes.
* This corresponds to the original {@code hunspell -s} (stemming) functionality.
*
* Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are
* still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic
* file (but differ from some existing entries in case). No roots are returned for compound words.
*
*
The returned roots may be used to retrieve morphological data via {@link
* Dictionary#lookupEntries}.
*/
public List getRoots(String word) {
return stemmer.stem(word).stream()
.map(CharsRef::toString)
.distinct()
.collect(Collectors.toList());
}
private class CompoundPart {
final CompoundPart prev;
final int index, length;
final CharsRef tail;
final Root root;
final CheckCompoundPattern enablingPattern;
CompoundPart(
CompoundPart prev,
CharsRef tail,
int length,
Root root,
CheckCompoundPattern enabler) {
this.prev = prev;
this.tail = tail;
this.length = length;
this.root = root;
index = prev == null ? 1 : prev.index + 1;
enablingPattern = enabler;
}
@Override
public String toString() {
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
}
boolean mayCompound(Root nextRoot, int nextPartLength, WordCase originalCase) {
boolean patternsOk =
enablingPattern != null
? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
: dictionary.checkCompoundPatterns.stream()
.noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
if (!patternsOk) {
return false;
}
if (dictionary.checkCompoundRep
&& isMisspelledSimpleWord(length + nextPartLength, originalCase)) {
return false;
}
char[] spaceSeparated = new char[length + nextPartLength + 1];
System.arraycopy(tail.chars, tail.offset, spaceSeparated, 0, length);
System.arraycopy(
tail.chars, tail.offset + length, spaceSeparated, length + 1, nextPartLength);
spaceSeparated[length] = ' ';
return !Boolean.TRUE.equals(checkSimpleWord(spaceSeparated, spaceSeparated.length, null));
}
private boolean isMisspelledSimpleWord(int length, WordCase originalCase) {
String word = new String(tail.chars, tail.offset, length);
for (RepEntry entry : dictionary.repTable) {
if (entry.isMiddle()) {
for (String sug : entry.substitute(word)) {
if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) {
return true;
}
}
}
}
return false;
}
}
private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
if (dictionary.checkCompoundCase) {
char a = chars[breakPos - 1];
char b = chars[breakPos];
if ((Character.isUpperCase(a) || Character.isUpperCase(b)) && a != '-' && b != '-') {
return false;
}
}
if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
//noinspection RedundantIfStatement
if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
|| breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
return false;
}
}
return true;
}
private boolean checkCompoundRules(
char[] wordChars, int offset, int length, List words) {
if (words.size() >= 100) return false;
int limit = length - dictionary.compoundMin + 1;
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
checkCanceled.run();
IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
if (forms != null) {
words.add(forms);
if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
return true;
}
}
words.remove(words.size() - 1);
}
}
return false;
}
private boolean checkLastCompoundPart(
char[] wordChars, int start, int length, List words) {
IntsRef ref = new IntsRef(new int[1], 0, 1);
words.add(ref);
Stemmer.RootProcessor stopOnMatching =
(stem, formID, morphDataId) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};
boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
words.remove(words.size() - 1);
return found;
}
private static boolean isNumber(String s) {
int i = 0;
while (i < s.length()) {
char c = s.charAt(i);
if (isDigit(c)) {
i++;
} else if (c == '.' || c == ',' || c == '-') {
if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) {
return false;
}
i += 2;
} else {
return false;
}
}
return true;
}
private static boolean isDigit(char c) {
return c >= '0' && c <= '9';
}
private boolean tryBreaks(String word) {
for (String br : dictionary.breaks.starting) {
if (word.length() > br.length() && word.startsWith(br)) {
if (spell(word.substring(br.length()))) {
return true;
}
}
}
for (String br : dictionary.breaks.ending) {
if (word.length() > br.length() && word.endsWith(br)) {
if (spell(word.substring(0, word.length() - br.length()))) {
return true;
}
}
}
for (String br : dictionary.breaks.middle) {
int pos = word.indexOf(br);
if (canBeBrokenAt(word, br, pos)) {
return true;
}
// try to break at the second occurrence
// to recognize dictionary words with a word break
if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
return true;
}
}
return false;
}
private boolean hasTooManyBreakOccurrences(String word) {
int occurrences = 0;
for (String br : dictionary.breaks.middle) {
int pos = 0;
while ((pos = word.indexOf(br, pos)) >= 0) {
if (++occurrences >= 10) return true;
pos += br.length();
}
}
return false;
}
private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
return breakPos > 0
&& breakPos < word.length() - breakStr.length()
&& spell(word.substring(0, breakPos))
&& spell(word.substring(breakPos + breakStr.length()));
}
/**
* @return suggestions for the given misspelled word
* @throws SuggestionTimeoutException if the computation takes too long and {@link
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
*/
public List suggest(String word) throws SuggestionTimeoutException {
return suggest(word, SUGGEST_TIME_LIMIT);
}
/**
* @param word the misspelled word to calculate suggestions for
* @param timeLimitMs the duration limit in milliseconds, after which the associated {@link
* TimeoutPolicy}'s effects (exception or partial result) may kick in
* @throws SuggestionTimeoutException if the computation takes too long and {@link
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
*/
public List suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
checkCanceled.run();
if (word.length() >= 100) return Collections.emptyList();
if (dictionary.needsInputCleaning(word)) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
WordCase wordCase = WordCase.caseOf(word);
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
String title = dictionary.toTitleCase(word);
if (spell(title)) {
return Collections.singletonList(title);
}
}
LinkedHashSet suggestions = new LinkedHashSet<>();
Runnable checkCanceled =
policy == NO_TIMEOUT
? this.checkCanceled
: checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
try {
doSuggest(word, wordCase, suggestions, checkCanceled);
} catch (SuggestionTimeoutException e) {
if (policy == RETURN_PARTIAL_RESULT) {
return postprocess(word, wordCase, suggestions);
}
throw e;
}
return postprocess(word, wordCase, suggestions);
}
private void doSuggest(
String word, WordCase wordCase, LinkedHashSet suggestions, Runnable checkCanceled) {
Hunspell suggestionSpeller =
new Hunspell(dictionary, policy, checkCanceled) {
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
};
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
boolean hasGoodSuggestions = modifier.suggest(word, wordCase);
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll(
new GeneratingSuggester(suggestionSpeller)
.suggest(dictionary.toLowerCase(word), wordCase, suggestions));
}
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
suggestions.addAll(modifyChunksBetweenDashes(word));
}
}
private Runnable checkTimeLimit(
String word, WordCase wordCase, Set suggestions, long timeLimitMs) {
return new Runnable() {
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
int invocationCounter = 100;
@Override
public void run() {
checkCanceled.run();
if (--invocationCounter <= 0) {
if (System.nanoTime() - deadline > 0) {
stop();
}
invocationCounter = 100;
}
}
private void stop() {
List partialResult =
policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
throw new SuggestionTimeoutException(message, partialResult);
}
};
}
private List postprocess(String word, WordCase wordCase, Collection suggestions) {
Set result = new LinkedHashSet<>();
for (String candidate : suggestions) {
result.add(adjustSuggestionCase(candidate, wordCase, word));
if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
result.add(candidate);
}
}
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
}
private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
if (originalCase == WordCase.UPPER) {
String upper = candidate.toUpperCase(Locale.ROOT);
if (upper.contains(" ") || spell(upper)) {
return upper;
}
}
if (Character.isUpperCase(original.charAt(0))) {
String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
if (title.contains(" ") || spell(title)) {
return title;
}
}
return candidate;
}
private List modifyChunksBetweenDashes(String word) {
List result = new ArrayList<>();
int chunkStart = 0;
while (chunkStart < word.length()) {
int chunkEnd = word.indexOf('-', chunkStart);
if (chunkEnd < 0) {
chunkEnd = word.length();
}
if (chunkEnd > chunkStart) {
String chunk = word.substring(chunkStart, chunkEnd);
if (!spell(chunk)) {
for (String chunkSug : suggest(chunk)) {
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
if (spell(replaced)) {
result.add(replaced);
}
}
}
}
chunkStart = chunkEnd + 1;
}
return result;
}
private String cleanOutput(String s) {
if (dictionary.oconv == null) return s;
StringBuilder sb = new StringBuilder(s);
dictionary.oconv.applyMappings(sb);
return sb.toString();
}
}