All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.hunspell.Hunspell Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;

/**
 * A spell checker based on Hunspell dictionaries. This class can be used in place of native
 * Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
 * languages are supported yet. For example:
 *
 * 
    *
  • Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the * source code *
  • Languages with Unicode characters outside of the Basic Multilingual Plane *
  • PHONE affix file option for suggestions *
* *

The objects of this class are thread-safe. */ public class Hunspell { static final long SUGGEST_TIME_LIMIT = 250; final Dictionary dictionary; final Stemmer stemmer; private final TimeoutPolicy policy; final Runnable checkCanceled; public Hunspell(Dictionary dictionary) { this(dictionary, RETURN_PARTIAL_RESULT, () -> {}); } /** * @param policy a strategy determining what to do when API calls take too much time * @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking * or suggestion generation by throwing an exception */ public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) { this.dictionary = dictionary; this.policy = policy; this.checkCanceled = checkCanceled; this.stemmer = new Stemmer(dictionary); } /** * @return whether the given word's spelling is considered correct according to Hunspell rules */ public boolean spell(String word) { checkCanceled.run(); if (word.isEmpty()) return true; if (dictionary.needsInputCleaning(word)) { word = dictionary.cleanInput(word, new StringBuilder()).toString(); } if (word.endsWith(".")) { return spellWithTrailingDots(word); } return spellClean(word); } private boolean spellClean(String word) { if (isNumber(word)) { return true; } char[] wordChars = word.toCharArray(); Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null); if (simpleResult != null) { return simpleResult; } if (checkCompounds(wordChars, wordChars.length, null)) { return true; } WordCase wc = stemmer.caseOf(wordChars, wordChars.length); if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) { Stemmer.CaseVariationProcessor variationProcessor = (variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase); if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) { return true; } } if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { return tryBreaks(word); } return false; } private boolean spellWithTrailingDots(String word) { int length = word.length() - 1; while (length > 0 && word.charAt(length - 1) == '.') { length--; } return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1)); } boolean checkWord(String word) { return checkWord(word.toCharArray(), word.length(), null); } Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD); if (entry != null) { return !dictionary.hasFlag(entry.entryId(), dictionary.forbiddenword); } return null; } private boolean checkWord(char[] wordChars, int length, WordCase originalCase) { Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase); if (simpleResult != null) { return simpleResult; } return checkCompounds(wordChars, length, originalCase); } private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) { if (dictionary.compoundRules != null && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { return true; } if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) { return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null); } return false; } Root findStem( char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { checkCanceled.run(); WordCase toCheck = context != COMPOUND_MIDDLE && context != COMPOUND_END ? originalCase : null; @SuppressWarnings({"rawtypes", "unchecked"}) Root[] result = new Root[1]; stemmer.doStem( wordChars, offset, length, context, (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> { if (!acceptCase(toCheck, formID, stem)) { return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG); } if (acceptsStem(formID)) { result[0] = new Root<>(stem, formID); } return false; }); return result[0]; } private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) { boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase); if (originalCase != null) { if (keepCase && dictionary.checkSharpS && originalCase == WordCase.TITLE && containsSharpS(root.chars, root.offset, root.length)) { return true; } return !keepCase; } return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG); } private boolean containsSharpS(char[] word, int offset, int length) { for (int i = 0; i < length; i++) { if (word[i + offset] == 'ß') { return true; } } return false; } boolean acceptsStem(int formID) { return true; } private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) { if (prev != null && prev.index > dictionary.compoundMax - 2) return false; if (prev == null && word.offset != 0) { // we check the word's beginning for FORCEUCASE and expect to find it at 0 throw new IllegalArgumentException(); } int limit = word.length - dictionary.compoundMin + 1; for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakOffset = word.offset + breakPos; if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { Root stem = findStem(word.chars, word.offset, breakPos, originalCase, context); if (stem == null && dictionary.simplifiedTriple && word.chars[breakOffset - 1] == word.chars[breakOffset]) { stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); } if (stem != null && !dictionary.hasFlag(stem.entryId(), dictionary.forbiddenword) && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); if (checkCompoundsAfter(originalCase, part)) { return true; } } } if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) { return true; } } return false; } private boolean checkCompoundPatternReplacements( CharsRef word, int pos, WordCase originalCase, CompoundPart prev) { for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) { CharsRef expanded = pattern.expandReplacement(word, pos); if (expanded != null) { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakPos = pos + pattern.endLength(); Root stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); if (stem != null) { CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern); if (checkCompoundsAfter(originalCase, part)) { return true; } } } } return false; } private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) { CharsRef word = prev.tail; int breakPos = prev.length; int remainingLength = word.length - breakPos; int breakOffset = word.offset + breakPos; Root lastRoot = findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); if (lastRoot != null && !dictionary.hasFlag(lastRoot.entryId(), dictionary.forbiddenword) && !(dictionary.checkCompoundDup && prev.root.equals(lastRoot)) && !hasForceUCaseProblem(lastRoot, originalCase, word.chars) && prev.mayCompound(lastRoot, remainingLength, originalCase)) { return true; } CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength); return checkCompounds(tail, originalCase, prev); } private boolean hasForceUCaseProblem(Root root, WordCase originalCase, char[] wordChars) { if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false; if (originalCase == null && Character.isUpperCase(wordChars[0])) return false; return dictionary.hasFlag(root.entryId(), dictionary.forceUCase); } /** * Find all roots that could result in the given word after case conversion and adding affixes. * This corresponds to the original {@code hunspell -s} (stemming) functionality. * *

Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic * file (but differ from some existing entries in case). No roots are returned for compound words. * *

The returned roots may be used to retrieve morphological data via {@link * Dictionary#lookupEntries}. */ public List getRoots(String word) { return stemmer.stem(word).stream().map(CharsRef::toString).distinct().toList(); } /** * @return all possible analyses of the given word with stems, prefixes, suffixed and * morphological data. Note that the order of the returned objects might not correspond to the * *.dic file order! */ public List analyzeSimpleWord(String word) { List result = new ArrayList<>(); stemmer.analyze( word.toCharArray(), word.length(), (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> { List prefixes = new ArrayList<>(); List suffixes = new ArrayList<>(); if (outerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, outerPrefix)); if (innerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, innerPrefix)); if (outerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, outerSuffix)); if (innerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, innerSuffix)); DictEntry entry = dictionary.dictEntry(stem.toString(), formID, morphDataId); result.add(new AffixedWord(word, entry, prefixes, suffixes)); return true; }); return result; } /** * Generate all word forms for all dictionary entries with the given root word. The result order * is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package. * * @see WordFormGenerator for finer-grained APIs */ public List getAllWordForms(String root) { return new WordFormGenerator(dictionary).getAllWordForms(root, checkCanceled); } /** * Given a list of words, try to produce a smaller set of dictionary entries (with some flags) * that would generate these words. This is equivalent to "munch" from the "hunspell-tools" * package. * * @see WordFormGenerator#compress for more details and control */ public EntrySuggestion compress(List words) { return new WordFormGenerator(dictionary).compress(words, Set.of(), checkCanceled); } private class CompoundPart { final CompoundPart prev; final int index, length; final CharsRef tail; final Root root; final CheckCompoundPattern enablingPattern; CompoundPart( CompoundPart prev, CharsRef tail, int length, Root root, CheckCompoundPattern enabler) { this.prev = prev; this.tail = tail; this.length = length; this.root = root; index = prev == null ? 1 : prev.index + 1; enablingPattern = enabler; } @Override public String toString() { return (prev == null ? "" : prev + "+") + tail.subSequence(0, length); } boolean mayCompound(Root nextRoot, int nextPartLength, WordCase originalCase) { boolean patternsOk = enablingPattern != null ? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot) : dictionary.checkCompoundPatterns.stream() .noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot)); if (!patternsOk) { return false; } if (dictionary.checkCompoundRep && isMisspelledSimpleWord(length + nextPartLength, originalCase)) { return false; } char[] spaceSeparated = new char[length + nextPartLength + 1]; System.arraycopy(tail.chars, tail.offset, spaceSeparated, 0, length); System.arraycopy( tail.chars, tail.offset + length, spaceSeparated, length + 1, nextPartLength); spaceSeparated[length] = ' '; return !Boolean.TRUE.equals(checkSimpleWord(spaceSeparated, spaceSeparated.length, null)); } private boolean isMisspelledSimpleWord(int length, WordCase originalCase) { String word = new String(tail.chars, tail.offset, length); for (RepEntry entry : dictionary.repTable) { if (entry.isMiddle()) { for (String sug : entry.substitute(word)) { if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) { return true; } } } } return false; } } private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) { if (dictionary.checkCompoundCase) { char a = chars[breakPos - 1]; char b = chars[breakPos]; if ((Character.isUpperCase(a) || Character.isUpperCase(b)) && a != '-' && b != '-') { return false; } } if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) { //noinspection RedundantIfStatement if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1] || breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) { return false; } } return true; } private boolean checkCompoundRules( char[] wordChars, int offset, int length, List words) { if (words.size() >= 100) return false; checkCanceled.run(); int limit = length - dictionary.compoundMin + 1; for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos); if (forms != null) { words.add(forms); if (mayHaveCompoundRule(words)) { if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { return true; } if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) { return true; } } words.remove(words.size() - 1); } } return false; } private boolean mayHaveCompoundRule(List words) { for (CompoundRule rule : dictionary.compoundRules) { if (rule.mayMatch(words)) { return true; } } return false; } private boolean checkLastCompoundPart( char[] wordChars, int start, int length, List words) { IntsRef ref = new IntsRef(new int[1], 0, 1); words.add(ref); Stemmer.RootProcessor stopOnMatching = (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> { ref.ints[0] = formID; for (CompoundRule r : dictionary.compoundRules) { if (r.fullyMatches(words)) { return false; } } return true; }; boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching); words.remove(words.size() - 1); return found; } private static boolean isNumber(String s) { int i = 0; while (i < s.length()) { char c = s.charAt(i); if (isDigit(c)) { i++; } else if (c == '.' || c == ',' || c == '-') { if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) { return false; } i += 2; } else { return false; } } return true; } private static boolean isDigit(char c) { return c >= '0' && c <= '9'; } private boolean tryBreaks(String word) { for (String br : dictionary.breaks.starting) { if (word.length() > br.length() && word.startsWith(br)) { if (spell(word.substring(br.length()))) { return true; } } } for (String br : dictionary.breaks.ending) { if (word.length() > br.length() && word.endsWith(br)) { if (spell(word.substring(0, word.length() - br.length()))) { return true; } } } for (String br : dictionary.breaks.middle) { int pos = word.indexOf(br); if (canBeBrokenAt(word, br, pos)) { return true; } // try to break at the second occurrence // to recognize dictionary words with a word break if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { return true; } } return false; } private boolean hasTooManyBreakOccurrences(String word) { int occurrences = 0; for (String br : dictionary.breaks.middle) { int pos = 0; while ((pos = word.indexOf(br, pos)) >= 0) { if (++occurrences >= 10) return true; pos += br.length(); } } return false; } private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { return breakPos > 0 && breakPos < word.length() - breakStr.length() && spell(word.substring(0, breakPos)) && spell(word.substring(breakPos + breakStr.length())); } /** * @return suggestions for the given misspelled word * @throws SuggestionTimeoutException if the computation takes too long and {@link * TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor * @see Suggester for finer-grained APIs and performance optimizations */ public List suggest(String word) throws SuggestionTimeoutException { return suggest(word, SUGGEST_TIME_LIMIT); } /** * @param word the misspelled word to calculate suggestions for * @param timeLimitMs the duration limit in milliseconds, after which the associated {@link * TimeoutPolicy}'s effects (exception or partial result) may kick in * @throws SuggestionTimeoutException if the computation takes too long and {@link * TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor * @see Suggester for finer-grained APIs and performance optimizations */ public List suggest(String word, long timeLimitMs) throws SuggestionTimeoutException { Suggester suggester = new Suggester(dictionary); if (policy == NO_TIMEOUT) return suggester.suggestNoTimeout(word, checkCanceled); try { return suggester.suggestWithTimeout(word, timeLimitMs, checkCanceled); } catch (SuggestionTimeoutException e) { if (policy == RETURN_PARTIAL_RESULT) { return e.getPartialResult(); } throw e; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy