All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.hunspell.Hunspell Maven / Gradle / Ivy

There is a newer version: 8.11.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;

/**
 * A spell checker based on Hunspell dictionaries. This class can be used in place of native
 * Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
 * languages are supported yet. For example:
 *
 * 
    *
  • Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the * source code *
  • Languages with Unicode characters outside of the Basic Multilingual Plane *
  • PHONE affix file option for suggestions *
* *

The objects of this class are thread-safe. */ public class Hunspell { static final long SUGGEST_TIME_LIMIT = 250; final Dictionary dictionary; final Stemmer stemmer; private final TimeoutPolicy policy; final Runnable checkCanceled; public Hunspell(Dictionary dictionary) { this(dictionary, RETURN_PARTIAL_RESULT, () -> {}); } /** * @param policy a strategy determining what to do when API calls take too much time * @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking * or suggestion generation by throwing an exception */ public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) { this.dictionary = dictionary; this.policy = policy; this.checkCanceled = checkCanceled; stemmer = new Stemmer(dictionary); } /** @return whether the given word's spelling is considered correct according to Hunspell rules */ public boolean spell(String word) { checkCanceled.run(); if (word.isEmpty()) return true; if (dictionary.needsInputCleaning(word)) { word = dictionary.cleanInput(word, new StringBuilder()).toString(); } if (word.endsWith(".")) { return spellWithTrailingDots(word); } return spellClean(word); } private boolean spellClean(String word) { if (isNumber(word)) { return true; } char[] wordChars = word.toCharArray(); Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null); if (simpleResult != null) { return simpleResult; } if (checkCompounds(wordChars, wordChars.length, null)) { return true; } WordCase wc = stemmer.caseOf(wordChars, wordChars.length); if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) { Stemmer.CaseVariationProcessor variationProcessor = (variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase); if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) { return true; } } if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { return tryBreaks(word); } return false; } private boolean spellWithTrailingDots(String word) { int length = word.length() - 1; while (length > 0 && word.charAt(length - 1) == '.') { length--; } return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1)); } boolean checkWord(String word) { return checkWord(word.toCharArray(), word.length(), null); } Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD); if (entry != null) { return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword); } return null; } private boolean checkWord(char[] wordChars, int length, WordCase originalCase) { Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase); if (simpleResult != null) { return simpleResult; } return checkCompounds(wordChars, length, originalCase); } private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) { if (dictionary.compoundRules != null && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { return true; } if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) { return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null); } return false; } private Root findStem( char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { checkCanceled.run(); boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END; @SuppressWarnings({"rawtypes", "unchecked"}) Root[] result = new Root[1]; stemmer.doStem( wordChars, offset, length, context, (stem, formID, morphDataId) -> { if (checkCase && !acceptCase(originalCase, formID, stem)) { return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG); } if (acceptsStem(formID)) { result[0] = new Root<>(stem, formID); } return false; }); return result[0]; } private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) { boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase); if (originalCase != null) { if (keepCase && dictionary.checkSharpS && originalCase == WordCase.TITLE && containsSharpS(root.chars, root.offset, root.length)) { return true; } return !keepCase; } return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG); } private boolean containsSharpS(char[] word, int offset, int length) { for (int i = 0; i < length; i++) { if (word[i + offset] == 'ß') { return true; } } return false; } boolean acceptsStem(int formID) { return true; } private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) { if (prev != null && prev.index > dictionary.compoundMax - 2) return false; if (prev == null && word.offset != 0) { // we check the word's beginning for FORCEUCASE and expect to find it at 0 throw new IllegalArgumentException(); } int limit = word.length - dictionary.compoundMin + 1; for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakOffset = word.offset + breakPos; if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { Root stem = findStem(word.chars, word.offset, breakPos, originalCase, context); if (stem == null && dictionary.simplifiedTriple && word.chars[breakOffset - 1] == word.chars[breakOffset]) { stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); } if (stem != null && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword) && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); if (checkCompoundsAfter(originalCase, part)) { return true; } } } if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) { return true; } } return false; } private boolean checkCompoundPatternReplacements( CharsRef word, int pos, WordCase originalCase, CompoundPart prev) { for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) { CharsRef expanded = pattern.expandReplacement(word, pos); if (expanded != null) { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakPos = pos + pattern.endLength(); Root stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); if (stem != null) { CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern); if (checkCompoundsAfter(originalCase, part)) { return true; } } } } return false; } private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) { CharsRef word = prev.tail; int breakPos = prev.length; int remainingLength = word.length - breakPos; int breakOffset = word.offset + breakPos; Root lastRoot = findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); if (lastRoot != null && !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword) && !(dictionary.checkCompoundDup && prev.root.equals(lastRoot)) && !hasForceUCaseProblem(lastRoot, originalCase, word.chars) && prev.mayCompound(lastRoot, remainingLength, originalCase)) { return true; } CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength); return checkCompounds(tail, originalCase, prev); } private boolean hasForceUCaseProblem(Root root, WordCase originalCase, char[] wordChars) { if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false; if (originalCase == null && Character.isUpperCase(wordChars[0])) return false; return dictionary.hasFlag(root.entryId, dictionary.forceUCase); } /** * Find all roots that could result in the given word after case conversion and adding affixes. * This corresponds to the original {@code hunspell -s} (stemming) functionality. * *

Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic * file (but differ from some existing entries in case). No roots are returned for compound words. * *

The returned roots may be used to retrieve morphological data via {@link * Dictionary#lookupEntries}. */ public List getRoots(String word) { return stemmer.stem(word).stream() .map(CharsRef::toString) .distinct() .collect(Collectors.toList()); } private class CompoundPart { final CompoundPart prev; final int index, length; final CharsRef tail; final Root root; final CheckCompoundPattern enablingPattern; CompoundPart( CompoundPart prev, CharsRef tail, int length, Root root, CheckCompoundPattern enabler) { this.prev = prev; this.tail = tail; this.length = length; this.root = root; index = prev == null ? 1 : prev.index + 1; enablingPattern = enabler; } @Override public String toString() { return (prev == null ? "" : prev + "+") + tail.subSequence(0, length); } boolean mayCompound(Root nextRoot, int nextPartLength, WordCase originalCase) { boolean patternsOk = enablingPattern != null ? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot) : dictionary.checkCompoundPatterns.stream() .noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot)); if (!patternsOk) { return false; } if (dictionary.checkCompoundRep && isMisspelledSimpleWord(length + nextPartLength, originalCase)) { return false; } char[] spaceSeparated = new char[length + nextPartLength + 1]; System.arraycopy(tail.chars, tail.offset, spaceSeparated, 0, length); System.arraycopy( tail.chars, tail.offset + length, spaceSeparated, length + 1, nextPartLength); spaceSeparated[length] = ' '; return !Boolean.TRUE.equals(checkSimpleWord(spaceSeparated, spaceSeparated.length, null)); } private boolean isMisspelledSimpleWord(int length, WordCase originalCase) { String word = new String(tail.chars, tail.offset, length); for (RepEntry entry : dictionary.repTable) { if (entry.isMiddle()) { for (String sug : entry.substitute(word)) { if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) { return true; } } } } return false; } } private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) { if (dictionary.checkCompoundCase) { char a = chars[breakPos - 1]; char b = chars[breakPos]; if ((Character.isUpperCase(a) || Character.isUpperCase(b)) && a != '-' && b != '-') { return false; } } if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) { //noinspection RedundantIfStatement if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1] || breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) { return false; } } return true; } private boolean checkCompoundRules( char[] wordChars, int offset, int length, List words) { if (words.size() >= 100) return false; int limit = length - dictionary.compoundMin + 1; for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { checkCanceled.run(); IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos); if (forms != null) { words.add(forms); if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) { if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { return true; } if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) { return true; } } words.remove(words.size() - 1); } } return false; } private boolean checkLastCompoundPart( char[] wordChars, int start, int length, List words) { IntsRef ref = new IntsRef(new int[1], 0, 1); words.add(ref); Stemmer.RootProcessor stopOnMatching = (stem, formID, morphDataId) -> { ref.ints[0] = formID; return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words)); }; boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching); words.remove(words.size() - 1); return found; } private static boolean isNumber(String s) { int i = 0; while (i < s.length()) { char c = s.charAt(i); if (isDigit(c)) { i++; } else if (c == '.' || c == ',' || c == '-') { if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) { return false; } i += 2; } else { return false; } } return true; } private static boolean isDigit(char c) { return c >= '0' && c <= '9'; } private boolean tryBreaks(String word) { for (String br : dictionary.breaks.starting) { if (word.length() > br.length() && word.startsWith(br)) { if (spell(word.substring(br.length()))) { return true; } } } for (String br : dictionary.breaks.ending) { if (word.length() > br.length() && word.endsWith(br)) { if (spell(word.substring(0, word.length() - br.length()))) { return true; } } } for (String br : dictionary.breaks.middle) { int pos = word.indexOf(br); if (canBeBrokenAt(word, br, pos)) { return true; } // try to break at the second occurrence // to recognize dictionary words with a word break if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { return true; } } return false; } private boolean hasTooManyBreakOccurrences(String word) { int occurrences = 0; for (String br : dictionary.breaks.middle) { int pos = 0; while ((pos = word.indexOf(br, pos)) >= 0) { if (++occurrences >= 10) return true; pos += br.length(); } } return false; } private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { return breakPos > 0 && breakPos < word.length() - breakStr.length() && spell(word.substring(0, breakPos)) && spell(word.substring(breakPos + breakStr.length())); } /** * @return suggestions for the given misspelled word * @throws SuggestionTimeoutException if the computation takes too long and {@link * TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor */ public List suggest(String word) throws SuggestionTimeoutException { return suggest(word, SUGGEST_TIME_LIMIT); } /** * @param word the misspelled word to calculate suggestions for * @param timeLimitMs the duration limit in milliseconds, after which the associated {@link * TimeoutPolicy}'s effects (exception or partial result) may kick in * @throws SuggestionTimeoutException if the computation takes too long and {@link * TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor */ public List suggest(String word, long timeLimitMs) throws SuggestionTimeoutException { checkCanceled.run(); if (word.length() >= 100) return Collections.emptyList(); if (dictionary.needsInputCleaning(word)) { word = dictionary.cleanInput(word, new StringBuilder()).toString(); } WordCase wordCase = WordCase.caseOf(word); if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) { String title = dictionary.toTitleCase(word); if (spell(title)) { return Collections.singletonList(title); } } LinkedHashSet suggestions = new LinkedHashSet<>(); Runnable checkCanceled = policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, wordCase, suggestions, timeLimitMs); try { doSuggest(word, wordCase, suggestions, checkCanceled); } catch (SuggestionTimeoutException e) { if (policy == RETURN_PARTIAL_RESULT) { return postprocess(word, wordCase, suggestions); } throw e; } return postprocess(word, wordCase, suggestions); } private void doSuggest( String word, WordCase wordCase, LinkedHashSet suggestions, Runnable checkCanceled) { Hunspell suggestionSpeller = new Hunspell(dictionary, policy, checkCanceled) { @Override boolean acceptsStem(int formID) { return !dictionary.hasFlag(formID, dictionary.noSuggest) && !dictionary.hasFlag(formID, dictionary.subStandard); } }; ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions); boolean hasGoodSuggestions = modifier.suggest(word, wordCase); if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) { suggestions.addAll( new GeneratingSuggester(suggestionSpeller) .suggest(dictionary.toLowerCase(word), wordCase, suggestions)); } if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) { suggestions.addAll(modifyChunksBetweenDashes(word)); } } private Runnable checkTimeLimit( String word, WordCase wordCase, Set suggestions, long timeLimitMs) { return new Runnable() { final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs); int invocationCounter = 100; @Override public void run() { checkCanceled.run(); if (--invocationCounter <= 0) { if (System.nanoTime() - deadline > 0) { stop(); } invocationCounter = 100; } } private void stop() { List partialResult = policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions); String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word; throw new SuggestionTimeoutException(message, partialResult); } }; } private List postprocess(String word, WordCase wordCase, Collection suggestions) { Set result = new LinkedHashSet<>(); for (String candidate : suggestions) { result.add(adjustSuggestionCase(candidate, wordCase, word)); if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) { result.add(candidate); } } return result.stream().map(this::cleanOutput).collect(Collectors.toList()); } private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) { if (originalCase == WordCase.UPPER) { String upper = candidate.toUpperCase(Locale.ROOT); if (upper.contains(" ") || spell(upper)) { return upper; } } if (Character.isUpperCase(original.charAt(0))) { String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1); if (title.contains(" ") || spell(title)) { return title; } } return candidate; } private List modifyChunksBetweenDashes(String word) { List result = new ArrayList<>(); int chunkStart = 0; while (chunkStart < word.length()) { int chunkEnd = word.indexOf('-', chunkStart); if (chunkEnd < 0) { chunkEnd = word.length(); } if (chunkEnd > chunkStart) { String chunk = word.substring(chunkStart, chunkEnd); if (!spell(chunk)) { for (String chunkSug : suggest(chunk)) { String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd); if (spell(replaced)) { result.add(replaced); } } } } chunkStart = chunkEnd + 1; } return result; } private String cleanOutput(String s) { if (dictionary.oconv == null) return s; StringBuilder sb = new StringBuilder(s); dictionary.oconv.applyMappings(sb); return sb.toString(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy