org.apache.lucene.analysis.hunspell.Hunspell Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-common Show documentation
Additional Analyzers
There is a newer version: 8.11.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;

/**
 * A spell checker based on Hunspell dictionaries. This class can be used in place of native
 * Hunspell for many languages for spell-checking and suggesting purposes. Note that not all
 * languages are supported yet. For example:
 *
 * 
 *   Hungarian (as it doesn't only rely on dictionaries, but has some logic directly in the
 *       source code
 *   
Languages with Unicode characters outside of the Basic Multilingual Plane
 *   
PHONE affix file option for suggestions
 * 
 *
 * The objects of this class are thread-safe.
 */
public class Hunspell {
  static final long SUGGEST_TIME_LIMIT = 250;

  final Dictionary dictionary;
  final Stemmer stemmer;
  private final TimeoutPolicy policy;
  final Runnable checkCanceled;

  public Hunspell(Dictionary dictionary) {
    this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
  }

  /**
   * @param policy a strategy determining what to do when API calls take too much time
   * @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
   *     or suggestion generation by throwing an exception
   */
  public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
    this.dictionary = dictionary;
    this.policy = policy;
    this.checkCanceled = checkCanceled;
    stemmer = new Stemmer(dictionary);
  }

  /** @return whether the given word's spelling is considered correct according to Hunspell rules */
  public boolean spell(String word) {
    checkCanceled.run();
    if (word.isEmpty()) return true;

    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

    if (word.endsWith(".")) {
      return spellWithTrailingDots(word);
    }

    return spellClean(word);
  }

  private boolean spellClean(String word) {
    if (isNumber(word)) {
      return true;
    }

    char[] wordChars = word.toCharArray();
    Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
    if (simpleResult != null) {
      return simpleResult;
    }

    if (checkCompounds(wordChars, wordChars.length, null)) {
      return true;
    }

    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
    if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
      Stemmer.CaseVariationProcessor variationProcessor =
          (variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
      if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
        return true;
      }
    }

    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
      return tryBreaks(word);
    }

    return false;
  }

  private boolean spellWithTrailingDots(String word) {
    int length = word.length() - 1;
    while (length > 0 && word.charAt(length - 1) == '.') {
      length--;
    }
    return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
  }

  boolean checkWord(String word) {
    return checkWord(word.toCharArray(), word.length(), null);
  }

  Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
    Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
    if (entry != null) {
      return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
    }

    return null;
  }

  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
    Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
    if (simpleResult != null) {
      return simpleResult;
    }

    return checkCompounds(wordChars, length, originalCase);
  }

  private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
    if (dictionary.compoundRules != null
        && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
      return true;
    }

    if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
      return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, null);
    }

    return false;
  }

  private Root findStem(
      char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
    checkCanceled.run();
    boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END;
    @SuppressWarnings({"rawtypes", "unchecked"})
    Root[] result = new Root[1];
    stemmer.doStem(
        wordChars,
        offset,
        length,
        context,
        (stem, formID, morphDataId) -> {
          if (checkCase && !acceptCase(originalCase, formID, stem)) {
            return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
          }
          if (acceptsStem(formID)) {
            result[0] = new Root<>(stem, formID);
          }
          return false;
        });
    return result[0];
  }

  private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) {
    boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
    if (originalCase != null) {
      if (keepCase
          && dictionary.checkSharpS
          && originalCase == WordCase.TITLE
          && containsSharpS(root.chars, root.offset, root.length)) {
        return true;
      }
      return !keepCase;
    }
    return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
  }

  private boolean containsSharpS(char[] word, int offset, int length) {
    for (int i = 0; i < length; i++) {
      if (word[i + offset] == 'ß') {
        return true;
      }
    }
    return false;
  }

  boolean acceptsStem(int formID) {
    return true;
  }

  private boolean checkCompounds(CharsRef word, WordCase originalCase, CompoundPart prev) {
    if (prev != null && prev.index > dictionary.compoundMax - 2) return false;
    if (prev == null && word.offset != 0) {
      // we check the word's beginning for FORCEUCASE and expect to find it at 0
      throw new IllegalArgumentException();
    }

    int limit = word.length - dictionary.compoundMin + 1;
    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
      WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
      int breakOffset = word.offset + breakPos;
      if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
        Root stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
        if (stem == null
            && dictionary.simplifiedTriple
            && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
          stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
        }
        if (stem != null
            && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
            && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
          CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
          if (checkCompoundsAfter(originalCase, part)) {
            return true;
          }
        }
      }

      if (checkCompoundPatternReplacements(word, breakPos, originalCase, prev)) {
        return true;
      }
    }

    return false;
  }

  private boolean checkCompoundPatternReplacements(
      CharsRef word, int pos, WordCase originalCase, CompoundPart prev) {
    for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
      CharsRef expanded = pattern.expandReplacement(word, pos);
      if (expanded != null) {
        WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
        int breakPos = pos + pattern.endLength();
        Root stem =
            findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
        if (stem != null) {
          CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
          if (checkCompoundsAfter(originalCase, part)) {
            return true;
          }
        }
      }
    }
    return false;
  }

  private boolean checkCompoundsAfter(WordCase originalCase, CompoundPart prev) {
    CharsRef word = prev.tail;
    int breakPos = prev.length;
    int remainingLength = word.length - breakPos;
    int breakOffset = word.offset + breakPos;
    Root lastRoot =
        findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
    if (lastRoot != null
        && !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
        && !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
        && !hasForceUCaseProblem(lastRoot, originalCase, word.chars)
        && prev.mayCompound(lastRoot, remainingLength, originalCase)) {
      return true;
    }

    CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
    return checkCompounds(tail, originalCase, prev);
  }

  private boolean hasForceUCaseProblem(Root root, WordCase originalCase, char[] wordChars) {
    if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
    if (originalCase == null && Character.isUpperCase(wordChars[0])) return false;
    return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
  }

  /**
   * Find all roots that could result in the given word after case conversion and adding affixes.
   * This corresponds to the original {@code hunspell -s} (stemming) functionality.
   *
   * 
Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are
   * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic
   * file (but differ from some existing entries in case). No roots are returned for compound words.
   *
   * The returned roots may be used to retrieve morphological data via {@link
   * Dictionary#lookupEntries}.
   */
  public List getRoots(String word) {
    return stemmer.stem(word).stream()
        .map(CharsRef::toString)
        .distinct()
        .collect(Collectors.toList());
  }

  private class CompoundPart {
    final CompoundPart prev;
    final int index, length;
    final CharsRef tail;
    final Root root;
    final CheckCompoundPattern enablingPattern;

    CompoundPart(
        CompoundPart prev,
        CharsRef tail,
        int length,
        Root root,
        CheckCompoundPattern enabler) {
      this.prev = prev;
      this.tail = tail;
      this.length = length;
      this.root = root;
      index = prev == null ? 1 : prev.index + 1;
      enablingPattern = enabler;
    }

    @Override
    public String toString() {
      return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
    }

    boolean mayCompound(Root nextRoot, int nextPartLength, WordCase originalCase) {
      boolean patternsOk =
          enablingPattern != null
              ? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
              : dictionary.checkCompoundPatterns.stream()
                  .noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
      if (!patternsOk) {
        return false;
      }

      if (dictionary.checkCompoundRep
          && isMisspelledSimpleWord(length + nextPartLength, originalCase)) {
        return false;
      }

      char[] spaceSeparated = new char[length + nextPartLength + 1];
      System.arraycopy(tail.chars, tail.offset, spaceSeparated, 0, length);
      System.arraycopy(
          tail.chars, tail.offset + length, spaceSeparated, length + 1, nextPartLength);
      spaceSeparated[length] = ' ';
      return !Boolean.TRUE.equals(checkSimpleWord(spaceSeparated, spaceSeparated.length, null));
    }

    private boolean isMisspelledSimpleWord(int length, WordCase originalCase) {
      String word = new String(tail.chars, tail.offset, length);
      for (RepEntry entry : dictionary.repTable) {
        if (entry.isMiddle()) {
          for (String sug : entry.substitute(word)) {
            if (findStem(sug.toCharArray(), 0, sug.length(), originalCase, SIMPLE_WORD) != null) {
              return true;
            }
          }
        }
      }
      return false;
    }
  }

  private boolean mayBreakIntoCompounds(char[] chars, int offset, int length, int breakPos) {
    if (dictionary.checkCompoundCase) {
      char a = chars[breakPos - 1];
      char b = chars[breakPos];
      if ((Character.isUpperCase(a) || Character.isUpperCase(b)) && a != '-' && b != '-') {
        return false;
      }
    }
    if (dictionary.checkCompoundTriple && chars[breakPos - 1] == chars[breakPos]) {
      //noinspection RedundantIfStatement
      if (breakPos > offset + 1 && chars[breakPos - 2] == chars[breakPos - 1]
          || breakPos < length - 1 && chars[breakPos] == chars[breakPos + 1]) {
        return false;
      }
    }
    return true;
  }

  private boolean checkCompoundRules(
      char[] wordChars, int offset, int length, List words) {
    if (words.size() >= 100) return false;

    int limit = length - dictionary.compoundMin + 1;
    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
      checkCanceled.run();
      IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
      if (forms != null) {
        words.add(forms);

        if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }

          if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
        }

        words.remove(words.size() - 1);
      }
    }

    return false;
  }

  private boolean checkLastCompoundPart(
      char[] wordChars, int start, int length, List words) {
    IntsRef ref = new IntsRef(new int[1], 0, 1);
    words.add(ref);

    Stemmer.RootProcessor stopOnMatching =
        (stem, formID, morphDataId) -> {
          ref.ints[0] = formID;
          return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
        };
    boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
    words.remove(words.size() - 1);
    return found;
  }

  private static boolean isNumber(String s) {
    int i = 0;
    while (i < s.length()) {
      char c = s.charAt(i);
      if (isDigit(c)) {
        i++;
      } else if (c == '.' || c == ',' || c == '-') {
        if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) {
          return false;
        }
        i += 2;
      } else {
        return false;
      }
    }
    return true;
  }

  private static boolean isDigit(char c) {
    return c >= '0' && c <= '9';
  }

  private boolean tryBreaks(String word) {
    for (String br : dictionary.breaks.starting) {
      if (word.length() > br.length() && word.startsWith(br)) {
        if (spell(word.substring(br.length()))) {
          return true;
        }
      }
    }

    for (String br : dictionary.breaks.ending) {
      if (word.length() > br.length() && word.endsWith(br)) {
        if (spell(word.substring(0, word.length() - br.length()))) {
          return true;
        }
      }
    }

    for (String br : dictionary.breaks.middle) {
      int pos = word.indexOf(br);
      if (canBeBrokenAt(word, br, pos)) {
        return true;
      }

      // try to break at the second occurrence
      // to recognize dictionary words with a word break
      if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
        return true;
      }
    }
    return false;
  }

  private boolean hasTooManyBreakOccurrences(String word) {
    int occurrences = 0;
    for (String br : dictionary.breaks.middle) {
      int pos = 0;
      while ((pos = word.indexOf(br, pos)) >= 0) {
        if (++occurrences >= 10) return true;
        pos += br.length();
      }
    }
    return false;
  }

  private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
    return breakPos > 0
        && breakPos < word.length() - breakStr.length()
        && spell(word.substring(0, breakPos))
        && spell(word.substring(breakPos + breakStr.length()));
  }

  /**
   * @return suggestions for the given misspelled word
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
   */
  public List suggest(String word) throws SuggestionTimeoutException {
    return suggest(word, SUGGEST_TIME_LIMIT);
  }

  /**
   * @param word the misspelled word to calculate suggestions for
   * @param timeLimitMs the duration limit in milliseconds, after which the associated {@link
   *     TimeoutPolicy}'s effects (exception or partial result) may kick in
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
   */
  public List suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
    checkCanceled.run();
    if (word.length() >= 100) return Collections.emptyList();

    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }

    WordCase wordCase = WordCase.caseOf(word);
    if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
      String title = dictionary.toTitleCase(word);
      if (spell(title)) {
        return Collections.singletonList(title);
      }
    }

    LinkedHashSet suggestions = new LinkedHashSet<>();
    Runnable checkCanceled =
        policy == NO_TIMEOUT
            ? this.checkCanceled
            : checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
    try {
      doSuggest(word, wordCase, suggestions, checkCanceled);
    } catch (SuggestionTimeoutException e) {
      if (policy == RETURN_PARTIAL_RESULT) {
        return postprocess(word, wordCase, suggestions);
      }
      throw e;
    }

    return postprocess(word, wordCase, suggestions);
  }

  private void doSuggest(
      String word, WordCase wordCase, LinkedHashSet suggestions, Runnable checkCanceled) {
    Hunspell suggestionSpeller =
        new Hunspell(dictionary, policy, checkCanceled) {
          @Override
          boolean acceptsStem(int formID) {
            return !dictionary.hasFlag(formID, dictionary.noSuggest)
                && !dictionary.hasFlag(formID, dictionary.subStandard);
          }
        };
    ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
    boolean hasGoodSuggestions = modifier.suggest(word, wordCase);

    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
      suggestions.addAll(
          new GeneratingSuggester(suggestionSpeller)
              .suggest(dictionary.toLowerCase(word), wordCase, suggestions));
    }

    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
      suggestions.addAll(modifyChunksBetweenDashes(word));
    }
  }

  private Runnable checkTimeLimit(
      String word, WordCase wordCase, Set suggestions, long timeLimitMs) {
    return new Runnable() {
      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
      int invocationCounter = 100;

      @Override
      public void run() {
        checkCanceled.run();
        if (--invocationCounter <= 0) {
          if (System.nanoTime() - deadline > 0) {
            stop();
          }
          invocationCounter = 100;
        }
      }

      private void stop() {
        List partialResult =
            policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
        throw new SuggestionTimeoutException(message, partialResult);
      }
    };
  }

  private List postprocess(String word, WordCase wordCase, Collection suggestions) {
    Set result = new LinkedHashSet<>();
    for (String candidate : suggestions) {
      result.add(adjustSuggestionCase(candidate, wordCase, word));
      if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
        result.add(candidate);
      }
    }
    return result.stream().map(this::cleanOutput).collect(Collectors.toList());
  }

  private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
    if (originalCase == WordCase.UPPER) {
      String upper = candidate.toUpperCase(Locale.ROOT);
      if (upper.contains(" ") || spell(upper)) {
        return upper;
      }
    }
    if (Character.isUpperCase(original.charAt(0))) {
      String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
      if (title.contains(" ") || spell(title)) {
        return title;
      }
    }
    return candidate;
  }

  private List modifyChunksBetweenDashes(String word) {
    List result = new ArrayList<>();
    int chunkStart = 0;
    while (chunkStart < word.length()) {
      int chunkEnd = word.indexOf('-', chunkStart);
      if (chunkEnd < 0) {
        chunkEnd = word.length();
      }

      if (chunkEnd > chunkStart) {
        String chunk = word.substring(chunkStart, chunkEnd);
        if (!spell(chunk)) {
          for (String chunkSug : suggest(chunk)) {
            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
            if (spell(replaced)) {
              result.add(replaced);
            }
          }
        }
      }

      chunkStart = chunkEnd + 1;
    }
    return result;
  }

  private String cleanOutput(String s) {
    if (dictionary.oconv == null) return s;

    StringBuilder sb = new StringBuilder(s);
    dictionary.oconv.applyMappings(sb);
    return sb.toString();
  }
}