morfologik.speller.Speller Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of morfologik-speller Show documentation
Morfologik Speller
The newest version!
package morfologik.speller;

import static morfologik.fsa.MatchResult.EXACT_MATCH;
import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.*;

import morfologik.fsa.FSA;
import morfologik.fsa.ByteSequenceIterator;
import morfologik.fsa.FSATraversal;
import morfologik.fsa.MatchResult;
import morfologik.stemming.BufferUtils;
import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.DictionaryMetadata;
import morfologik.stemming.UnmappableInputException;

/**
 * Finds spelling suggestions. Implements K. Oflazer's algorithm as described
 * in: Oflazer, Kemal. 1996.
 * "Error-Tolerant Finite-State Recognition with Applications to Morphological Analysis and Spelling Correction."
 * Computational Linguistics 22 (1): 73–89.
 * 
 * 
 * See Jan Daciuk's s_fsa package.
 */
public class Speller {
  /**
   * Maximum length of the word to be checked.
   */
  public static final int MAX_WORD_LENGTH = 120;
  static final int FREQ_RANGES = 'Z' - 'A' + 1;
  static final int FIRST_RANGE_CODE = 'A'; // less frequent words

  //FIXME: this is an upper limit for replacement searches, we need
  //proper tree traversal instead of generation of all possible candidates
  static final int UPPER_SEARCH_LIMIT = 15;
  private static final int MIN_WORD_LENGTH = 4;
  private static final int MAX_RECURSION_LEVEL = 6;

  private final int editDistance;
  private int effectEditDistance; // effective edit distance

  private final HMatrix hMatrix;

  private char[] candidate; /* current replacement */
  private int candLen;
  private int wordLen; /* length of word being processed */
  private char[] wordProcessed; /* word being processed */

  private Map> replacementsAnyToOne = new HashMap<>();
  private Map> replacementsAnyToTwo = new HashMap<>();
  private Map> replacementsTheRest = new HashMap<>();

  private boolean containsSeparators = true;

  /**
   * Internal reusable buffer for encoding words into byte arrays using
   * {@link #encoder}.
   */
  private ByteBuffer byteBuffer = ByteBuffer.allocate(MAX_WORD_LENGTH);

  /**
   * Internal reusable buffer for encoding words into byte arrays using
   * {@link #encoder}.
   */
  private CharBuffer charBuffer = CharBuffer.allocate(MAX_WORD_LENGTH);

  /**
   * Reusable match result.
   */
  private final MatchResult matchResult = new MatchResult();

  /**
   * Features of the compiled dictionary.
   * 
   * @see DictionaryMetadata
   */
  private final DictionaryMetadata dictionaryMetadata;

  /**
   * Charset encoder for the FSA.
   */
  private final CharsetEncoder encoder;

  /**
   * Charset decoder for the FSA.
   */
  private final CharsetDecoder decoder;

  /** An FSA used for lookups. */
  private final FSATraversal matcher;

  /** FSA's root node. */
  private final int rootNode;

  /**
   * The FSA we are using.
   */
  private final FSA fsa;

  /** An iterator for walking along the final states of {@link #fsa}. */
  private final ByteSequenceIterator finalStatesIterator;

  public Speller(final Dictionary dictionary) {
    this(dictionary, 1);
  }

  public Speller(final Dictionary dictionary, final int editDistance) {
    this.editDistance = editDistance;
    this.hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH);

    this.dictionaryMetadata = dictionary.metadata;
    this.rootNode = dictionary.fsa.getRootNode();
    this.fsa = dictionary.fsa;
    this.matcher = new FSATraversal(fsa);
    this.finalStatesIterator = new ByteSequenceIterator(fsa, rootNode);

    if (rootNode == 0) {
      throw new IllegalArgumentException("Dictionary must have at least the root node.");
    }

    if (dictionaryMetadata == null) {
      throw new IllegalArgumentException("Dictionary metadata must not be null.");
    }

    encoder = dictionaryMetadata.getEncoder();
    decoder = dictionaryMetadata.getDecoder();

    // Multibyte separator will result in an exception here.
    dictionaryMetadata.getSeparatorAsChar();

    this.createReplacementsMaps();
  }

  private void createReplacementsMaps() {
    for (Map.Entry> entry : dictionaryMetadata.getReplacementPairs().entrySet()) {
      for (String s : entry.getValue()) {
        // replacements any to one
        // the new key is the target of the replacement pair
        if (s.length() == 1) {
          if (!replacementsAnyToOne.containsKey(s.charAt(0))) {
            List charList = new ArrayList<>();
            charList.add(entry.getKey().toCharArray());
            replacementsAnyToOne.put(s.charAt(0), charList);
          } else {
            replacementsAnyToOne.get(s.charAt(0)).add(entry.getKey().toCharArray());
          }
        }
        // replacements any to two
        // the new key is the target of the replacement pair
        else if (s.length() == 2) {
          if (!replacementsAnyToTwo.containsKey(s)) {
            List charList = new ArrayList<>();
            charList.add(entry.getKey().toCharArray());
            replacementsAnyToTwo.put(s, charList);
          } else {
            replacementsAnyToTwo.get(s).add(entry.getKey().toCharArray());
          }
        } else {
          if (!replacementsTheRest.containsKey(entry.getKey())) {
            List charList = new ArrayList<>();
            charList.add(s);
            replacementsTheRest.put(entry.getKey(), charList);
          } else {
            replacementsTheRest.get(entry.getKey()).add(s);
          }
        }
      }
    }
  }

  private ByteBuffer charSequenceToBytes(final CharSequence word) throws UnmappableInputException {
    // Encode word characters into bytes in the same encoding as the FSA's.
    charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, word.length());
    for (int i = 0; i < word.length(); i++) {
      final char chr = word.charAt(i);
      charBuffer.put(chr);
    }
    charBuffer.flip();

    return BufferUtils.charsToBytes(encoder, charBuffer, byteBuffer);
  }

  /**
   * Checks whether the word is misspelled, by performing a series of checks
   * according to properties of the dictionary.
   *
   * If the flag fsa.dict.speller.ignore-punctuation is set, then
   * all non-alphabetic characters are considered to be correctly spelled.
   *
   * If the flag fsa.dict.speller.ignore-numbers is set, then all
   * words containing decimal digits are considered to be correctly spelled.
   *
   * If the flag fsa.dict.speller.ignore-camel-case is set, then
   * all CamelCase words are considered to be correctly spelled.
   *
   * If the flag fsa.dict.speller.ignore-all-uppercase is set, then
   * all alphabetic words composed of only uppercase characters are considered
   * to be correctly spelled.
   *
   * Otherwise, the word is checked in the dictionary. If the test fails, and
   * the dictionary does not perform any case conversions (as set by
   * fsa.dict.speller.convert-case flag), then the method returns
   * false. In case of case conversions, it is checked whether a non-mixed case
   * word is found in its lowercase version in the dictionary, and for
   * all-uppercase words, whether the word is found in the dictionary with the
   * initial uppercase letter.
   *
   * @param word
   *          - the word to be checked
   * @return true if the word is misspelled
   **/
  public boolean isMisspelled(final String word) {
    // dictionaries usually do not contain punctuation
    String wordToCheck = word;
    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
      wordToCheck = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
    }
    boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0));
    return wordToCheck.length() > 0
        && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic)
        && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck))
        && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck))
        && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck))
        && !isInDictionary(wordToCheck)
        && (!dictionaryMetadata.isConvertingCase() || 
            !(!isMixedCase(wordToCheck) && 
                (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale())) 
                    || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck)))));
  }

  private CharSequence initialUppercase(final String wordToCheck) {
    return wordToCheck.substring(0, 1) + wordToCheck.substring(1).toLowerCase(dictionaryMetadata.getLocale());
  }

  /**
   * Test whether the word is found in the dictionary.
   * 
   * @param word
   *          the word to be tested
   * @return True if it is found.
   */
  public boolean isInDictionary(final CharSequence word) {
    try {
      byteBuffer = charSequenceToBytes(word);
    } catch (UnmappableInputException e) {
      return false;
    }

    // Try to find a partial match in the dictionary.
    final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode);

    // Make sure the word doesn't contain a separator if there is an exact match
    if (containsSeparators && match.kind == EXACT_MATCH) {
      containsSeparators = false;
      for (int i=0; i 0
        && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0;
  }

  /**
   * Get the frequency value for a word form. It is taken from the first entry
   * with this word form.
   * 
   * @param word
   *          the word to be tested
   * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent).
   */

  public int getFrequency(final CharSequence word) {
    if (!dictionaryMetadata.isFrequencyIncluded()) {
      return 0;
    }

    final byte separator = dictionaryMetadata.getSeparator();
    try {
      byteBuffer = charSequenceToBytes(word);
    } catch (UnmappableInputException e) {
      return 0;
    }

    final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode);
    if (match.kind == SEQUENCE_IS_A_PREFIX) {
      final int arc = fsa.getArc(match.node, separator);
      if (arc != 0 && !fsa.isArcFinal(arc)) {
        finalStatesIterator.restartFrom(fsa.getEndNode(arc));
        if (finalStatesIterator.hasNext()) {
          final ByteBuffer bb = finalStatesIterator.next();
          final byte[] ba = bb.array();
          final int bbSize = bb.remaining();
          //the last byte contains the frequency after a separator
          return ba[bbSize - 1] - FIRST_RANGE_CODE;
        }
      }
    }
    return 0;
  }

  /**
   * Propose suggestions for misspelled run-on words. This algorithm is inspired
   * by spell.cc in s_fsa package by Jan Daciuk.
   * 
   * @param original
   *          The original misspelled word.
   * @return The list of suggested pairs, as CandidateData with space-concatenated strings.
   */
  public List replaceRunOnWordCandidates(final String original) {
    final List candidates = new ArrayList<>();
    String wordToCheck = original;
    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
      wordToCheck = DictionaryLookup.applyReplacements(original, dictionaryMetadata.getInputConversionPairs());
    }
    if (!isInDictionary(wordToCheck) && dictionaryMetadata.isSupportingRunOnWords()) {
      Locale locale = dictionaryMetadata.getLocale();
      for (int i = 1; i < wordToCheck.length(); i++) {
        // chop from left to right
        final String prefix = wordToCheck.substring(0, i);
        final String suffix = wordToCheck.substring(i);
        if (isInDictionary(suffix) 
            // camel case words: e.g. GreatElephant
            || (!isNotCapitalizedWord(suffix) && isInDictionary(suffix.toLowerCase(locale)))) {
          if (isInDictionary(prefix)) {
            addReplacement(candidates, prefix + " " + suffix);
          } else if (Character.isUpperCase(prefix.charAt(0)) && isInDictionary(prefix.toLowerCase(locale))) {
            // a word that's uppercase just because used at sentence start
            addReplacement(candidates, prefix + " " + suffix);
          }
        }
      }
    }
    return candidates;
  }

  /**
   * Propose suggestions for misspelled run-on words. This algorithm is inspired
   * by spell.cc in s_fsa package by Jan Daciuk.
   *
   * @param original
   *          The original misspelled word.
   * @return The list of suggested pairs, as space-concatenated strings.
   */
  public List replaceRunOnWords(final String original) {
    final List candidateData = replaceRunOnWordCandidates(original);
    final List candidates = new ArrayList<>();
    for (CandidateData candidate : candidateData) {
      candidates.add(candidate.word);
    }
    return candidates;
  }

  private void addReplacement(List candidates, String replacement) {
    if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
      candidates.add(new CandidateData(replacement, 1));
    } else {
      candidates.add(new CandidateData(DictionaryLookup.applyReplacements(replacement,
          dictionaryMetadata.getOutputConversionPairs()), 1));
    }
  }

  /**
   * Find similar words even if the original word is a correct word that exists in the dictionary
   * 
   * @param word The original word.
   * @return A list of suggested candidate replacements.
   */
  public ArrayList findSimilarWordCandidates(String word) {
    return findReplacementCandidates(word, true);
  }
  
  public ArrayList findSimilarWords(String word) {
    final List result = findSimilarWordCandidates(word);
    final ArrayList resultSuggestions = new ArrayList<>(result.size());
    for (CandidateData cd : result) {
      resultSuggestions.add(cd.getWord());
    }
    return resultSuggestions;
  }
  
  
  /**
   * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa
   * package, spell.cc for further explanation.
   * 
   * @param word The original misspelled word.
   * @return A list of suggested replacements.
   */
  public ArrayList findReplacements(String word) {
    final List result = findReplacementCandidates(word);

    final ArrayList resultSuggestions = new ArrayList<>(result.size());
    for (CandidateData cd : result) {
      resultSuggestions.add(cd.getWord());
    }
    return resultSuggestions;
  }

  
  /**
   * Find and return suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa
   * package, spell.cc for further explanation. This method is identical to
   * {@link #findReplacements}, but returns candidate terms with their edit distance scores.
   *
   * @param word The original misspelled word.
   * @return A list of suggested candidate replacements.
   */
  public ArrayList findReplacementCandidates(String word) {
    return findReplacementCandidates(word, false);
  }
  
  private ArrayList findReplacementCandidates(String word, boolean evenIfWordInDictionary) {
    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
      word = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
    }

    // candidate strings, including same additional data such as edit distance from the original word.
    List candidates = new ArrayList<>();

    if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && (!isInDictionary(word) || evenIfWordInDictionary)) {
      List wordsToCheck = new ArrayList<>();
      if (replacementsTheRest != null && word.length() > 1) {
        for (final String wordChecked : getAllReplacements(word, 0, 0)) {
          if (isInDictionary(wordChecked)) {
            candidates.add(new CandidateData(wordChecked, 0));
          } else {
            String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale());
            String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale());
            if (isInDictionary(lowerWord)) {
              //add the word as it is in the dictionary, not mixed-case versions of it
              candidates.add(new CandidateData(lowerWord, 0));
            }
            if (isInDictionary(upperWord)) {
              candidates.add(new CandidateData(upperWord, 0));
            }
            if (lowerWord.length() > 1) {
              String firstUpperWord = Character.toUpperCase(lowerWord.charAt(0)) + lowerWord.substring(1);
              if (isInDictionary(firstUpperWord)) {
                candidates.add(new CandidateData(firstUpperWord, 0));
              }
            }
          }
          wordsToCheck.add(wordChecked);
        }
      } else {
        wordsToCheck.add(word);
      }

      // Even if a candidate was found with the replacement pairs (which are usual errors),
      // there might be more good candidates (see issue #94):
      int i = 1;
      for (final String wordChecked : wordsToCheck) {
        i++;
        if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply
          break;
        }
        wordProcessed = wordChecked.toCharArray();
        wordLen = wordProcessed.length;
        if (wordLen < MIN_WORD_LENGTH && i > 2) { // three-letter replacements make little sense anyway
          break;
        }
        candidate = new char[MAX_WORD_LENGTH];
        candLen = candidate.length;
        effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
        charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH);
        byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH);
        final byte[] prevBytes = new byte[0];
        findRepl(candidates, 0, fsa.getRootNode(), prevBytes, 0, 0);
      }
    }

    Collections.sort(candidates);

    // Apply replacements, prune duplicates while preserving the candidate order.
    final Set words = new HashSet<>();
    final ArrayList result = new ArrayList<>(candidates.size());
    for (final CandidateData cd : candidates) {
      String replaced = DictionaryLookup.applyReplacements(cd.getWord(), dictionaryMetadata.getOutputConversionPairs());
      // Add only the first occurrence of a given word.
      if (words.add(replaced) && !replaced.equals(word)) {
        result.add(new CandidateData(replaced, cd.origDistance));
      }
    }

    return result;
  }

  private void findRepl(List candidates, final int depth, final int node, final byte[] prevBytes, final int wordIndex, final int candIndex) {
    int dist = 0;
    for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
      byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1);
      byteBuffer.put(prevBytes);
      byteBuffer.put(fsa.getArcLabel(arc));
      final int bufPos = byteBuffer.position();
      byteBuffer.flip();
      decoder.reset();
      // FIXME: this isn't correct -- no checks for overflows, no decoder flush. I don't think this should be in here
      // too, the decoder should run once on accumulated temporary byte buffer (current path) only when there's
      // a potential that this buffer can become a replacement candidate (isEndOfCandidate). Because we assume candidates
      // are valid input strings (this is verified when building the dictionary), it's save a lot of conversions.
      final CoderResult c = decoder.decode(byteBuffer, charBuffer, true);
      if (c.isMalformed()) { // assume that only valid
        // encodings are there
        final byte[] prev = new byte[bufPos];
        byteBuffer.position(0);
        byteBuffer.get(prev);
        if (!fsa.isArcTerminal(arc)) {
          findRepl(candidates, depth, fsa.getEndNode(arc), prev, wordIndex, candIndex); // note: depth is not incremented
        }
        byteBuffer.clear();
      } else if (!c.isError()) { // unmappable characters are silently discarded
        charBuffer.flip();
        candidate[candIndex] = charBuffer.get();
        charBuffer.clear();
        byteBuffer.clear();

        int lengthReplacement;
        // replacement "any to two"
        if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth - 1, depth - 1)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2));
            }
            if (dist <= effectEditDistance) {
              candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist));
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            int x = hMatrix.get(depth, depth);
            hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1));
            findRepl(candidates, Math.max(0, depth), fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement - 1,
                candIndex + 1);
            hMatrix.set(depth, depth, x);
          }
        }
        //replacement "any to one"
        if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) {
          // the replacement takes place at the end of the candidate
          if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth, depth)) <= effectEditDistance) {
            if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) {
              // there are extra letters in the word after the replacement
              dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1));
            }
            if (dist <= effectEditDistance) {
              candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist));
            }
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(candidates, depth, fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement, candIndex + 1);
          }
        }
        //general
        if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) {
          if ((isEndOfCandidate(arc, wordIndex))
              && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex)) <= effectEditDistance) {
            candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist));
          }
          if (isArcNotTerminal(arc, candIndex)) {
            findRepl(candidates, depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1);
          }
        }
      }
    }
  }

  private boolean isArcNotTerminal(final int arc, final int candIndex) {
    return !fsa.isArcTerminal(arc)
        && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar());
  }

  private boolean isEndOfCandidate(final int arc, final int wordIndex) {
    return (fsa.isArcFinal(arc) || isBeforeSeparator(arc))
    //candidate has proper length
        && (Math.abs(wordLen - 1 - (wordIndex)) <= effectEditDistance);
  }

  private boolean isBeforeSeparator(final int arc) {
    if (containsSeparators) {
      final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator());
      return arc1 != 0 && !fsa.isArcTerminal(arc1);
    }
    return false;
  }

  /**
   * Calculates edit distance.
   * 
   * @param i length of first word (here: misspelled) - 1;
   * @param j length of second word (here: candidate) - 1.
   * @param wordIndex (TODO: javadoc?)
   * @param candIndex (TODO: javadoc?)
   * @return Edit distance between the two words. Remarks: See Oflazer.
   */
  public int ed(final int i, final int j, final int wordIndex, final int candIndex) {
    int result;
    int a, b, c;

    if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) {
      // last characters are the same
      result = hMatrix.get(i, j);
    } else if (wordIndex > 0 && candIndex > 0 && wordProcessed[wordIndex] == candidate[candIndex - 1]
        && wordProcessed[wordIndex - 1] == candidate[candIndex]) {
      // last two characters are transposed
      a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba
      b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba
      c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab
      result = 1 + min(a, b, c);
    } else {
      // otherwise
      a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb
      b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a
      c = hMatrix.get(i, j + 1); // insertion e.g. a, ab
      result = 1 + min(a, b, c);
    }

    hMatrix.set(i + 1, j + 1, result);
    return result;
  }

  // by Jaume Ortola
  private boolean areEqual(final char x, final char y) {
    if (x == y) {
      return true;
    }
    if (dictionaryMetadata.getEquivalentChars() != null) {
      List chars = dictionaryMetadata.getEquivalentChars().get(x);
      if (chars != null && chars.contains(y)) {
        return true;
      }
    }
    if (dictionaryMetadata.isIgnoringDiacritics()) {
      String xn = Normalizer.normalize(Character.toString(x), Form.NFD);
      String yn = Normalizer.normalize(Character.toString(y), Form.NFD);
      if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible
        return true;
      }
      if (dictionaryMetadata.isConvertingCase()) {
        //again case conversion only when needed -- we
        // do not need String.lowercase because we only check
        // single characters, so a cheaper method is enough
        if (Character.isLetter(xn.charAt(0))) {
          boolean testNeeded = Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0));
          if (testNeeded) {
            return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0));
          }
        }
      }
      return xn.charAt(0) == yn.charAt(0);
    }
    return false;
  }

  /**
   * Calculates cut-off edit distance.
   * 
   * @param depth current length of candidates.
   * @param wordIndex (TODO: javadoc?)
   * @param candIndex (TODO: javadoc?)
   * @return Cut-off edit distance. Remarks: See Oflazer.
   */
  public int cuted(final int depth, final int wordIndex, final int candIndex) {
    final int l = Math.max(0, depth - effectEditDistance); // min chars from word to consider - 1
    final int u = Math.min(wordLen - 1 - (wordIndex - depth), depth + effectEditDistance); // max chars from word to
    // consider - 1
    int minEd = effectEditDistance + 1; // what is to be computed
    int wi = wordIndex + l - depth;
    int d;

    for (int i = l; i <= u; i++, wi++) {
      if ((d = ed(i, depth, wi, candIndex)) < minEd) {
        minEd = d;
      }
    }
    return minEd;
  }

  // Match the last letter of the candidate against two or more letters of the word.
  private int matchAnyToOne(final int wordIndex, final int candIndex) {
    if (replacementsAnyToOne.containsKey(candidate[candIndex])) {
      for (final char[] rep : replacementsAnyToOne.get(candidate[candIndex])) {
        int i = 0;
        while (i < rep.length && (wordIndex + i) < wordLen && rep[i] == wordProcessed[wordIndex + i]) {
          i++;
        }
        if (i == rep.length) {
          return i;
        }
      }
    }
    return 0;
  }

  private int matchAnyToTwo(final int wordIndex, final int candIndex) {
    if (candIndex > 0 && candIndex < candidate.length && wordIndex > 0) {
      char[] twoChar = { candidate[candIndex - 1], candidate[candIndex] };
      String sTwoChar = new String(twoChar);
      if (replacementsAnyToTwo.containsKey(sTwoChar)) {
        for (final char[] rep : replacementsAnyToTwo.get(sTwoChar)) {
          if (rep.length == 2 && wordIndex < wordLen && candidate[candIndex - 1] == wordProcessed[wordIndex - 1]
              && candidate[candIndex] == wordProcessed[wordIndex]) {
            return 0; //unnecessary replacements
          }
          int i = 0;
          while (i < rep.length && (wordIndex - 1 + i) < wordLen && rep[i] == wordProcessed[wordIndex - 1 + i]) {
            i++;
          }
          if (i == rep.length) {
            return i;
          }
        }
      }
    }
    return 0;
  }

  private static int min(final int a, final int b, final int c) {
    return Math.min(a, Math.min(b, c));
  }

  /**
   * Copy-paste of Character.isAlphabetic() (needed as we require only 1.6)
   * 
   * @param codePoint
   *          The input character.
   * @return True if the character is a Unicode alphabetic character.
   */
  static boolean isAlphabetic(final int codePoint) {
    return ((1 << Character.UPPERCASE_LETTER | 1 << Character.LOWERCASE_LETTER | 1 << Character.TITLECASE_LETTER
        | 1 << Character.MODIFIER_LETTER | 1 << Character.OTHER_LETTER | 1 << Character.LETTER_NUMBER) >> Character
        .getType(codePoint) & 1) != 0;
  }

  /**
   * Checks whether a string contains a digit. Used for ignoring words with
   * numbers
   * 
   * @param s
   *          Word to be checked.
   * @return True if there is a digit inside the word.
   */
  static boolean containsNoDigit(final String s) {
    for (int k = 0; k < s.length(); k++) {
      if (Character.isDigit(s.charAt(k))) {
        return false;
      }
    }
    return true;
  }

  /**
   * Returns true if str is made up of all-uppercase characters
   * (ignoring characters for which no upper-/lowercase distinction exists).
   */
  boolean isAllUppercase(final String str) {
    for (int i = 0; i < str.length(); i++) {
      char c = str.charAt(i);
      if (Character.isLetter(c) && Character.isLowerCase(c)) {
        return false;
      }
    }
    return true;
  }

  /**
   * Returns true if str is made up of all-lowercase characters
   * (ignoring characters for which no upper-/lowercase distinction exists).
   */
  boolean isNotAllLowercase(final String str) {
    for (int i = 0; i < str.length(); i++) {
      char c = str.charAt(i);
      if (Character.isLetter(c) && !Character.isLowerCase(c)) {
        return true;
      }
    }
    return false;
  }

  /**
   * @param str
   *          input string
   */
  boolean isNotCapitalizedWord(final String str) {
    if (isNotEmpty(str) && Character.isUpperCase(str.charAt(0))) {
      for (int i = 1; i < str.length(); i++) {
        char c = str.charAt(i);
        if (Character.isLetter(c) && !Character.isLowerCase(c)) {
          return true;
        }
      }
      return false;
    }
    return true;
  }

  /**
   * Helper method to replace calls to "".equals().
   * 
   * @param str
   *          String to check
   * @return true if string is empty OR null
   */
  static boolean isNotEmpty(final String str) {
    return str != null && str.length() != 0;
  }

  /**
   * @param str
   *          input str
   * @return Returns true if str is MixedCase.
   */
  boolean isMixedCase(final String str) {
    return !isAllUppercase(str) && isNotCapitalizedWord(str) && isNotAllLowercase(str);
  }

  /**
   * @param str The string to check.
   * @return Returns true if str is CamelCase. Note that German compounds with a dash
   *         (like "Waschmaschinen-Test") are also considered camel case by this method.
   */
  public boolean isCamelCase(final String str) {
    return isNotEmpty(str) && 
           !isAllUppercase(str) && 
           isNotCapitalizedWord(str) && 
           Character.isUpperCase(str.charAt(0)) && 
           (!(str.length() > 1) || Character.isLowerCase(str.charAt(1))) 
           && isNotAllLowercase(str);
  }

  /**
   * Used to determine whether the dictionary supports case conversions.
   * 
   * @return boolean value that answers this question in a deep and meaningful
   *         way.
   * @since 1.9
   */
  public boolean convertsCase() {
    return dictionaryMetadata.isConvertingCase();
  }

  /**
   * @param str
   *          The string to find the replacements for.
   * @param fromIndex
   *          The index from which replacements are found.
   * @param level
   *          The recursion level. The search stops if level is > MAX_RECURSION_LEVEL.
   * @return A list of all possible replacements of a {#link str} given string
   */
  public List getAllReplacements(final String str, final int fromIndex, final int level) {
    List replaced = new ArrayList<>();
    if (level > MAX_RECURSION_LEVEL) { // Stop searching at some point
      replaced.add(str);
      return replaced;
    }
    StringBuilder sb = new StringBuilder();
    sb.append(str);
    int index = MAX_WORD_LENGTH;
    String key = "";
    int keyLength = 0;
    boolean found = false;
    // find first possible replacement after fromIndex position
    for (final String auxKey : replacementsTheRest.keySet()) {
      int auxIndex = sb.indexOf(auxKey, fromIndex);
      if (auxIndex > -1 && (auxIndex < index || (auxIndex == index && !(auxKey.length() < keyLength)))) { //select the longest possible key
        index = auxIndex;
        key = auxKey;
        keyLength = auxKey.length();
      }
    }
    if (index < MAX_WORD_LENGTH) {
      for (final String rep : replacementsTheRest.get(key)) {
        // start a branch without replacement (only once per key)
        if (!found) {
          replaced.addAll(getAllReplacements(str, index + key.length(), level + 1));
          found = true;
        }
        // avoid unnecessary replacements (ex. don't replace L by L·L when L·L already present)
        int ind = sb.indexOf(rep, fromIndex - rep.length() + 1);
        if (rep.length() > key.length() && ind > -1 && (ind == index || ind == index - rep.length() + 1)) {
          continue;
        }
        // start a branch with replacement
        sb.replace(index, index + key.length(), rep);
        replaced.addAll(getAllReplacements(sb.toString(), index + rep.length(), level + 1));
        sb.setLength(0);
        sb.append(str);
      }
    }
    if (!found) {
      replaced.add(sb.toString());
    }
    return replaced;
  }

  /**
   * Sets up the word and candidate. Used only to test the edit distance in
   * JUnit tests.
   * 
   * @param word
   *          the first word
   * @param candidate
   *          the second word used for edit distance calculation
   */
  void setWordAndCandidate(final String word, final String candidate) {
    wordProcessed = word.toCharArray();
    wordLen = wordProcessed.length;
    this.candidate = candidate.toCharArray();
    candLen = this.candidate.length;
    effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
  }

  public final int getWordLen() {
    return wordLen;
  }

  public final int getCandLen() {
    return candLen;
  }

  public final int getEffectiveED() {
    return effectEditDistance;
  }

  /**
   * Used to sort candidates according to edit distance, and possibly according
   * to their frequency in the future.
   */
  public final class CandidateData implements Comparable {
    private final String word;
    private final int origDistance;
    private final int distance;

    CandidateData(final String word, final int distance) {
      this.word = word;
      this.origDistance = distance;
      this.distance = distance * FREQ_RANGES + FREQ_RANGES - getFrequency(word) - 1;
    }

    public final String getWord() {
      return word;
    }

    public final int getDistance() {
      return distance;
    }

    @Override
    public int compareTo(final CandidateData cd) {
      // Assume no overflow.
      return Integer.compare(this.distance, cd.getDistance());
    }

    @Override
    public String toString() {
      return word + '/' + distance;
    }
  }
}