com.swabunga.spell.engine.SpellDictionaryASpell Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jazzy Show documentation
This is a fork of the jazzy dictionary
The newest version!
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
/* Created by bgalbs on Jan 30, 2003 at 11:45:25 PM */
package com.swabunga.spell.engine;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

/**
 * Container for various methods that any SpellDictionary will use.
 * This class is based on the original Jazzy aspell port.
 * 
 * Derived classes will need words list files as spell checking reference. Words
 * list file is a dictionary with one word per line. There are many open source
 * dictionary files, see: 
 * http://wordlist.sourceforge.net/
 * 

 * You can choose words lists form aspell many
 * differents languages dictionaries. To grab some, install aspell
 * and the dictionaries you require. Then run aspell specifying the name of the
 * dictionary and the words list file to dump it into, for example:
 * 
 * 
 * aspell --master=fr-40 dump master > fr-40.txt
 * 
 * 
 * Note: the number following the language is the size indicator. A bigger
 * number gives a more extensive language coverage. Size 40 is more than
 * adequate for many usages.
 * 
 * For some languages, Aspell can also supply you with the phonetic file. On
 * Windows, go into aspell data directory and copy the phonetic
 * file corresponding to your language, for example the
 * fr_phonet.dat for the fr language. The phonetic
 * file should be in directory /usr/share/aspell on Unix.
 * 
 * @see GenericTransformator GenericTransformator for information on phonetic
 *      files.
 */
public abstract class SpellDictionaryASpell implements SpellDictionary {

    /**
     * The reference to a Transformator, used to transform a word into it's
     * phonetic code.
     */
    protected Transformator tf;

    /**
     * Constructs a new SpellDictionaryASpell
     * 
     * @param phonetic The file to use for phonetic transformation of the words
     *            list. If phonetic is null, the the transformation
     *            uses {@link DoubleMeta} transformation.
     * @throws java.io.IOException indicates problems reading the phonetic
     *             information
     */
    public SpellDictionaryASpell(File phonetic) throws IOException {
        if (phonetic == null)
            tf = new DoubleMeta();
        else
            tf = new GenericTransformator(phonetic);
    }

    /**
     * Constructs a new SpellDictionaryASpell
     * 
     * @param phonetic The file to use for phonetic transformation of the words
     *            list. If phonetic is null, the the transformation
     *            uses {@link DoubleMeta} transformation.
     * @param encoding Uses the character set encoding specified
     * @throws java.io.IOException indicates problems reading the phonetic
     *             information
     */
    public SpellDictionaryASpell(File phonetic, String encoding)
            throws IOException {
        if (phonetic == null)
            tf = new DoubleMeta();
        else
            tf = new GenericTransformator(phonetic, encoding);
    }

    /**
     * Constructs a new SpellDictionaryASpell
     * 
     * @param phonetic The Reader to use for phonetic transformation of the
     *            words list. If phonetic is null, the the
     *            transformation uses {@link DoubleMeta} transformation.
     * @throws java.io.IOException indicates problems reading the phonetic
     *             information
     */
    public SpellDictionaryASpell(Reader phonetic) throws IOException {
        if (phonetic == null)
            tf = new DoubleMeta();
        else
            tf = new GenericTransformator(phonetic);
    }

    /**
     * Returns a list of Word objects that are the suggestions to an incorrect
     * word.
     * 

     * This method is only needed to provide backward compatibility.
     * 
     * @see #getSuggestions(String, int, int[][])
     * @param word Suggestions for given misspelt word
     * @param threshold The lower boundary of similarity to misspelt word
     * @return Vector a List of suggestions
     */
    public List getSuggestions(String word, int threshold) {

        return getSuggestions(word, threshold, null);

    }

    /**
     * Returns a list of Word objects that are the suggestions to an incorrect
     * word.
     * 

     * 
     * @param word Suggestions for given misspelt word
     * @param threshold The lower boundary of similarity to misspelt word
     * @param matrix Two dimensional int array used to calculate edit distance.
     *            Allocating this memory outside of the function will greatly
     *            improve efficiency.
     * @return Vector a List of suggestions
     */
    public List getSuggestions(String word, int threshold, int[][] matrix) {

        int i;
        int j;

        if (matrix == null)
            matrix = new int[0][0];

        Map nearmisscodes = new HashMap();
        String code = getCode(word);

        // add all words that have the same phonetics
        nearmisscodes.put(code, code);
        List phoneticList = getWordsFromCode(word, nearmisscodes);

        // do some tranformations to pick up more results
        // interchange
        nearmisscodes = new HashMap();
        char[] charArray = word.toCharArray();
        char a;
        char b;

        for (i = 0; i < word.length() - 1; i++) {
            a = charArray[i];
            b = charArray[i + 1];
            charArray[i] = b;
            charArray[i + 1] = a;
            String s = getCode(new String(charArray));
            nearmisscodes.put(s, s);
            charArray[i] = a;
            charArray[i + 1] = b;
        }

        char[] replacelist = tf.getReplaceList();

        // change
        charArray = word.toCharArray();
        char original;
        for (i = 0; i < word.length(); i++) {
            original = charArray[i];
            for (j = 0; j < replacelist.length; j++) {
                charArray[i] = replacelist[j];
                String s = getCode(new String(charArray));
                nearmisscodes.put(s, s);
            }
            charArray[i] = original;
        }

        // add
        charArray = (word += " ").toCharArray();
        int iy = charArray.length - 1;
        while (true) {
            for (j = 0; j < replacelist.length; j++) {
                charArray[iy] = replacelist[j];
                String s = getCode(new String(charArray));
                nearmisscodes.put(s, s);
            }
            if (iy == 0)
                break;
            charArray[iy] = charArray[iy - 1];
            --iy;
        }

        // delete
        word = word.trim();
        charArray = word.toCharArray();
        char[] charArray2 = new char[charArray.length - 1];
        for (int ix = 0; ix < charArray2.length; ix++) {
            charArray2[ix] = charArray[ix];
        }

        a = charArray[charArray.length - 1];
        int ii = charArray2.length;
        while (true) {
            String s = getCode(new String(charArray));
            nearmisscodes.put(s, s);
            if (ii == 0)
                break;
            b = a;
            a = charArray2[ii - 1];
            charArray2[ii - 1] = b;
            --ii;
        }

        nearmisscodes.remove(code); // already accounted for in phoneticList

        List wordlist = getWordsFromCode(word, nearmisscodes);

        if (wordlist.size() == 0 && phoneticList.size() == 0)
            addBestGuess(word, phoneticList, matrix);

        // We sort a Vector at the end instead of maintaining a
        // continously sorted TreeSet because everytime you add a collection
        // to a treeset it has to be resorted. It's better to do this operation
        // once at the end.

        Collections.sort(phoneticList, new Word()); // always sort phonetic
                                                    // matches along the top
        Collections.sort(wordlist, new Word()); // the non-phonetic matches can
                                                // be listed below

        phoneticList.addAll(wordlist);
        return phoneticList;
    }

    /**
     * When we don't come up with any suggestions (probably because the
     * threshold was too strict), then pick the best guesses from the those
     * words that have the same phonetic code.
     * 
     * This method is only needed to provide backward compatibility.
     * 
     * @see addBestGuess(String word, Vector wordList, int[][] matrix)
     * @param word - the word we are trying spell correct
     * @param wordList - the linked list that will get the best guess
     */
//    private void addBestGuess(String word, List wordList) {
//        addBestGuess(word, wordList, null);
//    }

    /**
     * When we don't come up with any suggestions (probably because the
     * threshold was too strict), then pick the best guesses from the those
     * words that have the same phonetic code.
     * 
     * @param word - the word we are trying spell correct
     * @param Two dimensional array of int used to calculate edit distance.
     *            Allocating this memory outside of the function will greatly
     *            improve efficiency.
     * @param wordList - the linked list that will get the best guess
     */
    private void addBestGuess(String word, List wordList, int[][] matrix) {
        if (matrix == null)
            matrix = new int[0][0];

        if (wordList.size() != 0)
            throw new InvalidParameterException(
                    "the wordList vector must be empty");

        int bestScore = Integer.MAX_VALUE;

        String code = getCode(word);
        List simwordlist = getWords(code);

        List candidates = new LinkedList();

        for (Iterator j = simwordlist.iterator(); j.hasNext();) {
            String similar = j.next();
            int distance = EditDistance.getDistance(word, similar, matrix);
            if (distance <= bestScore) {
                bestScore = distance;
                Word goodGuess = new Word(similar, distance);
                candidates.add(goodGuess);
            }
        }

        // now, only pull out the guesses that had the best score
        for (Iterator iter = candidates.iterator(); iter.hasNext();) {
            Word candidate = iter.next();
            if (candidate.getCost() == bestScore)
                wordList.add(candidate);
        }

    }

    private List getWordsFromCode(String word, Map codes) {
        Configuration config = Configuration.getConfiguration();
        List result = new ArrayList();
        int[][] matrix = new int[0][0];
        final int configDistance = config
                .getInteger(Configuration.SPELL_THRESHOLD);

        Iterator> it = codes.entrySet().iterator();
        while (it.hasNext()) {
            Entry entry = it.next();
            String code = entry.getKey();
            List simwordlist = getWords(code);
            for (Iterator iter = simwordlist.iterator(); iter.hasNext();) {
                String similar = iter.next();
                int distance = EditDistance.getDistance(word, similar, matrix);
                if (distance < configDistance) {
                    Word w = new Word(similar, distance);
                    result.add(w);
                }
            }
        }
        return result;
    }

    /**
     * Returns the phonetic code representing the word.
     * 
     * @param word The word we want the phonetic code.
     * @return The value of the phonetic code for the word.
     */
    public String getCode(String word) {
        return tf.transform(word);
    }

    /**
     * Returns a list of words that have the same phonetic code.
     * 
     * @param phoneticCode The phonetic code common to the list of words
     * @return A list of words having the same phonetic code
     */
    protected abstract List getWords(String phoneticCode);

    /**
     * Returns true if the word is correctly spelled against the current word
     * list.
     */
    public boolean isCorrect(String word) {
        List possible = getWords(getCode(word));
        if (possible.contains(word))
            return true;
        // JMH should we always try the lowercase version. If I dont then
        // capitalised
        // words are always returned as incorrect.
        else if (possible.contains(word.toLowerCase()))
            return true;
        return false;
    }
}