com.swabunga.spell.engine.GenericTransformator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jazzy Show documentation
This is a fork of the jazzy dictionary
The newest version!
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package com.swabunga.spell.engine;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.swabunga.spell.util.StringUtility;

/**
 * A Generic implementation of a transformator takes an  aspell phonetics
 * file and constructs some sort of transformation table using the inner
 * class TransformationRule.  Basically, each transformation rule represent
 * a line in the phonetic file. One line contains two groups of characters
 * separated by white space(s). The first group is the match expression
 * . The match expression describe letters to associate with a
 * syllable. The second group is the replacement expression giving the
 * phonetic equivalent of the match expression.
 * 
 * @see SpellDictionaryASpell SpellDictionaryASpell for information on getting
 *      phonetic files for aspell.
 * 
 * @author Robert Gustavsson ([email protected])
 */
public class GenericTransformator implements Transformator {

    /**
     * This replace list is used if no phonetic file is supplied or it doesn't
     * contain the alphabet.
     */
    private static final char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D',
            'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
            'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

    /**
     * The alphabet start marker.
     * 
     * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
     */
    public static final char ALPHABET_START = '[';
    /**
     * The alphabet end marker.
     * 
     * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
     */
    public static final char ALPHABET_END = ']';
    /**
     * Phonetic file keyword indicating that a different alphabet is used for
     * this language. The keyword must be followed an
     * {@link GenericTransformator#ALPHABET_START ALPHABET_START} marker, a list
     * of characters defining the alphabet and a
     * {@link GenericTransformator#ALPHABET_END ALPHABET_END} marker.
     */
    public static final String KEYWORD_ALPHBET = "alphabet";
    /**
     * Phonetic file lines starting with the keywords are skipped. The key words
     * are: version, followup, collapse_result. Comments, starting with '#', are
     * also skipped to the end of line.
     */
    public static final String[] IGNORED_KEYWORDS = { "version", "followup",
            "collapse_result" };

    /**
     * Start a group of characters which can be appended to the match expression
     * of the phonetic file.
     */
    public static final char STARTMULTI = '(';
    /**
     * End a group of characters which can be appended to the match expression
     * of the phonetic file.
     */
    public static final char ENDMULTI = ')';
    /**
     * During phonetic transformation of a word each numeric character is
     * replaced by this DIGITCODE.
     */
    public static final String DIGITCODE = "0";
    /**
     * Phonetic file character code indicating that the replace expression is
     * empty.
     */
    public static final String REPLACEVOID = "_";

    private TransformationRule[] ruleArray = null;
    private char[] alphabetString = defaultEnglishAlphabet;

    /**
     * Construct a transformation table from the phonetic file
     * 
     * @param phonetic the phonetic file as specified in aspell
     * @throws java.io.IOException indicates a problem while reading the
     *             phonetic file
     */
    public GenericTransformator(File phonetic) throws IOException {
        buildRules(new BufferedReader(new FileReader(phonetic)));
        alphabetString = washAlphabetIntoReplaceList(getReplaceList());

    }

    /**
     * Construct a transformation table from the phonetic file
     * 
     * @param phonetic the phonetic file as specified in aspell
     * @param encoding the character set required
     * @throws java.io.IOException indicates a problem while reading the
     *             phonetic file
     */
    public GenericTransformator(File phonetic, String encoding)
            throws IOException {
        buildRules(new BufferedReader(new InputStreamReader(
                new FileInputStream(phonetic), encoding)));
        alphabetString = washAlphabetIntoReplaceList(getReplaceList());
    }

    /**
     * Construct a transformation table from the phonetic file
     * 
     * @param phonetic the phonetic file as specified in aspell. The file is
     *            supplied as a reader.
     * @throws java.io.IOException indicates a problem while reading the
     *             phonetic information
     */
    public GenericTransformator(Reader phonetic) throws IOException {
        buildRules(new BufferedReader(phonetic));
        alphabetString = washAlphabetIntoReplaceList(getReplaceList());
    }

    /**
     * Goes through an alphabet and makes sure that only one of those letters
     * that are coded equally will be in the replace list. In other words, it
     * removes any letters in the alphabet that are redundant phonetically.
     * 
     * This is done to improve speed in the getSuggestion method.
     * 
     * @param alphabet The complete alphabet to wash.
     * @return The washed alphabet to be used as replace list.
     */
    private char[] washAlphabetIntoReplaceList(char[] alphabet) {

        Map letters = new HashMap(alphabet.length);

        for (int i = 0; i < alphabet.length; i++) {
            String tmp = String.valueOf(alphabet[i]);
            String code = transform(tmp);
            if (!letters.containsKey(code)) {
                letters.put(code, new Character(alphabet[i]));
            }
        }

        Object[] tmpCharacters = letters.values().toArray();
        char[] washedArray = new char[tmpCharacters.length];

        for (int i = 0; i < tmpCharacters.length; i++) {
            washedArray[i] = ((Character) tmpCharacters[i]).charValue();
        }

        return washedArray;
    }

    /**
     * Takes out all single character replacements and put them in a char array.
     * This array can later be used for adding or changing letters in
     * getSuggestion().
     * 
     * @return char[] An array of chars with replacements characters
     */
    public char[] getCodeReplaceList() {
        char[] replacements;
        TransformationRule rule;
        List tmp = new ArrayList();

        if (ruleArray == null)
            return null;
        for (int i = 0; i < ruleArray.length; i++) {
            rule = ruleArray[i];
            if (rule.getReplaceExp().length() == 1)
                tmp.add(rule.getReplaceExp());
        }
        replacements = new char[tmp.size()];
        for (int i = 0; i < tmp.size(); i++) {
            replacements[i] = ((String) tmp.get(i)).charAt(0);
        }
        return replacements;
    }

    /**
     * Builds up an char array with the chars in the alphabet of the language as
     * it was read from the alphabet tag in the phonetic file.
     * 
     * @return char[] An array of chars representing the alphabet or null if no
     *         alphabet was available.
     */
    public char[] getReplaceList() {
        return alphabetString;
    }

    /**
     * Builds the phonetic code of the word.
     * 
     * @param word the word to transform
     * @return the phonetic transformation of the word
     */
    public String transform(String word) {

        if (ruleArray == null)
            return null;

        TransformationRule rule;
        StringBuilder str = new StringBuilder(word.toUpperCase());
        int strLength = str.length();
        int startPos = 0, add = 1;

        while (startPos < strLength) {

            add = 1;
            if (Character.isDigit(str.charAt(startPos))) {
                StringUtility.replace(str, startPos,
                        startPos + DIGITCODE.length(), DIGITCODE);
                startPos += add;
                continue;
            }

            for (int i = 0; i < ruleArray.length; i++) {
                // System.out.println("Testing rule#:"+i);
                rule = (TransformationRule) ruleArray[i];
                if (rule.startsWithExp() && startPos > 0)
                    continue;
                if (startPos + rule.lengthOfMatch() > strLength) {
                    continue;
                }
                if (rule.isMatching(str, startPos)) {
                    String replaceExp = rule.getReplaceExp();

                    add = replaceExp.length();
                    StringUtility.replace(str, startPos,
                            startPos + rule.getTakeOut(), replaceExp);
                    strLength -= rule.getTakeOut();
                    strLength += add;
                    // System.out.println("Replacing with rule#:"+i+" add="+add);
                    break;
                }
            }
            startPos += add;
        }
        // System.out.println(word);
        // System.out.println(str.toString());
        return str.toString();
    }

    // Used to build up the transformastion table.
    private void buildRules(BufferedReader in) throws IOException {
        String read = null;
        List ruleList = new ArrayList();
        while ((read = in.readLine()) != null) {
            buildRule(realTrimmer(read), ruleList);
        }
        ruleArray = ruleList.toArray(new TransformationRule[0]);
    }

    // Here is where the real work of reading the phonetics file is done.
    private void buildRule(String str, List ruleList) {
        if (str.length() < 1)
            return;
        for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
            if (str.startsWith(IGNORED_KEYWORDS[i]))
                return;
        }

        // A different alphabet is used for this language, will be read into
        // the alphabetString variable.
        if (str.startsWith(KEYWORD_ALPHBET)) {
            int start = str.indexOf(ALPHABET_START);
            int end = str.lastIndexOf(ALPHABET_END);
            if (end != -1 && start != -1) {
                alphabetString = str.substring(++start, end).toCharArray();
            }
            return;
        }

        // str contains two groups of characters separated by white space(s).
        // The fisrt group is the "match expression". The second group is the
        // "replacement expression" giving the phonetic equivalent of the
        // "match expression".
        TransformationRule rule = null;
        StringBuilder matchExp = new StringBuilder();
        StringBuilder replaceExp = new StringBuilder();
        boolean start = false, end = false;
        int takeOutPart = 0, matchLength = 0;
        boolean match = true, inMulti = false;
        for (int i = 0; i < str.length(); i++) {
            if (Character.isWhitespace(str.charAt(i))) {
                match = false;
            } else {
                if (match) {
                    if (!isReservedChar(str.charAt(i))) {
                        matchExp.append(str.charAt(i));
                        if (!inMulti) {
                            takeOutPart++;
                            matchLength++;
                        }
                        if (str.charAt(i) == STARTMULTI
                                || str.charAt(i) == ENDMULTI)
                            inMulti = !inMulti;
                    }
                    if (str.charAt(i) == '-')
                        takeOutPart--;
                    if (str.charAt(i) == '^')
                        start = true;
                    if (str.charAt(i) == '$')
                        end = true;
                } else {
                    replaceExp.append(str.charAt(i));
                }
            }
        }
        if (replaceExp.toString().equals(REPLACEVOID)) {
            replaceExp = new StringBuilder("");
            // System.out.println("Changing _ to \"\" for "+matchExp.toString());
        }
        rule = new TransformationRule(matchExp.toString(),
                replaceExp.toString(), takeOutPart, matchLength, start, end);
        // System.out.println(rule.toString());
        ruleList.add(rule);
    }

    // Chars with special meaning to aspell. Not everyone is implemented here.
    private boolean isReservedChar(char ch) {
        if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-'
                || Character.isDigit(ch))
            return true;
        return false;
    }

    // Trims off everything we don't care about.
    private String realTrimmer(String row) {
        int pos = row.indexOf('#');
        if (pos != -1) {
            row = row.substring(0, pos);
        }
        return row.trim();
    }

    // Inner Classes
    /*
     * Holds the match string and the replace string and all the rule
     * attributes. Is responsible for indicating matches.
     */
    private class TransformationRule {

        private String replace;
        private char[] match;
        // takeOut=number of chars to replace;
        // matchLength=length of matching string counting multies as one.
        private int takeOut, matchLength;
        private boolean start, end;

        // Construktor
        public TransformationRule(String match, String replace, int takeout,
                int matchLength, boolean start, boolean end) {
            this.match = match.toCharArray();
            this.replace = replace;
            this.takeOut = takeout;
            this.matchLength = matchLength;
            this.start = start;
            this.end = end;
        }

        /*
         * Returns true if word from pos and forward matches the match string.
         * Precondition: wordPos+matchLength