All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.swabunga.spell.engine.GenericTransformator Maven / Gradle / Ivy

The newest version!
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package com.swabunga.spell.engine;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.swabunga.spell.util.StringUtility;

/**
 * A Generic implementation of a transformator takes an  aspell phonetics
 * file and constructs some sort of transformation table using the inner
 * class TransformationRule. 

Basically, each transformation rule represent * a line in the phonetic file. One line contains two groups of characters * separated by white space(s). The first group is the match expression * . The match expression describe letters to associate with a * syllable. The second group is the replacement expression giving the * phonetic equivalent of the match expression. * * @see SpellDictionaryASpell SpellDictionaryASpell for information on getting * phonetic files for aspell. * * @author Robert Gustavsson ([email protected]) */ public class GenericTransformator implements Transformator { /** * This replace list is used if no phonetic file is supplied or it doesn't * contain the alphabet. */ private static final char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }; /** * The alphabet start marker. * * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET */ public static final char ALPHABET_START = '['; /** * The alphabet end marker. * * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET */ public static final char ALPHABET_END = ']'; /** * Phonetic file keyword indicating that a different alphabet is used for * this language. The keyword must be followed an * {@link GenericTransformator#ALPHABET_START ALPHABET_START} marker, a list * of characters defining the alphabet and a * {@link GenericTransformator#ALPHABET_END ALPHABET_END} marker. */ public static final String KEYWORD_ALPHBET = "alphabet"; /** * Phonetic file lines starting with the keywords are skipped. The key words * are: version, followup, collapse_result. Comments, starting with '#', are * also skipped to the end of line. */ public static final String[] IGNORED_KEYWORDS = { "version", "followup", "collapse_result" }; /** * Start a group of characters which can be appended to the match expression * of the phonetic file. */ public static final char STARTMULTI = '('; /** * End a group of characters which can be appended to the match expression * of the phonetic file. */ public static final char ENDMULTI = ')'; /** * During phonetic transformation of a word each numeric character is * replaced by this DIGITCODE. */ public static final String DIGITCODE = "0"; /** * Phonetic file character code indicating that the replace expression is * empty. */ public static final String REPLACEVOID = "_"; private TransformationRule[] ruleArray = null; private char[] alphabetString = defaultEnglishAlphabet; /** * Construct a transformation table from the phonetic file * * @param phonetic the phonetic file as specified in aspell * @throws java.io.IOException indicates a problem while reading the * phonetic file */ public GenericTransformator(File phonetic) throws IOException { buildRules(new BufferedReader(new FileReader(phonetic))); alphabetString = washAlphabetIntoReplaceList(getReplaceList()); } /** * Construct a transformation table from the phonetic file * * @param phonetic the phonetic file as specified in aspell * @param encoding the character set required * @throws java.io.IOException indicates a problem while reading the * phonetic file */ public GenericTransformator(File phonetic, String encoding) throws IOException { buildRules(new BufferedReader(new InputStreamReader( new FileInputStream(phonetic), encoding))); alphabetString = washAlphabetIntoReplaceList(getReplaceList()); } /** * Construct a transformation table from the phonetic file * * @param phonetic the phonetic file as specified in aspell. The file is * supplied as a reader. * @throws java.io.IOException indicates a problem while reading the * phonetic information */ public GenericTransformator(Reader phonetic) throws IOException { buildRules(new BufferedReader(phonetic)); alphabetString = washAlphabetIntoReplaceList(getReplaceList()); } /** * Goes through an alphabet and makes sure that only one of those letters * that are coded equally will be in the replace list. In other words, it * removes any letters in the alphabet that are redundant phonetically. * * This is done to improve speed in the getSuggestion method. * * @param alphabet The complete alphabet to wash. * @return The washed alphabet to be used as replace list. */ private char[] washAlphabetIntoReplaceList(char[] alphabet) { Map letters = new HashMap(alphabet.length); for (int i = 0; i < alphabet.length; i++) { String tmp = String.valueOf(alphabet[i]); String code = transform(tmp); if (!letters.containsKey(code)) { letters.put(code, new Character(alphabet[i])); } } Object[] tmpCharacters = letters.values().toArray(); char[] washedArray = new char[tmpCharacters.length]; for (int i = 0; i < tmpCharacters.length; i++) { washedArray[i] = ((Character) tmpCharacters[i]).charValue(); } return washedArray; } /** * Takes out all single character replacements and put them in a char array. * This array can later be used for adding or changing letters in * getSuggestion(). * * @return char[] An array of chars with replacements characters */ public char[] getCodeReplaceList() { char[] replacements; TransformationRule rule; List tmp = new ArrayList(); if (ruleArray == null) return null; for (int i = 0; i < ruleArray.length; i++) { rule = ruleArray[i]; if (rule.getReplaceExp().length() == 1) tmp.add(rule.getReplaceExp()); } replacements = new char[tmp.size()]; for (int i = 0; i < tmp.size(); i++) { replacements[i] = ((String) tmp.get(i)).charAt(0); } return replacements; } /** * Builds up an char array with the chars in the alphabet of the language as * it was read from the alphabet tag in the phonetic file. * * @return char[] An array of chars representing the alphabet or null if no * alphabet was available. */ public char[] getReplaceList() { return alphabetString; } /** * Builds the phonetic code of the word. * * @param word the word to transform * @return the phonetic transformation of the word */ public String transform(String word) { if (ruleArray == null) return null; TransformationRule rule; StringBuilder str = new StringBuilder(word.toUpperCase()); int strLength = str.length(); int startPos = 0, add = 1; while (startPos < strLength) { add = 1; if (Character.isDigit(str.charAt(startPos))) { StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE); startPos += add; continue; } for (int i = 0; i < ruleArray.length; i++) { // System.out.println("Testing rule#:"+i); rule = (TransformationRule) ruleArray[i]; if (rule.startsWithExp() && startPos > 0) continue; if (startPos + rule.lengthOfMatch() > strLength) { continue; } if (rule.isMatching(str, startPos)) { String replaceExp = rule.getReplaceExp(); add = replaceExp.length(); StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp); strLength -= rule.getTakeOut(); strLength += add; // System.out.println("Replacing with rule#:"+i+" add="+add); break; } } startPos += add; } // System.out.println(word); // System.out.println(str.toString()); return str.toString(); } // Used to build up the transformastion table. private void buildRules(BufferedReader in) throws IOException { String read = null; List ruleList = new ArrayList(); while ((read = in.readLine()) != null) { buildRule(realTrimmer(read), ruleList); } ruleArray = ruleList.toArray(new TransformationRule[0]); } // Here is where the real work of reading the phonetics file is done. private void buildRule(String str, List ruleList) { if (str.length() < 1) return; for (int i = 0; i < IGNORED_KEYWORDS.length; i++) { if (str.startsWith(IGNORED_KEYWORDS[i])) return; } // A different alphabet is used for this language, will be read into // the alphabetString variable. if (str.startsWith(KEYWORD_ALPHBET)) { int start = str.indexOf(ALPHABET_START); int end = str.lastIndexOf(ALPHABET_END); if (end != -1 && start != -1) { alphabetString = str.substring(++start, end).toCharArray(); } return; } // str contains two groups of characters separated by white space(s). // The fisrt group is the "match expression". The second group is the // "replacement expression" giving the phonetic equivalent of the // "match expression". TransformationRule rule = null; StringBuilder matchExp = new StringBuilder(); StringBuilder replaceExp = new StringBuilder(); boolean start = false, end = false; int takeOutPart = 0, matchLength = 0; boolean match = true, inMulti = false; for (int i = 0; i < str.length(); i++) { if (Character.isWhitespace(str.charAt(i))) { match = false; } else { if (match) { if (!isReservedChar(str.charAt(i))) { matchExp.append(str.charAt(i)); if (!inMulti) { takeOutPart++; matchLength++; } if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI) inMulti = !inMulti; } if (str.charAt(i) == '-') takeOutPart--; if (str.charAt(i) == '^') start = true; if (str.charAt(i) == '$') end = true; } else { replaceExp.append(str.charAt(i)); } } } if (replaceExp.toString().equals(REPLACEVOID)) { replaceExp = new StringBuilder(""); // System.out.println("Changing _ to \"\" for "+matchExp.toString()); } rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end); // System.out.println(rule.toString()); ruleList.add(rule); } // Chars with special meaning to aspell. Not everyone is implemented here. private boolean isReservedChar(char ch) { if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch)) return true; return false; } // Trims off everything we don't care about. private String realTrimmer(String row) { int pos = row.indexOf('#'); if (pos != -1) { row = row.substring(0, pos); } return row.trim(); } // Inner Classes /* * Holds the match string and the replace string and all the rule * attributes. Is responsible for indicating matches. */ private class TransformationRule { private String replace; private char[] match; // takeOut=number of chars to replace; // matchLength=length of matching string counting multies as one. private int takeOut, matchLength; private boolean start, end; // Construktor public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) { this.match = match.toCharArray(); this.replace = replace; this.takeOut = takeout; this.matchLength = matchLength; this.start = start; this.end = end; } /* * Returns true if word from pos and forward matches the match string. * Precondition: wordPos+matchLength




© 2015 - 2024 Weber Informatics LLC | Privacy Policy