com.swabunga.spell.engine.GenericTransformator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jazzy Show documentation
Show all versions of jazzy Show documentation
This is a fork of the jazzy dictionary
The newest version!
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.swabunga.spell.engine;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.swabunga.spell.util.StringUtility;
/**
* A Generic implementation of a transformator takes an aspell phonetics
* file and constructs some sort of transformation table using the inner
* class TransformationRule. Basically, each transformation rule represent
* a line in the phonetic file. One line contains two groups of characters
* separated by white space(s). The first group is the match expression
* . The match expression describe letters to associate with a
* syllable. The second group is the replacement expression giving the
* phonetic equivalent of the match expression.
*
* @see SpellDictionaryASpell SpellDictionaryASpell for information on getting
* phonetic files for aspell.
*
* @author Robert Gustavsson ([email protected])
*/
public class GenericTransformator implements Transformator {
/**
* This replace list is used if no phonetic file is supplied or it doesn't
* contain the alphabet.
*/
private static final char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D',
'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };
/**
* The alphabet start marker.
*
* @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
*/
public static final char ALPHABET_START = '[';
/**
* The alphabet end marker.
*
* @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
*/
public static final char ALPHABET_END = ']';
/**
* Phonetic file keyword indicating that a different alphabet is used for
* this language. The keyword must be followed an
* {@link GenericTransformator#ALPHABET_START ALPHABET_START} marker, a list
* of characters defining the alphabet and a
* {@link GenericTransformator#ALPHABET_END ALPHABET_END} marker.
*/
public static final String KEYWORD_ALPHBET = "alphabet";
/**
* Phonetic file lines starting with the keywords are skipped. The key words
* are: version, followup, collapse_result. Comments, starting with '#', are
* also skipped to the end of line.
*/
public static final String[] IGNORED_KEYWORDS = { "version", "followup",
"collapse_result" };
/**
* Start a group of characters which can be appended to the match expression
* of the phonetic file.
*/
public static final char STARTMULTI = '(';
/**
* End a group of characters which can be appended to the match expression
* of the phonetic file.
*/
public static final char ENDMULTI = ')';
/**
* During phonetic transformation of a word each numeric character is
* replaced by this DIGITCODE.
*/
public static final String DIGITCODE = "0";
/**
* Phonetic file character code indicating that the replace expression is
* empty.
*/
public static final String REPLACEVOID = "_";
private TransformationRule[] ruleArray = null;
private char[] alphabetString = defaultEnglishAlphabet;
/**
* Construct a transformation table from the phonetic file
*
* @param phonetic the phonetic file as specified in aspell
* @throws java.io.IOException indicates a problem while reading the
* phonetic file
*/
public GenericTransformator(File phonetic) throws IOException {
buildRules(new BufferedReader(new FileReader(phonetic)));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Construct a transformation table from the phonetic file
*
* @param phonetic the phonetic file as specified in aspell
* @param encoding the character set required
* @throws java.io.IOException indicates a problem while reading the
* phonetic file
*/
public GenericTransformator(File phonetic, String encoding)
throws IOException {
buildRules(new BufferedReader(new InputStreamReader(
new FileInputStream(phonetic), encoding)));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Construct a transformation table from the phonetic file
*
* @param phonetic the phonetic file as specified in aspell. The file is
* supplied as a reader.
* @throws java.io.IOException indicates a problem while reading the
* phonetic information
*/
public GenericTransformator(Reader phonetic) throws IOException {
buildRules(new BufferedReader(phonetic));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Goes through an alphabet and makes sure that only one of those letters
* that are coded equally will be in the replace list. In other words, it
* removes any letters in the alphabet that are redundant phonetically.
*
* This is done to improve speed in the getSuggestion method.
*
* @param alphabet The complete alphabet to wash.
* @return The washed alphabet to be used as replace list.
*/
private char[] washAlphabetIntoReplaceList(char[] alphabet) {
Map letters = new HashMap(alphabet.length);
for (int i = 0; i < alphabet.length; i++) {
String tmp = String.valueOf(alphabet[i]);
String code = transform(tmp);
if (!letters.containsKey(code)) {
letters.put(code, new Character(alphabet[i]));
}
}
Object[] tmpCharacters = letters.values().toArray();
char[] washedArray = new char[tmpCharacters.length];
for (int i = 0; i < tmpCharacters.length; i++) {
washedArray[i] = ((Character) tmpCharacters[i]).charValue();
}
return washedArray;
}
/**
* Takes out all single character replacements and put them in a char array.
* This array can later be used for adding or changing letters in
* getSuggestion().
*
* @return char[] An array of chars with replacements characters
*/
public char[] getCodeReplaceList() {
char[] replacements;
TransformationRule rule;
List tmp = new ArrayList();
if (ruleArray == null)
return null;
for (int i = 0; i < ruleArray.length; i++) {
rule = ruleArray[i];
if (rule.getReplaceExp().length() == 1)
tmp.add(rule.getReplaceExp());
}
replacements = new char[tmp.size()];
for (int i = 0; i < tmp.size(); i++) {
replacements[i] = ((String) tmp.get(i)).charAt(0);
}
return replacements;
}
/**
* Builds up an char array with the chars in the alphabet of the language as
* it was read from the alphabet tag in the phonetic file.
*
* @return char[] An array of chars representing the alphabet or null if no
* alphabet was available.
*/
public char[] getReplaceList() {
return alphabetString;
}
/**
* Builds the phonetic code of the word.
*
* @param word the word to transform
* @return the phonetic transformation of the word
*/
public String transform(String word) {
if (ruleArray == null)
return null;
TransformationRule rule;
StringBuilder str = new StringBuilder(word.toUpperCase());
int strLength = str.length();
int startPos = 0, add = 1;
while (startPos < strLength) {
add = 1;
if (Character.isDigit(str.charAt(startPos))) {
StringUtility.replace(str, startPos,
startPos + DIGITCODE.length(), DIGITCODE);
startPos += add;
continue;
}
for (int i = 0; i < ruleArray.length; i++) {
// System.out.println("Testing rule#:"+i);
rule = (TransformationRule) ruleArray[i];
if (rule.startsWithExp() && startPos > 0)
continue;
if (startPos + rule.lengthOfMatch() > strLength) {
continue;
}
if (rule.isMatching(str, startPos)) {
String replaceExp = rule.getReplaceExp();
add = replaceExp.length();
StringUtility.replace(str, startPos,
startPos + rule.getTakeOut(), replaceExp);
strLength -= rule.getTakeOut();
strLength += add;
// System.out.println("Replacing with rule#:"+i+" add="+add);
break;
}
}
startPos += add;
}
// System.out.println(word);
// System.out.println(str.toString());
return str.toString();
}
// Used to build up the transformastion table.
private void buildRules(BufferedReader in) throws IOException {
String read = null;
List ruleList = new ArrayList();
while ((read = in.readLine()) != null) {
buildRule(realTrimmer(read), ruleList);
}
ruleArray = ruleList.toArray(new TransformationRule[0]);
}
// Here is where the real work of reading the phonetics file is done.
private void buildRule(String str, List ruleList) {
if (str.length() < 1)
return;
for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
if (str.startsWith(IGNORED_KEYWORDS[i]))
return;
}
// A different alphabet is used for this language, will be read into
// the alphabetString variable.
if (str.startsWith(KEYWORD_ALPHBET)) {
int start = str.indexOf(ALPHABET_START);
int end = str.lastIndexOf(ALPHABET_END);
if (end != -1 && start != -1) {
alphabetString = str.substring(++start, end).toCharArray();
}
return;
}
// str contains two groups of characters separated by white space(s).
// The fisrt group is the "match expression". The second group is the
// "replacement expression" giving the phonetic equivalent of the
// "match expression".
TransformationRule rule = null;
StringBuilder matchExp = new StringBuilder();
StringBuilder replaceExp = new StringBuilder();
boolean start = false, end = false;
int takeOutPart = 0, matchLength = 0;
boolean match = true, inMulti = false;
for (int i = 0; i < str.length(); i++) {
if (Character.isWhitespace(str.charAt(i))) {
match = false;
} else {
if (match) {
if (!isReservedChar(str.charAt(i))) {
matchExp.append(str.charAt(i));
if (!inMulti) {
takeOutPart++;
matchLength++;
}
if (str.charAt(i) == STARTMULTI
|| str.charAt(i) == ENDMULTI)
inMulti = !inMulti;
}
if (str.charAt(i) == '-')
takeOutPart--;
if (str.charAt(i) == '^')
start = true;
if (str.charAt(i) == '$')
end = true;
} else {
replaceExp.append(str.charAt(i));
}
}
}
if (replaceExp.toString().equals(REPLACEVOID)) {
replaceExp = new StringBuilder("");
// System.out.println("Changing _ to \"\" for "+matchExp.toString());
}
rule = new TransformationRule(matchExp.toString(),
replaceExp.toString(), takeOutPart, matchLength, start, end);
// System.out.println(rule.toString());
ruleList.add(rule);
}
// Chars with special meaning to aspell. Not everyone is implemented here.
private boolean isReservedChar(char ch) {
if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-'
|| Character.isDigit(ch))
return true;
return false;
}
// Trims off everything we don't care about.
private String realTrimmer(String row) {
int pos = row.indexOf('#');
if (pos != -1) {
row = row.substring(0, pos);
}
return row.trim();
}
// Inner Classes
/*
* Holds the match string and the replace string and all the rule
* attributes. Is responsible for indicating matches.
*/
private class TransformationRule {
private String replace;
private char[] match;
// takeOut=number of chars to replace;
// matchLength=length of matching string counting multies as one.
private int takeOut, matchLength;
private boolean start, end;
// Construktor
public TransformationRule(String match, String replace, int takeout,
int matchLength, boolean start, boolean end) {
this.match = match.toCharArray();
this.replace = replace;
this.takeOut = takeout;
this.matchLength = matchLength;
this.start = start;
this.end = end;
}
/*
* Returns true if word from pos and forward matches the match string.
* Precondition: wordPos+matchLength