All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.swabunga.spell.engine.DoubleMeta Maven / Gradle / Ivy

The newest version!
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package com.swabunga.spell.engine;

/**
 * A phonetic encoding algorithm that takes an English word and computes a
 * phonetic version of it. This allows for phonetic matches in a spell checker.
 * This class is a port of the C++ DoubleMetaphone() class, which was intended
 * to return two possible phonetic translations for certain words, although the
 * Java version only seems to be concerned with one, making the "double" part
 * erroneous. 
* source code for the original C++ can be found here: http://aspell.sourceforge.net/metaphone/ DoubleMetaphone does some * processing, such as uppercasing, on the input string first to normalize it. * Then, to create the key, the function traverses the input string in a while * loop, sending successive characters into a giant switch statement. Before * determining the appropriate pronunciation, the algorithm considers the * context surrounding each character within the input string. *

* Things that were changed:
* The alternate flag could be set to true but was never checked so why bother * with it. REMOVED
* Why was this class serializable?
* The primary, in, length and last variables could be initialized and local to * the process method and references passed around the appropriate methods. As * such there are no class variables and this class becomes firstly threadsafe * and secondly could be static final.
* The function call SlavoGermaic was called repeatedly in the process function, * it is now only called once. * */ public class DoubleMeta implements Transformator { /** * The replace list is used in the getSuggestions method. All of the letters * in the misspelled word are replaced with the characters from this list to * try and generate more suggestions, which implies l*n tries, if l is the * size of the string, and n is the size of this list. * * In addition to that, each of these letters is added to the misspelled * word. */ private static char[] replaceList = { 'A', 'B', 'X', 'S', 'K', 'J', 'T', 'F', 'H', 'L', 'M', 'N', 'P', 'R', '0' }; private static final String[] myList = { "GN", "KN", "PN", "WR", "PS", "" }; private static final String[] list1 = { "ACH", "" }; private static final String[] list2 = { "BACHER", "MACHER", "" }; private static final String[] list3 = { "CAESAR", "" }; private static final String[] list4 = { "CHIA", "" }; private static final String[] list5 = { "CH", "" }; private static final String[] list6 = { "CHAE", "" }; private static final String[] list7 = { "HARAC", "HARIS", "" }; private static final String[] list8 = { "HOR", "HYM", "HIA", "HEM", "" }; private static final String[] list9 = { "CHORE", "" }; private static final String[] list10 = { "VAN ", "VON ", "" }; private static final String[] list11 = { "SCH", "" }; private static final String[] list12 = { "ORCHES", "ARCHIT", "ORCHID", "" }; private static final String[] list13 = { "T", "S", "" }; private static final String[] list14 = { "A", "O", "U", "E", "" }; private static final String[] list15 = { "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "" }; private static final String[] list16 = { "MC", "" }; private static final String[] list17 = { "CZ", "" }; private static final String[] list18 = { "WICZ", "" }; private static final String[] list19 = { "CIA", "" }; private static final String[] list20 = { "CC", "" }; private static final String[] list21 = { "I", "E", "H", "" }; private static final String[] list22 = { "HU", "" }; private static final String[] list23 = { "UCCEE", "UCCES", "" }; private static final String[] list24 = { "CK", "CG", "CQ", "" }; private static final String[] list25 = { "CI", "CE", "CY", "" }; // DMV: used by the original code which returned two phonetic code, but not // the current code // private static final String[] list26 = { // "CIO", "CIE", "CIA", "" // }; private static final String[] list27 = { " C", " Q", " G", "" }; private static final String[] list28 = { "C", "K", "Q", "" }; private static final String[] list29 = { "CE", "CI", "" }; private static final String[] list30 = { "DG", "" }; private static final String[] list31 = { "I", "E", "Y", "" }; private static final String[] list32 = { "DT", "DD", "" }; private static final String[] list33 = { "B", "H", "D", "" }; private static final String[] list34 = { "B", "H", "D", "" }; private static final String[] list35 = { "B", "H", "" }; private static final String[] list36 = { "C", "G", "L", "R", "T", "" }; private static final String[] list37 = { "EY", "" }; private static final String[] list38 = { "LI", "" }; private static final String[] list39 = { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER", "" }; private static final String[] list40 = { "ER", "" }; private static final String[] list41 = { "DANGER", "RANGER", "MANGER", "" }; private static final String[] list42 = { "E", "I", "" }; private static final String[] list43 = { "RGY", "OGY", "" }; private static final String[] list44 = { "E", "I", "Y", "" }; private static final String[] list45 = { "AGGI", "OGGI", "" }; private static final String[] list46 = { "VAN ", "VON ", "" }; private static final String[] list47 = { "SCH", "" }; private static final String[] list48 = { "ET", "" }; // DMV: used by the original code which returned two phonetic code, but not // the current code // private static final String[] list49 = { // "IER ", "" // }; private static final String[] list50 = { "JOSE", "" }; private static final String[] list51 = { "SAN ", "" }; private static final String[] list52 = { "SAN ", "" }; private static final String[] list53 = { "JOSE", "" }; private static final String[] list54 = { "L", "T", "K", "S", "N", "M", "B", "Z", "" }; private static final String[] list55 = { "S", "K", "L", "" }; private static final String[] list56 = { "ILLO", "ILLA", "ALLE", "" }; private static final String[] list57 = { "AS", "OS", "" }; private static final String[] list58 = { "A", "O", "" }; private static final String[] list59 = { "ALLE", "" }; private static final String[] list60 = { "UMB", "" }; private static final String[] list61 = { "ER", "" }; private static final String[] list62 = { "P", "B", "" }; private static final String[] list63 = { "IE", "" }; private static final String[] list64 = { "ME", "MA", "" }; private static final String[] list65 = { "ISL", "YSL", "" }; private static final String[] list66 = { "SUGAR", "" }; private static final String[] list67 = { "SH", "" }; private static final String[] list68 = { "HEIM", "HOEK", "HOLM", "HOLZ", "" }; private static final String[] list69 = { "SIO", "SIA", "" }; private static final String[] list70 = { "SIAN", "" }; private static final String[] list71 = { "M", "N", "L", "W", "" }; private static final String[] list72 = { "Z", "" }; private static final String[] list73 = { "Z", "" }; private static final String[] list74 = { "SC", "" }; private static final String[] list75 = { "OO", "ER", "EN", "UY", "ED", "EM", "" }; private static final String[] list76 = { "ER", "EN", "" }; private static final String[] list77 = { "I", "E", "Y", "" }; private static final String[] list78 = { "AI", "OI", "" }; private static final String[] list79 = { "S", "Z", "" }; private static final String[] list80 = { "TION", "" }; private static final String[] list81 = { "TIA", "TCH", "" }; private static final String[] list82 = { "TH", "" }; private static final String[] list83 = { "TTH", "" }; private static final String[] list84 = { "OM", "AM", "" }; private static final String[] list85 = { "VAN ", "VON ", "" }; private static final String[] list86 = { "SCH", "" }; private static final String[] list87 = { "T", "D", "" }; private static final String[] list88 = { "WR", "" }; private static final String[] list89 = { "WH", "" }; private static final String[] list90 = { "EWSKI", "EWSKY", "OWSKI", "OWSKY", "" }; private static final String[] list91 = { "SCH", "" }; private static final String[] list92 = { "WICZ", "WITZ", "" }; private static final String[] list93 = { "IAU", "EAU", "" }; private static final String[] list94 = { "AU", "OU", "" }; private static final String[] list95 = { "C", "X", "" }; // DMV: used by the original code which returned two phonetic code, but not // the current code // private static final String[] list96 = { // "ZO", "ZI", "ZA", "" // }; /** * @return */ private final static boolean SlavoGermanic(String in) { if ((in.indexOf("W") > -1) || (in.indexOf("K") > -1) || (in.indexOf("CZ") > -1) || (in.indexOf("WITZ") > -1)) return true; return false; } /** * put your documentation comment here * * @param main */ private final static void MetaphAdd(StringBuilder primary, String main) { if (main != null) { primary.append(main); } } private final static void MetaphAdd(StringBuilder primary, char main) { primary.append(main); } /** * put your documentation comment here * * @param at * @return */ private final static boolean isVowel(String in, int at, int length) { if ((at < 0) || (at >= length)) return false; char it = in.charAt(at); if ((it == 'A') || (it == 'E') || (it == 'I') || (it == 'O') || (it == 'U') || (it == 'Y')) return true; return false; } /** * put your documentation comment here * * @param string * @param start * @param length * @param list * @return */ private final static boolean stringAt(String string, int start, int length, String[] list) { if ((start < 0) || (start >= string.length()) || list.length == 0) return false; String substr = string.substring(start, start + length); for (int i = 0; i < list.length; i++) { if (list[i].equals(substr)) return true; } return false; } /** * Take the given word, and return the best phonetic hash for it. Vowels are * minimized as much as possible, and consenants that have similiar sounds * are converted to the same consenant for example, 'v' and 'f' are both * converted to 'f' * * @param word the texte to transform * @return the result of the phonetic transformation */ public final String transform(String word) { StringBuilder primary = new StringBuilder(word.length() + 5); String in = word.toUpperCase() + " "; int current = 0; int length = in.length(); if (length < 1) return ""; int last = length - 1; boolean isSlavoGermaic = SlavoGermanic(in); if (stringAt(in, 0, 2, myList)) current += 1; if (in.charAt(0) == 'X') { MetaphAdd(primary, 'S'); current += 1; } while (current < length) { switch (in.charAt(current)) { case 'A': case 'E': case 'I': case 'O': case 'U': case 'Y': if (current == 0) MetaphAdd(primary, 'A'); current += 1; break; case 'B': MetaphAdd(primary, 'P'); if (in.charAt(current + 1) == 'B') current += 2; else current += 1; break; case '\u00C7': MetaphAdd(primary, 'S'); current += 1; break; case 'C': if ((current > 1) && !isVowel(in, current - 2, length) && stringAt(in, (current - 1), 3, list1) && (in.charAt(current + 2) != 'I') && (in.charAt(current + 2) != 'E') || stringAt(in, (current - 2), 6, list2)) { MetaphAdd(primary, 'K'); current += 2; break; } if ((current == 0) && stringAt(in, current, 6, list3)) { MetaphAdd(primary, 'S'); current += 2; break; } if (stringAt(in, current, 4, list4)) { MetaphAdd(primary, 'K'); current += 2; break; } if (stringAt(in, current, 2, list5)) { if ((current > 0) && stringAt(in, current, 4, list6)) { MetaphAdd(primary, 'K'); current += 2; break; } if ((current == 0) && stringAt(in, (current + 1), 5, list7) || stringAt(in, current + 1, 3, list8) && !stringAt(in, 0, 5, list9)) { MetaphAdd(primary, 'K'); current += 2; break; } if (stringAt(in, 0, 4, list10) || stringAt(in, 0, 3, list11) || stringAt(in, current - 2, 6, list12) || stringAt(in, current + 2, 1, list13) || (stringAt(in, current - 1, 1, list14) || (current == 0)) && stringAt(in, current + 2, 1, list15)) { MetaphAdd(primary, 'K'); } else { if (current > 0) { if (stringAt(in, 0, 2, list16)) MetaphAdd(primary, 'K'); else MetaphAdd(primary, 'X'); } else { MetaphAdd(primary, 'X'); } } current += 2; break; } if (stringAt(in, current, 2, list17) && !stringAt(in, current, 4, list18)) { MetaphAdd(primary, 'S'); current += 2; break; } if (stringAt(in, current, 2, list19)) { MetaphAdd(primary, 'X'); current += 2; break; } if (stringAt(in, current, 2, list20) && !((current == 1) && in.charAt(0) == 'M')) { if (stringAt(in, current + 2, 1, list21) && !stringAt(in, current + 2, 2, list22)) { if (((current == 1) && (in.charAt(current - 1) == 'A')) || stringAt(in, (current - 1), 5, list23)) MetaphAdd(primary, "KS"); else MetaphAdd(primary, 'X'); current += 3; break; } else { MetaphAdd(primary, 'K'); current += 2; break; } } if (stringAt(in, current, 2, list24)) { MetaphAdd(primary, 'K'); current += 2; break; } else if (stringAt(in, current, 2, list25)) { MetaphAdd(primary, 'S'); current += 2; break; } MetaphAdd(primary, 'K'); if (stringAt(in, current + 1, 2, list27)) current += 3; else if (stringAt(in, current + 1, 1, list28) && !stringAt(in, current + 1, 2, list29)) current += 2; else current += 1; break; case 'D': if (stringAt(in, current, 2, list30)) { if (stringAt(in, current + 2, 1, list31)) { MetaphAdd(primary, 'J'); current += 3; break; } else { MetaphAdd(primary, "TK"); current += 2; break; } } MetaphAdd(primary, 'T'); if (stringAt(in, current, 2, list32)) { current += 2; } else { current += 1; } break; case 'F': if (in.charAt(current + 1) == 'F') current += 2; else current += 1; MetaphAdd(primary, 'F'); break; case 'G': if (in.charAt(current + 1) == 'H') { if ((current > 0) && !isVowel(in, current - 1, length)) { MetaphAdd(primary, 'K'); current += 2; break; } if (current < 3) { if (current == 0) { if (in.charAt(current + 2) == 'I') MetaphAdd(primary, 'J'); else MetaphAdd(primary, 'K'); current += 2; break; } } if ((current > 1) && stringAt(in, current - 2, 1, list33) || ((current > 2) && stringAt(in, current - 3, 1, list34)) || ((current > 3) && stringAt(in, current - 4, 1, list35))) { current += 2; break; } else { if ((current > 2) && (in.charAt(current - 1) == 'U') && stringAt(in, current - 3, 1, list36)) { MetaphAdd(primary, 'F'); } else { if ((current > 0) && (in.charAt(current - 1) != 'I')) MetaphAdd(primary, 'K'); } current += 2; break; } } if (in.charAt(current + 1) == 'N') { if ((current == 1) && isVowel(in, 0, length) && !isSlavoGermaic) { MetaphAdd(primary, "KN"); } else { if (!stringAt(in, current + 2, 2, list37) && (in.charAt(current + 1) != 'Y') && !isSlavoGermaic) { MetaphAdd(primary, "N"); } else { MetaphAdd(primary, "KN"); } } current += 2; break; } if (stringAt(in, current + 1, 2, list38) && !isSlavoGermaic) { MetaphAdd(primary, "KL"); current += 2; break; } if ((current == 0) && ((in.charAt(current + 1) == 'Y') || stringAt(in, current + 1, 2, list39))) { MetaphAdd(primary, 'K'); current += 2; break; } if ((stringAt(in, current + 1, 2, list40) || (in .charAt(current + 1) == 'Y')) && !stringAt(in, 0, 6, list41) && !stringAt(in, current - 1, 1, list42) && !stringAt(in, current - 1, 3, list43)) { MetaphAdd(primary, 'K'); current += 2; break; } if (stringAt(in, current + 1, 1, list44) || stringAt(in, current - 1, 4, list45)) { if (stringAt(in, 0, 4, list46) || stringAt(in, 0, 3, list47) || stringAt(in, current + 1, 2, list48)) { MetaphAdd(primary, 'K'); } else { MetaphAdd(primary, 'J'); } current += 2; break; } if (in.charAt(current + 1) == 'G') current += 2; else current += 1; MetaphAdd(primary, 'K'); break; case 'H': if (((current == 0) || isVowel(in, current - 1, length)) && isVowel(in, current + 1, length)) { MetaphAdd(primary, 'H'); current += 2; } else { current += 1; } break; case 'J': if (stringAt(in, current, 4, list50) || stringAt(in, 0, 4, list51)) { if ((current == 0) && (in.charAt(current + 4) == ' ') || stringAt(in, 0, 4, list52)) { MetaphAdd(primary, 'H'); } else { MetaphAdd(primary, 'J'); } current += 1; break; } if ((current == 0) && !stringAt(in, current, 4, list53)) { MetaphAdd(primary, 'J'); } else { if (isVowel(in, current - 1, length) && !isSlavoGermaic && ((in.charAt(current + 1) == 'A') || in .charAt(current + 1) == 'O')) { MetaphAdd(primary, 'J'); } else { if (current == last) { MetaphAdd(primary, 'J'); } else { if (!stringAt(in, current + 1, 1, list54) && !stringAt(in, current - 1, 1, list55)) { MetaphAdd(primary, 'J'); } } } } if (in.charAt(current + 1) == 'J') current += 2; else current += 1; break; case 'K': if (in.charAt(current + 1) == 'K') current += 2; else current += 1; MetaphAdd(primary, 'K'); break; case 'L': if (in.charAt(current + 1) == 'L') { if (((current == (length - 3)) && stringAt(in, current - 1, 4, list56)) || ((stringAt(in, last - 1, 2, list57) || stringAt( in, last, 1, list58)) && stringAt(in, current - 1, 4, list59))) { MetaphAdd(primary, 'L'); current += 2; break; } current += 2; } else current += 1; MetaphAdd(primary, 'L'); break; case 'M': if ((stringAt(in, current - 1, 3, list60) && (((current + 1) == last) || stringAt( in, current + 2, 2, list61))) || (in.charAt(current + 1) == 'M')) current += 2; else current += 1; MetaphAdd(primary, 'M'); break; case 'N': if (in.charAt(current + 1) == 'N') current += 2; else current += 1; MetaphAdd(primary, 'N'); break; case '\u00D1': current += 1; MetaphAdd(primary, 'N'); break; case 'P': if (in.charAt(current + 1) == 'N') { MetaphAdd(primary, 'F'); current += 2; break; } if (stringAt(in, current + 1, 1, list62)) current += 2; else current += 1; MetaphAdd(primary, 'P'); break; case 'Q': if (in.charAt(current + 1) == 'Q') current += 2; else current += 1; MetaphAdd(primary, 'K'); break; case 'R': if ((current == last) && !isSlavoGermaic && stringAt(in, current - 2, 2, list63) && !stringAt(in, current - 4, 2, list64)) { // MetaphAdd(primary, ""); } else MetaphAdd(primary, 'R'); if (in.charAt(current + 1) == 'R') current += 2; else current += 1; break; case 'S': if (stringAt(in, current - 1, 3, list65)) { current += 1; break; } if ((current == 0) && stringAt(in, current, 5, list66)) { MetaphAdd(primary, 'X'); current += 1; break; } if (stringAt(in, current, 2, list67)) { if (stringAt(in, current + 1, 4, list68)) MetaphAdd(primary, 'S'); else MetaphAdd(primary, 'X'); current += 2; break; } if (stringAt(in, current, 3, list69) || stringAt(in, current, 4, list70)) { MetaphAdd(primary, 'S'); current += 3; break; } if (((current == 0) && stringAt(in, current + 1, 1, list71)) || stringAt(in, current + 1, 1, list72)) { MetaphAdd(primary, 'S'); if (stringAt(in, current + 1, 1, list73)) current += 2; else current += 1; break; } if (stringAt(in, current, 2, list74)) { if (in.charAt(current + 2) == 'H') if (stringAt(in, current + 3, 2, list75)) { if (stringAt(in, current + 3, 2, list76)) { MetaphAdd(primary, "X"); } else { MetaphAdd(primary, "SK"); } current += 3; break; } else { MetaphAdd(primary, 'X'); current += 3; break; } if (stringAt(in, current + 2, 1, list77)) { MetaphAdd(primary, 'S'); current += 3; break; } MetaphAdd(primary, "SK"); current += 3; break; } if ((current == last) && stringAt(in, current - 2, 2, list78)) { // MetaphAdd(primary, ""); } else MetaphAdd(primary, 'S'); if (stringAt(in, current + 1, 1, list79)) current += 2; else current += 1; break; case 'T': if (stringAt(in, current, 4, list80)) { MetaphAdd(primary, 'X'); current += 3; break; } if (stringAt(in, current, 3, list81)) { MetaphAdd(primary, 'X'); current += 3; break; } if (stringAt(in, current, 2, list82) || stringAt(in, current, 3, list83)) { if (stringAt(in, (current + 2), 2, list84) || stringAt(in, 0, 4, list85) || stringAt(in, 0, 3, list86)) { MetaphAdd(primary, 'T'); } else { MetaphAdd(primary, '0'); } current += 2; break; } if (stringAt(in, current + 1, 1, list87)) { current += 2; } else current += 1; MetaphAdd(primary, 'T'); break; case 'V': if (in.charAt(current + 1) == 'V') current += 2; else current += 1; MetaphAdd(primary, 'F'); break; case 'W': if (stringAt(in, current, 2, list88)) { MetaphAdd(primary, 'R'); current += 2; break; } if ((current == 0) && (isVowel(in, current + 1, length) || stringAt(in, current, 2, list89))) { MetaphAdd(primary, 'A'); } if (((current == last) && isVowel(in, current - 1, length)) || stringAt(in, current - 1, 5, list90) || stringAt(in, 0, 3, list91)) { MetaphAdd(primary, 'F'); current += 1; break; } if (stringAt(in, current, 4, list92)) { MetaphAdd(primary, "TS"); current += 4; break; } current += 1; break; case 'X': if (!((current == last) && (stringAt(in, current - 3, 3, list93) || stringAt( in, current - 2, 2, list94)))) MetaphAdd(primary, "KS"); if (stringAt(in, current + 1, 1, list95)) current += 2; else current += 1; break; case 'Z': if (in.charAt(current + 1) == 'H') { MetaphAdd(primary, 'J'); current += 2; break; } else { MetaphAdd(primary, 'S'); } if (in.charAt(current + 1) == 'Z') current += 2; else current += 1; break; default: current += 1; } } return primary.toString(); } /** * @see com.swabunga.spell.engine.Transformator#getReplaceList() */ public char[] getReplaceList() { return replaceList; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy