All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.Metaphone Maven / Gradle / Ivy

Go to download

The codec package contains simple encoder and decoders for various formats such as Base64 and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.

There is a newer version: 20041127.091804
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a Metaphone value. 
 * 

* Initial Java implementation by William B. Brogden. December, 1997. * Permission given by wbrogden for code to be used anywhere. *

*

* Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p * 39. *

*

* Note, that this does not match the algorithm that ships with PHP, or the algorithm * found in the Perl Text:Metaphone-1.96. * They have had undocumented changes from the originally published algorithm. * For more information, see CODEC-57. *

* * @author Apache Software Foundation * @version $Id: Metaphone.java 1157192 2011-08-12 17:27:38Z ggregory $ */ public class Metaphone implements StringEncoder { /** * Five values in the English language */ private static final String VOWELS = "AEIOU" ; /** * Variable used in Metaphone algorithm */ private static final String FRONTV = "EIY" ; /** * Variable used in Metaphone algorithm */ private static final String VARSON = "CSPTG" ; /** * The max code length for metaphone is 4 */ private int maxCodeLen = 4 ; /** * Creates an instance of the Metaphone encoder */ public Metaphone() { super(); } /** * Find the metaphone value of a String. This is similar to the * soundex algorithm, but better at finding similar sounding words. * All input is converted to upper case. * Limitations: Input format is expected to be a single ASCII word * with only characters in the A - Z range, no punctuation or numbers. * * @param txt String to find the metaphone code for * @return A metaphone code corresponding to the String supplied */ public String metaphone(String txt) { boolean hard = false ; if ((txt == null) || (txt.length() == 0)) { return "" ; } // single character is itself if (txt.length() == 1) { return txt.toUpperCase(java.util.Locale.ENGLISH) ; } char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ; StringBuffer local = new StringBuffer(40); // manipulate StringBuffer code = new StringBuffer(10) ; // output // handle initial 2 characters exceptions switch(inwd[0]) { case 'K' : case 'G' : case 'P' : /* looking for KN, etc*/ if (inwd[1] == 'N') { local.append(inwd, 1, inwd.length - 1); } else { local.append(inwd); } break; case 'A': /* looking for AE */ if (inwd[1] == 'E') { local.append(inwd, 1, inwd.length - 1); } else { local.append(inwd); } break; case 'W' : /* looking for WR or WH */ if (inwd[1] == 'R') { // WR -> R local.append(inwd, 1, inwd.length - 1); break ; } if (inwd[1] == 'H') { local.append(inwd, 1, inwd.length - 1); local.setCharAt(0, 'W'); // WH -> W } else { local.append(inwd); } break; case 'X' : /* initial X becomes S */ inwd[0] = 'S'; local.append(inwd); break ; default : local.append(inwd); } // now local has working string with initials fixed int wdsz = local.length(); int n = 0 ; while ((code.length() < this.getMaxCodeLen()) && (n < wdsz) ) { // max code size of 4 works well char symb = local.charAt(n) ; // remove duplicate letters except C if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { n++ ; } else { // not dup switch(symb) { case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : if (n == 0) { code.append(symb); } break ; // only use vowel if leading char case 'B' : if ( isPreviousChar(local, n, 'M') && isLastChar(wdsz, n) ) { // B is silent if word ends in MB break; } code.append(symb); break; case 'C' : // lots of C special cases /* discard if SCI, SCE or SCY */ if ( isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { break; } if (regionMatch(local, n, "CIA")) { // "CIA" -> X code.append('X'); break; } if (!isLastChar(wdsz, n) && (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) { code.append('S'); break; // CI,CE,CY -> S } if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H') ) { // SCH->sk code.append('K') ; break ; } if (isNextChar(local, n, 'H')) { // detect CH if ((n == 0) && (wdsz >= 3) && isVowel(local,2) ) { // CH consonant -> K consonant code.append('K'); } else { code.append('X'); // CHvowel -> X } } else { code.append('K'); } break ; case 'D' : if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J code.append('J'); n += 2 ; } else { code.append('T'); } break ; case 'G' : // GH silent at end or before consonant if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) { break; } if (!isLastChar(wdsz, n + 1) && isNextChar(local,n,'H') && !isVowel(local,n+2)) { break; } if ((n > 0) && ( regionMatch(local, n, "GN") || regionMatch(local, n, "GNED") ) ) { break; // silent G } if (isPreviousChar(local, n, 'G')) { // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true hard = true ; } else { hard = false ; } if (!isLastChar(wdsz, n) && (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && (!hard)) { code.append('J'); } else { code.append('K'); } break ; case 'H': if (isLastChar(wdsz, n)) { break ; // terminal H } if ((n > 0) && (VARSON.indexOf(local.charAt(n - 1)) >= 0)) { break; } if (isVowel(local,n+1)) { code.append('H'); // Hvowel } break; case 'F': case 'J' : case 'L' : case 'M': case 'N' : case 'R' : code.append(symb); break; case 'K' : if (n > 0) { // not initial if (!isPreviousChar(local, n, 'C')) { code.append(symb); } } else { code.append(symb); // initial K } break ; case 'P' : if (isNextChar(local,n,'H')) { // PH -> F code.append('F'); } else { code.append(symb); } break ; case 'Q' : code.append('K'); break; case 'S' : if (regionMatch(local,n,"SH") || regionMatch(local,n,"SIO") || regionMatch(local,n,"SIA")) { code.append('X'); } else { code.append('S'); } break; case 'T' : if (regionMatch(local,n,"TIA") || regionMatch(local,n,"TIO")) { code.append('X'); break; } if (regionMatch(local,n,"TCH")) { // Silent if in "TCH" break; } // substitute numeral 0 for TH (resembles theta after all) if (regionMatch(local,n,"TH")) { code.append('0'); } else { code.append('T'); } break ; case 'V' : code.append('F'); break ; case 'W' : case 'Y' : // silent if not followed by vowel if (!isLastChar(wdsz,n) && isVowel(local,n+1)) { code.append(symb); } break ; case 'X' : code.append('K'); code.append('S'); break ; case 'Z' : code.append('S'); break ; } // end switch n++ ; } // end else from symb != 'C' if (code.length() > this.getMaxCodeLen()) { code.setLength(this.getMaxCodeLen()); } } return code.toString(); } private boolean isVowel(StringBuffer string, int index) { return VOWELS.indexOf(string.charAt(index)) >= 0; } private boolean isPreviousChar(StringBuffer string, int index, char c) { boolean matches = false; if( index > 0 && index < string.length() ) { matches = string.charAt(index - 1) == c; } return matches; } private boolean isNextChar(StringBuffer string, int index, char c) { boolean matches = false; if( index >= 0 && index < string.length() - 1 ) { matches = string.charAt(index + 1) == c; } return matches; } private boolean regionMatch(StringBuffer string, int index, String test) { boolean matches = false; if( index >= 0 && (index + test.length() - 1) < string.length() ) { String substring = string.substring( index, index + test.length()); matches = substring.equals( test ); } return matches; } private boolean isLastChar(int wdsz, int n) { return n + 1 == wdsz; } /** * Encodes an Object using the metaphone algorithm. This method * is provided in order to satisfy the requirements of the * Encoder interface, and will throw an EncoderException if the * supplied object is not of type java.lang.String. * * @param pObject Object to encode * @return An object (or type java.lang.String) containing the * metaphone code which corresponds to the String supplied. * @throws EncoderException if the parameter supplied is not * of type java.lang.String */ public Object encode(Object pObject) throws EncoderException { if (!(pObject instanceof String)) { throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); } return metaphone((String) pObject); } /** * Encodes a String using the Metaphone algorithm. * * @param pString String object to encode * @return The metaphone code corresponding to the String supplied */ public String encode(String pString) { return metaphone(pString); } /** * Tests is the metaphones of two strings are identical. * * @param str1 First of two strings to compare * @param str2 Second of two strings to compare * @return true if the metaphones of these strings are identical, * false otherwise. */ public boolean isMetaphoneEqual(String str1, String str2) { return metaphone(str1).equals(metaphone(str2)); } /** * Returns the maxCodeLen. * @return int */ public int getMaxCodeLen() { return this.maxCodeLen; } /** * Sets the maxCodeLen. * @param maxCodeLen The maxCodeLen to set */ public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy