com.groupbyinc.common.apache.commons.codec.language.MatchRatingApproachEncoder Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language;
import java.util.Locale;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
* Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977.
*
* This class is immutable and thread-safe.
*
* @see Wikipedia - Match Rating Approach
* @since 1.8
*/
public class MatchRatingApproachEncoder implements StringEncoder {
private static final String SPACE = " ";
private static final String EMPTY = "";
/**
* Constants used mainly for the min rating value.
*/
private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7,
ELEVEN = 11, TWELVE = 12;
/**
* The plain letter equivalent of the accented letters.
*/
private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
"AaEeIiOoUuYy" + // acute
"AaEeIiOoUuYy" + // circumflex
"AaOoNn" + // tilde
"AaEeIiOoUuYy" + // umlaut
"Aa" + // ring
"Cc" + // cedilla
"OoUu"; // double acute
/**
* Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
*/
private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
"\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
"\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
"\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
"\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
"\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
private static final String[] DOUBLE_CONSONANT =
new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
"TT", "VV", "WW", "XX", "YY", "ZZ" };
/**
* Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
* spaces.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param name
* The name to be cleaned
* @return The cleaned name
*/
String cleanName(final String name) {
String upperName = name.toUpperCase(Locale.ENGLISH);
final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
for (final String str : charsToTrim) {
upperName = upperName.replaceAll(str, EMPTY);
}
upperName = removeAccents(upperName);
upperName = upperName.replaceAll("\\s+", EMPTY);
return upperName;
}
/**
* Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
* Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
*
* @param pObject
* Object to encode
* @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
* String supplied.
* @throws EncoderException
* if the parameter supplied is not of type java.lang.String
*/
@Override
public final Object encode(final Object pObject) throws EncoderException {
if (!(pObject instanceof String)) {
throw new EncoderException(
"Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
}
return encode((String) pObject);
}
/**
* Encodes a String using the Match Rating Approach (MRA) algorithm.
*
* @param name
* String object to encode
* @return The MRA code corresponding to the String supplied
*/
@Override
public final String encode(String name) {
// Bulletproof for trivial input - NINO
if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
return EMPTY;
}
// Preprocessing
name = cleanName(name);
// BEGIN: Actual encoding part of the algorithm...
// 1. Delete all vowels unless the vowel begins the word
name = removeVowels(name);
// 2. Remove second consonant from any double consonant
name = removeDoubleConsonants(name);
// 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
name = getFirst3Last3(name);
return name;
}
/**
* Gets the first and last 3 letters of a name (if > 6 characters) Else just returns the name.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param name
* The string to get the substrings from
* @return Annexed first and last 3 letters of input word.
*/
String getFirst3Last3(final String name) {
final int nameLength = name.length();
if (nameLength > SIX) {
final String firstThree = name.substring(0, THREE);
final String lastThree = name.substring(nameLength - THREE, nameLength);
return firstThree + lastThree;
}
return name;
}
/**
* Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
* min rating. Values strictly from documentation.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param sumLength
* The length of 2 strings sent down
* @return The min rating value
*/
int getMinRating(final int sumLength) {
int minRating = 0;
if (sumLength <= FOUR) {
minRating = FIVE;
} else if (sumLength <= SEVEN) { // aready know it is at least 5
minRating = FOUR;
} else if (sumLength <= ELEVEN) { // aready know it is at least 8
minRating = THREE;
} else if (sumLength == TWELVE) {
minRating = TWO;
} else {
minRating = ONE; // docs said little here.
}
return minRating;
}
/**
* Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
* strings are cleaned in the same way as {@link #encode(String)}.
*
* @param name1
* First of the 2 strings (names) to compare
* @param name2
* Second of the 2 names to compare
* @return true
if the encodings are identical false
otherwise.
*/
public boolean isEncodeEquals(String name1, String name2) {
// Bulletproof for trivial input - NINO
if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
return false;
} else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
return false;
} else if (name1.length() == 1 || name2.length() == 1) {
return false;
} else if (name1.equalsIgnoreCase(name2)) {
return true;
}
// Preprocessing
name1 = cleanName(name1);
name2 = cleanName(name2);
// Actual MRA Algorithm
// 1. Remove vowels
name1 = removeVowels(name1);
name2 = removeVowels(name2);
// 2. Remove double consonants
name1 = removeDoubleConsonants(name1);
name2 = removeDoubleConsonants(name2);
// 3. Reduce down to 3 letters
name1 = getFirst3Last3(name1);
name2 = getFirst3Last3(name2);
// 4. Check for length difference - if 3 or greater then no similarity
// comparison is done
if (Math.abs(name1.length() - name2.length()) >= THREE) {
return false;
}
// 5. Obtain the minimum rating value by calculating the length sum of the
// encoded Strings and sending it down.
final int sumLength = Math.abs(name1.length() + name2.length());
int minRating = 0;
minRating = getMinRating(sumLength);
// 6. Process the encoded Strings from left to right and remove any
// identical characters found from both Strings respectively.
final int count = leftToRightThenRightToLeftProcessing(name1, name2);
// 7. Each PNI item that has a similarity rating equal to or greater than
// the min is considered to be a good candidate match
return count >= minRating;
}
/**
* Determines if a letter is a vowel.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param letter
* The letter under investiagtion
* @return True if a vowel, else false
*/
boolean isVowel(final String letter) {
return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
}
/**
* Processes the names from left to right (first) then right to left removing identical letters in same positions.
* Then subtracts the longer string that remains from 6 and returns this.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param name1
* name2
* @return the length as above
*/
int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
final char[] name1Char = name1.toCharArray();
final char[] name2Char = name2.toCharArray();
final int name1Size = name1.length() - 1;
final int name2Size = name2.length() - 1;
String name1LtRStart = EMPTY;
String name1LtREnd = EMPTY;
String name2RtLStart = EMPTY;
String name2RtLEnd = EMPTY;
for (int i = 0; i < name1Char.length; i++) {
if (i > name2Size) {
break;
}
name1LtRStart = name1.substring(i, i + 1);
name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
name2RtLStart = name2.substring(i, i + 1);
name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
// Left to right...
if (name1LtRStart.equals(name2RtLStart)) {
name1Char[i] = ' ';
name2Char[i] = ' ';
}
// Right to left...
if (name1LtREnd.equals(name2RtLEnd)) {
name1Char[name1Size - i] = ' ';
name2Char[name2Size - i] = ' ';
}
}
// Char arrays -> string & remove extraneous space
final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
// Final bit - subtract longest string from 6 and return this int value
if (strA.length() > strB.length()) {
return Math.abs(SIX - strA.length());
}
return Math.abs(SIX - strB.length());
}
/**
* Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
* http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
*
* @param accentedWord
* The word that may have accents in it.
* @return De-accented word
*/
String removeAccents(final String accentedWord) {
if (accentedWord == null) {
return null;
}
final StringBuilder sb = new StringBuilder();
final int n = accentedWord.length();
for (int i = 0; i < n; i++) {
final char c = accentedWord.charAt(i);
final int pos = UNICODE.indexOf(c);
if (pos > -1) {
sb.append(PLAIN_ASCII.charAt(pos));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Replaces any double consonant pair with the single letter equivalent.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param name
* String to have double consonants removed
* @return Single consonant word
*/
String removeDoubleConsonants(final String name) {
String replacedName = name.toUpperCase(Locale.ENGLISH);
for (final String dc : DOUBLE_CONSONANT) {
if (replacedName.contains(dc)) {
final String singleLetter = dc.substring(0, 1);
replacedName = replacedName.replace(dc, singleLetter);
}
}
return replacedName;
}
/**
* Deletes all vowels unless the vowel begins the word.
*
* API Usage
*
* Consider this method private, it is package protected for unit testing only.
*
*
* @param name
* The name to have vowels removed
* @return De-voweled word
*/
String removeVowels(String name) {
// Extract first letter
final String firstLetter = name.substring(0, 1);
name = name.replaceAll("A", EMPTY);
name = name.replaceAll("E", EMPTY);
name = name.replaceAll("I", EMPTY);
name = name.replaceAll("O", EMPTY);
name = name.replaceAll("U", EMPTY);
name = name.replaceAll("\\s{2,}\\b", SPACE);
// return isVowel(firstLetter) ? (firstLetter + name) : name;
if (isVowel(firstLetter)) {
return firstLetter + name;
}
return name;
}
}