All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.Nysiis Maven / Gradle / Ivy

Go to download

The Apache Commons Codec package contains simple encoder and decoders for various formats such as Base64 and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.

There is a newer version: 20041127.091804
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language;

import java.util.regex.Pattern;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
 * general purpose scheme to find word with similar phonemes.
 * 

* NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. *

* Algorithm description: *

 * 1. Transcode first characters of name
 *   1a. MAC ->   MCC
 *   1b. KN  ->   NN
 *   1c. K   ->   C
 *   1d. PH  ->   FF
 *   1e. PF  ->   FF
 *   1f. SCH ->   SSS
 * 2. Transcode last characters of name
 *   2a. EE, IE          ->   Y
 *   2b. DT,RT,RD,NT,ND  ->   D
 * 3. First character of key = first character of name
 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
 *   4a. EV  ->   AF  else A,E,I,O,U -> A
 *   4b. Q   ->   G
 *   4c. Z   ->   S
 *   4d. M   ->   N
 *   4e. KN  ->   N   else K -> C
 *   4f. SCH ->   SSS
 *   4g. PH  ->   FF
 *   4h. H   ->   If previous or next is nonvowel, previous
 *   4i. W   ->   If previous is vowel, previous
 *   4j. Add current to key if current != last key character
 * 5. If last character is S, remove it
 * 6. If last characters are AY, replace with Y
 * 7. If last character is A, remove it
 * 8. Collapse all strings of repeated characters
 * 9. Add original first character of name as first character of key
 * 
*

* This class is immutable and thread-safe. * * @see NYSIIS on Wikipedia * @see NYSIIS on dropby.com * @see Soundex * @since 1.7 * @version $Id: Nysiis.java 1429868 2013-01-07 16:08:05Z ggregory $ */ public class Nysiis implements StringEncoder { private static final char[] CHARS_A = new char[] { 'A' }; private static final char[] CHARS_AF = new char[] { 'A', 'F' }; private static final char[] CHARS_C = new char[] { 'C' }; private static final char[] CHARS_FF = new char[] { 'F', 'F' }; private static final char[] CHARS_G = new char[] { 'G' }; private static final char[] CHARS_N = new char[] { 'N' }; private static final char[] CHARS_NN = new char[] { 'N', 'N' }; private static final char[] CHARS_S = new char[] { 'S' }; private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' }; private static final Pattern PAT_MAC = Pattern.compile("^MAC"); private static final Pattern PAT_KN = Pattern.compile("^KN"); private static final Pattern PAT_K = Pattern.compile("^K"); private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); private static final Pattern PAT_SCH = Pattern.compile("^SCH"); private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); private static final char SPACE = ' '; private static final int TRUE_LENGTH = 6; /** * Tests if the given character is a vowel. * * @param c * the character to test * @return {@code true} if the character is a vowel, {@code false} otherwise */ private static boolean isVowel(final char c) { return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; } /** * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at * a time: [i-1, i, i+1, i+2]. * * @param prev * the previous character * @param curr * the current character * @param next * the next character * @param aNext * the after next character * @return a transcoded array of characters, starting from the current position */ private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { // 1. EV -> AF if (curr == 'E' && next == 'V') { return CHARS_AF; } // A, E, I, O, U -> A if (isVowel(curr)) { return CHARS_A; } // 2. Q -> G, Z -> S, M -> N if (curr == 'Q') { return CHARS_G; } else if (curr == 'Z') { return CHARS_S; } else if (curr == 'M') { return CHARS_N; } // 3. KN -> NN else K -> C if (curr == 'K') { if (next == 'N') { return CHARS_NN; } else { return CHARS_C; } } // 4. SCH -> SSS if (curr == 'S' && next == 'C' && aNext == 'H') { return CHARS_SSS; } // PH -> FF if (curr == 'P' && next == 'H') { return CHARS_FF; } // 5. H -> If previous or next is a non vowel, previous. if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { return new char[] { prev }; } // 6. W -> If previous is vowel, previous. if (curr == 'W' && isVowel(prev)) { return new char[] { prev }; } return new char[] { curr }; } /** Indicates the strict mode. */ private final boolean strict; /** * Creates an instance of the {@link Nysiis} encoder with strict mode (original form), * i.e. encoded strings have a maximum length of 6. */ public Nysiis() { this(true); } /** * Create an instance of the {@link Nysiis} encoder with the specified strict mode: * *

    *
  • {@code true}: encoded strings have a maximum length of 6
  • *
  • {@code false}: encoded strings may have arbitrary length
  • *
* * @param strict * the strict mode */ public Nysiis(final boolean strict) { this.strict = strict; } /** * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type * {@link String}. * * @param obj * Object to encode * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. * @throws EncoderException * if the parameter supplied is not of a {@link String} * @throws IllegalArgumentException * if a character is not mapped */ @Override public Object encode(final Object obj) throws EncoderException { if (!(obj instanceof String)) { throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); } return this.nysiis((String) obj); } /** * Encodes a String using the NYSIIS algorithm. * * @param str * A String object to encode * @return A Nysiis code corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ @Override public String encode(final String str) { return this.nysiis(str); } /** * Indicates the strict mode for this {@link Nysiis} encoder. * * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise */ public boolean isStrict() { return this.strict; } /** * Retrieves the NYSIIS code for a given String object. * * @param str * String to encode using the NYSIIS algorithm * @return A NYSIIS code for the String supplied */ public String nysiis(String str) { if (str == null) { return null; } // Use the same clean rules as Soundex str = SoundexUtils.clean(str); if (str.length() == 0) { return str; } // Translate first characters of name: // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS str = PAT_MAC.matcher(str).replaceFirst("MCC"); str = PAT_KN.matcher(str).replaceFirst("NN"); str = PAT_K.matcher(str).replaceFirst("C"); str = PAT_PH_PF.matcher(str).replaceFirst("FF"); str = PAT_SCH.matcher(str).replaceFirst("SSS"); // Translate last characters of name: // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D str = PAT_EE_IE.matcher(str).replaceFirst("Y"); str = PAT_DT_ETC.matcher(str).replaceFirst("D"); // First character of key = first character of name. final StringBuilder key = new StringBuilder(str.length()); key.append(str.charAt(0)); // Transcode remaining characters, incrementing by one character each time final char[] chars = str.toCharArray(); final int len = chars.length; for (int i = 1; i < len; i++) { final char next = i < len - 1 ? chars[i + 1] : SPACE; final char aNext = i < len - 2 ? chars[i + 2] : SPACE; final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); System.arraycopy(transcoded, 0, chars, i, transcoded.length); // only append the current char to the key if it is different from the last one if (chars[i] != chars[i - 1]) { key.append(chars[i]); } } if (key.length() > 1) { char lastChar = key.charAt(key.length() - 1); // If last character is S, remove it. if (lastChar == 'S') { key.deleteCharAt(key.length() - 1); lastChar = key.charAt(key.length() - 1); } if (key.length() > 2) { final char last2Char = key.charAt(key.length() - 2); // If last characters are AY, replace with Y. if (last2Char == 'A' && lastChar == 'Y') { key.deleteCharAt(key.length() - 2); } } // If last character is A, remove it. if (lastChar == 'A') { key.deleteCharAt(key.length() - 1); } } final String string = key.toString(); return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy