All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.Nysiis Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language;

import java.util.regex.Pattern;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
 * general purpose scheme to find word with similar phonemes.
 * 

* NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. *

*

* Algorithm description: *

*
 * 1. Transcode first characters of name
 *   1a. MAC ->   MCC
 *   1b. KN  ->   NN
 *   1c. K   ->   C
 *   1d. PH  ->   FF
 *   1e. PF  ->   FF
 *   1f. SCH ->   SSS
 * 2. Transcode last characters of name
 *   2a. EE, IE          ->   Y
 *   2b. DT,RT,RD,NT,ND  ->   D
 * 3. First character of key = first character of name
 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
 *   4a. EV  ->   AF  else A,E,I,O,U -> A
 *   4b. Q   ->   G
 *   4c. Z   ->   S
 *   4d. M   ->   N
 *   4e. KN  ->   N   else K -> C
 *   4f. SCH ->   SSS
 *   4g. PH  ->   FF
 *   4h. H   ->   If previous or next is non-vowel, previous
 *   4i. W   ->   If previous is vowel, previous
 *   4j. Add current to key if current != last key character
 * 5. If last character is S, remove it
 * 6. If last characters are AY, replace with Y
 * 7. If last character is A, remove it
 * 8. Collapse all strings of repeated characters
 * 9. Add original first character of name as first character of key
 * 
*

* This class is immutable and thread-safe. *

* * @see NYSIIS on Wikipedia * @see NYSIIS on dropby.com * @see Soundex * @since 1.7 */ public class Nysiis implements StringEncoder { private static final char[] CHARS_A = { 'A' }; private static final char[] CHARS_AF = { 'A', 'F' }; private static final char[] CHARS_C = { 'C' }; private static final char[] CHARS_FF = { 'F', 'F' }; private static final char[] CHARS_G = { 'G' }; private static final char[] CHARS_N = { 'N' }; private static final char[] CHARS_NN = { 'N', 'N' }; private static final char[] CHARS_S = { 'S' }; private static final char[] CHARS_SSS = { 'S', 'S', 'S' }; private static final Pattern PAT_MAC = Pattern.compile("^MAC"); private static final Pattern PAT_KN = Pattern.compile("^KN"); private static final Pattern PAT_K = Pattern.compile("^K"); private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); private static final Pattern PAT_SCH = Pattern.compile("^SCH"); private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); private static final char SPACE = ' '; private static final int TRUE_LENGTH = 6; /** * Tests if the given character is a vowel. * * @param c * the character to test * @return {@code true} if the character is a vowel, {@code false} otherwise */ private static boolean isVowel(final char c) { return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; } /** * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at * a time: [i-1, i, i+1, i+2]. * * @param prev * the previous character * @param curr * the current character * @param next * the next character * @param aNext * the after next character * @return a transcoded array of characters, starting from the current position */ private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { // 1. EV -> AF if (curr == 'E' && next == 'V') { return CHARS_AF; } // A, E, I, O, U -> A if (isVowel(curr)) { return CHARS_A; } // 2. Q -> G, Z -> S, M -> N // 3. KN -> NN else K -> C switch (curr) { case 'Q': return CHARS_G; case 'Z': return CHARS_S; case 'M': return CHARS_N; case 'K': if (next == 'N') { return CHARS_NN; } return CHARS_C; default: break; } // 4. SCH -> SSS if (curr == 'S' && next == 'C' && aNext == 'H') { return CHARS_SSS; } // PH -> FF if (curr == 'P' && next == 'H') { return CHARS_FF; } // 5. H -> If previous or next is a non vowel, previous. if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { return new char[] { prev }; } // 6. W -> If previous is vowel, previous. if (curr == 'W' && isVowel(prev)) { return new char[] { prev }; } return new char[] { curr }; } /** Indicates the strict mode. */ private final boolean strict; /** * Creates an instance of the {@link Nysiis} encoder with strict mode (original form), * i.e. encoded strings have a maximum length of 6. */ public Nysiis() { this(true); } /** * Create an instance of the {@link Nysiis} encoder with the specified strict mode: * *
    *
  • {@code true}: encoded strings have a maximum length of 6
  • *
  • {@code false}: encoded strings may have arbitrary length
  • *
* * @param strict * the strict mode */ public Nysiis(final boolean strict) { this.strict = strict; } /** * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type * {@link String}. * * @param obj * Object to encode * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. * @throws EncoderException * if the parameter supplied is not of a {@link String} * @throws IllegalArgumentException * if a character is not mapped */ @Override public Object encode(final Object obj) throws EncoderException { if (!(obj instanceof String)) { throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); } return this.nysiis((String) obj); } /** * Encodes a String using the NYSIIS algorithm. * * @param str * A String object to encode * @return A Nysiis code corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ @Override public String encode(final String str) { return this.nysiis(str); } /** * Indicates the strict mode for this {@link Nysiis} encoder. * * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise */ public boolean isStrict() { return this.strict; } /** * Retrieves the NYSIIS code for a given String object. * * @param str * String to encode using the NYSIIS algorithm * @return A NYSIIS code for the String supplied */ public String nysiis(String str) { if (str == null) { return null; } // Use the same clean rules as Soundex str = SoundexUtils.clean(str); if (str.isEmpty()) { return str; } // Translate first characters of name: // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS str = PAT_MAC.matcher(str).replaceFirst("MCC"); str = PAT_KN.matcher(str).replaceFirst("NN"); str = PAT_K.matcher(str).replaceFirst("C"); str = PAT_PH_PF.matcher(str).replaceFirst("FF"); str = PAT_SCH.matcher(str).replaceFirst("SSS"); // Translate last characters of name: // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D str = PAT_EE_IE.matcher(str).replaceFirst("Y"); str = PAT_DT_ETC.matcher(str).replaceFirst("D"); // First character of key = first character of name. final StringBuilder key = new StringBuilder(str.length()); key.append(str.charAt(0)); // Transcode remaining characters, incrementing by one character each time final char[] chars = str.toCharArray(); final int len = chars.length; for (int i = 1; i < len; i++) { final char next = i < len - 1 ? chars[i + 1] : SPACE; final char aNext = i < len - 2 ? chars[i + 2] : SPACE; final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); System.arraycopy(transcoded, 0, chars, i, transcoded.length); // only append the current char to the key if it is different from the last one if (chars[i] != chars[i - 1]) { key.append(chars[i]); } } if (key.length() > 1) { char lastChar = key.charAt(key.length() - 1); // If last character is S, remove it. if (lastChar == 'S') { key.deleteCharAt(key.length() - 1); lastChar = key.charAt(key.length() - 1); } if (key.length() > 2) { final char last2Char = key.charAt(key.length() - 2); // If last characters are AY, replace with Y. if (last2Char == 'A' && lastChar == 'Y') { key.deleteCharAt(key.length() - 2); } } // If last character is A, remove it. if (lastChar == 'A') { key.deleteCharAt(key.length() - 1); } } final String string = key.toString(); return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy