All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.DaitchMokotoffSoundex Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.codec.language;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import org.apache.commons.codec.CharEncoding;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.Resources;
import org.apache.commons.codec.StringEncoder;

/**
 * Encodes a string into a Daitch-Mokotoff Soundex value.
 * 

* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. *

*

* The main differences compared to the other soundex variants are: *

*
    *
  • coded names are 6 digits long *
  • the initial character of the name is coded *
  • rules to encoded multi-character n-grams *
  • multiple possible encodings for the same name (branching) *
*

* This implementation supports branching, depending on the used method: *

    *
  • {@link #encode(String)} - branching disabled, only the first code will be returned *
  • {@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' *
*

* Note: this implementation has additional branching rules compared to the original description of the algorithm. The * rules can be customized by overriding the default rules contained in the resource file * {@code org/apache/commons/codec/language/dmrules.txt}. *

*

* This class is thread-safe. *

* * @see Soundex * @see Wikipedia - Daitch-Mokotoff Soundex * @see Avotaynu - Soundexing and Genealogy * * @since 1.10 */ public class DaitchMokotoffSoundex implements StringEncoder { /** * Inner class representing a branch during DM soundex encoding. */ private static final class Branch { private final StringBuilder builder; private String cachedString; private String lastReplacement; private Branch() { builder = new StringBuilder(); lastReplacement = null; cachedString = null; } /** * Creates a new branch, identical to this branch. * * @return a new, identical branch */ public Branch createBranch() { final Branch branch = new Branch(); branch.builder.append(toString()); branch.lastReplacement = this.lastReplacement; return branch; } @Override public boolean equals(final Object other) { if (this == other) { return true; } if (!(other instanceof Branch)) { return false; } return toString().equals(((Branch) other).toString()); } /** * Finish this branch by appending '0's until the maximum code length has been reached. */ public void finish() { while (builder.length() < MAX_LENGTH) { builder.append('0'); cachedString = null; } } @Override public int hashCode() { return toString().hashCode(); } /** * Process the next replacement to be added to this branch. * * @param replacement * the next replacement to append * @param forceAppend * indicates if the default processing shall be overridden */ public void processNextReplacement(final String replacement, final boolean forceAppend) { final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; if (append && builder.length() < MAX_LENGTH) { builder.append(replacement); // remove all characters after the maximum length if (builder.length() > MAX_LENGTH) { builder.delete(MAX_LENGTH, builder.length()); } cachedString = null; } lastReplacement = replacement; } @Override public String toString() { if (cachedString == null) { cachedString = builder.toString(); } return cachedString; } } /** * Inner class for storing rules. */ private static final class Rule { private final String pattern; private final String[] replacementAtStart; private final String[] replacementBeforeVowel; private final String[] replacementDefault; protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, final String replacementDefault) { this.pattern = pattern; this.replacementAtStart = replacementAtStart.split("\\|"); this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); this.replacementDefault = replacementDefault.split("\\|"); } public int getPatternLength() { return pattern.length(); } public String[] getReplacements(final String context, final boolean atStart) { if (atStart) { return replacementAtStart; } final int nextIndex = getPatternLength(); final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; if (nextCharIsVowel) { return replacementBeforeVowel; } return replacementDefault; } private boolean isVowel(final char ch) { return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; } public boolean matches(final String context) { return context.startsWith(pattern); } @Override public String toString() { return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); } } private static final String COMMENT = "//"; private static final String DOUBLE_QUOTE = "\""; private static final String MULTILINE_COMMENT_END = "*/"; private static final String MULTILINE_COMMENT_START = "/*"; /** The resource file containing the replacement and folding rules */ private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; /** The code length of a DM soundex value. */ private static final int MAX_LENGTH = 6; /** Transformation rules indexed by the first character of their pattern. */ private static final Map> RULES = new HashMap<>(); /** Folding rules. */ private static final Map FOLDINGS = new HashMap<>(); static { try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); } // sort RULES by pattern length in descending order for (final Map.Entry> rule : RULES.entrySet()) { final List ruleList = rule.getValue(); Collections.sort(ruleList, new Comparator() { @Override public int compare(final Rule rule1, final Rule rule2) { return rule2.getPatternLength() - rule1.getPatternLength(); } }); } } private static void parseRules(final Scanner scanner, final String location, final Map> ruleMapping, final Map asciiFoldings) { int currentLine = 0; boolean inMultilineComment = false; while (scanner.hasNextLine()) { currentLine++; final String rawLine = scanner.nextLine(); String line = rawLine; if (inMultilineComment) { if (line.endsWith(MULTILINE_COMMENT_END)) { inMultilineComment = false; } continue; } if (line.startsWith(MULTILINE_COMMENT_START)) { inMultilineComment = true; } else { // discard comments final int cmtI = line.indexOf(COMMENT); if (cmtI >= 0) { line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.length() == 0) { continue; // empty lines can be safely skipped } if (line.contains("=")) { // folding final String[] parts = line.split("="); if (parts.length != 2) { throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location); } final String leftCharacter = parts[0]; final String rightCharacter = parts[1]; if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { throw new IllegalArgumentException("Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location); } asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); } else { // rule final String[] parts = line.split("\\s+"); if (parts.length != 4) { throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); } try { final String pattern = stripQuotes(parts[0]); final String replacement1 = stripQuotes(parts[1]); final String replacement2 = stripQuotes(parts[2]); final String replacement3 = stripQuotes(parts[3]); final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); final char patternKey = r.pattern.charAt(0); List rules = ruleMapping.get(patternKey); if (rules == null) { rules = new ArrayList<>(); ruleMapping.put(patternKey, rules); } rules.add(r); } catch (final IllegalArgumentException e) { throw new IllegalStateException( "Problem parsing line '" + currentLine + "' in " + location, e); } } } } } private static String stripQuotes(String str) { if (str.startsWith(DOUBLE_QUOTE)) { str = str.substring(1); } if (str.endsWith(DOUBLE_QUOTE)) { str = str.substring(0, str.length() - 1); } return str; } /** Whether to use ASCII folding prior to encoding. */ private final boolean folding; /** * Creates a new instance with ASCII-folding enabled. */ public DaitchMokotoffSoundex() { this(true); } /** * Creates a new instance. *

* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. * è -> e. *

* * @param folding * if ASCII-folding shall be performed before encoding */ public DaitchMokotoffSoundex(final boolean folding) { this.folding = folding; } /** * Performs a cleanup of the input string before the actual soundex transformation. *

* Removes all whitespace characters and performs ASCII folding if enabled. *

* * @param input * the input string to cleanup * @return a cleaned up string */ private String cleanup(final String input) { final StringBuilder sb = new StringBuilder(); for (char ch : input.toCharArray()) { if (Character.isWhitespace(ch)) { continue; } ch = Character.toLowerCase(ch); if (folding && FOLDINGS.containsKey(ch)) { ch = FOLDINGS.get(ch); } sb.append(ch); } return sb.toString(); } /** * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. *

* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an * EncoderException if the supplied object is not of type java.lang.String. *

* * @see #soundex(String) * * @param obj * Object to encode * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String * supplied. * @throws EncoderException * if the parameter supplied is not of type java.lang.String * @throws IllegalArgumentException * if a character is not mapped */ @Override public Object encode(final Object obj) throws EncoderException { if (!(obj instanceof String)) { throw new EncoderException( "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); } return encode((String) obj); } /** * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. * * @see #soundex(String) * * @param source * A String object to encode * @return A DM Soundex code corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ @Override public String encode(final String source) { if (source == null) { return null; } return soundex(source, false)[0]; } /** * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. *

* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, * separated by '|'. *

*

* Example: the name "AUERBACH" is encoded as both *

*
    *
  • 097400
  • *
  • 097500
  • *
*

* Thus the result will be "097400|097500". *

* * @param source * A String object to encode * @return A string containing a set of DM Soundex codes corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */ public String soundex(final String source) { final String[] branches = soundex(source, true); final StringBuilder sb = new StringBuilder(); int index = 0; for (final String branch : branches) { sb.append(branch); if (++index < branches.length) { sb.append('|'); } } return sb.toString(); } /** * Perform the actual DM Soundex algorithm on the input string. * * @param source * A String object to encode * @param branching * If branching shall be performed * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the * selected branching mode */ private String[] soundex(final String source, final boolean branching) { if (source == null) { return null; } final String input = cleanup(source); final Set currentBranches = new LinkedHashSet<>(); currentBranches.add(new Branch()); char lastChar = '\0'; for (int index = 0; index < input.length(); index++) { final char ch = input.charAt(index); // ignore whitespace inside a name if (Character.isWhitespace(ch)) { continue; } final String inputContext = input.substring(index); final List rules = RULES.get(ch); if (rules == null) { continue; } // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access final List nextBranches = branching ? new ArrayList() : Collections.emptyList(); for (final Rule rule : rules) { if (rule.matches(inputContext)) { if (branching) { nextBranches.clear(); } final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); final boolean branchingRequired = replacements.length > 1 && branching; for (final Branch branch : currentBranches) { for (final String nextReplacement : replacements) { // if we have multiple replacements, always create a new branch final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; // special rule: occurrences of mn or nm are treated differently final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); nextBranch.processNextReplacement(nextReplacement, force); if (branching) { nextBranches.add(nextBranch); } else { break; } } } if (branching) { currentBranches.clear(); currentBranches.addAll(nextBranches); } index += rule.getPatternLength() - 1; break; } } lastChar = ch; } final String[] result = new String[currentBranches.size()]; int index = 0; for (final Branch branch : currentBranches) { branch.finish(); result[index++] = branch.toString(); } return result; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy