org.apache.commons.codec.language.bm.BeiderMorseEncoder Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language.bm;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
*
* Encodes strings into their Beider-Morse phonetic encoding.
*
*
* Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range of words.
*
*
* This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
* thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
*
*
* Encoding overview
*
*
* Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
* language the word comes from. For example, if it ends in "ault
" then it infers that the word is French. Next,
* the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
* can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
* different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
* this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
* into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
* representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
* pronounced in several ways in the source language have only one way to represent them in this average phonetic
* language, so the result is again a set of phonetic spellings.
*
*
*
* Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
* this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
* Secondly, some names have standard prefixes, for example, "Mac/Mc
" in Scottish (English) names. As sometimes it is
* ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
* and once without it. The resulting encoding contains one and then the other result.
*
*
*
* Encoding format
*
* Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
* are multiple possible phonetic representations, these are joined with a pipe (|
) character. If multiple hyphenated
* words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
* are then joined with hyphens. For example, "d'ortley
" has a possible prefix. The form without prefix encodes to
* "ortlaj|ortlej
", while the form with prefix encodes to "dortlaj|dortlej
". Thus, the full, combined encoding is
* "(ortlaj|ortlej)-(dortlaj|dortlej)
".
*
*
* The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
* potential phonetic interpretations. For example, "Renault
" encodes to
* "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult
". The APPROX
rules will tend to produce larger
* encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
* Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
* splitting on pipe (|
) and indexing under each of these alternatives.
*
*
* @author Apache Software Foundation
* @since 1.6
*/
public class BeiderMorseEncoder implements StringEncoder {
// Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
// of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
// a cached object
private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
public Object encode(Object source) throws EncoderException {
if (!(source instanceof String)) {
throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
}
return encode((String) source);
}
public String encode(String source) throws EncoderException {
if (source == null) {
return null;
}
return this.engine.encode(source);
}
/**
* Gets the name type currently in operation.
*
* @return the NameType currently being used
*/
public NameType getNameType() {
return this.engine.getNameType();
}
/**
* Gets the rule type currently in operation.
*
* @return the RuleType currently being used
*/
public RuleType getRuleType() {
return this.engine.getRuleType();
}
/**
* Discovers if multiple possible encodings are concatenated.
*
* @return true if multiple encodings are concatenated, false if just the first one is returned
*/
public boolean isConcat() {
return this.engine.isConcat();
}
/**
* Sets how multiple possible phonetic encodings are combined.
*
* @param concat
* true if multiple encodings are to be combined with a '|', false if just the first one is to be considered
*/
public void setConcat(boolean concat) {
this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat);
}
/**
* Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or
* Sephardic Jewish family names.
*
* @param nameType
* the NameType in use
*/
public void setNameType(NameType nameType) {
this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat());
}
/**
* Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
*
* @param ruleType
* {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
*/
public void setRuleType(RuleType ruleType) {
this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat());
}
}