com.aliasi.tokenizer.SoundexTokenizerFactory Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.tokenizer;
import com.aliasi.util.Strings;
import java.io.ObjectInput;
import java.io.Serializable;
/**
* A {@code SoundexTokenizerFactory} modifies the output of a base
* tokenizer factory to produce tokens in soundex representation.
* Soundex replaces sequences of characters with a crude
* four-character approximation of their pronunciation plus initial
* letter.
*
* Soundex Representations
*
* The process for converting an input to its Soundex
* representation is fairly straighforward for inputs that are all
* ASCII letters. Soundex is case insensitive, but is only defined
* for strings of ASCII letters. Thus to begin, all characters
* that are not Latin1 letters are removed, and all Latin1 characters
* are stripped of their diacritics. The algorithm then proceeds
* according to its standard definition:
*
*
* - Normalize input by removing all characters that are not
* Latin1 letters, and converting all other characters to uppercase
* ASCII after first removing any diacritics.
*
- If the input is empty, return "0000"
*
* - Set the first letter of the output to the first letter of the input.
*
* - While there are less than four letters of output do:
*
* - If the next letter is a vowel, unset the last letter's code.
*
- If the next letter is
A
, E
, I
, O
, U
, H
, W
, Y
, continue.
* - If the next letter's code is equal to the previous letter's
* code, continue.
*
- Set the next letter of output to the current letter's code.
*
*
* - If there are fewer than four characters of output, pad
* the output with zeros (
0
)
* - Return the output string.
*
*
* The table of individual character encodings is as follows:
*
*
* Characters Code
* B, F, P, V 1
* C, G, J, K, Q, S, X, Z 2
* D, T 3
* L 4
* M, N 5
* R 6
*
*
* Here are some examples of translations from the unit tests,
* drawn from the sources cited below.
*
*
* Tokens Soundex Encoding Notes
* Gutierrez G362
* Pfister P236
* Jackson J250
* Tymczak T522
* Ashcraft A261
* Robert, Rupert R163
* Euler, Ellery E460
* Gauss, Ghosh G200
* Hilbert, Heilbronn H416
* Knuth, Kant K530
* Lloyd, Liddy L300
* Lukasiewicz, Lissajous L222
* Wachs, Waugh W200
*
*
* As a tokenizer filter, the SoundexFilterTokenizer
* simply replaces each token with its Soundex equivalent. Note that
* this may produce very many 0000
outputs if it is fed
* standard text with punctuation, numbers, etc.
*
*
Note: In order to produce a deterministic tokenizer filter,
* names with prefixes are coded with the prefix. Recall that
* Soundex considers the following set of words prefixes, and suggests
* providing both the Soundex computed with the prefix and the
* Soundex encoding computed without the prefix:
*
*
* Van, Con, De, Di, La, Le
*
* These are not accorded any special treatment by this
* implementation.
*
*
*
Thread Safety
*
* An English stop-listed tokenizer factory is thread safe if its
* base tokenizer factory is thread safe.
*
* Serialization
*
* An {@code EnglishStopTokenizerFactory} is serializable if its
* base tokenizer factory is serializable.
*
*
References and Historical Notes
*
* Soundex was invented and patented by Robert C. Russell in 1918.
* The original version involved eight categories, including one for
* vowels, without the initial character being treated specially as to
* coding. The first vowel was retained in the original Soundex.
* Furthermore, some positional information was added, such as the
* deletion of final s
and z
.
*
* The version in this class is the one described by Donald Knuth
* in The Art of Computer Programming and the one described by
* the United States National Archives and Records Administration
* version, which has been used for the United States Census.
*
*
*
* - Knuth, D. 1973. The Art of Computer Programming Volum 3: Sorting and Searching. Addison-Wesley. 2nd Edition Pages 394-395.
*
* - Wikipedia. Soundex.
*
*
- United States National Archives and Records Administration.
* Using the Census Soundex.
* General Information Leaflet 55.
*
*
* - Robert C. Russell. 1918. United States Patent
* 1,261,167.
* - Robert C. Russell. 1922.
* United States Patent 1,435,663.
*
*
*
* @author Bob Carpenter
* @version 4.0.1
* @since Lingpipe3.8
*/
public class SoundexTokenizerFactory
extends ModifyTokenTokenizerFactory
implements Serializable {
static final long serialVersionUID = -7062805184862100578L;
/**
* Construct a Soundex-based tokenizer factory that converts
* tokens produced by the specified base factory into their
* soundex representations.
*
* @param factory Base tokenizer factory.
*/
public SoundexTokenizerFactory(TokenizerFactory factory) {
super(factory);
}
/**
* Returns the Soundex encoding of the specified token.
*
* See the class documentation above for more
* information on the encoding.
*
* @param token Input token.
* @return The soundex encoding of the input token.
*/
public String modifyToken(String token) {
return soundexEncoding(token);
}
@Override
public String toString() {
return getClass().toString()
+ "\n base factory="
+ baseTokenizerFactory().toString().replace("\n","\n ");
}
Object writeReplace() {
return new Serializer(this);
}
/**
* Returns the Soundex encoding of the specified token.
*
* @param token Token to be encoded.
* @return The Soundex encoding of the specified token.
*/
public static String soundexEncoding(String token) {
int pos = 0;
while (pos < token.length()) {
char c = token.charAt(pos);
if (c < 256 && INITIAL_CODES[c] != NON_CHAR_CODE)
break;
++pos;
}
if (pos == token.length())
return "0000"; // nothing
int csPos = 1;
char[] cs = new char[4];
cs[0] = INITIAL_CODES[token.charAt(pos)];
char lastCode = CODES[token.charAt(pos)];
++pos;
while (csPos < 4 && pos < token.length()) {
char c = token.charAt(pos);
++pos;
if (c > 255) continue;
char code = CODES[c];
if (code == NON_CHAR_CODE) {
if (VOWELS[c])
lastCode = '7'; // never matches, forces next char to code
continue;
}
if (code == lastCode) continue;
cs[csPos] = code;
lastCode = code;
++csPos;
}
while (csPos < 4) {
cs[csPos] = '0';
++csPos;
}
return new String(cs);
}
static char soundexCode(char upperCaseLetter) {
switch (upperCaseLetter) {
case 'B' : return '1';
case 'F' : return '1';
case 'P' : return '1';
case 'V' : return '1';
case 'C' : return '2';
case 'G' : return '2';
case 'J' : return '2';
case 'K' : return '2';
case 'Q' : return '2';
case 'S' : return '2';
case 'X' : return '2';
case 'Z' : return '2';
case 'D' : return '3';
case 'T' : return '3';
case 'L' : return '4';
case 'M' : return '5';
case 'N' : return '5';
case 'R' : return '6';
// ignore A, E, I, O, U, H, W, Y & all else
default: return NON_CHAR_CODE;
}
}
static char NON_CHAR_CODE = (char) 0xFF;
static final char[] INITIAL_CODES = new char[256];
static final char[] CODES = new char[256];
static {
for (int i = 0; i < 256; ++i) {
char c = (char) i;
if (!Character.isLetter(c)) {
INITIAL_CODES[i] = NON_CHAR_CODE;
CODES[i] = NON_CHAR_CODE;
} else {
INITIAL_CODES[i]
= Character
.toUpperCase(Strings.deAccentLatin1(c));
CODES[i] = soundexCode(INITIAL_CODES[i]);
}
}
}
static final boolean[] VOWELS = new boolean[256];
static {
for (int i = 0; i < 256; ++i) {
char initCode = INITIAL_CODES[i];
VOWELS[i] = initCode == 'A'
|| initCode == 'E'
|| initCode == 'I'
|| initCode == 'O'
|| initCode == 'U';
}
}
static class Serializer
extends AbstractSerializer {
static final long serialVersionUID = 2496844521092643488L;
public Serializer(SoundexTokenizerFactory factory) {
super(factory);
}
public Serializer() {
this(null);
}
public Object read(ObjectInput in,
TokenizerFactory baseFactory) {
return new SoundexTokenizerFactory(baseFactory);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy