com.aliasi.spell.EditDistance Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.spell;
import com.aliasi.util.Distance;
import com.aliasi.util.Proximity;
/**
* The EditDistance
class implements the standard notion
* of edit distance, with or without transposition. The distance
* without transposition is known as Levenshtein distance, and
* with transposition as Damerau-Levenstein distance (see below about
* distance-like metric properties).
*
* The edit distance between two strings is defined to be the
* minimum number of non-matching substring edits that is required to
* turn one string into the other. The available edits and their
* corresponding input and output substrings are:
*
*
*
* Operation
* Input
* Output
* Notation
* Match "a" "a" m(a)
* Insert "" "a" i(a)
* Delete "a" "" d(a)
* Substitute "a" "b" s(b,a)
* Transpose "ab" "ba" t(ab)
*
*
*
* Examples of minimal edit sequences are as follows, with distances
* as indicated
*
*
*
* Input
* Output
* Edit Sequence (w. Transp)
* Dist (w. Transp)
* it it m(i) m(t) 0
* gage gauge m(g) m(a) i(u) m(g) m(e) 1
* thhe the m(t) d(h) m(h) m(e) 1
* tenser tensor m(t) m(e) m(n) m(s) s(o,e) m(r) 1
* hte the d(h) m(t) i(h) m(e) [t(ht) m(e)] 2 [1]
* htne then d(h) m(t) s(h,n) m(e) i(n) [t(ht) t(ne)] 3 [2]
* pwn4g ownage s(o,p) m(w) m(n) s(a,4) m(g) i(e) 3
*
*
*
* Note that, in general, there will be more than one way to edit a
* string into another string. For instance, a delete and insert may
* replace a transposition, so that "hte" becomes "the" through edits
* "d(h) m(t) i(h) m(e)" or "i(t) m(h) d(t) m(e)", as well as many
* many more, such as "d(h) d(t) d(e) i(t) i(h) i(e)".
*
* Distance as Metric
*
* Edit distance without transposition defines a proper metric.
* Recall that a distance measure d(x,y)
forms a metric
* if for all x, y, z
, we have (1) d(x,y) >=
* 0
, (2) d(x,y) = d(y,x)
, (3) d(x,x) =
* 0
, and (4) d(x,y) + d(y,z) >= d(x,z)
. All
* of these properties are easy to verify. But with transposition, we
* have strings such as AB
, BA
and
* ACB
. With transposition, d(AB,BA)=1
,
* d(BA,BCA)=1
, but d(AB,BCA)= 3 >= d(AB,BA) +
* d(BA,BCA) = 1 + 1 = 2
.
*
*
Implementation Note: This class implements edit distance
* using dynamic programming in time O(n*m)
where
* n
and m
are the length of the sequences
* being compared. Using a sliding window of three lattice slices rather
* than allocating the entire lattice at once, the space required is
* that for three arrays of integers as long as the shorter of the two
* character sequences being compared. For details, see section
* 12.1.1 of:
*
*
* - Dan Gusfield (1997) Algorithms on Strings, Trees, and Sequences.
* Cambridge University Press.
*
*
* @author Bob Carpenter
* @version 3.0
* @since LingPipe2.0
*/
public class EditDistance
implements Distance, Proximity {
private final boolean mAllowTransposition;
/**
* Construct an edit distance with or without transposition based
* on the specified flag.
*
* @param allowTransposition Set to true
to allow
* transposition edits in the constructed distance.
*/
public EditDistance(boolean allowTransposition) {
mAllowTransposition = allowTransposition;
}
/**
* Returns the edit distance between the specified character
* sequences. Whether transposition is allowed or not is set at
* construction time. This method may be accessed concurrently
* without synchronization.
*
* @param cSeq1 First character sequence.
* @param cSeq2 Second character sequence.
* @return Edit distance between the character sequences.
*/
public double distance(CharSequence cSeq1, CharSequence cSeq2) {
return editDistance(cSeq1,cSeq2,mAllowTransposition);
}
/**
* Returns the proximity between the character sequences.
* Proximity is defined as the negation of the distance:
*
*
* proximity(cs1,cs2) = -distance(cs1,cs2)
*
*
* and thus proximities will all be negative or zero.
*
* @param cSeq1 First character sequence.
* @param cSeq2 Second character sequence.
* @return Proximity between the character sequences.
*/
public double proximity(CharSequence cSeq1, CharSequence cSeq2) {
return -distance(cSeq1,cSeq2);
}
/**
* Returns a string representation of this edit distance.
*
* @return A string representation of this edit distance.
*/
@Override
public String toString() {
return "EditDistance(" + mAllowTransposition + ")";
}
/**
* Returns the edit distance between the character sequences with
* or without transpositions as specified. This distance is
* symmetric. This method is thread safe and may be accessed
* concurrently.
*
* @param cSeq1 First character sequence.
* @param cSeq2 Second character sequence.
* @param allowTransposition Set to true
to allow
* transposition edits.
* @return Edit distance between the character sequences.
*/
public static int editDistance(CharSequence cSeq1,
CharSequence cSeq2,
boolean allowTransposition) {
// switch for min sized lattice slices
if (cSeq1.length() < cSeq2.length()) {
CharSequence temp = cSeq1;
cSeq1 = cSeq2;
cSeq2 = temp;
}
// compute small array cases
if (cSeq2.length() == 0) return cSeq1.length();
if (cSeq2.length() == 1) {
char c = cSeq2.charAt(0);
for (int i = 0; i < cSeq1.length(); ++i)
if (cSeq1.charAt(i) == c)
return cSeq1.length()-1; // one match
return cSeq1.length(); // one subst, other deletes
}
if (allowTransposition)
return editDistanceTranspose(cSeq1,cSeq2);
return editDistanceNonTranspose(cSeq1,cSeq2);
}
private static int editDistanceNonTranspose(CharSequence cSeq1,
CharSequence cSeq2) {
// cSeq1.length >= cSeq2.length > 1
int xsLength = cSeq1.length() + 1; // > ysLength
int ysLength = cSeq2.length() + 1; // > 2
int[] lastSlice = new int[ysLength];
int[] currentSlice = new int[ysLength];
// first slice is just inserts
for (int y = 0; y < ysLength; ++y)
currentSlice[y] = y; // y inserts down first column of lattice
for (int x = 1; x < xsLength; ++x) {
char cX = cSeq1.charAt(x-1);
int[] lastSliceTmp = lastSlice;
lastSlice = currentSlice;
currentSlice = lastSliceTmp;
currentSlice[0] = x; // x deletes across first row of lattice
for (int y = 1; y < ysLength; ++y) {
int yMinus1 = y - 1;
// unfold this one step further to put 1 + outside all mins on match
currentSlice[y] = Math.min(cX == cSeq2.charAt(yMinus1)
? lastSlice[yMinus1] // match
: 1 + lastSlice[yMinus1], // subst
1 + Math.min(lastSlice[y], // delelte
currentSlice[yMinus1])); // insert
}
}
return currentSlice[currentSlice.length-1];
}
private static int editDistanceTranspose(CharSequence cSeq1,
CharSequence cSeq2) {
// cSeq1.length >= cSeq2.length > 1
int xsLength = cSeq1.length() + 1; // > ysLength
int ysLength = cSeq2.length() + 1; // > 2
int[] twoLastSlice = new int[ysLength];
int[] lastSlice = new int[ysLength];
int[] currentSlice = new int[ysLength];
// x=0: first slice is just inserts
for (int y = 0; y < ysLength; ++y)
lastSlice[y] = y; // y inserts down first column of lattice
// x=1:second slice no transpose
currentSlice[0] = 1; // insert x[0]
char cX = cSeq1.charAt(0);
for (int y = 1; y < ysLength; ++y) {
int yMinus1 = y-1;
currentSlice[y] = Math.min(cX == cSeq2.charAt(yMinus1)
? lastSlice[yMinus1] // match
: 1 + lastSlice[yMinus1], // subst
1 + Math.min(lastSlice[y], // delelte
currentSlice[yMinus1])); // insert
}
char cYZero = cSeq2.charAt(0);
// x>1:transpose after first element
for (int x = 2; x < xsLength; ++x) {
char cXMinus1 = cX;
cX = cSeq1.charAt(x-1);
// rotate slices
int[] tmpSlice = twoLastSlice;
twoLastSlice = lastSlice;
lastSlice = currentSlice;
currentSlice = tmpSlice;
currentSlice[0] = x; // x deletes across first row of lattice
// y=1: no transpose here
currentSlice[1] = Math.min(cX == cYZero
? lastSlice[0] // match
: 1 + lastSlice[0], // subst
1 + Math.min(lastSlice[1], // delelte
currentSlice[0])); // insert
// y > 1: transpose
char cY = cYZero;
for (int y = 2; y < ysLength; ++y) {
int yMinus1 = y-1;
char cYMinus1 = cY;
cY = cSeq2.charAt(yMinus1);
currentSlice[y] = Math.min(cX == cY
? lastSlice[yMinus1] // match
: 1 + lastSlice[yMinus1], // subst
1 + Math.min(lastSlice[y], // delelte
currentSlice[yMinus1])); // insert
if (cX == cYMinus1 && cY == cXMinus1)
currentSlice[y] = Math.min(currentSlice[y],1+twoLastSlice[y-2]);
}
}
return currentSlice[currentSlice.length-1];
}
/**
* Edit distance allowing transposition. The implementation is
* thread safe and may be accessed concurrently.
*/
public static final Distance TRANSPOSING
= new EditDistance(true);
/**
* Edit distance disallowing transposition. The implementation is
* thread safe and may be accessed concurrently.
*/
public static final Distance NON_TRANSPOSING
= new EditDistance(false);
}