water.util.comparison.string.H2OJaroWinklerComparator Maven / Gradle / Ivy
/**
* This class is equal to
* https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/JaroWinkler.java
*
* however it is not included in the last available Duke release. Once the new Duke version is release with this
* fixed class in, we can remove this class.
*
*/
package water.util.comparison.string;
import java.util.ArrayList;
import java.util.List;
/**
* An implementation of the Jaro-Winkler string similarity measure.
* The implementation follows the description in the paper "Evaluating
* String Comparator Performance for Record Linkage", by William
* E. Yancey, RESEARCH REPORT SERIES (Statistics #2005-05), US Bureau
* of the Census. http://www.census.gov/srd/papers/pdf/rrs2005-05.pdf
*/
public class H2OJaroWinklerComparator implements StringComparator {
public double compare(String s1, String s2) {
return similarity(s1, s2);
}
public boolean isTokenized() {
return true; // I guess?
}
/**
* Returns normalized score, with 0.0 meaning no similarity at all,
* and 1.0 meaning full equality.
*/
public static double similarity(String s1, String s2) {
if (s1.equals(s2))
return 1.0;
// ensure that s1 is shorter than or same length as s2
if (s1.length() > s2.length()) {
String tmp = s2;
s2 = s1;
s1 = tmp;
}
/*
* this list of Boolean values is used for avoiding duplicated count of
* common characters in S2
*/
List isCommonCharInS2 = new ArrayList();
for (int i=0; i= 5 && // both strings at least 5 characters long
// c - p >= 2 && // at least two common characters besides prefix
// c - p >= ((s1.length() - p) / 2)) // fairly rich in common chars
// {
// System.out.println("ADJUSTED!");
// score = score + ((1 - score) * ((c - (p + 1)) /
// ((double) ((s1.length() + s2.length())
// - (2 * (p - 1))))));
// }
// (4) similar characters adjustment
// the same holds for this as for (3) above.
return score;
}
}