All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.util.comparison.string.H2OJaroWinklerComparator Maven / Gradle / Ivy

There is a newer version: 3.46.0.5
Show newest version
/**
 * This class is equal to
 * https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/JaroWinkler.java
 *
 * however it is not included in the last available Duke release. Once the new Duke version is release with this
 * fixed class in, we can remove this class.
 *
 */
package water.util.comparison.string;

import java.util.ArrayList;
import java.util.List;

/**
 * An implementation of the Jaro-Winkler string similarity measure.
 * The implementation follows the description in the paper "Evaluating
 * String Comparator Performance for Record Linkage", by William
 * E. Yancey, RESEARCH REPORT SERIES (Statistics #2005-05), US Bureau
 * of the Census. http://www.census.gov/srd/papers/pdf/rrs2005-05.pdf
 */
public class H2OJaroWinklerComparator implements StringComparator {

  public double compare(String s1, String s2) {
    return similarity(s1, s2);
  }

  public boolean isTokenized() {
    return true; // I guess?
  }

  /**
   * Returns normalized score, with 0.0 meaning no similarity at all,
   * and 1.0 meaning full equality.
   */
  public static double similarity(String s1, String s2) {
    if (s1.equals(s2))
      return 1.0;

    // ensure that s1 is shorter than or same length as s2
    if (s1.length() > s2.length()) {
      String tmp = s2;
      s2 = s1;
      s1 = tmp;
    }
    /*
     * this list of Boolean values is used for avoiding duplicated count of
     * common characters in S2
     */
    List isCommonCharInS2 = new ArrayList();
    for (int i=0; i= 5 && // both strings at least 5 characters long
    //       c - p >= 2 && // at least two common characters besides prefix
    //       c - p >= ((s1.length() - p) / 2)) // fairly rich in common chars
    //     {
    //     System.out.println("ADJUSTED!");
    //     score = score + ((1 - score) * ((c - (p + 1)) /
    //                                     ((double) ((s1.length() + s2.length())
    //                                                - (2 * (p - 1))))));
    // }

    // (4) similar characters adjustment
    // the same holds for this as for (3) above.

    return score;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy