All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.metrics.Jaro Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * SimMetrics - SimMetrics is a java library of Similarity or Distance Metrics,
 * e.g. Levenshtein Distance, that provide float based similarity measures
 * between String Data. All metrics return consistent measures rather than
 * unbounded similarity scores.
 * 
 * Copyright (C) 2014 SimMetrics authors
 * 
 * This file is part of SimMetrics. This program is free software: you can
 * redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * SimMetrics. If not, see .
 */
package org.simmetrics.metrics;

import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.util.Arrays.copyOf;

import java.util.Arrays;

import org.simmetrics.StringMetric;

/**
 * Jaro algorithm providing a similarity measure between two strings.
 * 
 * 

* This class is immutable and thread-safe. * * @see Wikipedia * - Jaro-Winkler distance * @see JaroWinkler * * * */ public class Jaro implements StringMetric { @Override public float compare(final String a, final String b) { if (a.isEmpty() && b.isEmpty()) { return 1.0f; } if (a.isEmpty() || b.isEmpty()) { return 0.0f; } // Intentional integer division to round down. final int halfLength = max(0, max(a.length(), b.length()) / 2 - 1); final char[] commonA = getCommonCharacters(a, b, halfLength); final char[] commonB = getCommonCharacters(b, a, halfLength); if (commonA.length == 0 || commonB.length == 0) { return 0.0f; } // Only need to check a single array for length. commonA and commonB // will always contain the same multi-set of characters float transpositions = 0; for (int i = 0, length = commonA.length; i < length; i++) { if (commonA[i] != commonB[i]) { transpositions++; } } transpositions /= 2.0f; float aCommonRatio = commonA.length / (float) a.length(); float bCommonRatio = commonB.length / (float) b.length(); float transpositionRatio = (commonA.length - transpositions) / commonA.length; return (aCommonRatio + bCommonRatio + transpositionRatio) / 3.0f; } /* * Returns a string of characters from a within b A character in b is * counted as common when it is within separation distance from the position * in a. */ private static char[] getCommonCharacters(final String a, final String b, final int separation) { final char[] charsA = a.toCharArray(); final char[] charsB = b.toCharArray(); final char[] common = new char[min(charsA.length,charsB.length)]; // Iterate of string a and find all characters that occur in b within // the separation distance. Zero out any matches found to avoid // duplicate matchings. int commonIndex = 0; for (int i = 0, length = charsA.length; i < length; i++) { final char character = charsA[i]; int index = indexOf(character, charsB, i - separation, i + separation + 1); if (index > -1) { common[commonIndex++] = character; charsB[index] = (char) 0; } } return copyOf(common, commonIndex); } /* * Search for character in buffer starting at fromIndex to toIndex - 1. * * Returns -1 when not found. */ private static int indexOf(char character, char[] buffer, int fromIndex, int toIndex) { // compare char with range of characters to either side for (int j = max(0, fromIndex), length = min(toIndex, buffer.length); j < length; j++) { // check if found if (buffer[j] == character) { return j; } } return -1; } @Override public String toString() { return "Jaro"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy