All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.metrics.Jaro Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2016 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

package org.simmetrics.metrics;

import static java.lang.Math.max;
import static java.lang.Math.min;

import org.simmetrics.StringDistance;
import org.simmetrics.StringMetric;

/**
 * Calculates the Jaro distance (similarity) over two strings.
 * 

* * similarity(a,b) = jaro(a,b) *
* distance(a,b) = 1 - similarity(a,b) *
*

* This class is immutable and thread-safe. * * @see Wikipedia * - Jaro-Winkler distance * @see JaroWinkler * * * */ public final class Jaro implements StringMetric, StringDistance { @Override public float distance(String a, String b) { return 1.0f - compare(a, b); } @Override public float compare(final String a, final String b) { if (a.isEmpty() && b.isEmpty()) { return 1.0f; } if (a.isEmpty() || b.isEmpty()) { return 0.0f; } // Intentional integer division to round down. final int halfLength = max(0, max(a.length(), b.length()) / 2 - 1); final char[] charsA = a.toCharArray(); final char[] charsB = b.toCharArray(); final int[] commonA = getCommonCharacters(charsA, charsB, halfLength); final int[] commonB = getCommonCharacters(charsB, charsA, halfLength); // commonA and commonB will always contain the same multi-set of // characters. Because getCommonCharacters has been optimized, commonA // and commonB are -1-padded. So in this loop we count transposition // and use commonCharacters to determine the length of the multi-set. float transpositions = 0; int commonCharacters = 0; for (int length = commonA.length; commonCharacters < length && commonA[commonCharacters] > -1; commonCharacters++) { if (commonA[commonCharacters] != commonB[commonCharacters]) { transpositions++; } } if (commonCharacters == 0) { return 0.0f; } float aCommonRatio = commonCharacters / (float) a.length(); float bCommonRatio = commonCharacters / (float) b.length(); float transpositionRatio = (commonCharacters - transpositions / 2.0f) / commonCharacters; return (aCommonRatio + bCommonRatio + transpositionRatio) / 3.0f; } /* * Returns an array of characters from a within b. A character in b is * counted as common when it is within separation distance from the position * in a. */ private static int[] getCommonCharacters(final char[] charsA, final char[] charsB, final int separation) { final int[] common = new int[min(charsA.length, charsB.length)]; final boolean[] matched = new boolean[charsB.length]; // Iterate of string a and find all characters that occur in b within // the separation distance. Mark any matches found to avoid // duplicate matchings. int commonIndex = 0; for (int i = 0, length = charsA.length; i < length; i++) { final char character = charsA[i]; final int index = indexOf(character, charsB, i - separation, i + separation + 1, matched); if (index > -1) { common[commonIndex++] = character; matched[index] = true; } } if (commonIndex < common.length) { common[commonIndex] = -1; } // Both invocations will yield the same multi-set terminated by -1, so // they can be compared for transposition without making a copy. return common; } /* * Search for character in buffer starting at fromIndex to toIndex - 1. * * Returns -1 when not found. */ private static int indexOf(char character, char[] buffer, int fromIndex, int toIndex, boolean[] matched) { // compare char with range of characters to either side for (int j = max(0, fromIndex), length = min(toIndex, buffer.length); j < length; j++) { // check if found if (buffer[j] == character && !matched[j]) { return j; } } return -1; } @Override public String toString() { return "Jaro"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy