de.uni_jena.cs.fusion.similarity.jarowinkler.JaroWinklerSimilarity Maven / Gradle / Ivy
package de.uni_jena.cs.fusion.similarity.jarowinkler;
/*-
* #%L
* Jaro Winkler Similarity
* %%
* Copyright (C) 2018 Heinz Nixdorf Chair for Distributed Information Systems, Friedrich Schiller University Jena
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.function.Function;
/**
*
* {@link Function} to calculate the Jaro Winkler similarity of a given
* {@link String} to the value {@link String}s of a {@link Collection} or the
* key {@link String}s of a {@link Map} and return the ranked values with a Jaro
* Winkler similarity meeting a threshold.
*
*
* Acknowledgments: The development of this Jaro Winkler Similarity
* implementation was funded by DFG in the scope of the LakeBase project within
* the Scientific Library Services and Information Systems (LIS) program.
*
*
* @param
* Type of the returned ranked values
*
* @author Jan Martin Keil
* @since 0.1
*/
public class JaroWinklerSimilarity implements Function> {
public final static int COMMON_PREFIX_LENGTH_LIMIT = 4;
public final static double BOOST_THRESHOLD = 0.7;
public final static double BOOST_FACTOR = 0.1;
private static int equalInRange(boolean[] array, boolean expected, int lowerBound, int upperBound) {
int result = 0;
for (int i = lowerBound; i <= upperBound; i++) {
if (array[i] == expected) {
result++;
}
}
return result;
}
/**
* @param commonCharacters
* characters in common in pair of strings
* @param length1
* length of first string
* @param length2
* length of second string
* @param halfTranspositions
* number of half transpositions
* @return
*/
private static double jaroSimilarity(double commonCharacters, int length1, int length2, double halfTranspositions) {
if (commonCharacters > 0) {
return (commonCharacters * commonCharacters * 2 / length1
+ commonCharacters * commonCharacters * 2 / length2 + commonCharacters * 2 - halfTranspositions)
/ (3 * commonCharacters * 2);
} else {
return 0;
}
}
private static double jaroWinklerSimilarity(double commonCharacters, int length1, int length2,
double halfTranspositions, int commonPrefixLength) {
double jaroSimilarity = jaroSimilarity(commonCharacters, length1, length2, halfTranspositions);
if (jaroSimilarity >= BOOST_THRESHOLD) {
return jaroSimilarity + commonPrefixLength * BOOST_FACTOR * (1 - jaroSimilarity);
} else {
return jaroSimilarity;
}
}
/**
* @param threshold
* Minimum similarity of matching terms.
* @param termTrie
* Current node of the term trie to process.
* @param query
* Characters of the query string.
* @param queryLength
* Length of the query.
* @param termTargetLength
* Total length of the term to process.
* @param windowSize
* Window size to search for common characters.
* @param minCommonCharacters
* Min number of characters in common in term and query.
* @param minHalfTranspositions
* Min number of half transpositions
* @param maxCommonPrefixSize
* Max number of characters in common in pair of strings in
* emphasized beginning.
* @param saveCommonCharsQuery
* Assigned characters of the query whose predecessors are already
* outside of the matching window.
* @param assignedQuery
* array of booleans stating which characters of first string have
* been assigned (TRUE = assigned)
* @param assignedTerm
* array of booleans stating which characters of second string have
* been assigned (TRUE = assigned)
* @param commonCharsTerm
* Assigned characters of the term.
*/
private static void match(Trie termTrie, double threshold, String query, int queryLength,
int termTargetLength, int windowSize, int minCommonCharacters, int minHalfTranspositions,
int maxCommonPrefixSize, int saveCommonCharsQuery, boolean[] assignedQuery, boolean[] assignedTerm,
char[] commonCharsTerm, Map results) {
if (termTrie.containsLength(termTargetLength)) {
// current branch contains string of target length
// get current position on term string
final int termCurrentNodeDepth = termTrie.depth();
final int termCurrentNodeLength = termTrie.keyLength();
// iterate new characters
for (int termCurrentLength = termCurrentNodeDepth
+ 1; termCurrentLength <= termCurrentNodeLength; termCurrentLength++) {
// get character at current position
final char currentTermChar = termTrie.symbol().charAt(termCurrentLength - 1 - termCurrentNodeDepth);
// get window on query string
/**
* First character of query that can still become assigned.
*/
final int assignableQueryWindowLowerBoundIndex = Math.max(termCurrentLength - 1 - windowSize, 0);
/**
* Last character of query that can still become assigned by the current
* character of term.
*/
final int assignableQueryCurrentWindowUpperBoundIndex = Math.min(termCurrentLength + windowSize,
queryLength) - 1;
// update maxCommonPrefixSize
if (termCurrentLength <= maxCommonPrefixSize && query.charAt(termCurrentLength - 1) != currentTermChar)
// currently in the prefix and characters at current position
// do not match
{
// reduce maxCommonPrefixSize to current depth
maxCommonPrefixSize = termCurrentLength - 1;
}
// search matching char for current term char in window
for (int i = assignableQueryWindowLowerBoundIndex; i <= assignableQueryCurrentWindowUpperBoundIndex; i++) {
if (!assignedQuery[i] && query.charAt(i) == currentTermChar) {
// unassigned common character was found
assignedQuery[i] = true;
assignedTerm[termCurrentLength - 1] = true;
commonCharsTerm[minCommonCharacters] = currentTermChar;
minCommonCharacters++;
break;
}
}
// update minHalfTranspositions
if (windowSize < termCurrentLength && termCurrentLength - windowSize <= queryLength) {
// window lower bound inside of query string
if (assignedQuery[assignableQueryWindowLowerBoundIndex]) {
// character at window lower bound is assigned
if (query.charAt(
assignableQueryWindowLowerBoundIndex) != commonCharsTerm[saveCommonCharsQuery]) {
// common characters at last save position not equal
minHalfTranspositions++;
}
saveCommonCharsQuery++;
}
}
}
// get window bounds
int assignableQueryWindowLowerBoundIndex = Math.max(termCurrentNodeLength - 1 - windowSize, 0);
/**
* Last character of query that can still become assigned by any character of
* term.
*/
int assignableQueryTotalWindowUpperBoundIndex = Math.min(termTargetLength + windowSize, queryLength) - 1;
// get number of characters that can still become assigned
int assignableQuery = equalInRange(assignedQuery, false, assignableQueryWindowLowerBoundIndex,
assignableQueryTotalWindowUpperBoundIndex);
/**
* Number of characters of term that can still become assigned after processing
* current character.
*/
int assignableTerm = termTargetLength - termCurrentNodeLength;
// get maximum number of common characters
double maxCommonCharacters = Math.min(assignableQuery, assignableTerm) + minCommonCharacters;
// get remaining half transpositions
if (termCurrentNodeLength == termTargetLength) {
// termString has been completed
// iterate assignments not covered by minHalfTransposition yet
for (int i = Math.max(termCurrentNodeLength - windowSize,
0); i <= assignableQueryTotalWindowUpperBoundIndex; i++) {
if (assignedQuery[i]) {
// position is assigned
if (query.charAt(i) != commonCharsTerm[saveCommonCharsQuery]) {
// common characters at current position not equal
minHalfTranspositions++;
}
saveCommonCharsQuery++;
}
}
}
// calculate max similarity
double maxSimilarity = jaroWinklerSimilarity(maxCommonCharacters, queryLength, termTargetLength,
minHalfTranspositions, maxCommonPrefixSize);
// check against threshold
if (maxSimilarity >= threshold) {
// threshold is meet
if (termTargetLength == termCurrentNodeLength) {
// current node has target depth
if (termTrie.isPopulated()) {
// current node is contained
// add object of current node to results
results.put(termTrie.value(), maxSimilarity);
}
} else {
// iterate children
Iterator extends Trie> children = termTrie.childrenIterator();
while (children.hasNext()) {
boolean[] termAssignedCopy = Arrays.copyOf(assignedTerm, termTargetLength);
boolean[] queryAssignedCopy = Arrays.copyOf(assignedQuery, queryLength);
Trie child = children.next();
// traverse child
match(child, threshold, query, queryLength, termTargetLength, windowSize, minCommonCharacters,
minHalfTranspositions, maxCommonPrefixSize, saveCommonCharsQuery, queryAssignedCopy,
termAssignedCopy, commonCharsTerm, results);
}
}
}
}
}
/**
* Returns the Jaro Winkler similarity of two given {@link String}s.
*
* Note: Use {@link #apply(String)} to calculate the similarity of one
* {@link String} to many {@link String}s for performance reasons.
*
* @param first
* First {@link String} to match.
* @param second
* Second {@link String} to match.
* @param threshold
* Minimum similarity of the strings.
* @return Jaro Winkler similarity of {@code first} and {@code second} or
* {@code null} if they do not meet the threshold.
*
* @author Jan Martin Keil
* @since 1.0
*/
public static Double of(String first, String second, double threshold) {
// initialize result
Map results = new HashMap<>();
// get lengths
int firstLength = first.length();
int secondLenght = second.length();
// calculate window size for common characters
int windowSize = windowSize(secondLenght, firstLength);
// max value of l = the size of the emphasized first few characters
int maxCommonPrefixSize = Math.min(COMMON_PREFIX_LENGTH_LIMIT, Math.min(secondLenght, firstLength));
// recursive traverse of the trie to get matching strings of length2
match(Tries.singletonTrieSet(first), threshold, second, secondLenght, firstLength, windowSize, 0 // minCommonCharacters
, 0 // minHalfTranspositions
, maxCommonPrefixSize, 0 // saveCommonCharsQuery
, new boolean[secondLenght] // assignedQuery
, new boolean[firstLength] // assignedTerm
, new char[Math.min(secondLenght, firstLength)] // commonCharsTerm
, results);
return results.get(first);
}
private static int windowSize(int length1, int length2) {
return Math.max(0, Math.max(length1, length2) / 2 - 1);
}
/**
* Prepares a {@link JaroWinklerSimilarity} instance to match the content of a
* given {@link Collection} considering a given threshold. The created
* {@link JaroWinklerSimilarity} is not backed by the {@link Collection}, so it
* will not reflect changes of the {@link Collection}.
*
* @param terms
* {@link Collection} of matched and returned terms.
* @param threshold
* Minimum similarity of matching terms.
* @return A {@link JaroWinklerSimilarity} instance to match the content of the
* given {@link Collection} considering the given threshold.
*
* @author Jan Martin Keil
* @since 1.0
*/
public static JaroWinklerSimilarity with(Collection terms, double threshold) {
return new JaroWinklerSimilarity(new LinkedNodeTrieSet(terms), threshold);
}
/**
*
* Prepares a {@link JaroWinklerSimilarity} instance to match the content of a
* given {@link Map} considering a given threshold. The matching will search for
* similar keys, but return the corresponding values. The created
* {@link JaroWinklerSimilarity} is not backed by the {@link Map}, so it will
* not reflect changes of the {@link Map}.
*
* @param terms
* {@link Map} of matched terms and returned values.
* @param threshold
* Minimum similarity of matching terms.
* @return A {@link JaroWinklerSimilarity} instance to match the content of the
* given {@link Map} considering the given threshold.
*
* @param
* Type of the map values and returned values by the matching.
*
* @author Jan Martin Keil
* @since 1.0
*/
public static JaroWinklerSimilarity with(Map terms, double threshold) {
return new JaroWinklerSimilarity(new LinkedListTrieMap(terms), threshold);
}
private final Trie trie;
private double threshold;
private JaroWinklerSimilarity(Trie trie, double threshold) {
this.trie = trie;
this.threshold = threshold;
}
/**
* Matches a {@link String} against the terms of this
* {@link JaroWinklerSimilarity} instance.
*
* @param query
* {@link String} that will be compared to the terms to calculate the
* similarity.
* @return {@link Map} of the matching values and their ranking.
*
* @author Jan Martin Keil
* @since 1.0
*/
@Override
public Map apply(String query) {
// initialize result
Map results = new HashMap<>();
// get length of query
int queryLength = query.length();
// iterate possible lengths of terms
for (Integer termTargetLength : this.trie.containedLengths()) {
// calculate window size for common characters
int windowSize = windowSize(queryLength, termTargetLength);
// max value of l = the size of the emphasized first few characters
int maxCommonPrefixSize = Math.min(COMMON_PREFIX_LENGTH_LIMIT, Math.min(queryLength, termTargetLength));
// recursive traverse of the trie to get matching strings of length2
match(this.trie, this.threshold, query, queryLength, termTargetLength, windowSize, 0 // minCommonCharacters
, 0 // minHalfTranspositions
, maxCommonPrefixSize, 0 // saveCommonCharsQuery
, new boolean[queryLength] // assignedQuery
, new boolean[termTargetLength] // assignedTerm
, new char[Math.min(queryLength, termTargetLength)] // commonCharsTerm
, results);
}
return results;
}
/**
* Changes the used threshold.
*
* @param threshold
* Minimum similarity of matching terms.
*/
public void setThreshold(double threshold) {
this.threshold = threshold;
}
}