org.simmetrics.metrics.MatchingCoefficient Maven / Gradle / Ivy
/*
* SimMetrics - SimMetrics is a java library of Similarity or Distance Metrics,
* e.g. Levenshtein Distance, that provide float based similarity measures
* between String Data. All metrics return consistent measures rather than
* unbounded similarity scores.
*
* Copyright (C) 2014 SimMetrics authors
*
* This file is part of SimMetrics. This program is free software: you can
* redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* SimMetrics. If not, see .
*/
package org.simmetrics.metrics;
import static com.google.common.collect.HashMultiset.create;
import java.util.List;
import org.simmetrics.ListMetric;
import com.google.common.collect.Multiset;
/**
* Matching coefficient algorithm providing a similarity measure between two
* lists.
*
* The matching coefficient between two lists is defined as ratio of elements
* that occur in both lists and elements that exclusively occur in either list.
* This metric is identical to Jaccard similarity. However repeated elements are
* considered as distinct occurrences.
*
*
*
* similarity(a,b) = (a A b)| / (|a or b|)
*
*
*
* The A operation takes the list intersection of a
and
* b
. This is a list c
such that each element in has a
* 1-to-1 relation to an element in both a
and b
. E.g.
* the list intersection of [ab,ab,ab,ac]
and
* [ab,ab,ad]
is [ab,ab]
. *
*
* This metric is identical to Jaccard but is insensitive to repeated tokens.
* The list ["a","a","b"]
is identical to
* ["a","b","b"]
.
*
* This class is immutable and thread-safe.
*
* @see JaccardSimilarity
* @see Wikipedia
* - Simple Matching Coefficient
*
*
*
* @param
* type of the token
*
*/
public class MatchingCoefficient implements ListMetric {
@Override
public float compare(List a, List b) {
if (a.isEmpty() && b.isEmpty()) {
return 1.0f;
}
if (a.isEmpty() || b.isEmpty()) {
return 0.0f;
}
// Count elements in the list intersection.
// Elements are counted only once in both lists.
// E.g. the intersection of [ab,ab,ab] and [ab,ab,ac,ad] is [ab,ab].
// Note: this is not the same as b.retainAll(a).size()
int intersection = 0;
// Copy for destructive list difference
Multiset bCopy = create(b);
for (T token : a) {
if (bCopy.remove(token)) {
intersection++;
}
}
// Implementation note: The size of the union of two sets is equal to
// the size of both lists minus the duplicate elements.
return intersection / (float) (a.size() + b.size() - intersection);
}
@Override
public String toString() {
return "MatchingCoefficient";
}
}