org.simmetrics.metrics.SimonWhite Maven / Gradle / Ivy
/*
* #%L
* Simmetrics Core
* %%
* Copyright (C) 2014 - 2015 Simmetrics Authors
* %%
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
* #L%
*/
package org.simmetrics.metrics;
import static com.google.common.collect.Multisets.intersection;
import org.simmetrics.MultisetDistance;
import org.simmetrics.MultisetMetric;
import com.google.common.collect.Multiset;
/**
* Calculates the Dice similarity coefficient and distance over two multisets.
* Also known as quantitative version of the Dice similarity coefficient. The
* similarity is defined as twice the shared information (intersection) divided
* by the sum of cardinalities.
*
*
* similarity(a,b) = 2 * ∣a ∩ b∣ / (∣a∣ + ∣b∣)
*
* distance(a,b) = 1 - similarity(a,b)
*
*
*
* Implementation based on the ideas as outlined in How to Strike a
* Match by Simon White. To create the described metric use:
*
* {@code
* import static org.simmetrics.StringMetricBuilder.with;
*
* ...
*
* with(new SimonWhite())
* .tokenize(Tokenizers.qGram(2))
* .build();
* }
*
*
*
* The Dice similarity coefficient is identical to Simon White, but unlike Simon
* White the occurrence (cardinality) of an entry is not taken into account.
* E.g. {@code [hello, world]} and {@code [hello, world, hello, world]} would be
* identical when compared with Dice but are dissimilar when Simon White is
* used.
*
* This class is immutable and thread-safe.
*
* @see Dice
* @see Wikipedia - Sørensen–Dice coefficient
*
* @param
* type of the token
*
*/
public final class SimonWhite implements MultisetMetric, MultisetDistance {
@Override
public float compare(Multiset a, Multiset b) {
if (a.isEmpty() && b.isEmpty()) {
return 1.0f;
}
if (a.isEmpty() || b.isEmpty()) {
return 0.0f;
}
// Smaller set first for performance improvement.
// See: MultisetIntersectionSize benchmark
if(a.size() > b.size()){
final Multiset swap = a; a = b; b = swap;
}
// 2 * ∣a ∩ b∣ / (∣a∣ + ∣b∣)
return (2.0f * intersection(a, b).size()) / (a.size() + b.size());
}
@Override
public float distance(Multiset a, Multiset b) {
return 1.0f - compare(a, b);
}
@Override
public String toString() {
return "SimonWhite";
}
}