org.simmetrics.metrics.StringMetrics Maven / Gradle / Ivy
/*
* #%L
* Simmetrics Core
* %%
* Copyright (C) 2014 - 2016 Simmetrics Authors
* %%
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
* #L%
*/
package org.simmetrics.metrics;
import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.builders.StringMetricBuilder.with;
import static org.simmetrics.simplifiers.Simplifiers.chain;
import static org.simmetrics.tokenizers.Tokenizers.qGram;
import static org.simmetrics.tokenizers.Tokenizers.qGramWithPadding;
import static org.simmetrics.tokenizers.Tokenizers.whitespace;
import java.util.List;
import java.util.Set;
import org.simmetrics.Metric;
import org.simmetrics.StringMetric;
import org.simmetrics.builders.StringMetricBuilder;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Soundex;
import org.simmetrics.tokenizers.Tokenizer;
import com.google.common.collect.Multiset;
/**
* Utility class for string similarity metrics.
*
* Consists of well known string similarity metrics and methods to create string
* similarity metrics from list- or set metrics. All metrics are setup with
* sensible defaults, to customize metrics use {@link StringMetricBuilder}.
*
* The created similarity metrics are immutable and thread-safe provided all their
* components are also immutable and thread-safe.
*/
public final class StringMetrics {
/**
* Returns a cosine similarity metric over tokens in a string. The tokens
* are created by splitting the string on whitespace.
*
* @return a cosine similarity metric
*
* @see CosineSimilarity
*/
public static StringMetric cosineSimilarity() {
return with(new CosineSimilarity())
.tokenize(whitespace())
.build();
}
/**
* Returns a block distance similarity metric over tokens in a string. The
* tokens are created by splitting the string on whitespace.
*
* @return a block distance metric
*
* @see BlockDistance
*/
public static StringMetric blockDistance() {
return with(new BlockDistance()).tokenize(whitespace()).build();
}
/**
* Returns a Damerau-Levenshtein similarity metric over tokens in a string.
* The tokens are created by splitting the string on whitespace.
*
* @return a Damerau-Levenshtein metric
*
* @see DamerauLevenshtein
*/
public static StringMetric damerauLevenshtein() {
return new DamerauLevenshtein();
}
/**
* Returns a Dice similarity metric over tokens in a string. The tokens are
* created by splitting the string on whitespace.
*
* @return a dice metric
*
* @see Dice
*/
public static StringMetric dice() {
return with(new Dice()).tokenize(whitespace()).build();
}
/**
* Returns an Euclidean distance similarity metric over tokens in a string.
* The tokens are created by splitting the string on whitespace.
*
* @return a Euclidean distance similarity metric
*
* @see EuclideanDistance
*/
public static StringMetric euclideanDistance() {
return with(new EuclideanDistance()).tokenize(whitespace())
.build();
}
/**
* Returns a generalized Jaccard similarity metric over tokens in a string.
* The tokens are created by splitting the string on whitespace.
*
* @return a generalized Jaccard index metric
*
* @see GeneralizedJaccard
*/
public static StringMetric generalizedJaccard() {
return with(new GeneralizedJaccard()).tokenize(whitespace())
.build();
}
/**
* Returns an identity string similarity metric. The metric returns 1.0 when
* the inputs are equals, and 0.0 when they're not.
*
* @return an identity similarity metric
*
* @see Identity
*/
public static StringMetric identity() {
return new StringMetric() {
private final Identity metric = new Identity<>();
@Override
public float compare(String a, String b) {
return metric.compare(a, b);
}
@Override
public String toString() {
return metric.toString();
}
};
}
/**
* Returns a Jaccard similarity metric over tokens in a string. The
* tokens are created by splitting the string on whitespace.
*
* @return a Jaccard similarity metric
*
* @see Jaccard
*/
public static StringMetric jaccard() {
return with(new Jaccard()).tokenize(whitespace()).build();
}
/**
* Returns a Jaro string similarity metric.
*
* @return a Jaro string similarity metric
*
* @see Jaro
*/
public static StringMetric jaro() {
return new Jaro();
}
/**
* Returns a Jaro-Winkler string similarity metric.
*
* @return a Jaro-Winkler string similarity metric
*
* @see JaroWinkler
*/
public static StringMetric jaroWinkler() {
return new JaroWinkler();
}
/**
* Returns a Levenshtein string similarity metric.
*
* @return a Levenshtein string similarity metric
*
* @see Levenshtein
*/
public static StringMetric levenshtein() {
return new Levenshtein();
}
/**
* Returns a normalized Monge-Elkan metric over tokens in a string. The
* tokens are created by splitting the string on whitespace. The metric
* applies Smith-Waterman-Gotoh internally.
*
* @return a normalized Monge-Elkan metric
*
* @see MongeElkan
*/
public static StringMetric mongeElkan() {
return with(new MongeElkan(new SmithWatermanGotoh())).tokenize(
whitespace()).build();
}
/**
* Returns a Needleman-Wunch string similarity metric.
*
* @return a Needleman-Wunch string similarity metric
*
* @see NeedlemanWunch
*/
public static StringMetric needlemanWunch() {
return new NeedlemanWunch();
}
/**
* Returns an overlap coefficient similarity metric over tokens in a string.
* The tokens are created by splitting the string on whitespace.
*
* @return an overlap coefficient metric
*
* @see OverlapCoefficient
*/
public static StringMetric overlapCoefficient() {
return with(new OverlapCoefficient()).tokenize(whitespace())
.build();
}
/**
* Returns a q-grams distance similarity metric. Q-grams distance applies a
* block distance similarity similarity metric over all tri-grams in a
* string.
*
* @return a q-grams distance similarity metric
*
* @see BlockDistance
*/
public static StringMetric qGramsDistance() {
return with(new BlockDistance()).tokenize(qGramWithPadding(3))
.build();
}
/**
* Returns a Simon White similarity metric. Simon White applies the
* quantitative version Dice similarity over tokens in a string. The tokens
* are created by splitting the string on whitespace and taking bi-grams of
* the created tokens.
*
* Implementation based on the ideas as outlined in How to Strike
* a Match by Simon White.
*
* @return a Simon White similarity metric
*
* @see SimonWhite
*/
public static StringMetric simonWhite() {
return with(new SimonWhite()).tokenize(whitespace())
.tokenize(qGram(2)).build();
}
/**
* Returns a Smith-Waterman string similarity metric.
*
* @return a Smith-Waterman string similarity metric
*
* @see SmithWaterman
*/
public static StringMetric smithWaterman() {
return new SmithWaterman();
}
/**
* Returns a Smith-Waterman-Gotoh string similarity metric.
*
* @return a Smith-Waterman-Gotoh string similarity metric
*
* @see SmithWatermanGotoh
*/
public static StringMetric smithWatermanGotoh() {
return new SmithWatermanGotoh();
}
/**
* Returns a soundex similarity metric. The metric applies the Jaro-Winkler
* similarity metric over soundex strings.
*
* @return a soundex similarity metric
*
* @see Soundex
* @see JaroWinkler
*
* @deprecated will be removed due to a lack of a good use case
*/
@Deprecated
public static StringMetric soundex() {
return with(new JaroWinkler()).simplify(new Soundex()).build();
}
/**
* Returns a string similarity metric that uses the
* {@link LongestCommonSubsequence} metric.
*
* @return a longest common subsequence metric
*/
public static StringMetric longestCommonSubsequence() {
return new LongestCommonSubsequence();
}
/**
* Returns a similarity metric that uses the {@link LongestCommonSubstring}
* metric.
*
* @return a longest common substring metric
*/
public static StringMetric longestCommonSubstring() {
return new LongestCommonSubstring();
}
/**
* Either constructs a new string similarity metric or returns the original
* metric.
*
* @param metric
* a metric for strings
*
* @return a similarity metric.
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric create(Metric metric) {
if (metric instanceof StringMetric) {
return (StringMetric) metric;
}
return new ForString(metric);
}
/**
* Constructs a new composite string similarity metric. The simplifier will
* be applied before the metric compares the strings.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric or simplifier are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric create(Metric metric,
Simplifier simplifier) {
if (metric instanceof ForString) {
ForString forString = (ForString) metric;
return new ForStringWithSimplifier(forString.getMetric(),
simplifier);
} else if (metric instanceof ForStringWithSimplifier) {
ForStringWithSimplifier fsws = (ForStringWithSimplifier) metric;
return new ForStringWithSimplifier(fsws.getMetric(), chain(
simplifier, fsws.getSimplifier()));
} else if (metric instanceof ForList) {
ForList fl = (ForList) metric;
return createForListMetric(fl.getMetric(), simplifier,
fl.getTokenizer());
} else if (metric instanceof ForListWithSimplifier) {
ForListWithSimplifier fl = (ForListWithSimplifier) metric;
return createForListMetric(fl.getMetric(),
chain(simplifier, fl.getSimplifier()), fl.getTokenizer());
} else if (metric instanceof ForSet) {
ForSet fl = (ForSet) metric;
return createForSetMetric(fl.getMetric(), simplifier,
fl.getTokenizer());
} else if (metric instanceof ForSetWithSimplifier) {
ForSetWithSimplifier fl = (ForSetWithSimplifier) metric;
return createForSetMetric(fl.getMetric(),
chain(simplifier, fl.getSimplifier()), fl.getTokenizer());
}
return new ForStringWithSimplifier(metric, simplifier);
}
/**
* Creates a new composite string similarity metric.The tokenizer is used to
* tokenize the simplified strings. The list metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite list metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForListMetric(Metric> metric,
Simplifier simplifier, Tokenizer tokenizer) {
return new ForListWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string similarity metric. The tokenizer is used
* to tokenize the strings. The list metric compares the the tokens.
*
* @param metric
* a list metric
* @param tokenizer
* a tokenizer
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForListMetric(Metric> metric,
Tokenizer tokenizer) {
return new ForList(metric, tokenizer);
}
/**
* Creates a new composite string similarity metric.The tokenizer is used to
* tokenize the simplified strings. The set metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForSetMetric(Metric> metric,
Simplifier simplifier, Tokenizer tokenizer) {
return new ForSetWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string similarity metric. The tokenizer is used
* to tokenize the strings. The set metric compares the the tokens.
*
* @param metric
* a set metric
*
* @param tokenizer
* a tokenizer
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForSetMetric(Metric> metric,
Tokenizer tokenizer) {
return new ForSet(metric, tokenizer);
}
/**
* Creates a new composite string similarity metric.The tokenizer is used to
* tokenize the simplified strings. The set metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForMultisetMetric(
Metric> metric, Simplifier simplifier,
Tokenizer tokenizer) {
return new ForMultisetWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string similarity metric. The tokenizer is used
* to tokenize the strings. The set metric compares the the tokens.
*
* @param metric
* a set metric
*
* @param tokenizer
* a tokenizer
* @return a new composite similarity metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*
* @deprecated Use {@link StringMetricBuilder} in favor of directly
* constructing a metric.
*/
@Deprecated
public static StringMetric createForMultisetMetric(
Metric> metric, Tokenizer tokenizer) {
return new ForMultiset(metric, tokenizer);
}
static final class ForList implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForList(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToList(a),
tokenizer.tokenizeToList(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForListWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForListWithSimplifier(Metric> metric,
Simplifier simplifier, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(
tokenizer.tokenizeToList(simplifier.simplify(a)),
tokenizer.tokenizeToList(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForSet implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForSet(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToSet(a),
tokenizer.tokenizeToSet(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForSetWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForSetWithSimplifier(Metric> metric, Simplifier simplifier,
Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(
tokenizer.tokenizeToSet(simplifier.simplify(a)),
tokenizer.tokenizeToSet(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForMultiset implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForMultiset(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToMultiset(a),
tokenizer.tokenizeToMultiset(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForMultisetWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForMultisetWithSimplifier(Metric> metric,
Simplifier simplifier, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(
tokenizer.tokenizeToMultiset(simplifier.simplify(a)),
tokenizer.tokenizeToMultiset(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForString implements StringMetric {
private final Metric metric;
ForString(Metric metric) {
this.metric = metric;
}
@Override
public float compare(String a, String b) {
return metric.compare(a, b);
}
@Override
public String toString() {
return metric.toString();
}
Metric getMetric() {
return metric;
}
}
static final class ForStringWithSimplifier implements StringMetric {
private final Metric metric;
private final Simplifier simplifier;
ForStringWithSimplifier(Metric metric, Simplifier simplifier) {
checkNotNull(metric);
checkNotNull(simplifier);
this.metric = metric;
this.simplifier = simplifier;
}
@Override
public float compare(String a, String b) {
return metric.compare(simplifier.simplify(a),
simplifier.simplify(b));
}
Metric getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
@Override
public String toString() {
return metric + " [" + simplifier + "]";
}
}
private StringMetrics() {
// Utility class.
}
}