org.simmetrics.metrics.StringMetrics Maven / Gradle / Ivy
/*
* #%L
* Simmetrics Core
* %%
* Copyright (C) 2014 - 2015 Simmetrics Authors
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package org.simmetrics.metrics;
import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.simplifiers.Simplifiers.chain;
import static org.simmetrics.tokenizers.Tokenizers.chain;
import static org.simmetrics.tokenizers.Tokenizers.qGram;
import static org.simmetrics.tokenizers.Tokenizers.whitespace;
import java.util.List;
import java.util.Set;
import org.simmetrics.Metric;
import org.simmetrics.StringMetric;
import org.simmetrics.builders.StringMetricBuilder;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Soundex;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.tokenizers.Tokenizers;
import com.google.common.collect.Multiset;
/**
* Utility class for StringMetrics.
*
* Consists of well known metrics and methods to create string metrics from
* list- or set metrics. All metrics are setup with sensible defaults, to
* customize metrics use {@link StringMetricBuilder}.
*
* The available metrics are:
*
*
* - Block Distance
*
- Cosine Similarity
*
- Damerau Levenshtein
*
- Dice
*
- SimonWhite (Quantitative Dice)
*
- Euclidean Distance
*
- Jaccard
*
- Generalized Jaccard
*
- Jaro
*
- Jaro-Winkler
*
- LevenShtein
*
- Monge-Elkan
*
- NeedleMan Wunch
*
- Overlap Coefficient
*
- q-Grams Distance
*
- Smith-Waterman
*
- Smith-Waterman-Gotoh
*
- Soundex
*
*
*
* All methods return immutable objects provided the arguments are also
* immutable.
*/
public final class StringMetrics {
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link CosineSimilarity} metric.
*
* @return a cosine similarity metric
*/
public static StringMetric cosineSimilarity() {
return createForMultisetMetric(new CosineSimilarity(), whitespace());
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link BlockDistance} metric.
*
* @return a block distance metric
*/
public static StringMetric blockDistance() {
return createForMultisetMetric(new BlockDistance(), whitespace());
}
/**
* Returns a string metric that uses a {@link DamerauLevenshtein} metric.
*
* @return a damerau levenshtein metric
*/
public static StringMetric damerauLevenshtein() {
return new DamerauLevenshtein();
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link Dice} metric.
*
* @return a dice metric
*/
public static StringMetric dice() {
return createForSetMetric(new Dice(), whitespace());
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link EuclideanDistance} metric.
*
* @return a Euclidean distance similarity metric
*/
public static StringMetric euclideanDistance() {
return createForMultisetMetric(new EuclideanDistance(), whitespace());
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link GeneralizedJaccard} metric.
*
* @return a generalized jaccard index metric
*/
public static StringMetric generalizedJaccard() {
return createForMultisetMetric(new GeneralizedJaccard(), whitespace());
}
/**
* Returns an string metric that uses the {@link Identity} metric.
*
* @return an identity string metric
*/
public static StringMetric identity() {
return create(new Identity());
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link Jaccard} metric.
*
* @return a Jaccard similarity metric
*/
public static StringMetric jaccard() {
return createForSetMetric(new Jaccard(), whitespace());
}
/**
* Returns a string metric that uses the {@link Jaro} metric.
*
* @return a Jaro metric
*/
public static StringMetric jaro() {
return new Jaro();
}
/**
* Returns a string metric that uses the {@link JaroWinkler} metric.
*
* @return a Jaro-Winkler metric
*/
public static StringMetric jaroWinkler() {
return new JaroWinkler();
}
/**
* Returns a string metric that uses the {@link Levenshtein} metric.
*
* @return a Levenshtein metric
*/
public static StringMetric levenshtein() {
return new Levenshtein();
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link MongeElkan} metric with an internal {@link SmithWatermanGotoh}
* metric.
*
* @return a Monge-Elkan metric
*/
public static StringMetric mongeElkan() {
return createForListMetric(new MongeElkan(new SmithWatermanGotoh()), whitespace());
}
/**
* Returns a string metric that uses the {@link NeedlemanWunch} metric.
*
* @return a Needleman-Wunch metric
*/
public static StringMetric needlemanWunch() {
return new NeedlemanWunch();
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()} and
* the {@link OverlapCoefficient} metric.
*
* @return a overlap coefficient metric
*/
public static StringMetric overlapCoefficient() {
return createForSetMetric(new OverlapCoefficient(), whitespace());
}
/**
* Returns a string metric that uses a
* {@link Tokenizers#qGramWithPadding(int)} for {@code q=3} and the
* {@link BlockDistance} metric.
*
* @return a q-grams distance metric
*/
public static StringMetric qGramsDistance() {
return createForMultisetMetric(new BlockDistance(), Tokenizers.qGramWithPadding(3));
}
/**
* Returns a string metric that uses a {@link Tokenizers#whitespace()}
* followed by a {@link Tokenizers#qGramWithPadding(int)} for {@code q=2}
* and the {@link SimonWhite} metric.
*
* @return a Simon White metric
*/
public static StringMetric simonWhite() {
return createForMultisetMetric(new SimonWhite(), chain(whitespace(), qGram(2)));
}
/**
* Returns a string metric that uses the {@link SmithWaterman} metric.
*
* @return a Smith-Waterman metric
*/
public static StringMetric smithWaterman() {
return new SmithWaterman();
}
/**
* Returns a string metric that uses the {@link SmithWatermanGotoh} metric.
*
* @return a Smith-Waterman-Gotoh metric
*/
public static StringMetric smithWatermanGotoh() {
return new SmithWatermanGotoh();
}
/**
* Returns a string metric that uses a {@link Soundex} and
* {@link JaroWinkler} metric.
*
* @return a Soundex metric
*/
public static StringMetric soundex() {
return create(new JaroWinkler(), new Soundex());
}
/**
* Either constructs a new string metric or returns the original metric.
*
* @param metric
* a metric for strings
*
* @return a string metric.
*/
public static StringMetric create(Metric metric) {
if (metric instanceof StringMetric) {
return (StringMetric) metric;
}
return new ForString(metric);
}
/**
* Constructs a new composite string metric. The simplifier will be applied
* before the metric compares the strings.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric or simplifier are null
*
* @see StringMetricBuilder
*/
public static StringMetric create(Metric metric, Simplifier simplifier) {
if (metric instanceof ForString) {
ForString forString = (ForString) metric;
return new ForStringWithSimplifier(forString.getMetric(), simplifier);
} else if (metric instanceof ForStringWithSimplifier) {
ForStringWithSimplifier fsws = (ForStringWithSimplifier) metric;
return new ForStringWithSimplifier(fsws.getMetric(), chain(simplifier, fsws.getSimplifier()));
} else if (metric instanceof ForList) {
ForList fl = (ForList) metric;
return createForListMetric(fl.getMetric(), simplifier, fl.getTokenizer());
} else if (metric instanceof ForListWithSimplifier) {
ForListWithSimplifier fl = (ForListWithSimplifier) metric;
return createForListMetric(fl.getMetric(), chain(simplifier, fl.getSimplifier()), fl.getTokenizer());
} else if (metric instanceof ForSet) {
ForSet fl = (ForSet) metric;
return createForSetMetric(fl.getMetric(), simplifier, fl.getTokenizer());
} else if (metric instanceof ForSetWithSimplifier) {
ForSetWithSimplifier fl = (ForSetWithSimplifier) metric;
return createForSetMetric(fl.getMetric(), chain(simplifier, fl.getSimplifier()), fl.getTokenizer());
}
return new ForStringWithSimplifier(metric, simplifier);
}
/**
* Creates a new composite string metric.The tokenizer is used to tokenize
* the simplified strings. The list metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite list metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForListMetric(Metric> metric, Simplifier simplifier,
Tokenizer tokenizer) {
return new ForListWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string metric. The tokenizer is used to tokenize
* the strings. The list metric compares the the tokens.
*
* @param metric
* a list metric
* @param tokenizer
* a tokenizer
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForListMetric(Metric> metric, Tokenizer tokenizer) {
return new ForList(metric, tokenizer);
}
/**
* Creates a new composite string metric.The tokenizer is used to tokenize
* the simplified strings. The set metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForSetMetric(Metric> metric, Simplifier simplifier,
Tokenizer tokenizer) {
return new ForSetWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string metric. The tokenizer is used to tokenize
* the strings. The set metric compares the the tokens.
*
* @param metric
* a set metric
*
* @param tokenizer
* a tokenizer
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForSetMetric(Metric> metric, Tokenizer tokenizer) {
return new ForSet(metric, tokenizer);
}
/**
* Creates a new composite string metric.The tokenizer is used to tokenize
* the simplified strings. The set metric compares the the tokens.
*
* @param metric
* a list metric
* @param simplifier
* a simplifier
* @param tokenizer
* a tokenizer
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric, simplifier or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForMultisetMetric(Metric> metric, Simplifier simplifier,
Tokenizer tokenizer) {
return new ForMultisetWithSimplifier(metric, simplifier, tokenizer);
}
/**
* Creates a new composite string metric. The tokenizer is used to tokenize
* the strings. The set metric compares the the tokens.
*
* @param metric
* a set metric
*
* @param tokenizer
* a tokenizer
* @return a new composite string metric
*
* @throws NullPointerException
* when either metric or tokenizer are null
*
* @see StringMetricBuilder
*/
public static StringMetric createForMultisetMetric(Metric> metric, Tokenizer tokenizer) {
return new ForMultiset(metric, tokenizer);
}
static final class ForList implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForList(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToList(a), tokenizer.tokenizeToList(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForListWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForListWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToList(simplifier.simplify(a)),
tokenizer.tokenizeToList(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForSet implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForSet(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToSet(a), tokenizer.tokenizeToSet(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForSetWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForSetWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToSet(simplifier.simplify(a)),
tokenizer.tokenizeToSet(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForMultiset implements StringMetric {
private final Metric> metric;
private final Tokenizer tokenizer;
ForMultiset(Metric> metric, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(tokenizer);
this.metric = metric;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToMultiset(a), tokenizer.tokenizeToMultiset(b));
}
Metric> getMetric() {
return metric;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + tokenizer + "]";
}
}
static final class ForMultisetWithSimplifier implements StringMetric {
private final Metric> metric;
private final Simplifier simplifier;
private final Tokenizer tokenizer;
ForMultisetWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) {
checkNotNull(metric);
checkNotNull(simplifier);
checkNotNull(tokenizer);
this.metric = metric;
this.simplifier = simplifier;
this.tokenizer = tokenizer;
}
@Override
public float compare(String a, String b) {
return metric.compare(tokenizer.tokenizeToMultiset(simplifier.simplify(a)),
tokenizer.tokenizeToMultiset(simplifier.simplify(b)));
}
Metric> getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
Tokenizer getTokenizer() {
return tokenizer;
}
@Override
public String toString() {
return metric + " [" + simplifier + " -> " + tokenizer + "]";
}
}
static final class ForString implements StringMetric {
private final Metric metric;
ForString(Metric metric) {
this.metric = metric;
}
@Override
public float compare(String a, String b) {
return metric.compare(a, b);
}
@Override
public String toString() {
return metric.toString();
}
Metric getMetric() {
return metric;
}
}
static final class ForStringWithSimplifier implements StringMetric {
private final Metric metric;
private final Simplifier simplifier;
ForStringWithSimplifier(Metric metric, Simplifier simplifier) {
checkNotNull(metric);
checkNotNull(simplifier);
this.metric = metric;
this.simplifier = simplifier;
}
@Override
public float compare(String a, String b) {
return metric.compare(simplifier.simplify(a), simplifier.simplify(b));
}
Metric getMetric() {
return metric;
}
Simplifier getSimplifier() {
return simplifier;
}
@Override
public String toString() {
return metric + " [" + simplifier + "]";
}
}
private StringMetrics() {
// Utility class.
}
}