All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.metrics.StringMetrics Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2015 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

package org.simmetrics.metrics;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.simplifiers.Simplifiers.chain;
import static org.simmetrics.tokenizers.Tokenizers.chain;
import static org.simmetrics.tokenizers.Tokenizers.qGram;
import static org.simmetrics.tokenizers.Tokenizers.whitespace;

import java.util.List;
import java.util.Set;

import org.simmetrics.Metric;
import org.simmetrics.StringMetric;
import org.simmetrics.builders.StringMetricBuilder;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Soundex;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.tokenizers.Tokenizers;

import com.google.common.collect.Multiset;

/**
 * Utility class for StringMetrics.
 * 

* Consists of well known metrics and methods to create string metrics from * list- or set metrics. All metrics are setup with sensible defaults, to * customize metrics use {@link StringMetricBuilder}. *

* The available metrics are: * *

    *
  • Block Distance *
  • Cosine Similarity *
  • Damerau Levenshtein *
  • Dice *
  • SimonWhite (Quantitative Dice) *
  • Euclidean Distance *
  • Jaccard *
  • Generalized Jaccard *
  • Jaro *
  • Jaro-Winkler *
  • LevenShtein *
  • Monge-Elkan *
  • NeedleMan Wunch *
  • Overlap Coefficient *
  • q-Grams Distance *
  • Smith-Waterman *
  • Smith-Waterman-Gotoh *
  • Soundex *
* *

* All methods return immutable objects provided the arguments are also * immutable. */ public final class StringMetrics { /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link CosineSimilarity} metric. * * @return a cosine similarity metric */ public static StringMetric cosineSimilarity() { return createForMultisetMetric(new CosineSimilarity(), whitespace()); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link BlockDistance} metric. * * @return a block distance metric */ public static StringMetric blockDistance() { return createForMultisetMetric(new BlockDistance(), whitespace()); } /** * Returns a string metric that uses a {@link DamerauLevenshtein} metric. * * @return a damerau levenshtein metric */ public static StringMetric damerauLevenshtein() { return new DamerauLevenshtein(); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link Dice} metric. * * @return a dice metric */ public static StringMetric dice() { return createForSetMetric(new Dice(), whitespace()); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link EuclideanDistance} metric. * * @return a Euclidean distance similarity metric */ public static StringMetric euclideanDistance() { return createForMultisetMetric(new EuclideanDistance(), whitespace()); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link GeneralizedJaccard} metric. * * @return a generalized jaccard index metric */ public static StringMetric generalizedJaccard() { return createForMultisetMetric(new GeneralizedJaccard(), whitespace()); } /** * Returns an string metric that uses the {@link Identity} metric. * * @return an identity string metric */ public static StringMetric identity() { return create(new Identity()); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link Jaccard} metric. * * @return a Jaccard similarity metric */ public static StringMetric jaccard() { return createForSetMetric(new Jaccard(), whitespace()); } /** * Returns a string metric that uses the {@link Jaro} metric. * * @return a Jaro metric */ public static StringMetric jaro() { return new Jaro(); } /** * Returns a string metric that uses the {@link JaroWinkler} metric. * * @return a Jaro-Winkler metric */ public static StringMetric jaroWinkler() { return new JaroWinkler(); } /** * Returns a string metric that uses the {@link Levenshtein} metric. * * @return a Levenshtein metric */ public static StringMetric levenshtein() { return new Levenshtein(); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link MongeElkan} metric with an internal {@link SmithWatermanGotoh} * metric. * * @return a Monge-Elkan metric */ public static StringMetric mongeElkan() { return createForListMetric(new MongeElkan(new SmithWatermanGotoh()), whitespace()); } /** * Returns a string metric that uses the {@link NeedlemanWunch} metric. * * @return a Needleman-Wunch metric */ public static StringMetric needlemanWunch() { return new NeedlemanWunch(); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} and * the {@link OverlapCoefficient} metric. * * @return a overlap coefficient metric */ public static StringMetric overlapCoefficient() { return createForSetMetric(new OverlapCoefficient(), whitespace()); } /** * Returns a string metric that uses a * {@link Tokenizers#qGramWithPadding(int)} for {@code q=3} and the * {@link BlockDistance} metric. * * @return a q-grams distance metric */ public static StringMetric qGramsDistance() { return createForMultisetMetric(new BlockDistance(), Tokenizers.qGramWithPadding(3)); } /** * Returns a string metric that uses a {@link Tokenizers#whitespace()} * followed by a {@link Tokenizers#qGramWithPadding(int)} for {@code q=2} * and the {@link SimonWhite} metric. * * @return a Simon White metric */ public static StringMetric simonWhite() { return createForMultisetMetric(new SimonWhite(), chain(whitespace(), qGram(2))); } /** * Returns a string metric that uses the {@link SmithWaterman} metric. * * @return a Smith-Waterman metric */ public static StringMetric smithWaterman() { return new SmithWaterman(); } /** * Returns a string metric that uses the {@link SmithWatermanGotoh} metric. * * @return a Smith-Waterman-Gotoh metric */ public static StringMetric smithWatermanGotoh() { return new SmithWatermanGotoh(); } /** * Returns a string metric that uses a {@link Soundex} and * {@link JaroWinkler} metric. * * @return a Soundex metric */ public static StringMetric soundex() { return create(new JaroWinkler(), new Soundex()); } /** * Either constructs a new string metric or returns the original metric. * * @param metric * a metric for strings * * @return a string metric. */ public static StringMetric create(Metric metric) { if (metric instanceof StringMetric) { return (StringMetric) metric; } return new ForString(metric); } /** * Constructs a new composite string metric. The simplifier will be applied * before the metric compares the strings. * * @param metric * a list metric * @param simplifier * a simplifier * @return a new composite string metric * * @throws NullPointerException * when either metric or simplifier are null * * @see StringMetricBuilder */ public static StringMetric create(Metric metric, Simplifier simplifier) { if (metric instanceof ForString) { ForString forString = (ForString) metric; return new ForStringWithSimplifier(forString.getMetric(), simplifier); } else if (metric instanceof ForStringWithSimplifier) { ForStringWithSimplifier fsws = (ForStringWithSimplifier) metric; return new ForStringWithSimplifier(fsws.getMetric(), chain(simplifier, fsws.getSimplifier())); } else if (metric instanceof ForList) { ForList fl = (ForList) metric; return createForListMetric(fl.getMetric(), simplifier, fl.getTokenizer()); } else if (metric instanceof ForListWithSimplifier) { ForListWithSimplifier fl = (ForListWithSimplifier) metric; return createForListMetric(fl.getMetric(), chain(simplifier, fl.getSimplifier()), fl.getTokenizer()); } else if (metric instanceof ForSet) { ForSet fl = (ForSet) metric; return createForSetMetric(fl.getMetric(), simplifier, fl.getTokenizer()); } else if (metric instanceof ForSetWithSimplifier) { ForSetWithSimplifier fl = (ForSetWithSimplifier) metric; return createForSetMetric(fl.getMetric(), chain(simplifier, fl.getSimplifier()), fl.getTokenizer()); } return new ForStringWithSimplifier(metric, simplifier); } /** * Creates a new composite string metric.The tokenizer is used to tokenize * the simplified strings. The list metric compares the the tokens. * * @param metric * a list metric * @param simplifier * a simplifier * @param tokenizer * a tokenizer * @return a new composite list metric * * @throws NullPointerException * when either metric, simplifier or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForListMetric(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return new ForListWithSimplifier(metric, simplifier, tokenizer); } /** * Creates a new composite string metric. The tokenizer is used to tokenize * the strings. The list metric compares the the tokens. * * @param metric * a list metric * @param tokenizer * a tokenizer * @return a new composite string metric * * @throws NullPointerException * when either metric or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForListMetric(Metric> metric, Tokenizer tokenizer) { return new ForList(metric, tokenizer); } /** * Creates a new composite string metric.The tokenizer is used to tokenize * the simplified strings. The set metric compares the the tokens. * * @param metric * a list metric * @param simplifier * a simplifier * @param tokenizer * a tokenizer * @return a new composite string metric * * @throws NullPointerException * when either metric, simplifier or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForSetMetric(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return new ForSetWithSimplifier(metric, simplifier, tokenizer); } /** * Creates a new composite string metric. The tokenizer is used to tokenize * the strings. The set metric compares the the tokens. * * @param metric * a set metric * * @param tokenizer * a tokenizer * @return a new composite string metric * * @throws NullPointerException * when either metric or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForSetMetric(Metric> metric, Tokenizer tokenizer) { return new ForSet(metric, tokenizer); } /** * Creates a new composite string metric.The tokenizer is used to tokenize * the simplified strings. The set metric compares the the tokens. * * @param metric * a list metric * @param simplifier * a simplifier * @param tokenizer * a tokenizer * @return a new composite string metric * * @throws NullPointerException * when either metric, simplifier or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForMultisetMetric(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return new ForMultisetWithSimplifier(metric, simplifier, tokenizer); } /** * Creates a new composite string metric. The tokenizer is used to tokenize * the strings. The set metric compares the the tokens. * * @param metric * a set metric * * @param tokenizer * a tokenizer * @return a new composite string metric * * @throws NullPointerException * when either metric or tokenizer are null * * @see StringMetricBuilder */ public static StringMetric createForMultisetMetric(Metric> metric, Tokenizer tokenizer) { return new ForMultiset(metric, tokenizer); } static final class ForList implements StringMetric { private final Metric> metric; private final Tokenizer tokenizer; ForList(Metric> metric, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(tokenizer); this.metric = metric; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToList(a), tokenizer.tokenizeToList(b)); } Metric> getMetric() { return metric; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + tokenizer + "]"; } } static final class ForListWithSimplifier implements StringMetric { private final Metric> metric; private final Simplifier simplifier; private final Tokenizer tokenizer; ForListWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(simplifier); checkNotNull(tokenizer); this.metric = metric; this.simplifier = simplifier; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToList(simplifier.simplify(a)), tokenizer.tokenizeToList(simplifier.simplify(b))); } Metric> getMetric() { return metric; } Simplifier getSimplifier() { return simplifier; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + simplifier + " -> " + tokenizer + "]"; } } static final class ForSet implements StringMetric { private final Metric> metric; private final Tokenizer tokenizer; ForSet(Metric> metric, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(tokenizer); this.metric = metric; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToSet(a), tokenizer.tokenizeToSet(b)); } Metric> getMetric() { return metric; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + tokenizer + "]"; } } static final class ForSetWithSimplifier implements StringMetric { private final Metric> metric; private final Simplifier simplifier; private final Tokenizer tokenizer; ForSetWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(simplifier); checkNotNull(tokenizer); this.metric = metric; this.simplifier = simplifier; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToSet(simplifier.simplify(a)), tokenizer.tokenizeToSet(simplifier.simplify(b))); } Metric> getMetric() { return metric; } Simplifier getSimplifier() { return simplifier; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + simplifier + " -> " + tokenizer + "]"; } } static final class ForMultiset implements StringMetric { private final Metric> metric; private final Tokenizer tokenizer; ForMultiset(Metric> metric, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(tokenizer); this.metric = metric; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToMultiset(a), tokenizer.tokenizeToMultiset(b)); } Metric> getMetric() { return metric; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + tokenizer + "]"; } } static final class ForMultisetWithSimplifier implements StringMetric { private final Metric> metric; private final Simplifier simplifier; private final Tokenizer tokenizer; ForMultisetWithSimplifier(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { checkNotNull(metric); checkNotNull(simplifier); checkNotNull(tokenizer); this.metric = metric; this.simplifier = simplifier; this.tokenizer = tokenizer; } @Override public float compare(String a, String b) { return metric.compare(tokenizer.tokenizeToMultiset(simplifier.simplify(a)), tokenizer.tokenizeToMultiset(simplifier.simplify(b))); } Metric> getMetric() { return metric; } Simplifier getSimplifier() { return simplifier; } Tokenizer getTokenizer() { return tokenizer; } @Override public String toString() { return metric + " [" + simplifier + " -> " + tokenizer + "]"; } } static final class ForString implements StringMetric { private final Metric metric; ForString(Metric metric) { this.metric = metric; } @Override public float compare(String a, String b) { return metric.compare(a, b); } @Override public String toString() { return metric.toString(); } Metric getMetric() { return metric; } } static final class ForStringWithSimplifier implements StringMetric { private final Metric metric; private final Simplifier simplifier; ForStringWithSimplifier(Metric metric, Simplifier simplifier) { checkNotNull(metric); checkNotNull(simplifier); this.metric = metric; this.simplifier = simplifier; } @Override public float compare(String a, String b) { return metric.compare(simplifier.simplify(a), simplifier.simplify(b)); } Metric getMetric() { return metric; } Simplifier getSimplifier() { return simplifier; } @Override public String toString() { return metric + " [" + simplifier + "]"; } } private StringMetrics() { // Utility class. } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy