All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.StringMetricBuilder Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * SimMetrics - SimMetrics is a java library of Similarity or Distance Metrics,
 * e.g. Levenshtein Distance, that provide float based similarity measures
 * between String Data. All metrics return consistent measures rather than
 * unbounded similarity scores.
 * 
 * Copyright (C) 2014 SimMetrics authors
 * 
 * This file is part of SimMetrics. This program is free software: you can
 * redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * SimMetrics. If not, see .
 */
package org.simmetrics;

import static com.google.common.base.Preconditions.checkNotNull;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;

import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.utils.CachingSimplifier;
import org.simmetrics.utils.CachingTokenizer;
import org.simmetrics.utils.CompositeSimplifier;
import org.simmetrics.utils.CompositeStringMetric;
import org.simmetrics.utils.CompositeListMetric;
import org.simmetrics.utils.CompositeSetMetric;
import org.simmetrics.utils.FilteringTokenizer;
import org.simmetrics.utils.CompositeTokenizer;
import org.simmetrics.utils.PassThroughSimplifier;
import org.simmetrics.utils.SimplifyingSimplifier;
import org.simmetrics.utils.TokenizingTokenizer;

import com.google.common.base.Predicate;
import com.google.common.base.Predicates;

/**
 * Convenience tool to build string metrics. Any class implementing
 * {@link StringMetric}, {@link ListMetric} or {@link SetMetric} can be used to
 * build a string metric. Supports the addition of simplification, tokenization,
 * filtering and caching to a metric.
 * 
 * 

Metrics

* * A metric is used to measure the similarity between strings. Metrics can work * on strings, lists or sets tokens. To compare strings with a metric that works * on a collection of tokens a tokenizer is required. * *

* By adding simplifiers, tokenizers and filters the effectiveness of a metric * can be improved. The exact combination is generally domain specific. This * builder supports these domain specific customizations. *

* *

 * 
 * {@code
 * 	StringMetric metric = new StringMetricBuilder()
 * 		.with(new CosineSimilarity())
 * 		.simplify(new NonWordCharacter())
 * 		.simplify(new Case.Lower())
 * 		.tokenize(new Whitespace())
 * 		.build();
 * }
 * 
 * 
* *

Simplification

* * Simplification increases the effectiveness of a metric by removing noise and * reducing the dimensionality of the problem. The process maps a a complex * string such as Chilpéric II son of Childeric II to a simpler * format chilperic ii son of childeric ii. This allows string from * different sources to be compared in the same normal form. * *

* Simplification can be done by any class implementing the {@link Simplifier} * interface. * *

Tokenization

* * Tokenization cuts up a string into tokens e.g. * chilperic ii son of childeric ii is tokenized into * [chilperic, ii, son, of, * childeric, ii]. Tokenization can also be done repeatedly by tokenizing * the individual tokens e.g. * [ch,hi,il,il,lp,pe,er,ri,ic, ii, so,on, of, ch,hi,il,ld,de,er,ri,ic, ii] *

* *

 * 
 * {@code
 * 	return new StringMetricBuilder()
 * 			.with(new SimonWhite())
 * 			.tokenize(new Whitespace())
 * 			.tokenize(new QGram(2))
 * 			.build();
 * }
 * 
 * 
* * * * *

* The method of tokenization changes the space in which strings are compared. * The effectiveness depends on the context. A whitespace tokenizer might be * more useful to measure similarity between large bodies of texts whiles a * q-gram tokenizer will work more effectively for matching words. * *

* Tokenization can be done by any class implementing the {@link Tokenizer} * interface and is required for all metrics that work on collections of tokens * rather then whole strings; {@link ListMetric}s and {@link SetMetric}s * *

Filtering

* * * Filtering removes tokens that should not be considered for comparison. For * example removing all tokens with a size less then three from `[chilperic, ii, * son, of, childeric, ii]` results in `[chilperic, son, childeric]`. * * A Filter can be implemented by implementing a the Predicate interface. * *
 * 
 * {@code
 * 				with(new CosineSimilarity())
 * 				.simplify(new Case.Lower())
 * 				.simplify(new WordCharacter())
 * 				.tokenize(new Whitespace())
 * 				.filter(new MinimumLenght(3))
 * 				.build();
 * }
 * 
 * 
* * By chaining predicates more complicated filters can be build. * *
 * 
 * {@code
 * 		Set commonWords = ...;
 * 		
 * 				with(new CosineSimilarity())
 * 				.simplify(new Case.Lower())
 * 				.simplify(new NonWordCharacter())
 * 				.tokenize(new Whitespace())
 * 				.filter(Predicates.not(Predicates.in(commonWords)))
 * 				.build();
 * }
 * 
 * 
* * *

Caching

* * Simplification and tokenization can be complex and expensive operations. When * comparing one string against a collection of strings these two operations are * done repeatedly for a single string - a common use case when searching for a * match. With a simple caching mechanism this overhead can be reduced. * * *
 * 
 * {@code
 * 				with(new CosineSimilarity())
 * 				.simplify(new Case.Lower())
 * 				.simplifierCache()
 * 				.tokenize(new QGram(2))
 * 				.tokenizerCache()
 * 				.build();
 * }
 * 
 * 
* * When a cache is set it applies to the whole simplification or tokenization * chain. The default cache has a size of two for use with * `StringMetrics.compare(StringMetric, String, List)` and friends. * * * @See {@link StringMetrics} * @See {@link Predicates} * * * */ public class StringMetricBuilder { /** * Starts building a metric with a string metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CompositeStringMetricBuilder with(StringMetric metric) { return new CompositeStringMetricBuilder(metric); } /** * Starts building a metric with a list metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CompositeListMetricBuilder with(ListMetric metric) { return new CompositeListMetricBuilder(metric); } /** * Starts building a metric with a set metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CompositeSetMetricBuilder with(SetMetric metric) { return new CompositeSetMetricBuilder(metric); } @SuppressWarnings("javadoc") public interface BuildStep { /** * Builds a metric with the given steps. * * @return a metric */ StringMetric build(); } @SuppressWarnings("javadoc") public interface StringMetricSimplifierStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ StringMetricSimplifierCacheStep simplify(Simplifier simplifier); /** * Builds a metric with the given simplifier. * * @return a metric */ StringMetric build(); } @SuppressWarnings("javadoc") public interface StringMetricSimplifierCacheStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ StringMetricSimplifierCacheStep simplify(Simplifier simplifier); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. The cache will be provided with a * simplifier through * {@link SimplifyingSimplifier#setSimplifier(Simplifier)}. * * @param cache * a cache to add * @return this for fluent chaining */ BuildStep simplifierCache(SimplifyingSimplifier cache); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. * * @param initialCapacity * initial cache size * @param maximumSize * maximum cache size * * @return this for fluent chaining */ BuildStep simplifierCache(int initialCapacity, int maximumSize); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. The cache will have a size of 2. * * @return this for fluent chaining */ BuildStep simplifierCache(); /** * Builds a metric with the given simplifier. * * @return a metric */ StringMetric build(); } @SuppressWarnings("javadoc") public interface CollectionMetricStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ CollectionMetricSimplifierStep simplify(Simplifier simplifier); /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ CollectionMetricTokenizerCacheStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricSimplifierStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ CollectionMetricSimplifierStep simplify(Simplifier simplifier); /** * Sets a cache for tokenization chain. The cache will store the result * of all tokenization steps. The cache will be provided with a * tokenizer through {@link TokenizingTokenizer#setTokenizer(Tokenizer)} * . . * * @param cache * a cache to add * @return this for fluent chaining */ CollectionMetricTokenizerStep simplifierCache( SimplifyingSimplifier cache); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. * * @param initialCapacity * initial cache size * @param maximumSize * maximum cache size * * @return this for fluent chaining */ CollectionMetricTokenizerStep simplifierCache(int initialCapacity, int maximumSize); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. The cache will have a size of 2. * * @return this for fluent chaining */ CollectionMetricTokenizerStep simplifierCache(); /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return this for fluent chaining */ CollectionMetricTokenizerCacheStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricTokenizerStep { /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ CollectionMetricTokenizerCacheStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricTokenizerCacheStep { /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ CollectionMetricTokenizerCacheStep tokenize(Tokenizer tokenizer); /** * Adds a filter step to the metric. All tokens that match the predicate * are kept. * * @param predicate * a predicate for tokens to keep * @return this for fluent chaining */ CollectionMetricTokenizerCacheStep filter(Predicate predicate); /** * Sets a cache for tokenization chain. The cache will store the result * of all tokenization steps. The cache will be provided with a * tokenizer through {@link TokenizingTokenizer#setTokenizer(Tokenizer)} * . . * * @param cache * a cache to add * @return this for fluent chaining */ BuildStep tokenizerCache(TokenizingTokenizer cache); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. * * @param initialCapacity * initial cache size * @param maximumSize * maximum cache size * * @return this for fluent chaining */ BuildStep tokenizerCache(int initialCapacity, int maximumSize); /** * Sets a cache for simplification chain. The cache will store the * result of all simplification steps. The cache will have a size of 2. * * @return this for fluent chaining */ BuildStep tokenizerCache(); /** * Builds a string metric that will use the given simplification, * tokenization and filtering steps. * * @return a string metric. */ StringMetric build(); } @SuppressWarnings("javadoc") public static final class CompositeStringMetricBuilder implements StringMetricSimplifierStep, StringMetricSimplifierCacheStep, BuildStep { private final StringMetric metric; private static final int CACHE_SIZE = 2; private final List simplifiers = new ArrayList<>(); private SimplifyingSimplifier cache; CompositeStringMetricBuilder(StringMetric metric) { checkNotNull(metric); this.metric = metric; } @Override public StringMetric build() { if (simplifiers.isEmpty()) { return metric; } Simplifier simplifier; if (simplifiers.size() == 1) { simplifier = simplifiers.get(0); } else { simplifier = new CompositeSimplifier(simplifiers); } if (cache != null) { cache.setSimplifier(simplifier); return new CompositeStringMetric(metric, cache); } else { return new CompositeStringMetric(metric, simplifier); } } @Override public BuildStep simplifierCache(SimplifyingSimplifier cache) { checkNotNull(cache); this.cache = cache; return this; } @Override public BuildStep simplifierCache(int initialCapacity, int maximumSize) { return simplifierCache(new CachingSimplifier(initialCapacity, maximumSize)); } @Override public BuildStep simplifierCache() { return simplifierCache(CACHE_SIZE, CACHE_SIZE); } @Override public StringMetricSimplifierCacheStep simplify(Simplifier simplifier) { checkNotNull(simplifier); this.simplifiers.add(simplifier); return this; } } @SuppressWarnings("javadoc") public static abstract class CompositeCollectionMetricBuilder> implements BuildStep, CollectionMetricStep, CollectionMetricSimplifierStep, CollectionMetricTokenizerStep, CollectionMetricTokenizerCacheStep { private final Metric metric; private static final int CACHE_SIZE = 2; private final List simplifiers = new ArrayList<>(); private final List tokenizers = new ArrayList<>(); private SimplifyingSimplifier stringCache; private TokenizingTokenizer tokenCache; CompositeCollectionMetricBuilder(Metric metric) { checkNotNull(metric); this.metric = metric; } public StringMetric build() { Simplifier simplifier; if (simplifiers.isEmpty()) { simplifier = new PassThroughSimplifier(); } else if (simplifiers.size() == 1) { simplifier = simplifiers.get(0); } else { simplifier = new CompositeSimplifier(simplifiers); } if (stringCache != null) { stringCache.setSimplifier(simplifier); simplifier = stringCache; } Tokenizer tokenizer; if (tokenizers.size() == 1) { tokenizer = tokenizers.get(0); } else { tokenizer = new CompositeTokenizer(tokenizers); } if (tokenCache != null) { tokenCache.setTokenizer(tokenizer); tokenizer = tokenCache; } return build(metric, simplifier, tokenizer); } abstract StringMetric build(Metric metric, Simplifier simplifier, Tokenizer tokenizer); @Override public BuildStep tokenizerCache(TokenizingTokenizer cache) { checkNotNull(cache); this.tokenCache = cache; return this; } @Override public BuildStep tokenizerCache(int initialCapacity, int maximumSize) { return tokenizerCache(new CachingTokenizer(initialCapacity, maximumSize)); } @Override public BuildStep tokenizerCache() { return tokenizerCache(CACHE_SIZE, CACHE_SIZE); } @Override public CollectionMetricTokenizerStep simplifierCache( SimplifyingSimplifier cache) { checkNotNull(cache); this.stringCache = cache; return this; } @Override public CollectionMetricTokenizerStep simplifierCache( int initialCapacity, int maximumSize) { return simplifierCache(new CachingSimplifier(initialCapacity, maximumSize)); } @Override public CollectionMetricTokenizerStep simplifierCache() { return simplifierCache(CACHE_SIZE, CACHE_SIZE); } @Override public CollectionMetricSimplifierStep simplify(Simplifier simplifier) { checkNotNull(simplifier); simplifiers.add(simplifier); return this; } @Override public CollectionMetricTokenizerCacheStep tokenize(Tokenizer tokenizer) { checkNotNull(tokenizer); tokenizers.add(tokenizer); return this; } @Override public CollectionMetricTokenizerCacheStep filter( Predicate predicate) { checkNotNull(predicate); Tokenizer tokenizer; if (tokenizers.size() == 1) { tokenizer = tokenizers.get(0); } else { tokenizer = new CompositeTokenizer(new ArrayList<>(tokenizers)); } tokenizers.clear(); FilteringTokenizer filter = new FilteringTokenizer(tokenizer, predicate); tokenizers.add(filter); return this; } } @SuppressWarnings("javadoc") public static final class CompositeListMetricBuilder extends CompositeCollectionMetricBuilder> { CompositeListMetricBuilder(Metric> metric) { super(metric); } @Override StringMetric build(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return new CompositeListMetric(metric, simplifier, tokenizer); } } @SuppressWarnings("javadoc") public static final class CompositeSetMetricBuilder extends CompositeCollectionMetricBuilder> { CompositeSetMetricBuilder(Metric> metric) { super(metric); } @Override StringMetric build(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return new CompositeSetMetric(metric, simplifier, tokenizer); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy