org.simmetrics.builders.StringDistanceBuilder Maven / Gradle / Ivy
/*
* #%L
* Simmetrics Core
* %%
* Copyright (C) 2014 - 2016 Simmetrics Authors
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package org.simmetrics.builders;
import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.builders.StringDistances.create;
import static org.simmetrics.builders.StringDistances.createForListDistance;
import static org.simmetrics.builders.StringDistances.createForMultisetDistance;
import static org.simmetrics.builders.StringDistances.createForSetDistance;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import org.simmetrics.ListDistance;
import org.simmetrics.Distance;
import org.simmetrics.MultisetDistance;
import org.simmetrics.SetDistance;
import org.simmetrics.StringDistance;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Simplifiers;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.tokenizers.Tokenizers;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.cache.Cache;
import com.google.common.collect.Multiset;
/**
* Convenience tool to build string distance metrics. Any class implementing
* {@link StringDistance}, {@link ListDistance}, {@link SetDistance} or
* {@link MultisetDistance} can be used to build a string distance metric.
* Supports the addition of simplification, tokenization, token-filtering,
* token-transformation and caching to a distance.
*
* The created distance metrics are immutable and thread-safe provided all their components
* are also immutable and thread-safe.
*
* For usage examples see the simmetrics-example module.
*/
public final class StringDistanceBuilder {
private StringDistanceBuilder() {
// Utility class
}
/**
* Starts building a distance metric with a string distance metric.
*
* @param distance
* the distance to use as a base
* @return a builder for fluent chaining
*/
public static StringDistanceInitialSimplifierStep with(StringDistance distance) {
return new CompositeStringDistanceBuilder(distance);
}
/**
* Starts building a distance with a list distance.
*
* @param distance
* the distance to use as a base
* @return a builder for fluent chaining
*/
public static CollectionDistanceInitialSimplifierStep> with(ListDistance distance) {
return new CompositeListDistanceBuilder(distance);
}
/**
* Starts building a distance with a set distance.
*
* @param distance
* the distance to use as a base
* @return a builder for fluent chaining
*/
public static CollectionDistanceInitialSimplifierStep> with(SetDistance distance) {
return new CompositeSetDistanceBuilder(distance);
}
/**
* Starts building a distance with a multiset distance.
*
* @param distance
* the distance to use as a base
* @return a builder for fluent chaining
*/
public static CollectionDistanceInitialSimplifierStep> with(MultisetDistance distance) {
return new CompositeMultisetDistanceBuilder(distance);
}
@SuppressWarnings("javadoc")
public interface BuildStep {
/**
* Builds a distance with the given steps.
*
* @return a distance
*/
StringDistance build();
}
@SuppressWarnings("javadoc")
public interface StringDistanceInitialSimplifierStep extends BuildStep {
/**
* Adds a simplifier to the distance.
*
* @param simplifier
* a simplifier to add
* @return this for fluent chaining
*/
StringDistanceSimplifierStep simplify(Simplifier simplifier);
/**
* Builds a distance with the given simplifier.
*
* @return a distance
*/
@Override
StringDistance build();
}
@SuppressWarnings("javadoc")
public interface StringDistanceSimplifierStep extends StringDistanceInitialSimplifierStep {
/**
* Adds a simplifier to the distance.
*
* @param simplifier
* a simplifier to add
* @return this for fluent chaining
*/
@Override
StringDistanceSimplifierStep simplify(Simplifier simplifier);
/**
* Sets a cache for simplification chain. The cache will store the
* result of all previous simplification steps.
*
* @param cache
* a cache to add
* @return this for fluent chaining
*/
BuildStep cacheStrings(Cache cache);
/**
* Builds a distance with the given simplifier.
*
* @return a distance
*/
@Override
StringDistance build();
}
@SuppressWarnings("javadoc")
public interface CollectionDistanceInitialSimplifierStep> {
/**
* Adds a simplifier to the distance.
*
* @param simplifier
* a simplifier to add
* @return this for fluent chaining
*/
CollectionDistanceSimplifierStep simplify(Simplifier simplifier);
/**
* Adds a tokenization step to the distance.
*
* @param tokenizer
* a tokenizer to add
* @return this for fluent chaining
*/
CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer);
}
@SuppressWarnings("javadoc")
public interface CollectionDistanceSimplifierStep>
extends CollectionDistanceInitialSimplifierStep {
/**
* Adds a simplifier to the distance.
*
* @param simplifier
* a simplifier to add
* @return this for fluent chaining
*/
@Override
CollectionDistanceSimplifierStep simplify(Simplifier simplifier);
/**
* Sets a cache for simplification chain. The cache will store the
* result of all previous simplification steps.
*
* @param cache
* a cache to add
* @return this for fluent chaining
*/
CollectionDistanceInitialTokenizerStep cacheStrings(Cache cache);
/**
* Adds a tokenization step to the distance.
*
* @param tokenizer
* a tokenizer to add
* @return this for fluent chaining
*/
@Override
CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer);
}
@SuppressWarnings("javadoc")
public interface CollectionDistanceInitialTokenizerStep> {
/**
* Adds a tokenization step to the distance.
*
* @param tokenizer
* a tokenizer to add
* @return a builder for fluent chaining
*/
CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer);
}
@SuppressWarnings("javadoc")
public interface CollectionDistanceTokenizerStep>
extends BuildStep, CollectionDistanceInitialTokenizerStep {
/**
* Adds a tokenization step to the distance.
*
* @param tokenizer
* a tokenizer to add
* @return a builder for fluent chaining
*/
@Override
CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer);
/**
* Adds a filter step to the distance. All tokens that match the
* predicate are kept.
*
* @param predicate
* a predicate for tokens to keep
* @return this for fluent chaining
*/
CollectionDistanceTokenizerStep filter(Predicate predicate);
/**
* Adds a transform step to the distance. All tokens are transformed by
* the function. The function may not return null.
*
* @param function
* a function to transform tokens
* @return this for fluent chaining
*/
CollectionDistanceTokenizerStep transform(Function function);
/**
* Sets a cache for tokenization chain. The cache will store the result
* of all previous tokenization steps.
*
* @param cache
* a cache to add
* @return this for fluent chaining
*
*/
BuildStep cacheTokens(Cache cache);
/**
* Builds a string distance metric that will use the given
* simplification, tokenization and filtering steps.
*
* @return a string distance metric
*/
@Override
StringDistance build();
}
private static final class CompositeStringDistanceBuilder implements StringDistanceSimplifierStep {
private final Distance distance;
private final List simplifiers = new ArrayList<>();
CompositeStringDistanceBuilder(Distance distance) {
checkNotNull(distance);
this.distance = distance;
}
@Override
public StringDistance build() {
if (simplifiers.isEmpty()) {
return create(distance);
}
return create(distance, chainSimplifiers());
}
private Simplifier chainSimplifiers() {
final Simplifier simplifier = Simplifiers.chain(simplifiers);
simplifiers.clear();
return simplifier;
}
@Override
public BuildStep cacheStrings(Cache cache) {
checkNotNull(cache);
CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers());
this.simplifiers.add(cachingSimplifier);
return this;
}
@Override
public StringDistanceSimplifierStep simplify(Simplifier simplifier) {
checkNotNull(simplifier);
this.simplifiers.add(simplifier);
return this;
}
}
private static abstract class CompositeCollectionDistanceBuilder>
implements CollectionDistanceSimplifierStep, CollectionDistanceTokenizerStep {
private final Distance distance;
private final List simplifiers = new ArrayList<>();
private final List tokenizers = new ArrayList<>();
CompositeCollectionDistanceBuilder(Distance distance) {
checkNotNull(distance);
this.distance = distance;
}
@Override
public final StringDistance build() {
Tokenizer tokenizer = chainTokenizers();
if (simplifiers.isEmpty()) {
return build(distance, tokenizer);
}
return build(distance, chainSimplifiers(), tokenizer);
}
abstract StringDistance build(Distance distance, Simplifier simplifier, Tokenizer tokenizer);
abstract StringDistance build(Distance distance, Tokenizer tokenizer);
@Override
public final BuildStep cacheTokens(Cache cache) {
checkNotNull(cache);
tokenizers.add(createCachingTokenizer(cache, chainTokenizers()));
return this;
}
protected abstract Tokenizer createCachingTokenizer(Cache cache, Tokenizer tokenizer);
@Override
public final CollectionDistanceInitialTokenizerStep cacheStrings(Cache cache) {
checkNotNull(cache);
CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers());
this.simplifiers.add(cachingSimplifier);
return this;
}
@Override
public final CollectionDistanceSimplifierStep simplify(Simplifier simplifier) {
checkNotNull(simplifier);
simplifiers.add(simplifier);
return this;
}
@Override
public final CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer) {
checkNotNull(tokenizer);
tokenizers.add(tokenizer);
return this;
}
@Override
public final CollectionDistanceTokenizerStep filter(Predicate predicate) {
checkNotNull(predicate);
final Tokenizer filter = Tokenizers.filter(chainTokenizers(), predicate);
tokenizers.add(filter);
return this;
}
@Override
public final CollectionDistanceTokenizerStep transform(Function function) {
checkNotNull(function);
final Tokenizer transform = Tokenizers.transform(chainTokenizers(), function);
tokenizers.add(transform);
return this;
}
private Tokenizer chainTokenizers() {
final Tokenizer tokenizer = Tokenizers.chain(tokenizers);
tokenizers.clear();
return tokenizer;
}
private Simplifier chainSimplifiers() {
final Simplifier simplifier = Simplifiers.chain(simplifiers);
simplifiers.clear();
return simplifier;
}
}
private static final class CompositeListDistanceBuilder extends CompositeCollectionDistanceBuilder> {
CompositeListDistanceBuilder(Distance> distance) {
super(distance);
}
@Override
StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) {
return createForListDistance(distance, simplifier, tokenizer);
}
@Override
StringDistance build(Distance> distance, Tokenizer tokenizer) {
return createForListDistance(distance, tokenizer);
}
@Override
protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) {
return new CachingListTokenizer(cache, tokenizer);
}
}
private static final class CompositeSetDistanceBuilder extends CompositeCollectionDistanceBuilder> {
CompositeSetDistanceBuilder(Distance> distance) {
super(distance);
}
@Override
StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) {
return createForSetDistance(distance, simplifier, tokenizer);
}
@Override
StringDistance build(Distance> distance, Tokenizer tokenizer) {
return createForSetDistance(distance, tokenizer);
}
@Override
protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) {
return new CachingSetTokenizer(cache, tokenizer);
}
}
private static final class CompositeMultisetDistanceBuilder
extends CompositeCollectionDistanceBuilder> {
CompositeMultisetDistanceBuilder(Distance> distance) {
super(distance);
}
@Override
StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) {
return createForMultisetDistance(distance, simplifier, tokenizer);
}
@Override
StringDistance build(Distance> distance, Tokenizer tokenizer) {
return createForMultisetDistance(distance, tokenizer);
}
@Override
protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) {
return new CachingMultisetTokenizer(cache, tokenizer);
}
}
static final class CachingSimplifier implements Simplifier {
private final Cache cache;
final Simplifier simplifier;
CachingSimplifier(Cache cache, Simplifier simplifier) {
this.cache = cache;
this.simplifier = simplifier;
}
@Override
public String simplify(final String input) {
try {
return cache.get(input, new Callable() {
@Override
public String call() throws Exception {
return simplifier.simplify(input);
}
});
} catch (ExecutionException e) {
// Can't happen. Simplifier may not throw checked exceptions
throw new IllegalStateException(e);
}
}
@Override
public String toString() {
return "CachingSimplifier [" + simplifier + "]";
}
}
static final class CachingMultisetTokenizer implements Tokenizer {
private final Cache> cache;
final Tokenizer tokenizer;
CachingMultisetTokenizer(Cache> cache, Tokenizer tokenizer) {
this.cache = cache;
this.tokenizer = tokenizer;
}
@Override
public List tokenizeToList(final String input) {
throw new UnsupportedOperationException();
}
@Override
public Set tokenizeToSet(final String input) {
throw new UnsupportedOperationException();
}
@Override
public Multiset tokenizeToMultiset(final String input) {
try {
return cache.get(input, new Callable>() {
@Override
public Multiset call() {
return tokenizer.tokenizeToMultiset(input);
}
});
} catch (ExecutionException e) {
// Can't happen. Tokenizer may not throw checked exceptions
throw new IllegalStateException(e);
}
}
@Override
public String toString() {
return "CachingMultisetTokenizer [" + cache + ", " + tokenizer + "]";
}
}
static final class CachingSetTokenizer implements Tokenizer {
private final Cache> cache;
final Tokenizer tokenizer;
CachingSetTokenizer(Cache> cache, Tokenizer tokenizer) {
this.cache = cache;
this.tokenizer = tokenizer;
}
@Override
public List tokenizeToList(final String input) {
throw new UnsupportedOperationException();
}
@Override
public Set tokenizeToSet(final String input) {
try {
return cache.get(input, new Callable>() {
@Override
public Set call() {
return tokenizer.tokenizeToSet(input);
}
});
} catch (ExecutionException e) {
// Can't happen. Tokenizer may not throw checked exceptions
throw new IllegalStateException(e);
}
}
@Override
public Multiset tokenizeToMultiset(final String input) {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
return "CachingSetTokenizer [" + cache + ", " + tokenizer + "]";
}
}
static final class CachingListTokenizer implements Tokenizer {
private final Cache> cache;
final Tokenizer tokenizer;
CachingListTokenizer(Cache> cache, Tokenizer tokenizer) {
this.cache = cache;
this.tokenizer = tokenizer;
}
@Override
public List tokenizeToList(final String input) {
try {
return cache.get(input, new Callable>() {
@Override
public List call() {
return tokenizer.tokenizeToList(input);
}
});
} catch (ExecutionException e) {
// Can't happen. Tokenizer may not throw checked exceptions
throw new IllegalStateException(e);
}
}
@Override
public Set tokenizeToSet(final String input) {
throw new UnsupportedOperationException();
}
@Override
public Multiset tokenizeToMultiset(final String input) {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
return "CachingListTokenizer [" + cache + ", " + tokenizer + "]";
}
}
}