All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.builders.StringDistanceBuilder Maven / Gradle / Ivy

The newest version!
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2016 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

package org.simmetrics.builders;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.builders.StringDistances.create;
import static org.simmetrics.builders.StringDistances.createForListDistance;
import static org.simmetrics.builders.StringDistances.createForMultisetDistance;
import static org.simmetrics.builders.StringDistances.createForSetDistance;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;

import org.simmetrics.ListDistance;
import org.simmetrics.Distance;
import org.simmetrics.MultisetDistance;
import org.simmetrics.SetDistance;
import org.simmetrics.StringDistance;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Simplifiers;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.tokenizers.Tokenizers;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.cache.Cache;
import com.google.common.collect.Multiset;

/**
 * Convenience tool to build string distance metrics. Any class implementing
 * {@link StringDistance}, {@link ListDistance}, {@link SetDistance} or
 * {@link MultisetDistance} can be used to build a string distance metric.
 * Supports the addition of simplification, tokenization, token-filtering,
 * token-transformation and caching to a distance.
 * 

* The created distance metrics are immutable and thread-safe provided all their components * are also immutable and thread-safe. *

* For usage examples see the simmetrics-example module. */ public final class StringDistanceBuilder { private StringDistanceBuilder() { // Utility class } /** * Starts building a distance metric with a string distance metric. * * @param distance * the distance to use as a base * @return a builder for fluent chaining */ public static StringDistanceInitialSimplifierStep with(StringDistance distance) { return new CompositeStringDistanceBuilder(distance); } /** * Starts building a distance with a list distance. * * @param distance * the distance to use as a base * @return a builder for fluent chaining */ public static CollectionDistanceInitialSimplifierStep> with(ListDistance distance) { return new CompositeListDistanceBuilder(distance); } /** * Starts building a distance with a set distance. * * @param distance * the distance to use as a base * @return a builder for fluent chaining */ public static CollectionDistanceInitialSimplifierStep> with(SetDistance distance) { return new CompositeSetDistanceBuilder(distance); } /** * Starts building a distance with a multiset distance. * * @param distance * the distance to use as a base * @return a builder for fluent chaining */ public static CollectionDistanceInitialSimplifierStep> with(MultisetDistance distance) { return new CompositeMultisetDistanceBuilder(distance); } @SuppressWarnings("javadoc") public interface BuildStep { /** * Builds a distance with the given steps. * * @return a distance */ StringDistance build(); } @SuppressWarnings("javadoc") public interface StringDistanceInitialSimplifierStep extends BuildStep { /** * Adds a simplifier to the distance. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ StringDistanceSimplifierStep simplify(Simplifier simplifier); /** * Builds a distance with the given simplifier. * * @return a distance */ @Override StringDistance build(); } @SuppressWarnings("javadoc") public interface StringDistanceSimplifierStep extends StringDistanceInitialSimplifierStep { /** * Adds a simplifier to the distance. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ @Override StringDistanceSimplifierStep simplify(Simplifier simplifier); /** * Sets a cache for simplification chain. The cache will store the * result of all previous simplification steps. * * @param cache * a cache to add * @return this for fluent chaining */ BuildStep cacheStrings(Cache cache); /** * Builds a distance with the given simplifier. * * @return a distance */ @Override StringDistance build(); } @SuppressWarnings("javadoc") public interface CollectionDistanceInitialSimplifierStep> { /** * Adds a simplifier to the distance. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ CollectionDistanceSimplifierStep simplify(Simplifier simplifier); /** * Adds a tokenization step to the distance. * * @param tokenizer * a tokenizer to add * @return this for fluent chaining */ CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionDistanceSimplifierStep> extends CollectionDistanceInitialSimplifierStep { /** * Adds a simplifier to the distance. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ @Override CollectionDistanceSimplifierStep simplify(Simplifier simplifier); /** * Sets a cache for simplification chain. The cache will store the * result of all previous simplification steps. * * @param cache * a cache to add * @return this for fluent chaining */ CollectionDistanceInitialTokenizerStep cacheStrings(Cache cache); /** * Adds a tokenization step to the distance. * * @param tokenizer * a tokenizer to add * @return this for fluent chaining */ @Override CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionDistanceInitialTokenizerStep> { /** * Adds a tokenization step to the distance. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionDistanceTokenizerStep> extends BuildStep, CollectionDistanceInitialTokenizerStep { /** * Adds a tokenization step to the distance. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ @Override CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer); /** * Adds a filter step to the distance. All tokens that match the * predicate are kept. * * @param predicate * a predicate for tokens to keep * @return this for fluent chaining */ CollectionDistanceTokenizerStep filter(Predicate predicate); /** * Adds a transform step to the distance. All tokens are transformed by * the function. The function may not return null. * * @param function * a function to transform tokens * @return this for fluent chaining */ CollectionDistanceTokenizerStep transform(Function function); /** * Sets a cache for tokenization chain. The cache will store the result * of all previous tokenization steps. * * @param cache * a cache to add * @return this for fluent chaining * */ BuildStep cacheTokens(Cache cache); /** * Builds a string distance metric that will use the given * simplification, tokenization and filtering steps. * * @return a string distance metric */ @Override StringDistance build(); } private static final class CompositeStringDistanceBuilder implements StringDistanceSimplifierStep { private final Distance distance; private final List simplifiers = new ArrayList<>(); CompositeStringDistanceBuilder(Distance distance) { checkNotNull(distance); this.distance = distance; } @Override public StringDistance build() { if (simplifiers.isEmpty()) { return create(distance); } return create(distance, chainSimplifiers()); } private Simplifier chainSimplifiers() { final Simplifier simplifier = Simplifiers.chain(simplifiers); simplifiers.clear(); return simplifier; } @Override public BuildStep cacheStrings(Cache cache) { checkNotNull(cache); CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers()); this.simplifiers.add(cachingSimplifier); return this; } @Override public StringDistanceSimplifierStep simplify(Simplifier simplifier) { checkNotNull(simplifier); this.simplifiers.add(simplifier); return this; } } private static abstract class CompositeCollectionDistanceBuilder> implements CollectionDistanceSimplifierStep, CollectionDistanceTokenizerStep { private final Distance distance; private final List simplifiers = new ArrayList<>(); private final List tokenizers = new ArrayList<>(); CompositeCollectionDistanceBuilder(Distance distance) { checkNotNull(distance); this.distance = distance; } @Override public final StringDistance build() { Tokenizer tokenizer = chainTokenizers(); if (simplifiers.isEmpty()) { return build(distance, tokenizer); } return build(distance, chainSimplifiers(), tokenizer); } abstract StringDistance build(Distance distance, Simplifier simplifier, Tokenizer tokenizer); abstract StringDistance build(Distance distance, Tokenizer tokenizer); @Override public final BuildStep cacheTokens(Cache cache) { checkNotNull(cache); tokenizers.add(createCachingTokenizer(cache, chainTokenizers())); return this; } protected abstract Tokenizer createCachingTokenizer(Cache cache, Tokenizer tokenizer); @Override public final CollectionDistanceInitialTokenizerStep cacheStrings(Cache cache) { checkNotNull(cache); CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers()); this.simplifiers.add(cachingSimplifier); return this; } @Override public final CollectionDistanceSimplifierStep simplify(Simplifier simplifier) { checkNotNull(simplifier); simplifiers.add(simplifier); return this; } @Override public final CollectionDistanceTokenizerStep tokenize(Tokenizer tokenizer) { checkNotNull(tokenizer); tokenizers.add(tokenizer); return this; } @Override public final CollectionDistanceTokenizerStep filter(Predicate predicate) { checkNotNull(predicate); final Tokenizer filter = Tokenizers.filter(chainTokenizers(), predicate); tokenizers.add(filter); return this; } @Override public final CollectionDistanceTokenizerStep transform(Function function) { checkNotNull(function); final Tokenizer transform = Tokenizers.transform(chainTokenizers(), function); tokenizers.add(transform); return this; } private Tokenizer chainTokenizers() { final Tokenizer tokenizer = Tokenizers.chain(tokenizers); tokenizers.clear(); return tokenizer; } private Simplifier chainSimplifiers() { final Simplifier simplifier = Simplifiers.chain(simplifiers); simplifiers.clear(); return simplifier; } } private static final class CompositeListDistanceBuilder extends CompositeCollectionDistanceBuilder> { CompositeListDistanceBuilder(Distance> distance) { super(distance); } @Override StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) { return createForListDistance(distance, simplifier, tokenizer); } @Override StringDistance build(Distance> distance, Tokenizer tokenizer) { return createForListDistance(distance, tokenizer); } @Override protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) { return new CachingListTokenizer(cache, tokenizer); } } private static final class CompositeSetDistanceBuilder extends CompositeCollectionDistanceBuilder> { CompositeSetDistanceBuilder(Distance> distance) { super(distance); } @Override StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) { return createForSetDistance(distance, simplifier, tokenizer); } @Override StringDistance build(Distance> distance, Tokenizer tokenizer) { return createForSetDistance(distance, tokenizer); } @Override protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) { return new CachingSetTokenizer(cache, tokenizer); } } private static final class CompositeMultisetDistanceBuilder extends CompositeCollectionDistanceBuilder> { CompositeMultisetDistanceBuilder(Distance> distance) { super(distance); } @Override StringDistance build(Distance> distance, Simplifier simplifier, Tokenizer tokenizer) { return createForMultisetDistance(distance, simplifier, tokenizer); } @Override StringDistance build(Distance> distance, Tokenizer tokenizer) { return createForMultisetDistance(distance, tokenizer); } @Override protected Tokenizer createCachingTokenizer(Cache> cache, Tokenizer tokenizer) { return new CachingMultisetTokenizer(cache, tokenizer); } } static final class CachingSimplifier implements Simplifier { private final Cache cache; final Simplifier simplifier; CachingSimplifier(Cache cache, Simplifier simplifier) { this.cache = cache; this.simplifier = simplifier; } @Override public String simplify(final String input) { try { return cache.get(input, new Callable() { @Override public String call() throws Exception { return simplifier.simplify(input); } }); } catch (ExecutionException e) { // Can't happen. Simplifier may not throw checked exceptions throw new IllegalStateException(e); } } @Override public String toString() { return "CachingSimplifier [" + simplifier + "]"; } } static final class CachingMultisetTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingMultisetTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { throw new UnsupportedOperationException(); } @Override public Set tokenizeToSet(final String input) { throw new UnsupportedOperationException(); } @Override public Multiset tokenizeToMultiset(final String input) { try { return cache.get(input, new Callable>() { @Override public Multiset call() { return tokenizer.tokenizeToMultiset(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public String toString() { return "CachingMultisetTokenizer [" + cache + ", " + tokenizer + "]"; } } static final class CachingSetTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingSetTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { throw new UnsupportedOperationException(); } @Override public Set tokenizeToSet(final String input) { try { return cache.get(input, new Callable>() { @Override public Set call() { return tokenizer.tokenizeToSet(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public Multiset tokenizeToMultiset(final String input) { throw new UnsupportedOperationException(); } @Override public String toString() { return "CachingSetTokenizer [" + cache + ", " + tokenizer + "]"; } } static final class CachingListTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingListTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { try { return cache.get(input, new Callable>() { @Override public List call() { return tokenizer.tokenizeToList(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public Set tokenizeToSet(final String input) { throw new UnsupportedOperationException(); } @Override public Multiset tokenizeToMultiset(final String input) { throw new UnsupportedOperationException(); } @Override public String toString() { return "CachingListTokenizer [" + cache + ", " + tokenizer + "]"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy