All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.builders.StringMetricBuilder Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2016 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

package org.simmetrics.builders;

import static com.google.common.base.Preconditions.checkNotNull;
import static org.simmetrics.builders.StringMetrics.create;
import static org.simmetrics.builders.StringMetrics.createForListMetric;
import static org.simmetrics.builders.StringMetrics.createForMultisetMetric;
import static org.simmetrics.builders.StringMetrics.createForSetMetric;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;

import org.simmetrics.ListMetric;
import org.simmetrics.Metric;
import org.simmetrics.MultisetMetric;
import org.simmetrics.SetMetric;
import org.simmetrics.StringMetric;
import org.simmetrics.simplifiers.Simplifier;
import org.simmetrics.simplifiers.Simplifiers;
import org.simmetrics.tokenizers.Tokenizer;
import org.simmetrics.tokenizers.Tokenizers;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.cache.Cache;
import com.google.common.collect.Multiset;

/**
 * Convenience tool to build string similarity metrics. Any class implementing
 * {@link StringMetric}, {@link ListMetric}, {@link SetMetric} or
 * {@link MultisetMetric} can be used to build a string similarity metric. Supports the
 * addition of simplification, tokenization, token-filtering,
 * token-transformation and caching to a metric.
 * 

* For usage examples see the simmetrics-example module. */ public final class StringMetricBuilder { private StringMetricBuilder() { // Utility class } /** * Starts building a metric with a string similarity metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static StringMetricInitialSimplifierStep with(StringMetric metric) { return new CompositeStringMetricBuilder(metric); } /** * Starts building a metric with a list metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CollectionMetricInitialSimplifierStep> with( ListMetric metric) { return new CompositeListMetricBuilder(metric); } /** * Starts building a metric with a set metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CollectionMetricInitialSimplifierStep> with( SetMetric metric) { return new CompositeSetMetricBuilder(metric); } /** * Starts building a metric with a multiset metric. * * @param metric * the metric to use as a base * @return a builder for fluent chaining */ public static CollectionMetricInitialSimplifierStep> with( MultisetMetric metric) { return new CompositeMultisetMetricBuilder(metric); } @SuppressWarnings("javadoc") public interface BuildStep { /** * Builds a metric with the given steps. * * @return a metric */ StringMetric build(); } @SuppressWarnings("javadoc") public interface StringMetricInitialSimplifierStep extends BuildStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ StringMetricSimplifierStep simplify(Simplifier simplifier); /** * Builds a metric with the given simplifier. * * @return a metric */ @Override StringMetric build(); } @SuppressWarnings("javadoc") public interface StringMetricSimplifierStep extends StringMetricInitialSimplifierStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ @Override StringMetricSimplifierStep simplify(Simplifier simplifier); /** * Sets a cache for simplification chain. The cache will store the * result of all previous simplification steps. * * @param cache * a cache to add * @return this for fluent chaining */ BuildStep cacheStrings(Cache cache); /** * Builds a metric with the given simplifier. * * @return a metric */ @Override StringMetric build(); } @SuppressWarnings("javadoc") public interface CollectionMetricInitialSimplifierStep> { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ CollectionMetricSimplifierStep simplify(Simplifier simplifier); /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return this for fluent chaining */ CollectionMetricTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricSimplifierStep> extends CollectionMetricInitialSimplifierStep { /** * Adds a simplifier to the metric. * * @param simplifier * a simplifier to add * @return this for fluent chaining */ @Override CollectionMetricSimplifierStep simplify(Simplifier simplifier); /** * Sets a cache for simplification chain. The cache will store the * result of all previous simplification steps. * * @param cache * a cache to add * @return this for fluent chaining */ CollectionMetricInitialTokenizerStep cacheStrings( Cache cache); /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return this for fluent chaining */ @Override CollectionMetricTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricInitialTokenizerStep> { /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ CollectionMetricTokenizerStep tokenize(Tokenizer tokenizer); } @SuppressWarnings("javadoc") public interface CollectionMetricTokenizerStep> extends BuildStep, CollectionMetricInitialTokenizerStep { /** * Adds a tokenization step to the metric. * * @param tokenizer * a tokenizer to add * @return a builder for fluent chaining */ @Override CollectionMetricTokenizerStep tokenize(Tokenizer tokenizer); /** * Adds a filter step to the metric. All tokens that match the predicate * are kept. * * @param predicate * a predicate for tokens to keep * @return this for fluent chaining */ CollectionMetricTokenizerStep filter(Predicate predicate); /** * Adds a transform step to the metric. All tokens are transformed by * the function. The function may not return null. * * @param function * a function to transform tokens * @return this for fluent chaining */ CollectionMetricTokenizerStep transform( Function function); /** * Sets a cache for tokenization chain. The cache will store the result * of all previous tokenization steps. * * @param cache * a cache to add * @return this for fluent chaining * */ BuildStep cacheTokens(Cache cache); /** * Builds a similarity metric that will use the given simplification, * tokenization and filtering steps. * * @return a string similarity metric. */ @Override StringMetric build(); } private static final class CompositeStringMetricBuilder implements StringMetricSimplifierStep { private final Metric metric; private final List simplifiers = new ArrayList<>(); CompositeStringMetricBuilder(Metric metric) { checkNotNull(metric); this.metric = metric; } @Override public StringMetric build() { if (simplifiers.isEmpty()) { return create(metric); } return create(metric, chainSimplifiers()); } private Simplifier chainSimplifiers() { final Simplifier simplifier = Simplifiers.chain(simplifiers); simplifiers.clear(); return simplifier; } @Override public BuildStep cacheStrings(Cache cache) { checkNotNull(cache); CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers()); this.simplifiers.add(cachingSimplifier); return this; } @Override public StringMetricSimplifierStep simplify(Simplifier simplifier) { checkNotNull(simplifier); this.simplifiers.add(simplifier); return this; } } private static abstract class CompositeCollectionMetricBuilder> implements CollectionMetricSimplifierStep, CollectionMetricTokenizerStep { private final Metric metric; private final List simplifiers = new ArrayList<>(); private final List tokenizers = new ArrayList<>(); CompositeCollectionMetricBuilder(Metric metric) { checkNotNull(metric); this.metric = metric; } @Override public final StringMetric build() { Tokenizer tokenizer = chainTokenizers(); if (simplifiers.isEmpty()) { return build(metric, tokenizer); } return build(metric, chainSimplifiers(), tokenizer); } abstract StringMetric build(Metric metric, Simplifier simplifier, Tokenizer tokenizer); abstract StringMetric build(Metric metric, Tokenizer tokenizer); @Override public final BuildStep cacheTokens(Cache cache) { checkNotNull(cache); tokenizers.add(createCachingTokenizer(cache, chainTokenizers())); return this; } protected abstract Tokenizer createCachingTokenizer( Cache cache, Tokenizer tokenizer); @Override public final CollectionMetricInitialTokenizerStep cacheStrings( Cache cache) { checkNotNull(cache); CachingSimplifier cachingSimplifier = new CachingSimplifier(cache, chainSimplifiers()); this.simplifiers.add(cachingSimplifier); return this; } @Override public final CollectionMetricSimplifierStep simplify( Simplifier simplifier) { checkNotNull(simplifier); simplifiers.add(simplifier); return this; } @Override public final CollectionMetricTokenizerStep tokenize( Tokenizer tokenizer) { checkNotNull(tokenizer); tokenizers.add(tokenizer); return this; } @Override public final CollectionMetricTokenizerStep filter( Predicate predicate) { checkNotNull(predicate); final Tokenizer filter = Tokenizers.filter(chainTokenizers(), predicate); tokenizers.add(filter); return this; } @Override public final CollectionMetricTokenizerStep transform( Function function) { checkNotNull(function); final Tokenizer transform = Tokenizers.transform(chainTokenizers(), function); tokenizers.add(transform); return this; } private Tokenizer chainTokenizers() { final Tokenizer tokenizer = Tokenizers.chain(tokenizers); tokenizers.clear(); return tokenizer; } private Simplifier chainSimplifiers() { final Simplifier simplifier = Simplifiers.chain(simplifiers); simplifiers.clear(); return simplifier; } } private static final class CompositeListMetricBuilder extends CompositeCollectionMetricBuilder> { CompositeListMetricBuilder(Metric> metric) { super(metric); } @Override StringMetric build(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return createForListMetric(metric, simplifier, tokenizer); } @Override StringMetric build(Metric> metric, Tokenizer tokenizer) { return createForListMetric(metric, tokenizer); } @Override protected Tokenizer createCachingTokenizer( Cache> cache, Tokenizer tokenizer) { return new CachingListTokenizer(cache, tokenizer); } } private static final class CompositeSetMetricBuilder extends CompositeCollectionMetricBuilder> { CompositeSetMetricBuilder(Metric> metric) { super(metric); } @Override StringMetric build(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return createForSetMetric(metric, simplifier, tokenizer); } @Override StringMetric build(Metric> metric, Tokenizer tokenizer) { return createForSetMetric(metric, tokenizer); } @Override protected Tokenizer createCachingTokenizer( Cache> cache, Tokenizer tokenizer) { return new CachingSetTokenizer(cache, tokenizer); } } private static final class CompositeMultisetMetricBuilder extends CompositeCollectionMetricBuilder> { CompositeMultisetMetricBuilder(Metric> metric) { super(metric); } @Override StringMetric build(Metric> metric, Simplifier simplifier, Tokenizer tokenizer) { return createForMultisetMetric(metric, simplifier, tokenizer); } @Override StringMetric build(Metric> metric, Tokenizer tokenizer) { return createForMultisetMetric(metric, tokenizer); } @Override protected Tokenizer createCachingTokenizer( Cache> cache, Tokenizer tokenizer) { return new CachingMultisetTokenizer(cache, tokenizer); } } static final class CachingSimplifier implements Simplifier { private final Cache cache; final Simplifier simplifier; CachingSimplifier(Cache cache, Simplifier simplifier) { this.cache = cache; this.simplifier = simplifier; } @Override public String simplify(final String input) { try { return cache.get(input, new Callable() { @Override public String call() throws Exception { return simplifier.simplify(input); } }); } catch (ExecutionException e) { // Can't happen. Simplifier may not throw checked exceptions throw new IllegalStateException(e); } } @Override public String toString() { return "CachingSimplifier [" + simplifier + "]"; } } static final class CachingMultisetTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingMultisetTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { throw new UnsupportedOperationException(); } @Override public Set tokenizeToSet(final String input) { throw new UnsupportedOperationException(); } @Override public Multiset tokenizeToMultiset(final String input) { try { return cache.get(input, new Callable>() { @Override public Multiset call() { return tokenizer.tokenizeToMultiset(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public String toString() { return "CachingMultisetTokenizer [" + cache + ", " + tokenizer + "]"; } } static final class CachingSetTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingSetTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { throw new UnsupportedOperationException(); } @Override public Set tokenizeToSet(final String input) { try { return cache.get(input, new Callable>() { @Override public Set call() { return tokenizer.tokenizeToSet(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public Multiset tokenizeToMultiset(final String input) { throw new UnsupportedOperationException(); } @Override public String toString() { return "CachingSetTokenizer [" + cache + ", " + tokenizer + "]"; } } static final class CachingListTokenizer implements Tokenizer { private final Cache> cache; final Tokenizer tokenizer; CachingListTokenizer(Cache> cache, Tokenizer tokenizer) { this.cache = cache; this.tokenizer = tokenizer; } @Override public List tokenizeToList(final String input) { try { return cache.get(input, new Callable>() { @Override public List call() { return tokenizer.tokenizeToList(input); } }); } catch (ExecutionException e) { // Can't happen. Tokenizer may not throw checked exceptions throw new IllegalStateException(e); } } @Override public Set tokenizeToSet(final String input) { throw new UnsupportedOperationException(); } @Override public Multiset tokenizeToMultiset(final String input) { throw new UnsupportedOperationException(); } @Override public String toString() { return "CachingListTokenizer [" + cache + ", " + tokenizer + "]"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy