All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.utils.CompositeTokenizer Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
package org.simmetrics.utils;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.simmetrics.tokenizers.Tokenizer;

import com.google.common.base.Joiner;

/**
 * Tokenizer composed of multiple tokenizers. Applies the tokenizers in their
 * iteration order.
 * 
 * 

* This class is immutable and thread-safe if its components are. * */ public final class CompositeTokenizer implements Tokenizer { private final Iterable tokenizers; /** * Constructs a new composite tokenizer. * * @param tokenizers * an iteration of tokenizers */ public CompositeTokenizer(Iterable tokenizers) { this.tokenizers = tokenizers; } @Override public List tokenizeToList(final String input) { List tokens = new ArrayList<>(1); tokens.add(input); List newTokens = new ArrayList<>(input.length()); for (Tokenizer t : tokenizers) { for (String token : tokens) { newTokens.addAll(t.tokenizeToList(token)); } tokens = newTokens; newTokens = new ArrayList<>(input.length()); } return tokens; } @Override public Set tokenizeToSet(final String input) { // tokenizeToArray is not reused here on purpose. Removing duplicate // words early means these don't have to be tokenized multiple // times. Increases performance. Set tokens = new HashSet<>(1); tokens.add(input); Set newTokens = new HashSet<>(input.length()); for (Tokenizer t : tokenizers) { for (String token : tokens) { newTokens.addAll(t.tokenizeToList(token)); } tokens = newTokens; newTokens = new HashSet<>(input.length()); } return tokens; } @Override public String toString() { return Joiner.on(" -> ").join(tokenizers); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy