All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.tokenizers.Tokenizers Maven / Gradle / Ivy

The newest version!
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2016 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 * #L%
 */
package org.simmetrics.tokenizers;

import static com.google.common.base.Functions.compose;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Predicates.and;
import static com.google.common.base.Strings.repeat;
import static com.google.common.collect.Lists.asList;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Sets.newHashSet;
import static java.util.Arrays.asList;
import static java.util.Arrays.copyOfRange;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.collect.Collections2;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import com.google.common.collect.Sets;

/**
 * Construct simple tokenizers, chains multiple tokenizers into a single
 * tokenizer or creates a tokenizers that apply filters and transforms to
 * tokens.
 * 

* The created tokenizers are immutable and thread-safe provided all their * components are also immutable and thread-safe. */ public final class Tokenizers { /** * Returns a tokenizer that splits a string into tokens around the pattern * as if calling {@code pattern.split(input,-1)}. * * @param pattern * to split the the string around * * @return a pattern tokenizer */ public static Tokenizer pattern(Pattern pattern) { return new Split(pattern); } /** * Returns a tokenizer that splits a string into tokens around the pattern * as if calling {@code Pattern.compile(regex).split(input,-1)}. * * @param regex * to split the the string around * * @return a pattern tokenizer */ public static Tokenizer pattern(String regex) { return pattern(Pattern.compile(regex)); } /** * Returns a q-gram tokenizer for a variable {@code q}. The tokenizer will * return an empty collection if the input is empty. A collection with the * original input is returned for tokens shorter then {@code q}. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. * * @param q * size of the tokens * @return a q-gram tokenizer * */ public static Tokenizer qGram(int q) { return new QGram(q); } /** * Returns a q-gram tokenizer for a variable {@code q}.The tokenizer will * return an empty collection if the input is empty or shorter then * {@code q}. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. * @param q * size of the tokens * @return a q-gram tokenizer * */ public static Tokenizer qGramWithFilter(int q) { return new QGram(q, true); } /** * Returns a q-gram tokenizer for a variable {@code q}. The input is padded * with {@code q-1} special characters before being tokenized. Uses * {@code #} as the default padding. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. * * @param q * size of the tokens * @return a q-gram tokenizer */ public static Tokenizer qGramWithPadding(int q) { return new QGramExtended(q); } /** * Returns a q-gram tokenizer for a variable {@code q}. The q-gram is * extended beyond the length of the string with padding. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. * * @param q * size of the tokens * @param padding * padding to pad start and end of string with * @return a q-gram tokenizer */ public static Tokenizer qGramWithPadding(int q, String padding) { return qGramWithPadding(q, padding, padding); } /** * Returns a q-gram tokenizer for a variable {@code q}.The q-gram is * extended beyond the length of the string with padding. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. * * @param q * size of the tokens * @param startPadding * padding to pad start of string with * @param endPadding * padding to pad end of string with * @return a q-gram tokenizer */ public static Tokenizer qGramWithPadding(int q, String startPadding, String endPadding) { return new QGramExtended(q, startPadding, endPadding); } /** * Returns a tokenizer that splits a string into tokens around whitespace. * Does not return leading or trailing empty tokens. *

* To create tokenizer that returns leading and trailing empty tokens use * {@code Tokenizers.pattern("\\s+")} * * @return a white space tokenizer */ public static Tokenizer whitespace() { return new Whitespace(); } /** * Constructs a new transforming tokenizer. After tokenization, all tokens * are transformed by the function. * * @param tokenizer * delegate tokenizer * @param function * to transform tokens * @return a new transforming tokenizer. */ public static Tokenizer transform(Tokenizer tokenizer, Function function) { if (tokenizer instanceof Transform) { return Transform.createCombined( (Transform) tokenizer, function); } else if (tokenizer instanceof Filter) { return Transform.createCombined( (Filter) tokenizer, function); } return new Transform(tokenizer, function); } /** * Chains tokenizers together. The output of each tokenizer is tokenized by * the next. The tokenizers are applied in order. *

* If only a single tokenizer is provided, that tokenizer is returned. * * @param tokenizers * a non-empty list of tokenizers * @return a chain of tokenizers */ public static Tokenizer chain(List tokenizers) { if (tokenizers.size() == 1) { return tokenizers.get(0); } return new Recursive(flatten(tokenizers)); } private static List flatten(List simplifiers) { Builder flattend = ImmutableList.builder(); for (Tokenizer s : simplifiers) { if (s instanceof Recursive) { // Tokenizers controls the creation of recursive tokenizers // all recursive tokenizers are flat so we don't have // to flatten recursively final Recursive c = (Recursive) s; flattend.addAll(c.getTokenizers()); } else { flattend.add(s); } } return flattend.build(); } /** * Chains tokenizers together. The output of each tokenizer is tokenized by * the next. The tokenizers are applied in order. *

* If only a single tokenizer is provided, that tokenizer is returned. * * @param tokenizer * the first tokenizer * * @param tokenizers * a the other tokenizers * @return a chain of tokenizers */ public static Tokenizer chain(Tokenizer tokenizer, Tokenizer... tokenizers) { checkNotNull(tokenizer); if (tokenizers.length == 0) { return tokenizer; } return new Recursive(flatten(asList(tokenizer, tokenizers))); } /** * Constructs a new filtering tokenizer. After tokenization, all tokens that * don't match {@code predicate} are removed. * * @param tokenizer * delegate tokenizer * @param predicate * for tokens to keep * @return a new filtering tokenizer. */ public static Tokenizer filter(Tokenizer tokenizer, Predicate predicate) { if (tokenizer instanceof Filter) { return Filter.createCombined( (Filter) tokenizer, predicate); } else if (tokenizer instanceof Transform) { return Filter.createCombined( (Transform) tokenizer, predicate); } return new Filter(tokenizer, predicate); } static class Filter implements Tokenizer { static final class TransformFilter extends Filter { private final Transform tokenizer; TransformFilter(Transform tokenizer, Predicate predicate) { super(tokenizer, predicate); this.tokenizer = tokenizer; } @Override Transform getTokenizer() { return tokenizer; } @Override Collection tokenizeToFilteredList(String input) { return Collections2.filter( tokenizer.tokenizeToTransformedList(input), predicate); } @Override Collection tokenizeToFilteredMultiset(String input) { return Collections2.filter( tokenizer.tokenizeToTransformedMultiset(input), predicate); } @Override Collection tokenizeToFilteredSet(String input) { return Collections2.filter( tokenizer.tokenizeToTransformedSet(input), predicate); } @Override public List tokenizeToList(String input) { return newArrayList(Collections2.filter( tokenizer.tokenizeToTransformedList(input), predicate)); } @Override public Multiset tokenizeToMultiset(String input) { return HashMultiset.create(Collections2.filter( tokenizer.tokenizeToTransformedMultiset(input), predicate)); } @Override public Set tokenizeToSet(String input) { return newHashSet(Collections2.filter( tokenizer.tokenizeToTransformedSet(input), predicate)); } } static Tokenizer createCombined(Filter tokenizer, Predicate predicate) { if (tokenizer instanceof TransformFilter) { TransformFilter tft = (TransformFilter) tokenizer; return new TransformFilter(tft.getTokenizer(), and(tft.getPredicate(), predicate)); } return new Filter(tokenizer.getTokenizer(), and( tokenizer.getPredicate(), predicate)); } static Tokenizer createCombined(Transform tokenizer, Predicate predicate) { return new TransformFilter(tokenizer, predicate); } protected final Predicate predicate; private final Tokenizer tokenizer; Filter(Tokenizer tokenizer, Predicate predicate) { checkNotNull(tokenizer); checkNotNull(predicate); this.predicate = predicate; this.tokenizer = tokenizer; } Predicate getPredicate() { return predicate; } Tokenizer getTokenizer() { return tokenizer; } Collection tokenizeToFilteredList(String input) { return Collections2.filter(tokenizer.tokenizeToList(input), predicate); } Collection tokenizeToFilteredMultiset(String input) { return Collections2.filter(tokenizer.tokenizeToMultiset(input), predicate); } Collection tokenizeToFilteredSet(String input) { return Sets.filter(tokenizer.tokenizeToSet(input), predicate); } @Override public List tokenizeToList(String input) { return new ArrayList<>(Collections2.filter( tokenizer.tokenizeToList(input), predicate)); } @Override public Multiset tokenizeToMultiset(String input) { return HashMultiset.create(Multisets.filter( tokenizer.tokenizeToMultiset(input), predicate)); } @Override public Set tokenizeToSet(String input) { return new HashSet<>(Collections2.filter( tokenizer.tokenizeToSet(input), predicate)); } @Override public final String toString() { return Joiner.on(" -> ").join(tokenizer, predicate); } } static final class Recursive implements Tokenizer { private final List tokenizers; Recursive(List tokenizers) { this.tokenizers = ImmutableList.copyOf(tokenizers); } List getTokenizers() { return tokenizers; } @Override public List tokenizeToList(final String input) { List tokens = new ArrayList<>(input.length()); tokens.add(input); List newTokens = new ArrayList<>(input.length()); for (Tokenizer t : tokenizers) { for (String token : tokens) { newTokens.addAll(t.tokenizeToList(token)); } List swap = tokens; tokens = newTokens; newTokens = swap; newTokens.clear(); } return tokens; } @Override public Multiset tokenizeToMultiset(String input) { // tokenizeToList is not reused here on purpose. Removing duplicate // words early means these don't have to be tokenized multiple // times. Increases performance. Multiset tokens = HashMultiset.create(input.length()); tokens.add(input); Multiset newTokens = HashMultiset.create(input.length()); for (Tokenizer t : tokenizers) { for (String token : tokens) { newTokens.addAll(t.tokenizeToList(token)); } Multiset swap = tokens; tokens = newTokens; newTokens = swap; newTokens.clear(); } return tokens; } @Override public Set tokenizeToSet(final String input) { // tokenizeToList is not reused here on purpose. Removing duplicate // words early means these don't have to be tokenized multiple // times. Increases performance. Set tokens = new HashSet<>(input.length()); tokens.add(input); Set newTokens = new HashSet<>(input.length()); for (Tokenizer t : tokenizers) { for (String token : tokens) { // Do use to list here, avoid intermediate // adding of a list to to set to add it to newTokens newTokens.addAll(t.tokenizeToList(token)); } Set swap = tokens; tokens = newTokens; newTokens = swap; newTokens.clear(); } return tokens; } @Override public String toString() { return Joiner.on(" -> ").join(tokenizers); } } static final class Split extends AbstractTokenizer { private final Pattern pattern; public Split(Pattern pattern) { this.pattern = pattern; } @Override public List tokenizeToList(final String input) { return asList(pattern.split(input, -1)); } @Override public String toString() { return "Split[" + pattern + "]"; } Pattern getPattern() { return pattern; } } static class Transform implements Tokenizer { static final class FilterTransform extends Transform { private final Filter tokenizer; FilterTransform(Filter tokenizer, Function function) { super(tokenizer, function); this.tokenizer = tokenizer; } @Override Filter getTokenizer() { return tokenizer; } @Override public List tokenizeToList(String input) { return newArrayList(Collections2.transform( tokenizer.tokenizeToFilteredList(input), function)); } @Override public Multiset tokenizeToMultiset(String input) { return HashMultiset.create(Collections2.transform( tokenizer.tokenizeToFilteredMultiset(input), function)); } @Override public Set tokenizeToSet(String input) { return newHashSet(Collections2.transform( tokenizer.tokenizeToFilteredSet(input), function)); } @Override Collection tokenizeToTransformedList(String input) { return Collections2.transform( tokenizer.tokenizeToFilteredList(input), function); } @Override Collection tokenizeToTransformedMultiset(String input) { return Collections2.transform( tokenizer.tokenizeToFilteredMultiset(input), function); } @Override Collection tokenizeToTransformedSet(String input) { return Collections2.transform( tokenizer.tokenizeToFilteredSet(input), function); } } static Tokenizer createCombined(Filter tokenizer, Function function) { return new FilterTransform(tokenizer, function); } static Tokenizer createCombined(Transform tokenizer, Function function) { if (tokenizer instanceof FilterTransform) { FilterTransform ftt = (FilterTransform) tokenizer; return new FilterTransform(ftt.getTokenizer(), compose(function, tokenizer.getFunction())); } return new Transform(tokenizer.getTokenizer(), compose( function, tokenizer.getFunction())); } protected final Function function; private final Tokenizer tokenizer; Transform(Tokenizer tokenizer, Function function) { checkNotNull(tokenizer); checkNotNull(function); this.function = function; this.tokenizer = tokenizer; } Function getFunction() { return function; } Tokenizer getTokenizer() { return tokenizer; } @Override public List tokenizeToList(String input) { return newArrayList(Collections2.transform( tokenizer.tokenizeToList(input), function)); } @Override public Multiset tokenizeToMultiset(String input) { return HashMultiset.create(Collections2.transform( tokenizer.tokenizeToMultiset(input), function)); } @Override public Set tokenizeToSet(String input) { return newHashSet(Collections2.transform( tokenizer.tokenizeToSet(input), function)); } Collection tokenizeToTransformedList(String input) { return Lists.transform(tokenizer.tokenizeToList(input), function); } Collection tokenizeToTransformedMultiset(String input) { return Collections2.transform(tokenizer.tokenizeToMultiset(input), function); } Collection tokenizeToTransformedSet(String input) { return Collections2.transform(tokenizer.tokenizeToSet(input), function); } @Override public final String toString() { return Joiner.on(" -> ").join(tokenizer, function); } } static final class Whitespace extends AbstractTokenizer { private final Pattern pattern = Pattern.compile("\\s+"); Whitespace() { } @Override public List tokenizeToList(final String input) { if (input.isEmpty()) { return emptyList(); } String[] tokens = pattern.split(input); // Remove leading empty token if any if (tokens.length > 0 && tokens[0].isEmpty()) { tokens = copyOfRange(tokens, 1, tokens.length); } return asList(tokens); } @Override public String toString() { return "Whitespace"; } } /** * Q-gram tokenizer for a variable {@code q}. *

* Input strings are tokenized into {@code q} substrings of {@code q} * length. E.g. for {@code q=2} the string {@code "HelloWorld"} is tokenized * into {@code [He, el, ll, lo, oW, Wo, or, rl, ld]}. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. *

* This class is immutable and thread-safe. * */ static class QGram extends AbstractTokenizer { private final int q; private final boolean filter; QGram(int q, boolean filter) { checkArgument(q > 0, "q must be greater then 0"); this.q = q; this.filter = filter; } public QGram(int q) { this(q, false); } int getQ() { return q; } boolean isFilter() { return filter; } @Override public List tokenizeToList(final String input) { if (input.isEmpty()) { return emptyList(); } // Minor optimization. Because characters are either equal to or // smaller then codepoints a string must contain at least q // characters. if (filter && input.length() < q) { return new ArrayList<>(); } else if (input.length() < q) { return singletonList(input); } // To create a q-gram set of tokens we move a q-codepoints-wide // sliding windows across the string. So the final index of the left // side of the window lies q code points to the left of end of the // string final int lastQGramStart; try { lastQGramStart = input.offsetByCodePoints(input.length(), -q); } catch (IndexOutOfBoundsException e) { // When the window doesn't fit act according to the filter // setting. if (filter) { return new ArrayList<>(); } return singletonList(input); } final List ret = new ArrayList<>(input.length()); for (int qGramStart = 0; qGramStart <= lastQGramStart; qGramStart = input.offsetByCodePoints(qGramStart,1)) { ret.add(input.substring(qGramStart, input.offsetByCodePoints(qGramStart, q))); } return ret; } @Override public String toString() { return "QGram [q=" + q + "]"; } } /** * Q-gram tokenizer for a variable {@code q}.The q-gram is extended beyond * the length of the string with padding. *

* * Input strings are tokenized into {@code q} substrings of {@code q} * length. E.g. for {@code q=2} and padding {@code #} the string * {@code "HelloWorld"} is tokenized into * {@code [#H, He, el, ll, lo, oW, Wo, or, rl, ld, d#]}. *

* The tokenizer takes care to split the string on Unicode code points, not * separating valid surrogate pairs. *

* This class is immutable and thread-safe. * */ static class QGramExtended extends AbstractTokenizer { private final static String DEFAULT_START_PADDING = "#"; private final static String DEFAULT_END_PADDING = "#"; private final String endPadding; private final String startPadding; private final QGram tokenizer; /** * Constructs a q-gram tokenizer with the given {@code q} and padding. * * @param q * size of the tokens * @param startPadding * padding to apply at the start of short tokens * @param endPadding * padding to apply at the end of short tokens */ public QGramExtended(int q, String startPadding, String endPadding) { checkArgument(!startPadding.isEmpty(), "startPadding may not be empty"); checkArgument(!endPadding.isEmpty(), "endPadding may not be empty"); this.tokenizer = new QGram(q); this.startPadding = repeat(startPadding, q - 1); this.endPadding = repeat(endPadding, q - 1); } /** * Constructs a q-gram tokenizer with the given q and default padding. * * @param q * size of the tokens */ public QGramExtended(int q) { this(q, DEFAULT_START_PADDING, DEFAULT_END_PADDING); } @Override public List tokenizeToList(String input) { if (input.isEmpty()) { return emptyList(); } return tokenizer.tokenizeToList(startPadding + input + endPadding); } @Override public String toString() { return "QGramExtended [startPadding=" + startPadding + ", endPadding=" + endPadding + ", q=" + tokenizer.getQ() + "]"; } int getQ() { return tokenizer.getQ(); } String getStartPadding() { return startPadding; } String getEndPadding() { return endPadding; } } private Tokenizers() { // Utility class } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy