com.force.i18n.commons.text.GenericTrieMatcher Maven / Gradle / Ivy
Show all versions of grammaticus Show documentation
/*
* Copyright (c) 2017, salesforce.com, inc.
* All rights reserved.
* Licensed under the BSD 3-Clause license.
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
package com.force.i18n.commons.text;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.google.common.annotations.Beta;
/**
* A trie matcher that uses tokens instead of strings. It's not nearly
* as efficient, since it uses a regular hashmap instead of a nice IntHashMap
* to store the trie matches, but it's similar. If you want to use this
* for an AST or an enum, you probably want to genericize this to take in
*
* The parameter is the token, it can be an enum or an Id.
*
* Beta class. Classes under com.force.i18n.commons package will be moved into a dedicated project.
*
* @author stamm
*/
@Beta
public class GenericTrieMatcher {
private static final int DEFAULT_CAPACITY = 1; // trading initialization time for a small memory footprint
/**
* This is not the cheapest of operations.
*
* @param the type of object being matched
* @param searches this is the list of words that make up the Trie.
* It is assumed that the lists are not modified once passed into the Trie
* @param replacements the list of words that can be used to replace those words.
* It is assumed that the lists are not modified once passed into the Trie
* @return a new GenericTrieMatcher
*/
public static GenericTrieMatcher compile(List extends List> searches, List extends List> replacements) {
return compile(searches, replacements, null);
}
/**
* This is not the cheapest of operations.
*
* @param the type of object being matched
* @param searches this is the list of words that make up the Trie.
* It is assumed that the lists are not modified once passed into the Trie
* @param replacements the list of words that can be used to replace those words.
* It is assumed that the lists are not modified once passed into the Trie
* @param tokenClass based on the class, a more efficient trie map can be generated
* @return a new GenericTrieMatcher
*/
public static GenericTrieMatcher compile(List extends List> searches, List extends List> replacements, Class tokenClass) {
return new GenericTrieMatcher(searches, replacements, tokenClass);
}
/**
* Search and replace multiple strings in s
given the the words and replacements given in
* TrieMatcher
.
*
* This is best used when 1) you will reuse the Trie many times 2) you have a large set of strings your are searching on
*
* Note, regexes aren't supported by this
*
* @param the type of object being matched
* @param s
* the text you are searching in
* @param trieMatcher
* the trie representing the words to search and replace for
* @return the text with the search words swapped by the replacements
*/
public static final List replaceMultiple(List s, GenericTrieMatcher trieMatcher) {
return replaceMultiple(s, trieMatcher, null);
}
/**
* Search and replace multiple strings in s
given the the words and replacements given in
* TrieMatcher
and a validation strategy
*
* @param the type of object being matched
* @param s
* the text you are searching in
* @param trieMatcher
* the trie representing the words to search and replace for
* @param validator
* the optional code that validates whether a match should be accepted or not.
* @return the text with the search words swapped by the replacements
*/
public static final List replaceMultiple(List s, GenericTrieMatcher trieMatcher, MatchValidator validator) {
if (s == null || trieMatcher == null)
return s;
// we don't use a DeferredStringBuilder because we don't expect to
// reuse much of the original string. it's likely all or nothing.
List dsb = new ArrayList(s.size() + 16);
int pos = 0;
int length = s.size();
boolean foundMatch = false;
while (pos < length) {
GenericTrieMatch match = trieMatcher.match(s, pos);
// Try to find a valid match
if (match != null && validator != null) {
int curPos = pos; // Start from pos and look for the next one
while (match != null && pos < length && !validator.isValidMatch(match,s)) {
match = trieMatcher.match(s, ++curPos);
}
}
if (match == null) {
if (!foundMatch) {
return s;
} else {
// No more matches, so copy the rest and get gone
dsb.addAll(s.subList(pos, s.size()));
break;
}
}
foundMatch = true;
// Copy up to the match position
if (match.getPosition() > pos)
dsb.addAll(s.subList(pos, match.getPosition()));
// Append the replacement
dsb.addAll(match.getReplacement());
// Advance our current position
pos = match.getPosition() + match.getWord().size();
}
return dsb;
}
/**
* An interface that represents whether a match for a given string is "valid"
*/
public interface MatchValidator {
/**
*
* @param match the match found
* @param src the original source string being modified (NOTE: the positions may be different.)
* @return {@code true} if the match in the given src string is valid
*/
public boolean isValidMatch(GenericTrieMatch match, List src);
}
/**
* @param s the term to search for the terms of the trie in
* @return true if the any of the terms are contained in s
*/
public boolean containedIn(List s) {
GenericTrieMatch match = match(s);
return match != null;
}
/**
* @param s the term to see if it starts with any terms of the trie
* @return whether the list begins with any of the matches in this trie
*/
public boolean begins(List s) {
GenericTrieData match = begins(s, 0);
return match != null;
}
/**
* Find the next match in s
.
*
* @param s the term to search for the terms of the trie in
* @param start the 0-based position to start the search from.
* @return null if no match found
*/
public List findIn(List s, int start) {
GenericTrieMatch match = match(s, start);
if (match == null) return null;
return match.getWord();
}
private static class GenericTrieData {
List word;
List replacement;
final Map> nextChars;
GenericTrieData(Map> next) {
this.nextChars = next;
}
}
private final List> words;
private final Map> root;
private final int minWordLength;
/**
* Use the factory {@link #compile(List, List, Class)} instead.
*/
private GenericTrieMatcher(List extends List> strings, List extends List> replacements, Class tokenClass) {
if (strings == null) throw new NullPointerException();
if (replacements == null) throw new NullPointerException();
if (strings.size() != replacements.size()) {
throw new IllegalArgumentException("Replacements must have same size, "+ replacements.size()
+ ", as search strings " + strings.size());
}
this.words = Collections.unmodifiableList(strings);
this.root = makeMap(tokenClass);
int minWordLen = Integer.MAX_VALUE;
int wordIndex = 0;
for (List s : strings) {
Map> current = this.root;
int len = s.size();
minWordLen = Math.min(minWordLen, len);
for (int i = 0; i < len; i++) {
T ch = s.get(i);
GenericTrieData next = current.get(ch);
if (next == null) {
next = new GenericTrieData(makeMap(tokenClass));
current.put(ch, next);
}
current = next.nextChars;
// if we're at the last char, store it and its replacement...
if (i+1 == len) {
next.word = s;
next.replacement = replacements.get(wordIndex);
}
}
wordIndex++;
}
this.minWordLength = minWordLen;
}
@SuppressWarnings({ "unchecked", "rawtypes" }) // Conversion to enum can't be done in a way that is safe
private Map> makeMap(Class tokenClass) {
if (tokenClass == null) return new HashMap>(DEFAULT_CAPACITY);
if (tokenClass.isEnum()) {
return new EnumMap(tokenClass);
}
return new HashMap>(DEFAULT_CAPACITY);
}
/**
* See if the given string matches any of the given words in the Trie
*
* @param s the list of objects to search
* @return null if none are found.
*/
GenericTrieMatch match(List s) {
return match(s, 0);
}
/**
* See if the given string matches any of the given words in the Trie
*
* @param s the list of objects to search
* @param offset where to start looking inside of the given String.
* @return null if none are found.
*/
public GenericTrieMatch match(List s, int offset) {
if (s == null || s.size() == 0 || offset < 0) return null;
int len = s.size();
for (int i = offset; i < len; i++) {
// optimize the case when we don't have enough room left to contain any matches
if (i + this.minWordLength > len) break;
GenericTrieData data = contains(s, i);
if (data != null) return new GenericTrieMatch(i, data.word, data.replacement);
}
return null;
}
private GenericTrieData begins(List s, int offset) {
if (s == null || s.size() == 0 || offset < 0) return null;
return contains(s, offset);
}
/**
* @return null if not found
*/
private GenericTrieData contains(List s, int offset) {
Map> current = this.root;
int len = s.size();
LinkedList> matches = null;
for (int i = offset; i < len; i++) {
T ch = s.get(i);
GenericTrieData nextData = current.get(ch);
if (nextData == null) break;
if (nextData.word != null) {
if (matches == null) matches = new LinkedList>();
matches.add(nextData);
}
current = nextData.nextChars;
}
if (matches != null) {
// only 1 match, so we know that's the one
if (matches.size() == 1) return matches.getFirst();
// else, we need to find the "highest" priority order word
// as specified by the input to the trie
for (List word : this.words) {
for (GenericTrieData td : matches) {
if (word.equals(td.word)) return td;
}
}
}
return null;
}
/**
* Struct returned by {@link GenericTrieMatcher#match(List)} to represent a match.
*
* @author koliver
* @see TrieMatcher
*/
public static class GenericTrieMatch {
private final int position;
private final List word;
private final List replacement;
GenericTrieMatch(int position, List word, List replacement) {
if (position < 0) throw new IllegalArgumentException(Integer.toString(position));
if (word == null) throw new NullPointerException();
if (replacement == null) throw new NullPointerException();
this.position = position;
this.word = Collections.unmodifiableList(word);
this.replacement = Collections.unmodifiableList(replacement);
}
/**
* @return The position of where the match was in the source.
* Eg,
* Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
* TrieMatch match = trie.match("abcxdef");
* Assert.assertEquals(3, match.getPosition());
*
*/
public int getPosition() {
return this.position;
}
/**
* @return The word in the trie that matched.
* Eg,
* Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
* TrieMatch match = trie.match("abcxdef");
* Assert.assertEquals("x", match.getWord());
*
*/
public List getWord() {
return this.word;
}
/**
* @return The replacement for word in the trie that matched.
* Eg,
* Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
* TrieMatch match = trie.match("abcxdef");
* Assert.assertEquals("Y", match.getReplacement());
*
*/
public List getReplacement() {
return this.replacement;
}
}
}