com.force.i18n.commons.text.TrieMatcher Maven / Gradle / Ivy
Show all versions of grammaticus Show documentation
/*
* Copyright (c) 2017, salesforce.com, inc.
* All rights reserved.
* Licensed under the BSD 3-Clause license.
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
package com.force.i18n.commons.text;
import java.util.*;
import com.force.i18n.commons.util.collection.IntHashMap;
import com.google.common.annotations.Beta;
/**
* An immutable trie used for fast multiple string search and replace.
*
* It's set of words and replacements are populated at initialization,
* and the data structure creation is not the cheapest of operations,
* so it is best used when the object will be used multiple times.
*
* Beta class. Classes under com.force.i18n.commons package will be moved into a dedicated project.
*
* @author koliver
* @see TrieMatcher#replaceMultiple(String, TrieMatcher)
*/
@Beta
public class TrieMatcher {
private static final int DEFAULT_CAPACITY = 1; // trading initialization time for a small memory footprint
/**
* This is not the cheapest of operations.
*
* @return a new TrieMatcher
* @param strings this is the list of words that make up the Trie.
* It is assumed that the lists are not modified once passed into the Trie
* @param replacements the list of words that can be used to replace those words.
* It is assumed that the lists are not modified once passed into the Trie
*/
public static TrieMatcher compile(String[] strings, String[] replacements) {
return TrieMatcher.compile(Arrays.asList(strings), Arrays.asList(replacements));
}
/**
* This is not the cheapest of operations.
*
* @return a new TrieMatcher
* @param strings this is the list of words that make up the Trie.
* It is assumed that the lists are not modified once passed into the Trie
* @param replacements the list of words that can be used to replace those words.
* It is assumed that the lists are not modified once passed into the Trie
*/
public static TrieMatcher compile(List strings, List replacements) {
return new TrieMatcher(strings, replacements);
}
/**
* @return whether the string begins with any of the terms in the trie
* @param s the term to see if it starts with any terms of the trie
*/
public boolean begins(CharSequence s) {
TrieData match = begins(s, 0);
return match != null;
}
private static class TrieData {
String word;
String replacement;
final IntHashMap nextChars;
TrieData(IntHashMap next) {
this.nextChars = next;
}
}
private final IntHashMap root;
private final List words;
private final int minWordLength;
/**
* Use the factory {@link #compile(String[], String[])} instead.
* @param strings strings
* @param replacements parallel replacements
*/
private TrieMatcher(List strings, List replacements) {
if (strings == null) throw new NullPointerException();
if (replacements == null) throw new NullPointerException();
if (strings.size() != replacements.size()) {
throw new IllegalArgumentException("Replacements must have same size, "+ replacements.size()
+ ", as search strings " + strings.size());
}
this.words = Collections.unmodifiableList(strings);
this.root = new IntHashMap(DEFAULT_CAPACITY);
int minWordLen = Integer.MAX_VALUE;
int wordIndex = 0;
for (String s : strings) {
IntHashMap current = this.root;
int len = s.length();
minWordLen = Math.min(minWordLen, len);
for (int i = 0; i < len; i++) {
int ch = s.charAt(i);
TrieData next = current.get(ch);
if (next == null) {
next = new TrieData(new IntHashMap(DEFAULT_CAPACITY));
current.put(ch, next);
}
current = next.nextChars;
// if we're at the last char, store it and its replacement...
if (i+1 == len) {
next.word = s;
next.replacement = replacements.get(wordIndex);
}
}
wordIndex++;
}
this.minWordLength = minWordLen;
}
private TrieData begins(CharSequence s, int offset) {
if (s == null || s.length() == 0 || offset < 0) return null;
return contains(s, offset);
}
/**
* @return null if not found
* @param s the string to check
* @param offset the offset in the string
*/
private TrieData contains(CharSequence s, int offset) {
IntHashMap current = this.root;
int len = s.length();
LinkedList matches = null;
TrieData firstMatch = null;
for (int i = offset; i < len; i++) {
int ch = s.charAt(i);
TrieData nextData = current.get(ch);
if (nextData == null) break;
if (nextData.word != null) {
if (firstMatch == null){
firstMatch = nextData;
} else {
if (matches == null){
matches = new LinkedList();
matches.add(firstMatch);
}
matches.add(nextData);
}
}
current = nextData.nextChars;
}
if (firstMatch != null) {
// only 1 match, so we know that's the one
if (matches == null) return firstMatch;
// else, we need to find the "highest" priority order word
// as specified by the input to the trie
for (String word : this.words) {
for (TrieData td : matches) {
if (word.equals(td.word)) return td;
}
}
}
return null;
}
/**
* Search and replace multiple strings in s
given the the words and replacements given in
* TrieMatcher
.
*
* Note, this is best used when 1) you will reuse the Trie many times 2) you have a large set of strings your are searching on
*
* Note, regexes aren't supported by this
*
* @param s
* the text you are searching in
* @param trieMatcher
* the trie representing the words to search and replace for
* @return the text with the search words swapped by the replacements
*/
public static final String replaceMultiple(String s, TrieMatcher trieMatcher) {
if (s == null || trieMatcher == null || s.length() == 0)
return s;
// we don't use a DeferredStringBuilder because we don't expect to
// reuse much of the original string. it's likely all or nothing.
// Don't allocate the buffer until it's needed.
StringBuilder dsb = null;
int pos = 0;
int length = s.length();
boolean foundMatch = false;
while (pos < length) {
TrieMatch match = trieMatcher.match(s, pos);
if (match == null) {
if (!foundMatch) {
return s;
} else {
// No more matches, so copy the rest and get gone
dsb.append(s, pos, s.length());
break;
}
}
foundMatch = true;
if (dsb == null) dsb = new StringBuilder(s.length() + 16);
// Copy up to the match position
if (match.getPosition() > pos)
dsb.append(s, pos, match.getPosition());
// Append the replacement
dsb.append(match.getReplacement());
// Advance our current position
pos = match.getPosition() + match.getWord().length();
}
return dsb.toString();
}
/**
* See if the given string matches any of the given words in the Trie
*
* @param s the string to look for
* @param offset where to start looking inside of the given String.
* @return null if none are found.
*/
public TrieMatch match(CharSequence s, int offset) {
if (s == null || s.length() == 0 || offset < 0) return null;
int len = s.length();
for (int i = offset; i < len; i++) {
// optimize the case when we don't have enough room left to contain any matches
if (i + this.minWordLength > len) break;
TrieData data = contains(s, i);
if (data != null) return new TrieMatch(i, data.word, data.replacement);
}
return null;
}
}