All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.force.i18n.commons.text.GenericTrieMatcher Maven / Gradle / Ivy

There is a newer version: 1.2.30
Show newest version
/*
 * Copyright (c) 2017, salesforce.com, inc.
 * All rights reserved.
 * Licensed under the BSD 3-Clause license.
 * For full license text, see LICENSE.txt file in the repo root  or https://opensource.org/licenses/BSD-3-Clause
 */

package com.force.i18n.commons.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import com.google.common.annotations.Beta;

/**
 * A trie matcher that uses tokens instead of strings.  It's not nearly
 * as efficient, since it uses a regular hashmap instead of a nice IntHashMap
 * to store the trie matches, but it's similar.  If you want to use this
 * for an AST or an enum, you probably want to genericize this to take in
 *
 * The parameter is the token, it can be an enum or an Id.
 *
 * Beta class. Classes under com.force.i18n.commons package will be moved into a dedicated project.
 *
 * @author stamm
 */
@Beta
public class GenericTrieMatcher {
    private static final int DEFAULT_CAPACITY = 1; // trading initialization time for a small memory footprint

    /**
     * This is not the cheapest of operations.
     *
     * @param  the type of object being matched
     * @param searches this is the list of words that make up the Trie.
     *      It is assumed that the lists are not modified once passed into the Trie
     * @param replacements the list of words that can be used to replace those words.
     *      It is assumed that the lists are not modified once passed into the Trie
     * @return a new GenericTrieMatcher
     */
    public static  GenericTrieMatcher compile(List> searches, List> replacements) {
        return compile(searches, replacements, null);
    }


    /**
     * This is not the cheapest of operations.
     *
     * @param  the type of object being matched
     * @param searches this is the list of words that make up the Trie.
     *      It is assumed that the lists are not modified once passed into the Trie
     * @param replacements the list of words that can be used to replace those words.
     *      It is assumed that the lists are not modified once passed into the Trie
     * @param tokenClass based on the class, a more efficient trie map can be generated
     * @return a new GenericTrieMatcher
     */
    public static  GenericTrieMatcher compile(List> searches, List> replacements, Class tokenClass) {
        return new GenericTrieMatcher(searches, replacements, tokenClass);
    }

    /**
     * Search and replace multiple strings in s given the the words and replacements given in
     * TrieMatcher.
     * 

* This is best used when 1) you will reuse the Trie many times 2) you have a large set of strings your are searching on *

* Note, regexes aren't supported by this * * @param the type of object being matched * @param s * the text you are searching in * @param trieMatcher * the trie representing the words to search and replace for * @return the text with the search words swapped by the replacements */ public static final List replaceMultiple(List s, GenericTrieMatcher trieMatcher) { return replaceMultiple(s, trieMatcher, null); } /** * Search and replace multiple strings in s given the the words and replacements given in * TrieMatcher and a validation strategy *

* @param the type of object being matched * @param s * the text you are searching in * @param trieMatcher * the trie representing the words to search and replace for * @param validator * the optional code that validates whether a match should be accepted or not. * @return the text with the search words swapped by the replacements */ public static final List replaceMultiple(List s, GenericTrieMatcher trieMatcher, MatchValidator validator) { if (s == null || trieMatcher == null) return s; // we don't use a DeferredStringBuilder because we don't expect to // reuse much of the original string. it's likely all or nothing. List dsb = new ArrayList(s.size() + 16); int pos = 0; int length = s.size(); boolean foundMatch = false; while (pos < length) { GenericTrieMatch match = trieMatcher.match(s, pos); // Try to find a valid match if (match != null && validator != null) { int curPos = pos; // Start from pos and look for the next one while (match != null && pos < length && !validator.isValidMatch(match,s)) { match = trieMatcher.match(s, ++curPos); } } if (match == null) { if (!foundMatch) { return s; } else { // No more matches, so copy the rest and get gone dsb.addAll(s.subList(pos, s.size())); break; } } foundMatch = true; // Copy up to the match position if (match.getPosition() > pos) dsb.addAll(s.subList(pos, match.getPosition())); // Append the replacement dsb.addAll(match.getReplacement()); // Advance our current position pos = match.getPosition() + match.getWord().size(); } return dsb; } /** * An interface that represents whether a match for a given string is "valid" */ public interface MatchValidator { /** * * @param match the match found * @param src the original source string being modified (NOTE: the positions may be different.) * @return {@code true} if the match in the given src string is valid */ public boolean isValidMatch(GenericTrieMatch match, List src); } /** * @param s the term to search for the terms of the trie in * @return true if the any of the terms are contained in s */ public boolean containedIn(List s) { GenericTrieMatch match = match(s); return match != null; } /** * @param s the term to see if it starts with any terms of the trie * @return whether the list begins with any of the matches in this trie */ public boolean begins(List s) { GenericTrieData match = begins(s, 0); return match != null; } /** * Find the next match in s. * * @param s the term to search for the terms of the trie in * @param start the 0-based position to start the search from. * @return null if no match found */ public List findIn(List s, int start) { GenericTrieMatch match = match(s, start); if (match == null) return null; return match.getWord(); } private static class GenericTrieData { List word; List replacement; final Map> nextChars; GenericTrieData(Map> next) { this.nextChars = next; } } private final List> words; private final Map> root; private final int minWordLength; /** * Use the factory {@link #compile(List, List, Class)} instead. */ private GenericTrieMatcher(List> strings, List> replacements, Class tokenClass) { if (strings == null) throw new NullPointerException(); if (replacements == null) throw new NullPointerException(); if (strings.size() != replacements.size()) { throw new IllegalArgumentException("Replacements must have same size, "+ replacements.size() + ", as search strings " + strings.size()); } this.words = Collections.unmodifiableList(strings); this.root = makeMap(tokenClass); int minWordLen = Integer.MAX_VALUE; int wordIndex = 0; for (List s : strings) { Map> current = this.root; int len = s.size(); minWordLen = Math.min(minWordLen, len); for (int i = 0; i < len; i++) { T ch = s.get(i); GenericTrieData next = current.get(ch); if (next == null) { next = new GenericTrieData(makeMap(tokenClass)); current.put(ch, next); } current = next.nextChars; // if we're at the last char, store it and its replacement... if (i+1 == len) { next.word = s; next.replacement = replacements.get(wordIndex); } } wordIndex++; } this.minWordLength = minWordLen; } @SuppressWarnings({ "unchecked", "rawtypes" }) // Conversion to enum can't be done in a way that is safe private Map> makeMap(Class tokenClass) { if (tokenClass == null) return new HashMap>(DEFAULT_CAPACITY); if (tokenClass.isEnum()) { return new EnumMap(tokenClass); } return new HashMap>(DEFAULT_CAPACITY); } /** * See if the given string matches any of the given words in the Trie * * @param s the list of objects to search * @return null if none are found. */ GenericTrieMatch match(List s) { return match(s, 0); } /** * See if the given string matches any of the given words in the Trie * * @param s the list of objects to search * @param offset where to start looking inside of the given String. * @return null if none are found. */ public GenericTrieMatch match(List s, int offset) { if (s == null || s.size() == 0 || offset < 0) return null; int len = s.size(); for (int i = offset; i < len; i++) { // optimize the case when we don't have enough room left to contain any matches if (i + this.minWordLength > len) break; GenericTrieData data = contains(s, i); if (data != null) return new GenericTrieMatch(i, data.word, data.replacement); } return null; } private GenericTrieData begins(List s, int offset) { if (s == null || s.size() == 0 || offset < 0) return null; return contains(s, offset); } /** * @return null if not found */ private GenericTrieData contains(List s, int offset) { Map> current = this.root; int len = s.size(); LinkedList> matches = null; for (int i = offset; i < len; i++) { T ch = s.get(i); GenericTrieData nextData = current.get(ch); if (nextData == null) break; if (nextData.word != null) { if (matches == null) matches = new LinkedList>(); matches.add(nextData); } current = nextData.nextChars; } if (matches != null) { // only 1 match, so we know that's the one if (matches.size() == 1) return matches.getFirst(); // else, we need to find the "highest" priority order word // as specified by the input to the trie for (List word : this.words) { for (GenericTrieData td : matches) { if (word.equals(td.word)) return td; } } } return null; } /** * Struct returned by {@link GenericTrieMatcher#match(List)} to represent a match. * * @author koliver * @see TrieMatcher */ public static class GenericTrieMatch { private final int position; private final List word; private final List replacement; GenericTrieMatch(int position, List word, List replacement) { if (position < 0) throw new IllegalArgumentException(Integer.toString(position)); if (word == null) throw new NullPointerException(); if (replacement == null) throw new NullPointerException(); this.position = position; this.word = Collections.unmodifiableList(word); this.replacement = Collections.unmodifiableList(replacement); } /** * @return The position of where the match was in the source. * Eg,

         *    Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
         *    TrieMatch match = trie.match("abcxdef");
         *    Assert.assertEquals(3, match.getPosition());
         * 
*/ public int getPosition() { return this.position; } /** * @return The word in the trie that matched. * Eg,
         *    Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
         *    TrieMatch match = trie.match("abcxdef");
         *    Assert.assertEquals("x", match.getWord());
         * 
*/ public List getWord() { return this.word; } /** * @return The replacement for word in the trie that matched. * Eg,
         *    Trie trie = new Trie(String[]{"x"}, String[]{"Y"});
         *    TrieMatch match = trie.match("abcxdef");
         *    Assert.assertEquals("Y", match.getReplacement());
         * 
*/ public List getReplacement() { return this.replacement; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy