com.davidbracewell.collection.trie.PatriciaTrie Maven / Gradle / Ivy
Show all versions of mango Show documentation
/*
* Take from Apache commons with modifications
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.davidbracewell.collection.trie;
import com.davidbracewell.collection.trie.analyzer.StringKeyAnalyzer;
import com.davidbracewell.conversion.Convert;
import com.davidbracewell.io.CSV;
import com.davidbracewell.io.resource.Resource;
import com.davidbracewell.io.structured.csv.CSVReader;
import com.davidbracewell.string.StringUtils;
import com.google.common.base.CharMatcher;
import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* Implementation of a PATRICIA Trie (Practical Algorithm to Retrieve Information
* Coded in Alphanumeric).
*
* A PATRICIA {@link Trie} is a compressed {@link Trie}. Instead of storing
* all data at the edges of the {@link Trie} (and having empty internal nodes),
* PATRICIA stores data in every node. This allows for very efficient traversal,
* insert, delete, predecessor, successor, prefix, range, and {@link #select(Object)}
* operations. All operations are performed at worst in O(K) time, where K
* is the number of bits in the largest item in the tree. In practice,
* operations actually take O(A(K)) time, where A(K) is the average number of
* bits of all items in the tree.
*
* Most importantly, PATRICIA requires very few comparisons to keys while
* doing any operation. While performing a lookup, each comparison (at most
* K of them, described above) will perform a single bit comparison against
* the given key, instead of comparing the entire key to another key.
*
* The {@link Trie} can return operations in lexicographical order using the
* 'prefixMap', 'submap', or 'iterator' methods. The {@link Trie} can also
* scan for items that are 'bitwise' (using an XOR metric) by the 'select' method.
* Bitwise closeness is determined by the {@link KeyAnalyzer} returning true or
* false for a bit being set or not in a given key.
*
* This PATRICIA {@link Trie} supports both variable length & fixed length
* keys. Some methods, such as {@link #prefixMap(Object)} are suited only
* to variable length keys.
*
* @param the type parameter
* @version $Id : PatriciaTrie.java 1543928 2013-11-20 20:15:35Z tn $
* @see Radix Tree
* @see PATRICIA
* @see Crit-Bit Tree
* @since 4.0
*/
public class PatriciaTrie extends AbstractPatriciaTrie {
private static final long serialVersionUID = 4446367780901817838L;
/**
* Instantiates a new Patricia trie.
*/
public PatriciaTrie() {
super(new StringKeyAnalyzer());
}
/**
* Instantiates a new Patricia trie.
*
* @param m the m
*/
public PatriciaTrie(final Map extends String, ? extends E> m) {
super(new StringKeyAnalyzer(), m);
}
/**
* Instantiates a new Patricia trie.
*
* @param m the m
* @param suffix the suffix
*/
protected PatriciaTrie(final Map extends String, ? extends E> m, boolean suffix) {
super(new StringKeyAnalyzer(), m);
}
/**----------------------------------------------------------
Illuminating knowledge contribution
----------------------------------------------------------**/
/**
* Constructs a trie from a csv file where the first column is the string and the second column is the value.
*
* @param the type parameter
* @param resource The csv resource
* @param valueType Class information for the value
* @return A ByteTrie from the csv
* @throws IOException the io exception
*/
public static PatriciaTrie loadCSV(Resource resource, Class valueType) throws IOException {
return loadCSV(resource, valueType, Functions.identity());
}
/**
* Constructs a trie from a csv file where the first column is the string and the second column is the value.
*
* @param the type parameter
* @param resource The csv resource
* @param valueType Class information for the value
* @param keyTransform function to transform the keys in some fashion, e.g. lower case
* @return A ByteTrie from the csv
* @throws IOException the io exception
*/
@SuppressWarnings("unchecked")
public static PatriciaTrie loadCSV(Resource resource, Class valueType, Function keyTransform) throws IOException {
Preconditions.checkNotNull(resource, "Resource cannot be null");
Preconditions.checkNotNull(valueType, "valueType cannot be null");
Preconditions.checkNotNull(keyTransform, "keyTransform cannot be null");
PatriciaTrie trie = new PatriciaTrie<>();
try (CSVReader csv = CSV.builder().reader(resource)) {
List row;
while ((row = csv.nextRow()) != null) {
if (row.size() >= 2) {
String key = keyTransform.apply(row.get(0));
V value = Convert.convert(row.get(1), valueType);
trie.put(key, value);
} else if (row.size() == 1 && valueType == String.class) {
String key = keyTransform.apply(row.get(0));
trie.put(key, (V) key);
}
}
}
return trie;
}
/**
* Finds all occurrences of keys in the ByteTrie in the input text. Uses a default character matcher that
* matches anything t hat is not alphanumeric. Does not allow prefix matches.
*
* @param text The text to search in
* @return A list of Tuple3s indicating [start, end) and the value associated with the match.
*/
public List> findOccurrencesIn(String text) {
return findOccurrencesIn(text, false, StringUtils.NOT_LETTER_OR_DIGIT);
}
/**
* Finds all occurrences of keys in the ByteTrie in the input text. Uses a default character matcher that
* matches anything t hat is not alphanumeric.
*
* @param text The text to search in
* @param prefixMatch True if allow prefix matches
* @return A list of Tuple3s indicating [start, end) and the value associated with the match.
*/
public List> findOccurrencesIn(String text, boolean prefixMatch) {
return findOccurrencesIn(text, prefixMatch, StringUtils.NOT_LETTER_OR_DIGIT);
}
/**
* Finds all occurrences of keys in the ByteTrie in the input text.
*
* @param text The text to search in
* @param prefixMatch True if allow prefix matches
* @param matcher The character matcher to use to mark end of word
* @return A list of Tuple3s indicating [start, end) and the value associated with the match.
*/
public List> findOccurrencesIn(String text, boolean prefixMatch, CharMatcher matcher) {
List> rval = Lists.newArrayList();
int len = text.length();
StringBuilder key = new StringBuilder();
int start = 0;
int lastMatch = -1;
for (int i = 0; i < len; i++) {
key.append(text.charAt(i));
//We have a key match
if (containsKey(key.toString())) {
int nextI = i + 1;
lastMatch = i + 1;
//There is something longer!
if (nextI < len && !prefixMap(key.toString() + text.charAt(i + 1)).isEmpty()) {
continue;
}
lastMatch = -1;
//check if we accept
if (nextI >= len || prefixMatch || matcher.matches(text.charAt(nextI))) {
E value = get(key.toString());
if (prefixMatch) {
while ((i + 1) < text.length() && !matcher.matches(text.charAt(i + 1))) {
i++;
key.append(text.charAt(i));
}
}
rval.add(new TrieMatch<>(start, i + 1, value));
start = i + 1;
continue;
}
}
if (prefixMap(key.toString()).isEmpty()) {
if (lastMatch != -1) {
int nextI = lastMatch;
if (nextI >= len || prefixMatch || matcher.matches(text.charAt(nextI))) {
key = new StringBuilder(text.substring(start, nextI));
E value = get(key.toString());
rval.add(new TrieMatch<>(start, nextI, value));
i = lastMatch;
lastMatch = -1;
}
}
start = i;
if (key.length() > 1) {
key.setLength(1);
key.setCharAt(0, text.charAt(i));
} else {
key.setLength(0);
}
}
}
return rval;
}
}