it.unimi.dsi.util.ImmutableBinaryTrie Maven / Gradle / Ivy
Show all versions of dsi-utils Show documentation
package it.unimi.dsi.util;
/*
* DSI utilities
*
* Copyright (C) 2005-2009 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.booleans.BooleanIterator;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.lang.MutableString;
import java.io.Serializable;
import java.util.Iterator;
import java.util.ListIterator;
import cern.colt.bitvector.QuickBitVector;
/** An immutable implementation of binary tries.
*
* Instance of this class are built starting from a lexicographically ordered
* list of {@link BitVector}s representing binary words. Each word
* is assigned its position (starting from 0) in the list. The words are then organised in a
* binary trie with path compression.
*
*
Once the trie has been
* built, it is possible to ask whether a word w is {@linkplain #get(BooleanIterator) contained in the trie}
* (getting back its position in the list), the {@linkplain #getInterval(BooleanIterator) interval given by the words extending w} and the
* {@linkplain #getApproximatedInterval(BooleanIterator) approximated interval defined by w}.
* @author Sebastiano Vigna
* @since 0.9.2
*/
public class ImmutableBinaryTrie extends AbstractObject2LongFunction implements Serializable {
private final static boolean ASSERTS = false;
public static final long serialVersionUID = 1L;
/** A node in the trie. */
protected static class Node implements Serializable {
private static final long serialVersionUID = 1L;
public Node left, right;
/** An array containing the path compacted in this node (null
if there is no compaction at this node). */
final public long[] path;
/** The length of the path compacted in this node (0 if there is no compaction at this node). */
final public int pathLength;
/** If nonnegative, this node represent the word
-th word. */
final public int word ;
/** Creates a node representing a word.
*
* Note that the long array contained in path
will be stored inside the node.
*
* @param path the path compacted in this node, or null
for the empty path.
* @param word the index of the word represented by this node.
*/
public Node( final BitVector path, final int word ) {
if ( path == null ) {
this.path = null;
this.pathLength = 0;
}
else {
this.path = path.bits();
this.pathLength = path.size();
}
this.word = word;
}
/** Creates a node that does not represent a word.
*
* @param path the path compacted in this node, or null
for the empty path.
*/
public Node( final BitVector path ) {
this( path, -1 );
}
/** Returns true if this node is a leaf.
*
* @return true if this node is a leaf.
*/
public boolean isLeaf() {
return right == null && left == null;
}
public String toString() {
return "[" + path + ", " + word + "]";
}
}
/** The root of the trie. */
protected final Node root;
/** The number of words in this trie. */
private int size;
private final TransformationStrategy super T> transformationStrategy;
/** Creates a trie from a set of elements.
*
* @param elements a set of elements
* @param transformationStrategy a transformation strategy that must turn elements
into a list of
* distinct, lexicographically increasing (in iteration order) binary words.
*/
public ImmutableBinaryTrie( final Iterable extends T> elements, final TransformationStrategy super T> transformationStrategy ) {
this.transformationStrategy = transformationStrategy;
defRetValue = -1;
// Check order
final Iterator extends T> iterator = elements.iterator();
final ObjectList words = new ObjectArrayList();
int cmp;
if ( iterator.hasNext() ) {
final LongArrayBitVector prev = LongArrayBitVector.copy( transformationStrategy.toBitVector( iterator.next() ) );
words.add( prev.copy() );
BitVector curr;
while( iterator.hasNext() ) {
curr = transformationStrategy.toBitVector( iterator.next() );
cmp = prev.compareTo( curr );
if ( cmp == 0 ) throw new IllegalArgumentException( "The trie elements are not unique" );
if ( cmp > 0 ) throw new IllegalArgumentException( "The trie elements are not sorted" );
prev.replace( curr );
words.add( prev.copy() );
}
}
root = buildTrie( words, 0 );
}
/** Builds a trie recursively.
*
* The trie will contain the suffixes of words in words
starting at pos
.
*
* @param elements a list of elements.
* @param pos a starting position.
* @return a trie containing the suffixes of words in words
starting at pos
.
*/
protected Node buildTrie( final ObjectList elements, final int pos ) {
// TODO: on-the-fly check for lexicographical order
if ( elements.size() == 0 ) return null;
BitVector first = elements.get( 0 ), curr;
int prefix = first.size(), change = -1, j;
// We rule out the case of a single word (it would work anyway, but it's faster)
if ( elements.size() == 1 ) return new Node( pos < prefix ? LongArrayBitVector.copy( first.subVector( pos, prefix ) ) : null, size++ );
// Find maximum common prefix. change records the point of change (for splitting the word set).
for( ListIterator i = elements.listIterator( 1 ); i.hasNext(); ) {
curr = i.next();
if ( curr.size() < prefix ) prefix = curr.size();
for( j = pos; j < prefix; j++ ) if ( first.get( j ) != curr.get( j ) ) break;
if ( j < prefix ) {
change = i.previousIndex();
prefix = j;
}
}
final Node n;
if ( prefix == first.size() ) {
// Special case: the first word is the common prefix. We must store it in the node,
// and explicitly search for the actual change point, which is the first
// word with prefix-th bit true.
change = 1;
for( ListIterator i = elements.listIterator( 1 ); i.hasNext(); ) {
curr = i.next();
if ( curr.getBoolean( prefix ) ) break;
change++;
}
n = new Node( prefix > pos ? LongArrayBitVector.copy( first.subVector( pos, prefix ) ) : null, size++ );
n.left = buildTrie( elements.subList( 1, change ), prefix + 1 );
n.right = buildTrie( elements.subList( change, elements.size() ), prefix + 1 );
}
else {
n = new Node( prefix > pos ? LongArrayBitVector.copy( first.subVector( pos, prefix ) ) : null ); // There's some common prefix
n.left = buildTrie( elements.subList( 0, change ), prefix + 1 );
n.right = buildTrie( elements.subList( change, elements.size() ), prefix + 1 );
}
return n;
}
/** Returns the number of binary words in this trie.
*
* @return the number of binary words in this trie.
*/
public int size() {
return size;
}
@SuppressWarnings("unchecked")
public long getIndex( final Object element ) {
final BitVector word = transformationStrategy.toBitVector( (T)element );
final int length = word.size();
Node n = root;
int pos = 0; // Current position in word
long[] path;
while( n != null ) {
if ( pos == length ) return n.word;
path = n.path;
if ( path != null ) {
int minLength = Math.min( length - pos, n.pathLength ), i;
for( i = 0; i < minLength; i++ ) if ( word.getBoolean( pos + i ) != QuickBitVector.get( path, i ) ) break;
// Incompatible with current path.
if ( i < minLength ) return -1;
pos += i;
// Completely contained in the current path (note that n.word == -1 if this is not a word).
if ( pos == length ) return n.word;
}
n = word.getBoolean( pos++ ) ? n.right : n.left;
}
return -1;
}
public long getLong( final Object element ) {
final long result = getIndex( element );
return result == -1 ? defRetValue : result;
}
public boolean containsKey( final Object element ) {
return getIndex( element ) != -1;
}
/** Return the index of the word returned by the given iterator, or -1 if the word is not this trie.
*
* @param iterator a boolean iterator that will be used to find a word in this trie.
* @return the index of the specified word, or -1 if the word returned by the iterator is not this trie.
* @see #getLong(Object)
*/
public int get( final BooleanIterator iterator ) {
Node n = root;
int pathLength;
long[] path;
while( n != null ) {
if ( ! iterator.hasNext() ) return n.word;
pathLength = n.pathLength;
if ( pathLength != 0 ) {
int i;
path = n.path;
for( i = 0; i < pathLength && iterator.hasNext(); i++ ) if ( iterator.nextBoolean() != QuickBitVector.get( path, i ) ) break;
// Incompatible with current path.
if ( i < pathLength ) return -1;
// Completely contained in the current path (note that n.word == -1 if this is not a word).
if ( ! iterator.hasNext() ) return n.word;
}
n = iterator.nextBoolean() ? n.right : n.left;
}
return -1;
}
/** Returns an interval given by the smallest and the largest word in the trie starting with the specified word.
*
* @param word a word.
* @return an interval given by the smallest and the largest word in the trie
* that start with word
(thus, the {@linkplain Intervals#EMPTY_INTERVAL empty inteval}
* if no such words exist).
* @see #getInterval(BooleanIterator)
*/
public Interval getInterval( final BitVector word ) {
final int length = word.size();
Node n = root;
long[] path;
int pos = 0; // Current position in word
while( n != null ) {
// We found the current path: we go searching for left and right delimiters.
if ( pos == length ) break;
path = n.path;
if ( path != null ) {
int maxLength = Math.min( length - pos, n.pathLength );
int i;
for( i = 0; i < maxLength; i++ ) if ( word.getBoolean( pos + i ) != QuickBitVector.get( path, i ) ) break;
// Incompatible with current path--we return the empty interval.
if ( i < maxLength ) return Intervals.EMPTY_INTERVAL;
pos += i;
// Completely contained in the current path: we go searching for left and right delimiters.
if ( pos == length ) break;
}
n = word.getBoolean( pos++ ) ? n.right : n.left;
}
// If n == null, we did not found the path. Otherwise, it's the current node,
// and we must search for left and right delimiters.
if ( n == null ) return Intervals.EMPTY_INTERVAL;
Node l = n;
// Searching for the left extreme...
while( l.word < 0 ) l = l.left != null ? l.left : l.right;
// Searching for the right extreme, unless we're on a leaf.
while( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( l.word, n.word );
}
/** Returns an interval given by the smallest and the largest word in the trie starting with
* the word returned by the given iterator.
*
* @param iterator an iterator.
* @return an interval given by the smallest and the largest word in the trie
* that start with the word returned by iterator
(thus, the {@linkplain Intervals#EMPTY_INTERVAL empty inteval}
* if no such words exist).
* @see #getInterval(BitVector)
*/
public Interval getInterval( final BooleanIterator iterator ) {
Node n = root;
boolean mismatch = false;
long[] path;
int pathLength;
while( n != null ) {
// We found the current path: we go searching for left and right delimiters.
if ( ! iterator.hasNext() ) break;
pathLength = n.pathLength;
if ( pathLength != 0 ) {
int i;
path = n.path;
for( i = 0; i < pathLength && iterator.hasNext(); i++ ) if ( ( mismatch = ( iterator.nextBoolean() != QuickBitVector.get( path, i ) ) ) ) break;
// Incompatible with current path--we return the empty interval.
if ( mismatch ) return Intervals.EMPTY_INTERVAL;
// Completely contained in the current path: we go searching for left and right delimiters.
if ( ! iterator.hasNext() ) break;
}
n = iterator.nextBoolean() ? n.right : n.left;
}
// If n == null, we did not found the path. Otherwise, it's the current node,
// and we must search for left and right delimiters.
if ( n == null ) return Intervals.EMPTY_INTERVAL;
Node l = n;
// Searching for the left extreme...
while( l.word < 0 ) l = l.left != null ? l.left : l.right;
// Searching for the right extreme, unless we're on a leaf.
while ( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( l.word, n.word );
}
/** Returns an approximated interval around the specified word.
*
* Given a word w, the corresponding approximated interval is
* defined as follows: if the words in the approximator are thought of as left interval extremes in a
* larger lexicographically ordered set of words, and we number these word intervals using the
* indices of their left extremes, then the first word extending w would be in the
* word interval given by the left extreme of the interval returned by this method, whereas
* the last word extending w would be in the word interval given by the right
* extreme of the interval returned by this method. If no word in the larger set could possibly extend
* w (because w is smaller than the lexicographically smallest word in the approximator)
* the result is just an {@linkplain it.unimi.dsi.util.Intervals#EMPTY_INTERVAL empty interval}.
*
* @param element an element.
* @return an approximated interval around the specified word.
* @see #getApproximatedInterval(BooleanIterator)
*/
public Interval getApproximatedInterval( final T element ) {
final BitVector word = transformationStrategy.toBitVector( element );
final int length = word.size();
Node n = root;
long[] path;
boolean exactMatch = false, mismatch = false, nextBit;
int pos = 0; // Current position in word
while( n != null ) {
// We found the current path: we go searching for left and right delimiters.
path = n.path;
if ( pos == length ) {
if ( n.word >= 0 && path == null ) exactMatch = true;
break;
}
if ( path != null ) {
int maxLength = Math.min( length - pos, n.pathLength );
int i;
for( i = 0; i < maxLength; i++ ) if ( mismatch = ( word.getBoolean( pos + i ) != QuickBitVector.get( path, i ) ) ) break;
if ( mismatch ) {
// System.err.println( "Exit 1" );
// A mismatch. In this case, it is guaranteed that all
// strings starting with the prefix examined so far lie
// in a single block. The block index depends, however
// on the bit that went wrong.
if ( QuickBitVector.get( path, i ) ) {
while( n.word < 0 ) n = n.left != null ? n.left : n.right;
return n.word > 0 ? Interval.valueOf( n.word - 1 ) : Intervals.EMPTY_INTERVAL;
}
else {
while( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( n.word );
}
}
pos += i;
// Completely contained in the current path
if ( pos == length ) {
if ( ASSERTS ) assert n.pathLength == maxLength;
if ( i == n.pathLength && n.word >= 0 ) exactMatch = true;
break;
}
}
if ( n.isLeaf() ) break;
nextBit = word.getBoolean( pos++ );
// We would like to take an impossible turn. This case is similar to
// prefix mismatches, with subtly different off-by-ones.
if ( nextBit && n.right == null ) {
while( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( n.word );
}
else if ( ! nextBit && n.left == null ) {
while( n.word < 0 ) n = n.left != null ? n.left : n.right;
return Interval.valueOf( n.word );
}
n = nextBit ? n.right : n.left;
}
Node l = n;
// Searching for the left extreme...
//System.err.println("Going for exit 2: l:" + l + " n:" + n);
while( l.word < 0 ) l = l.left != null ? l.left : l.right;
// Searching for the right extreme, unless we're on a leaf.
while ( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
// System.err.println("Following exit 2: l:" + l + " n:" + n);
// If did not find an exact match and l.word is 0 we are lexicographically before every word.
if ( pos == length && ! exactMatch ) {
if ( l.word == 0 ) return mismatch ? Intervals.EMPTY_INTERVAL : Interval.valueOf( l.word, n.word );
else return Interval.valueOf( l.word - 1, n.word );
}
// System.err.println( "Exit 2 (exactMatch: " + exactMatch +")" );
return Interval.valueOf( l.word, n.word );
}
/** Returns an approximated prefix interval around the word returned by the specified iterator.
*
* @param iterator an iterator.
* @return an approximated interval around the specified word: if the words in this trie
* are thought of as left interval extremes in a larger lexicographically ordered set of words,
* and we number these word intervals using the indices of their left extremes,
* then the first word extending word
would be in the word interval given by
* the left extreme of the {@link Interval} returned by this method, whereas
* the last word extending word
would be in the word
* interval given by the right extreme of the {@link Interval} returned by this method.
* @see #getApproximatedInterval(Object)
*/
public Interval getApproximatedInterval( final BooleanIterator iterator ) {
Node n = root;
long[] path;
boolean exactMatch = false, mismatch = false, nextBit;
for(;;) {
// We found the current path: we go searching for left and right delimiters.
path = n.path;
if ( ! iterator.hasNext() ) {
if ( n.word >= 0 && path == null ) exactMatch = true;
break;
}
if ( path != null ) {
int i;
final int pathSize = n.pathLength;
for( i = 0; i < pathSize && iterator.hasNext(); i++ ) if ( ( mismatch = ( iterator.nextBoolean() != QuickBitVector.get( path, i ) ) ) ) break;
if ( mismatch ) {
// System.err.println( "Exit 1" );
// A mismatch. In this case, it is guaranteed that all
// strings starting with the prefix examined so far lie
// in a single block. The block index depends, however
// on the bit that went wrong.
if ( QuickBitVector.get( path, i ) ) {
while( n.word < 0 ) n = n.left != null ? n.left : n.right;
return n.word > 0 ? Interval.valueOf( n.word - 1 ) : Intervals.EMPTY_INTERVAL;
}
else {
while( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( n.word );
}
}
// Completely contained in the current path
if ( ! iterator.hasNext() ) {
if ( i == pathSize && n.word >= 0 ) exactMatch = true;
break;
}
}
if ( n.isLeaf() ) break;
nextBit = iterator.nextBoolean();
// We would like to take an impossible turn. This case is similar to
// prefix mismatches, with subtly different off-by-ones.
if ( nextBit && n.right == null ) {
while( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
return Interval.valueOf( n.word );
}
else if ( ! nextBit && n.left == null ) {
while( n.word < 0 ) n = n.left != null ? n.left : n.right;
return Interval.valueOf( n.word );
}
n = nextBit ? n.right : n.left;
}
Node l = n;
// Searching for the left extreme...
//System.err.println("Going for exit 2: l:" + l + " n:" + n);
while( l.word < 0 ) l = l.left != null ? l.left : l.right;
// Searching for the right extreme, unless we're on a leaf.
while ( ! n.isLeaf() ) n = n.right != null ? n.right : n.left;
// If did not find an exact match and l.word is 0 we are lexicographically before every word.
if ( ! iterator.hasNext() && ! exactMatch ) {
if ( l.word == 0 ) return mismatch ? Intervals.EMPTY_INTERVAL : Interval.valueOf( 0 );
else return Interval.valueOf( l.word - 1, n.word );
}
// System.err.println( "Exit 2 (hasNext: " +iterator.hasNext() + " exactMatch: " + exactMatch +")" );
return Interval.valueOf( l.word, n.word );
}
private void recToString( final Node n, final MutableString printPrefix, final MutableString result, final MutableString path, final int level ) {
if ( n == null ) return;
//System.err.println( "Called with prefix " + printPrefix );
result.append( printPrefix ).append( '(' ).append( level ).append( ')' );
if ( n.path != null ) {
path.append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
result.append( " path:" ).append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
}
if ( n.word >= 0 ) result.append( " word: " ).append( n.word ).append( " (" ).append( path ).append( ')' );
result.append( '\n' );
path.append( '0' );
recToString( n.left, printPrefix.append( '\t' ).append( "0 => " ), result, path, level + 1 );
path.charAt( path.length() - 1, '1' );
recToString( n.right, printPrefix.replace( printPrefix.length() - 5, printPrefix.length(), "1 => "), result, path, level + 1 );
path.delete( path.length() - 1, path.length() );
printPrefix.delete( printPrefix.length() - 6, printPrefix.length() );
//System.err.println( "Path now: " + path + " Going to delete from " + ( path.length() - n.pathLength));
path.delete( path.length() - n.pathLength, path.length() );
}
public String toString() {
MutableString s = new MutableString();
recToString( root, new MutableString(), s, new MutableString(), 0 );
return s.toString();
}
}