it.unimi.dsi.bits.TransformationStrategies Maven / Gradle / Ivy
Show all versions of dsi-utils Show documentation
package it.unimi.dsi.bits;
* DSI utilities
* Copyright (C) 2007-2009 Sebastiano Vigna
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.lang.MutableString;
import java.util.Iterator;
import java.util.List;
/** A class providing static methods and objects that do useful things with transformation strategies.
* @see TransformationStrategy
public class TransformationStrategies {
private final static TransformationStrategy IDENTITY = new TransformationStrategy() {
private static final long serialVersionUID = 1L;
public BitVector toBitVector( final BitVector object ) {
return object;
public long numBits() { return 0; }
public TransformationStrategy copy() {
return this;
public Object readResolve() {
return IDENTITY;
/** A trivial transformation for data already in {@link BitVector} form. */
public static TransformationStrategy identity() {
return (TransformationStrategy)IDENTITY;
private static final TransformationStrategy UTF16 = new Utf16TransformationStrategy( false );
/** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation.
* Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
* changes while the bit vector is being accessed, the results will be unpredictable.
public static TransformationStrategy utf16() {
return (TransformationStrategy)UTF16;
private static final TransformationStrategy PREFIX_FREE_UTF16 = new Utf16TransformationStrategy( true );
/** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation and completes
* the representation with an ASCII NUL to guarantee lexicographical ordering and prefix-freeness.
* Note that strings provided to this strategy must not contain ASCII NULs.
* Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
* changes while the bit vector is being accessed, the results will be unpredictable.
public static TransformationStrategy prefixFreeUtf16() {
return (TransformationStrategy)PREFIX_FREE_UTF16;
private static class Utf16TransformationStrategy implements TransformationStrategy, Serializable {
private static final long serialVersionUID = 1L;
/** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */
private final boolean prefixFree;
/** Creates a UTF16 transformation strategy. The strategy will map a string to its natural UTF16 bit sequence.
* @param prefixFree if true, the resulting set of binary words will be made prefix free by adding
protected Utf16TransformationStrategy( boolean prefixFree ) {
this.prefixFree = prefixFree;
private static class Utf16CharSequenceBitVector extends AbstractBitVector implements Serializable {
private static final long serialVersionUID = 1L;
private transient CharSequence s;
private transient long length;
private transient long actualEnd;
public Utf16CharSequenceBitVector( final CharSequence s, final boolean prefixFree ) {
this.s = s;
actualEnd = s.length() * Character.SIZE;
length = actualEnd + ( prefixFree ? Character.SIZE : 0 );
public boolean getBoolean( long index ) {
if ( index > length ) throw new IndexOutOfBoundsException();
if ( index >= actualEnd ) return false;
final int charIndex = (int)( index / Character.SIZE );
return ( s.charAt( charIndex ) & 0x8000 >>> index % Character.SIZE ) != 0;
public long getLong( final long from, final long to ) {
final int startBit = (int)( from % Long.SIZE );
if ( startBit == 0 && to % Character.SIZE == 0 ) {
if ( from == to ) return 0;
long l;
int pos = (int)( from / Character.SIZE );
if ( to == from + Long.SIZE ) l = ( ( to > actualEnd ? 0 : (long)s.charAt( pos + 3 ) ) << 48 | (long)s.charAt( pos + 2 ) << 32 | (long)s.charAt( pos + 1 ) << 16 | s.charAt( pos ) );
else {
l = 0;
final int residual = (int)( Math.min( actualEnd, to ) - from );
for( int i = residual / Character.SIZE; i-- != 0; )
l |= (long)s.charAt( pos + i ) << i * Character.SIZE;
l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L;
l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L;
l = ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL;
return ( l & 0x00ff00ff00ff00ffL ) << 8 | ( l >>> 8 ) & 0x00ff00ff00ff00ffL;
final long l = Long.SIZE - ( to - from );
final long startPos = from - startBit;
if ( l == Long.SIZE ) return 0;
if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l;
return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l;
public long length() {
return length;
private static class Utf16MutableStringBitVector extends AbstractBitVector implements Serializable {
private static final long serialVersionUID = 1L;
private transient char[] a;
private transient long length;
private transient long actualEnd;
public Utf16MutableStringBitVector( final MutableString s, final boolean prefixFree ) {
this.a = s.array();
actualEnd = s.length() * Character.SIZE;
length = actualEnd + ( prefixFree ? Character.SIZE : 0 );
public boolean getBoolean( long index ) {
if ( index > length ) throw new IndexOutOfBoundsException();
if ( index >= actualEnd ) return false;
final int charIndex = (int)( index / Character.SIZE );
return ( a[ charIndex ] & 0x8000 >>> index % Character.SIZE ) != 0;
public long getLong( final long from, final long to ) {
final int startBit = (int)( from % Long.SIZE );
if ( startBit == 0 && to % Character.SIZE == 0 ) {
if ( from == to ) return 0;
long l;
int pos = (int)( from / Character.SIZE );
if ( to == from + Long.SIZE ) l = ( ( to > actualEnd ? 0 : (long)a[ pos + 3 ] ) << 48 | (long)a[ pos + 2 ] << 32 | (long)a[ pos + 1 ] << 16 | a[ pos ] );
else {
l = 0;
final int residual = (int)( Math.min( actualEnd, to ) - from );
for( int i = residual / Character.SIZE; i-- != 0; )
l |= (long)a[ pos + i ] << i * Character.SIZE;
l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L;
l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L;
l = ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL;
return ( l & 0x00ff00ff00ff00ffL ) << 8 | ( l >>> 8 ) & 0x00ff00ff00ff00ffL;
final long l = Long.SIZE - ( to - from );
final long startPos = from - startBit;
if ( l == Long.SIZE ) return 0;
if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l;
return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l;
public long length() {
return length;
public BitVector toBitVector( final CharSequence s ) {
return s instanceof MutableString ? new Utf16MutableStringBitVector( (MutableString)s, prefixFree ) : new Utf16CharSequenceBitVector( s, prefixFree );
public long numBits() { return 0; }
public TransformationStrategy copy() {
return this;
private Object readResolve() {
return prefixFree ? PREFIX_FREE_UTF16 : UTF16;
private static final TransformationStrategy ISO = new ISOTransformationStrategy( false );
/** A trivial transformation from strings to bit vectors that concatenates the lower eight bits of the UTF-16 representation.
* Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset.
* Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
* changes while the bit vector is being accessed, the results will be unpredictable.
public static TransformationStrategy iso() {
return (TransformationStrategy)ISO;
private static final TransformationStrategy PREFIX_FREE_ISO = new ISOTransformationStrategy( true );
/** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation and completes
* the representation with an ASCII NUL to guarantee lexicographical ordering and prefix-freeness.
* Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset, and
* that strings provided to this strategy must not contain ASCII NULs.
* Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
* changes while the bit vector is being accessed, the results will be unpredictable.
public static TransformationStrategy prefixFreeIso() {
return (TransformationStrategy)PREFIX_FREE_ISO;
private static class ISOTransformationStrategy implements TransformationStrategy, Serializable {
private static final long serialVersionUID = 1L;
/** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */
private final boolean prefixFree;
/** Creates an ISO transformation strategy. The strategy will map a string to the lowest eight bits of its natural UTF16 bit sequence.
* @param prefixFree if true, the resulting set of binary words will be made prefix free by adding
protected ISOTransformationStrategy( boolean prefixFree ) {
this.prefixFree = prefixFree;
private static class ISOCharSequenceBitVector extends AbstractBitVector implements Serializable {
private static final long serialVersionUID = 1L;
private transient CharSequence s;
private transient long length;
private transient long actualEnd;
public ISOCharSequenceBitVector( final CharSequence s, final boolean prefixFree ) {
this.s = s;
actualEnd = s.length() * Byte.SIZE;
length = actualEnd + ( prefixFree ? Byte.SIZE : 0 );
public boolean getBoolean( long index ) {
if ( index > length ) throw new IndexOutOfBoundsException();
if ( index >= actualEnd ) return false;
final int byteIndex = (int)( index / Byte.SIZE );
return ( s.charAt( byteIndex ) & 0x80 >>> index % Byte.SIZE ) != 0;
public long getLong( final long from, final long to ) {
//System.err.println ( from + "->" + to );
final int startBit = (int)( from % Long.SIZE );
if ( startBit == 0 && to % Byte.SIZE == 0 ) {
if ( from == to ) return 0;
long l;
int pos = (int)( from / Byte.SIZE );
if ( to == from + Long.SIZE )
l = ( to > actualEnd ? 0 : ( s.charAt( pos + 7 ) & 0xFFL ) ) << 56 |
( s.charAt( pos + 6 ) & 0xFFL ) << 48 |
( s.charAt( pos + 5 ) & 0xFFL ) << 40 |
( s.charAt( pos + 4 ) & 0xFFL ) << 32 |
( s.charAt( pos + 3 ) & 0xFFL ) << 24 |
( s.charAt( pos + 2 ) & 0xFF ) << 16 |
( s.charAt( pos + 1 ) & 0xFF ) << 8 |
( s.charAt( pos ) & 0xFF );
else {
l = 0;
final int residual = (int)( Math.min( actualEnd, to ) - from );
for( int i = residual / Byte.SIZE; i-- != 0; )
l |= ( s.charAt( pos + i ) & 0xFFL ) << i * Byte.SIZE;
l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L;
l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L;
return ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL;
final long l = Long.SIZE - ( to - from );
final long startPos = from - startBit;
if ( l == Long.SIZE ) return 0;
if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l;
return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l;
public long length() {
return length;
private static class ISOMutableStringBitVector extends AbstractBitVector implements Serializable {
private static final long serialVersionUID = 1L;
private transient char[] a;
private transient long length;
private transient long actualEnd;
public ISOMutableStringBitVector( final MutableString s, final boolean prefixFree ) {
this.a = s.array();
actualEnd = s.length() * Byte.SIZE;
length = actualEnd + ( prefixFree ? Byte.SIZE : 0 );
public boolean getBoolean( long index ) {
if ( index > length ) throw new IndexOutOfBoundsException();
if ( index >= actualEnd ) return false;
final int byteIndex = (int)( index / Byte.SIZE );
return ( a[ byteIndex ] & 0x80 >>> index % Byte.SIZE ) != 0;
public long getLong( final long from, final long to ) {
//System.err.println ( from + "->" + to );
final int startBit = (int)( from % Long.SIZE );
if ( startBit == 0 && to % Byte.SIZE == 0 ) {
if ( from == to ) return 0;
long l;
int pos = (int)( from / Byte.SIZE );
if ( to == from + Long.SIZE )
l = ( to > actualEnd ? 0 : ( a[ pos + 7 ] & 0xFFL ) ) << 56 |
( a[ pos + 6 ] & 0xFFL ) << 48 |
( a[ pos + 5 ] & 0xFFL ) << 40 |
( a[ pos + 4 ] & 0xFFL ) << 32 |
( a[ pos + 3 ] & 0xFFL ) << 24 |
( a[ pos + 2 ] & 0xFF ) << 16 |
( a[ pos + 1 ] & 0xFF ) << 8 |
( a[ pos ] & 0xFF );
else {
l = 0;
final int residual = (int)( Math.min( actualEnd, to ) - from );
for( int i = residual / Byte.SIZE; i-- != 0; )
l |= ( a[ pos + i ] & 0xFFL ) << i * Byte.SIZE;
l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L;
l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L;
return ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL;
final long l = Long.SIZE - ( to - from );
final long startPos = from - startBit;
if ( l == Long.SIZE ) return 0;
if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l;
return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l;
public long length() {
return length;
public BitVector toBitVector( final CharSequence s ) {
return s instanceof MutableString ? new ISOMutableStringBitVector( (MutableString)s, prefixFree ) : new ISOCharSequenceBitVector( s, prefixFree );
public long numBits() { return 0; }
public TransformationStrategy copy() {
return this;
private Object readResolve() {
return prefixFree ? PREFIX_FREE_ISO : ISO;
private final static class IteratorWrapper extends AbstractObjectIterator {
final Iterator iterator;
final TransformationStrategy super T> transformationStrategy;
public IteratorWrapper( final Iterator iterator, final TransformationStrategy super T> transformationStrategy ) {
this.iterator = iterator;
this.transformationStrategy = transformationStrategy;
public boolean hasNext() {
return iterator.hasNext();
public BitVector next() {
return transformationStrategy.toBitVector( );
/** Wraps a given iterator, returning an iterator that emits {@linkplain BitVector bit vectors}.
* @param iterator an iterator.
* @param transformationStrategy a strategy to transform the object returned by iterator
* @return an iterator that emits the content of iterator
passed through transformationStrategy
public static Iterator wrap( final Iterator iterator, final TransformationStrategy super T> transformationStrategy ) {
return (Iterator)( transformationStrategy == IDENTITY ? iterator : new IteratorWrapper( iterator, transformationStrategy ) );
private final static class IterableWrapper implements Iterable {
private final TransformationStrategy super T> transformationStrategy;
private final Iterable collection;
public IterableWrapper( final Iterable collection, final TransformationStrategy super T> transformationStrategy ) {
this.collection = collection;
this.transformationStrategy = transformationStrategy;
public ObjectIterator iterator() {
return new IteratorWrapper( collection.iterator(), transformationStrategy.copy() );
/** Wraps a given iterable, returning an iterable that contains {@linkplain BitVector bit vectors}.
* @param iterable an iterable.
* @param transformationStrategy a strategy to transform the object contained in iterable
* @return an iterable that has the content of iterable
passed through transformationStrategy
public static Iterable wrap( final Iterable iterable, final TransformationStrategy super T> transformationStrategy ) {
return (Iterable)( transformationStrategy == IDENTITY ? iterable : new IterableWrapper( iterable, transformationStrategy ) );
private final static class ListWrapper extends AbstractObjectList {
private final TransformationStrategy super T> transformationStrategy;
private final List list;
public ListWrapper( final List list, final TransformationStrategy super T> transformationStrategy ) {
this.list = list;
this.transformationStrategy = transformationStrategy;
public BitVector get( int index ) {
return transformationStrategy.toBitVector( list.get( index ) );
public int size() {
return list.size();
/** Wraps a given list, returning a list that contains {@linkplain BitVector bit vectors}.
* @param list a list.
* @param transformationStrategy a strategy to transform the object contained in list
* @return a list that has the content of list
passed through transformationStrategy
public static List wrap( final List list, final TransformationStrategy super T> transformationStrategy ) {
return (List)( transformationStrategy == IDENTITY ? list : new ListWrapper( list, transformationStrategy ) );
private static class PrefixFreeTransformationStrategy implements TransformationStrategy, Serializable {
private static final long serialVersionUID = 1L;
private static class PrefixFreeBitVector extends AbstractBitVector implements Serializable {
private static final long serialVersionUID = 1L;
private transient BitVector v;
private transient long length;
public PrefixFreeBitVector( final BitVector v ) {
this.v = v;
length = v.length() * 2 + 1;
public boolean getBoolean( long index ) {
if ( index >= length ) throw new IndexOutOfBoundsException();
if ( index == length - 1 ) return false;
if ( index % 2 == 0 ) return true;
return v.getBoolean( index / 2 );
public long getLong( final long from, final long to ) {
final int startBit = (int)( from % Long.SIZE );
if ( startBit == 0 && to - from == Long.SIZE ) {
long word = v.getLong( from / 2, Math.min( v.length(), from / 2 + Long.SIZE ) );
if ( from % 2 != 0 ) word >>>= Long.SIZE / 2;
else word &= 0xFFFFFFFFL;
word = ( word | word << 16 ) & 0x0000FFFF0000FFFFL;
//System.err.println( Long.toHexString( word ) );
word = ( word | word << 8 ) & 0x00FF00FF00FF00FFL;
//System.err.println( Long.toHexString( word ) );
word = ( word | word << 4 ) & 0x0F0F0F0F0F0F0F0FL;
//System.err.println( Long.toHexString( word ) );
word = ( word | word << 2 ) & 0x3333333333333333L;
//System.err.println( Long.toHexString( word ) );
word = ( word << 1 | word << 2 ) | 0x5555555555555555L;
//System.err.println( Long.toHexString( word ) );
return word;
// TODO: implement in a fast way the case startBit == 0, to == length.
return super.getLong( from, to );
public long length() {
return length;
public BitVector toBitVector( final BitVector v ) {
return new PrefixFreeBitVector( v );
public long numBits() { return 0; }
public TransformationStrategy copy() {
return this;
private Object readResolve() {
/** A transformation from bit vectors to bit vectors that guarantees that its results are prefix free.
* More in detail, we map 0 to 10, 1 to 11, and we add a 0 at the end of all strings.
Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
* changes while the bit vector is being accessed, the results will be unpredictable.
public static TransformationStrategy prefixFree() {
return (TransformationStrategy)PREFIX_FREE;
private static final TransformationStrategy extends BitVector> PREFIX_FREE = new PrefixFreeTransformationStrategy();