All Downloads are FREE. Search and download functionalities are using the official Maven repository.

it.unimi.dsi.bits.TransformationStrategies Maven / Gradle / Ivy

Go to download

Blazegraph Modifications to the DSI utils. This are forked from version 1.10.0 under LGPLv2.1.

There is a newer version: 2.1.4
Show newest version
package it.unimi.dsi.bits;

/*		 
 * DSI utilities
 *
 * Copyright (C) 2007-2009 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 2.1 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */


import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.lang.MutableString;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;

/** A class providing static methods and objects that do useful things with transformation strategies.
 * 
 * @see TransformationStrategy
 */


public class TransformationStrategies {

	private final static TransformationStrategy IDENTITY = new TransformationStrategy() {
		private static final long serialVersionUID = 1L;

		public BitVector toBitVector( final BitVector object ) {
			return object;
		}
		
		public long numBits() { return 0; }

		public TransformationStrategy copy() {
			return this;
		}
		
		public Object readResolve() {
			return IDENTITY;
		}
	};
	
	
	/** A trivial transformation for data already in {@link BitVector} form. */
	@SuppressWarnings("unchecked")
	public static  TransformationStrategy identity() {
		return (TransformationStrategy)IDENTITY;
	}
 	
	private static final TransformationStrategy UTF16 = new Utf16TransformationStrategy( false );
	
	/** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation.
	 * 
	 * Warning: bit vectors returned by this strategy are adaptors around the original string. If the string
	 * changes while the bit vector is being accessed, the results will be unpredictable.  
	 */
	@SuppressWarnings("unchecked")
	public static  TransformationStrategy utf16() {
		return (TransformationStrategy)UTF16;
	}

	private static final TransformationStrategy PREFIX_FREE_UTF16 = new Utf16TransformationStrategy( true );
	
	/** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation and completes
	 * the representation with an ASCII NUL to guarantee lexicographical ordering and prefix-freeness.
	 * 
	 * 

Note that strings provided to this strategy must not contain ASCII NULs. * * Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFreeUtf16() { return (TransformationStrategy)PREFIX_FREE_UTF16; } private static class Utf16TransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; /** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */ private final boolean prefixFree; /** Creates a UTF16 transformation strategy. The strategy will map a string to its natural UTF16 bit sequence. * * @param prefixFree if true, the resulting set of binary words will be made prefix free by adding */ protected Utf16TransformationStrategy( boolean prefixFree ) { this.prefixFree = prefixFree; } private static class Utf16CharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private transient CharSequence s; private transient long length; private transient long actualEnd; public Utf16CharSequenceBitVector( final CharSequence s, final boolean prefixFree ) { this.s = s; actualEnd = s.length() * Character.SIZE; length = actualEnd + ( prefixFree ? Character.SIZE : 0 ); } public boolean getBoolean( long index ) { if ( index > length ) throw new IndexOutOfBoundsException(); if ( index >= actualEnd ) return false; final int charIndex = (int)( index / Character.SIZE ); return ( s.charAt( charIndex ) & 0x8000 >>> index % Character.SIZE ) != 0; } public long getLong( final long from, final long to ) { final int startBit = (int)( from % Long.SIZE ); if ( startBit == 0 && to % Character.SIZE == 0 ) { if ( from == to ) return 0; long l; int pos = (int)( from / Character.SIZE ); if ( to == from + Long.SIZE ) l = ( ( to > actualEnd ? 0 : (long)s.charAt( pos + 3 ) ) << 48 | (long)s.charAt( pos + 2 ) << 32 | (long)s.charAt( pos + 1 ) << 16 | s.charAt( pos ) ); else { l = 0; final int residual = (int)( Math.min( actualEnd, to ) - from ); for( int i = residual / Character.SIZE; i-- != 0; ) l |= (long)s.charAt( pos + i ) << i * Character.SIZE; } l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L; l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L; l = ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL; return ( l & 0x00ff00ff00ff00ffL ) << 8 | ( l >>> 8 ) & 0x00ff00ff00ff00ffL; } final long l = Long.SIZE - ( to - from ); final long startPos = from - startBit; if ( l == Long.SIZE ) return 0; if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l; return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l; } public long length() { return length; } } private static class Utf16MutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private transient char[] a; private transient long length; private transient long actualEnd; public Utf16MutableStringBitVector( final MutableString s, final boolean prefixFree ) { this.a = s.array(); actualEnd = s.length() * Character.SIZE; length = actualEnd + ( prefixFree ? Character.SIZE : 0 ); } public boolean getBoolean( long index ) { if ( index > length ) throw new IndexOutOfBoundsException(); if ( index >= actualEnd ) return false; final int charIndex = (int)( index / Character.SIZE ); return ( a[ charIndex ] & 0x8000 >>> index % Character.SIZE ) != 0; } public long getLong( final long from, final long to ) { final int startBit = (int)( from % Long.SIZE ); if ( startBit == 0 && to % Character.SIZE == 0 ) { if ( from == to ) return 0; long l; int pos = (int)( from / Character.SIZE ); if ( to == from + Long.SIZE ) l = ( ( to > actualEnd ? 0 : (long)a[ pos + 3 ] ) << 48 | (long)a[ pos + 2 ] << 32 | (long)a[ pos + 1 ] << 16 | a[ pos ] ); else { l = 0; final int residual = (int)( Math.min( actualEnd, to ) - from ); for( int i = residual / Character.SIZE; i-- != 0; ) l |= (long)a[ pos + i ] << i * Character.SIZE; } l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L; l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L; l = ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL; return ( l & 0x00ff00ff00ff00ffL ) << 8 | ( l >>> 8 ) & 0x00ff00ff00ff00ffL; } final long l = Long.SIZE - ( to - from ); final long startPos = from - startBit; if ( l == Long.SIZE ) return 0; if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l; return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l; } public long length() { return length; } } public BitVector toBitVector( final CharSequence s ) { return s instanceof MutableString ? new Utf16MutableStringBitVector( (MutableString)s, prefixFree ) : new Utf16CharSequenceBitVector( s, prefixFree ); } public long numBits() { return 0; } public TransformationStrategy copy() { return this; } private Object readResolve() { return prefixFree ? PREFIX_FREE_UTF16 : UTF16; } } private static final TransformationStrategy ISO = new ISOTransformationStrategy( false ); /** A trivial transformation from strings to bit vectors that concatenates the lower eight bits of the UTF-16 representation. * *

Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset. * * Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy iso() { return (TransformationStrategy)ISO; } private static final TransformationStrategy PREFIX_FREE_ISO = new ISOTransformationStrategy( true ); /** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation and completes * the representation with an ASCII NUL to guarantee lexicographical ordering and prefix-freeness. * *

Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset, and * that strings provided to this strategy must not contain ASCII NULs. * * Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFreeIso() { return (TransformationStrategy)PREFIX_FREE_ISO; } private static class ISOTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; /** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */ private final boolean prefixFree; /** Creates an ISO transformation strategy. The strategy will map a string to the lowest eight bits of its natural UTF16 bit sequence. * * @param prefixFree if true, the resulting set of binary words will be made prefix free by adding */ protected ISOTransformationStrategy( boolean prefixFree ) { this.prefixFree = prefixFree; } private static class ISOCharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private transient CharSequence s; private transient long length; private transient long actualEnd; public ISOCharSequenceBitVector( final CharSequence s, final boolean prefixFree ) { this.s = s; actualEnd = s.length() * Byte.SIZE; length = actualEnd + ( prefixFree ? Byte.SIZE : 0 ); } public boolean getBoolean( long index ) { if ( index > length ) throw new IndexOutOfBoundsException(); if ( index >= actualEnd ) return false; final int byteIndex = (int)( index / Byte.SIZE ); return ( s.charAt( byteIndex ) & 0x80 >>> index % Byte.SIZE ) != 0; } public long getLong( final long from, final long to ) { //System.err.println ( from + "->" + to ); final int startBit = (int)( from % Long.SIZE ); if ( startBit == 0 && to % Byte.SIZE == 0 ) { if ( from == to ) return 0; long l; int pos = (int)( from / Byte.SIZE ); if ( to == from + Long.SIZE ) l = ( to > actualEnd ? 0 : ( s.charAt( pos + 7 ) & 0xFFL ) ) << 56 | ( s.charAt( pos + 6 ) & 0xFFL ) << 48 | ( s.charAt( pos + 5 ) & 0xFFL ) << 40 | ( s.charAt( pos + 4 ) & 0xFFL ) << 32 | ( s.charAt( pos + 3 ) & 0xFFL ) << 24 | ( s.charAt( pos + 2 ) & 0xFF ) << 16 | ( s.charAt( pos + 1 ) & 0xFF ) << 8 | ( s.charAt( pos ) & 0xFF ); else { l = 0; final int residual = (int)( Math.min( actualEnd, to ) - from ); for( int i = residual / Byte.SIZE; i-- != 0; ) l |= ( s.charAt( pos + i ) & 0xFFL ) << i * Byte.SIZE; } l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L; l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L; return ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL; } final long l = Long.SIZE - ( to - from ); final long startPos = from - startBit; if ( l == Long.SIZE ) return 0; if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l; return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l; } public long length() { return length; } } private static class ISOMutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private transient char[] a; private transient long length; private transient long actualEnd; public ISOMutableStringBitVector( final MutableString s, final boolean prefixFree ) { this.a = s.array(); actualEnd = s.length() * Byte.SIZE; length = actualEnd + ( prefixFree ? Byte.SIZE : 0 ); } public boolean getBoolean( long index ) { if ( index > length ) throw new IndexOutOfBoundsException(); if ( index >= actualEnd ) return false; final int byteIndex = (int)( index / Byte.SIZE ); return ( a[ byteIndex ] & 0x80 >>> index % Byte.SIZE ) != 0; } public long getLong( final long from, final long to ) { //System.err.println ( from + "->" + to ); final int startBit = (int)( from % Long.SIZE ); if ( startBit == 0 && to % Byte.SIZE == 0 ) { if ( from == to ) return 0; long l; int pos = (int)( from / Byte.SIZE ); if ( to == from + Long.SIZE ) l = ( to > actualEnd ? 0 : ( a[ pos + 7 ] & 0xFFL ) ) << 56 | ( a[ pos + 6 ] & 0xFFL ) << 48 | ( a[ pos + 5 ] & 0xFFL ) << 40 | ( a[ pos + 4 ] & 0xFFL ) << 32 | ( a[ pos + 3 ] & 0xFFL ) << 24 | ( a[ pos + 2 ] & 0xFF ) << 16 | ( a[ pos + 1 ] & 0xFF ) << 8 | ( a[ pos ] & 0xFF ); else { l = 0; final int residual = (int)( Math.min( actualEnd, to ) - from ); for( int i = residual / Byte.SIZE; i-- != 0; ) l |= ( a[ pos + i ] & 0xFFL ) << i * Byte.SIZE; } l = ( l & 0x5555555555555555L ) << 1 | ( l >>> 1 ) & 0x5555555555555555L; l = ( l & 0x3333333333333333L ) << 2 | ( l >>> 2 ) & 0x3333333333333333L; return ( l & 0x0f0f0f0f0f0f0f0fL ) << 4 | ( l >>> 4 ) & 0x0f0f0f0f0f0f0f0fL; } final long l = Long.SIZE - ( to - from ); final long startPos = from - startBit; if ( l == Long.SIZE ) return 0; if ( startBit <= l ) return getLong( startPos, Math.min( length, startPos + Long.SIZE ) ) << l - startBit >>> l; return getLong( startPos, startPos + Long.SIZE ) >>> startBit | getLong( startPos + Long.SIZE, Math.min( length, startPos + 2 * Long.SIZE ) ) << Long.SIZE + l - startBit >>> l; } public long length() { return length; } } public BitVector toBitVector( final CharSequence s ) { return s instanceof MutableString ? new ISOMutableStringBitVector( (MutableString)s, prefixFree ) : new ISOCharSequenceBitVector( s, prefixFree ); } public long numBits() { return 0; } public TransformationStrategy copy() { return this; } private Object readResolve() { return prefixFree ? PREFIX_FREE_ISO : ISO; } } private final static class IteratorWrapper extends AbstractObjectIterator { final Iterator iterator; final TransformationStrategy transformationStrategy; public IteratorWrapper( final Iterator iterator, final TransformationStrategy transformationStrategy ) { this.iterator = iterator; this.transformationStrategy = transformationStrategy; } public boolean hasNext() { return iterator.hasNext(); } public BitVector next() { return transformationStrategy.toBitVector( iterator.next() ); } } /** Wraps a given iterator, returning an iterator that emits {@linkplain BitVector bit vectors}. * * @param iterator an iterator. * @param transformationStrategy a strategy to transform the object returned by iterator. * @return an iterator that emits the content of iterator passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static Iterator wrap( final Iterator iterator, final TransformationStrategy transformationStrategy ) { return (Iterator)( transformationStrategy == IDENTITY ? iterator : new IteratorWrapper( iterator, transformationStrategy ) ); } private final static class IterableWrapper implements Iterable { private final TransformationStrategy transformationStrategy; private final Iterable collection; public IterableWrapper( final Iterable collection, final TransformationStrategy transformationStrategy ) { this.collection = collection; this.transformationStrategy = transformationStrategy; } public ObjectIterator iterator() { return new IteratorWrapper( collection.iterator(), transformationStrategy.copy() ); } } /** Wraps a given iterable, returning an iterable that contains {@linkplain BitVector bit vectors}. * * @param iterable an iterable. * @param transformationStrategy a strategy to transform the object contained in iterable. * @return an iterable that has the content of iterable passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static Iterable wrap( final Iterable iterable, final TransformationStrategy transformationStrategy ) { return (Iterable)( transformationStrategy == IDENTITY ? iterable : new IterableWrapper( iterable, transformationStrategy ) ); } private final static class ListWrapper extends AbstractObjectList { private final TransformationStrategy transformationStrategy; private final List list; public ListWrapper( final List list, final TransformationStrategy transformationStrategy ) { this.list = list; this.transformationStrategy = transformationStrategy; } public BitVector get( int index ) { return transformationStrategy.toBitVector( list.get( index ) ); } public int size() { return list.size(); } } /** Wraps a given list, returning a list that contains {@linkplain BitVector bit vectors}. * * @param list a list. * @param transformationStrategy a strategy to transform the object contained in list. * @return a list that has the content of list passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static List wrap( final List list, final TransformationStrategy transformationStrategy ) { return (List)( transformationStrategy == IDENTITY ? list : new ListWrapper( list, transformationStrategy ) ); } private static class PrefixFreeTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; private static class PrefixFreeBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private transient BitVector v; private transient long length; public PrefixFreeBitVector( final BitVector v ) { this.v = v; length = v.length() * 2 + 1; } public boolean getBoolean( long index ) { if ( index >= length ) throw new IndexOutOfBoundsException(); if ( index == length - 1 ) return false; if ( index % 2 == 0 ) return true; return v.getBoolean( index / 2 ); } public long getLong( final long from, final long to ) { final int startBit = (int)( from % Long.SIZE ); if ( startBit == 0 && to - from == Long.SIZE ) { long word = v.getLong( from / 2, Math.min( v.length(), from / 2 + Long.SIZE ) ); if ( from % 2 != 0 ) word >>>= Long.SIZE / 2; else word &= 0xFFFFFFFFL; word = ( word | word << 16 ) & 0x0000FFFF0000FFFFL; //System.err.println( Long.toHexString( word ) ); word = ( word | word << 8 ) & 0x00FF00FF00FF00FFL; //System.err.println( Long.toHexString( word ) ); word = ( word | word << 4 ) & 0x0F0F0F0F0F0F0F0FL; //System.err.println( Long.toHexString( word ) ); word = ( word | word << 2 ) & 0x3333333333333333L; //System.err.println( Long.toHexString( word ) ); word = ( word << 1 | word << 2 ) | 0x5555555555555555L; //System.err.println( Long.toHexString( word ) ); return word; } // TODO: implement in a fast way the case startBit == 0, to == length. return super.getLong( from, to ); } public long length() { return length; } } public BitVector toBitVector( final BitVector v ) { return new PrefixFreeBitVector( v ); } public long numBits() { return 0; } public TransformationStrategy copy() { return this; } private Object readResolve() { return PREFIX_FREE; } } /** A transformation from bit vectors to bit vectors that guarantees that its results are prefix free. * *

More in detail, we map 0 to 10, 1 to 11, and we add a 0 at the end of all strings. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFree() { return (TransformationStrategy)PREFIX_FREE; } private static final TransformationStrategy PREFIX_FREE = new PrefixFreeTransformationStrategy(); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy