
drv.ArrayFrontCodedList.drv Maven / Gradle / Ivy
Show all versions of fastutil Show documentation
/*
* Copyright (C) 2002-2016 Sebastiano Vigna
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package PACKAGE;
import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.fastutil.longs.LongArrays;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Collection;
import java.util.NoSuchElementException;
import java.util.RandomAccess;
/** Compact storage of lists of arrays using front coding.
*
* This class stores immutably a list of arrays in a single large array
* using front coding (of course, the compression will be reasonable only if
* the list is sorted lexicographically—see below). It implements an
* immutable type-specific list that returns the i-th array when
* calling {@link #get(int) get(i)}. The returned array may be
* freely modified.
*
*
Front coding is based on the idea that if the i-th and the
* (i+1)-th array have a common prefix, we might store the length
* of the common prefix, and then the rest of the second array.
*
*
This approach, of course, requires that once in a while an array is
* stored entirely. The ratio of a front-coded list defines how
* often this happens (once every {@link #ratio()} arrays). A higher ratio
* means more compression, but means also a longer access time, as more arrays
* have to be probed to build the result. Note that we must build an array
* every time {@link #get(int)} is called, but this class provides also methods
* that extract one of the stored arrays in a given array, reducing garbage
* collection. See the documentation of the family of get()
* methods.
*
*
By setting the ratio to 1 we actually disable front coding: however, we
* still have a data structure storing large list of arrays with a reduced
* overhead (just one integer per array, plus the space required for lengths).
*
*
Note that the typical usage of front-coded lists is under the form of
* serialized objects; usually, the data that has to be compacted is processed
* offline, and the resulting structure is stored permanently. Since the
* pointer array is not stored, the serialized format is very small.
*
*
Implementation Details
*
* All arrays are stored in a {@linkplain it.unimi.dsi.fastutil.BigArrays big array}. A separate array of pointers
* indexes arrays whose position is a multiple of the ratio: thus, a higher ratio
* means also less pointers.
*
*
More in detail, an array whose position is a multiple of the ratio is
* stored as the array length, followed by the elements of the array. The array
* length is coded by a simple variable-length list of k-1 bit
* blocks, where k is the number of bits of the underlying primitive
* type. All other arrays are stored as follows: let common
the
* length of the maximum common prefix between the array and its predecessor.
* Then we store the array length decremented by common
, followed
* by common
, followed by the array elements whose index is
* greater than or equal to common
. For instance, if we store
* foo
, foobar
, football
and
* fool
in a front-coded character-array list with ratio 3, the
* character array will contain
*
*
* 3 f o o 3 3 b a r 5 3 t b a l l 4 f o o l
*
*/
public class ARRAY_FRONT_CODED_LIST extends AbstractObjectList implements Serializable, Cloneable, RandomAccess {
private static final long serialVersionUID = 1L;
/** The number of arrays in the list. */
protected final int n;
/** The ratio of this front-coded list. */
protected final int ratio;
/** The big array containing the compressed arrays. */
protected final KEY_TYPE[][] array;
/** The pointers to entire arrays in the list. */
protected transient long[] p;
/** Creates a new front-coded list containing the arrays returned by the given iterator.
*
* @param arrays an iterator returning arrays.
* @param ratio the desired ratio.
*/
public ARRAY_FRONT_CODED_LIST( final Iterator arrays, final int ratio ) {
if ( ratio < 1 ) throw new IllegalArgumentException( "Illegal ratio (" + ratio + ")" );
KEY_TYPE[][] array = BIG_ARRAYS.EMPTY_BIG_ARRAY;
long[] p = LongArrays.EMPTY_ARRAY;
KEY_TYPE[][] a = new KEY_TYPE[ 2 ][];
long curSize = 0;
int n = 0, b = 0, common, length, minLength;
while( arrays.hasNext() ) {
a[ b ] = arrays.next();
length = a[ b ].length;
if ( n % ratio == 0 ) {
p = LongArrays.grow( p, n / ratio + 1 );
p[ n / ratio ] = curSize;
array = BIG_ARRAYS.grow( array, curSize + count( length ) + length, curSize );
curSize += writeInt( array, length, curSize );
BIG_ARRAYS.copyToBig( a[ b ], 0, array, curSize, length );
curSize += length;
}
else {
minLength = a[ 1 - b ].length;
if ( length < minLength ) minLength = length;
for( common = 0; common < minLength; common++ ) if ( a[ 0 ][ common ] != a[ 1 ][ common ] ) break;
length -= common;
array = BIG_ARRAYS.grow( array, curSize + count( length ) + count( common ) + length, curSize );
curSize += writeInt( array, length, curSize );
curSize += writeInt( array, common, curSize );
BIG_ARRAYS.copyToBig( a[ b ], common, array, curSize, length );
curSize += length;
}
b = 1 - b;
n++;
}
this.n = n;
this.ratio = ratio;
this.array = BIG_ARRAYS.trim( array, curSize );
this.p = LongArrays.trim( p, ( n + ratio - 1 ) / ratio );
}
/** Creates a new front-coded list containing the arrays in the given collection.
*
* @param c a collection containing arrays.
* @param ratio the desired ratio.
*/
public ARRAY_FRONT_CODED_LIST( final Collection c, final int ratio ) {
this( c.iterator(), ratio );
}
/* The following (rather messy) methods implements the encoding of arbitrary integers inside a big array.
* Unfortunately, we have to specify different codes for almost every type. */
/** Reads a coded length.
* @param a the data big array.
* @param pos the starting position.
* @return the length coded at pos
.
*/
private static int readInt( final KEY_TYPE a[][], long pos ) {
#if #keyclass(Integer)
return IntBigArrays.get( a, pos );
#elif #keyclass(Long)
return (int)LongBigArrays.get( a, pos );
#elif #keyclass(Character)
final char c0 = CharBigArrays.get( a, pos );
return c0 < 0x8000 ? c0 : ( c0 & 0x7FFF ) << 16 | CharBigArrays.get( a, pos + 1 );
#elif #keyclass(Short)
final short s0 = ShortBigArrays.get( a, pos );
return s0 >= 0 ? s0 : s0 << 16 | ( ShortBigArrays.get( a, pos + 1 ) & 0xFFFF );
#else
final byte b0 = ByteBigArrays.get( a, pos );
if ( b0 >= 0 ) return b0;
final byte b1 = ByteBigArrays.get( a, pos + 1 );
if ( b1 >= 0 ) return ( - b0 - 1 ) << 7 | b1;
final byte b2 = ByteBigArrays.get( a, pos + 2 );
if ( b2 >= 0 ) return ( - b0 - 1 ) << 14 | ( - b1 - 1 ) << 7 | b2;
final byte b3 = ByteBigArrays.get( a, pos + 3 );
if ( b3 >= 0 ) return ( - b0 - 1 ) << 21 | ( - b1 - 1 ) << 14 | ( - b2 - 1 ) << 7 | b3;
return ( - b0 - 1 ) << 28 | ( - b1 - 1 ) << 21 | ( - b2 - 1 ) << 14 | ( - b3 - 1 ) << 7 | ByteBigArrays.get( a, pos + 4 );
#endif
}
/** Computes the number of elements coding a given length.
* @param length the length to be coded.
* @return the number of elements coding length
.
*/
private static int count( final int length ) {
#if #keyclass(Integer) || #keyclass(Long)
return 1;
#elif #keyclass(Character) || #keyclass(Short)
return length < ( 1 << 15 ) ? 1 : 2;
#else
if ( length < ( 1 << 7 ) ) return 1;
if ( length < ( 1 << 14 ) ) return 2;
if ( length < ( 1 << 21 ) ) return 3;
if ( length < ( 1 << 28 ) ) return 4;
return 5;
#endif
}
/** Writes a length.
* @param a the data array.
* @param length the length to be written.
* @param pos the starting position.
* @return the number of elements coding length
.
*/
private static int writeInt( final KEY_TYPE a[][], int length, long pos ) {
#if #keyclass(Long)
LongBigArrays.set( a, pos, length );
return 1;
#elif #keyclass(Integer)
IntBigArrays.set( a, pos, length );
return 1;
#elif #keyclass(Character)
if ( length < ( 1 << 15 ) ) {
CharBigArrays.set( a, pos, (char)length );
return 1;
}
CharBigArrays.set( a, pos++, (char)( length >>> 16 | 0x8000 ) );
CharBigArrays.set( a, pos, (char)( length & 0xFFFF ) );
return 2;
#elif #keyclass(Short)
if ( length < ( 1 << 15 ) ) {
ShortBigArrays.set( a, pos, (short)length );
return 1;
}
ShortBigArrays.set( a, pos++, (short)( - ( length >>> 16 ) - 1 ) );
ShortBigArrays.set( a, pos, (short)( length & 0xFFFF ) );
return 2;
#else
final int count = count( length );
ByteBigArrays.set( a, pos + count - 1, (byte)( length & 0x7F ) );
if ( count != 1 ) {
int i = count - 1;
while( i-- != 0 ) {
length >>>= 7;
ByteBigArrays.set( a, pos + i, (byte)( - ( length & 0x7F ) - 1 ) );
}
}
return count;
#endif
}
/** Returns the ratio of this list.
*
* @return the ratio of this list.
*/
public int ratio() {
return ratio;
}
/** Computes the length of the array at the given index.
*
* This private version of {@link #arrayLength(int)} does not check its argument.
*
* @param index an index.
* @return the length of the index
-th array.
*/
private int length( final int index ) {
final KEY_TYPE[][] array = this.array;
final int delta = index % ratio; // The index into the p array, and the delta inside the block.
long pos = p[ index / ratio ]; // The position into the array of the first entire word before the index-th.
int length = readInt( array, pos );
if ( delta == 0 ) return length;
// First of all, we recover the array length and the maximum amount of copied elements.
int common;
pos += count( length ) + length;
length = readInt( array, pos );
common = readInt( array, pos + count( length ) );
for( int i = 0; i < delta - 1; i++ ) {
pos += count( length ) + count( common ) + length;
length = readInt( array, pos );
common = readInt( array, pos + count( length ) );
}
return length + common;
}
/** Computes the length of the array at the given index.
*
* @param index an index.
* @return the length of the index
-th array.
*/
public int arrayLength( final int index ) {
ensureRestrictedIndex( index );
return length( index );
}
/** Extracts the array at the given index.
*
* @param index an index.
* @param a the array that will store the result (we assume that it can hold the result).
* @param offset an offset into a
where elements will be store.
* @param length a maximum number of elements to store in a
.
* @return the length of the extracted array.
*/
private int extract( final int index, final KEY_TYPE a[], final int offset, final int length ) {
final int delta = index % ratio; // The delta inside the block.
final long startPos = p[ index / ratio ]; // The position into the array of the first entire word before the index-th.
long pos, prevArrayPos;
int arrayLength = readInt( array, pos = startPos ), currLen = 0, actualCommon;
if ( delta == 0 ) {
pos = p[ index / ratio ] + count( arrayLength );
BIG_ARRAYS.copyFromBig( array, pos, a, offset, Math.min( length, arrayLength ) );
return arrayLength;
}
int common = 0;
for( int i = 0; i < delta; i++ ) {
prevArrayPos = pos + count( arrayLength ) + ( i != 0 ? count( common ) : 0 );
pos = prevArrayPos + arrayLength;
arrayLength = readInt( array, pos );
common = readInt( array, pos + count( arrayLength ) );
actualCommon = Math.min( common, length );
if ( actualCommon <= currLen ) currLen = actualCommon;
else {
BIG_ARRAYS.copyFromBig( array, prevArrayPos, a, currLen + offset, actualCommon - currLen );
currLen = actualCommon;
}
}
if ( currLen < length ) BIG_ARRAYS.copyFromBig( array, pos + count( arrayLength ) + count( common ), a, currLen + offset, Math.min( arrayLength, length - currLen ) );
return arrayLength + common;
}
public KEY_TYPE[] get( final int index ) {
return getArray( index );
}
/**
* @see #get(int)
*/
public KEY_TYPE[] getArray( final int index ) {
ensureRestrictedIndex( index );
final int length = length( index );
final KEY_TYPE a[] = new KEY_TYPE[ length ];
extract( index, a, 0, length );
return a;
}
/** Stores in the given array elements from an array stored in this front-coded list.
*
* @param index an index.
* @param a the array that will store the result.
* @param offset an offset into a
where elements will be store.
* @param length a maximum number of elements to store in a
.
* @return if a
can hold the extracted elements, the number of extracted elements;
* otherwise, the number of remaining elements with the sign changed.
*/
public int get( final int index, final KEY_TYPE[] a, final int offset, final int length ) {
ensureRestrictedIndex( index );
ARRAYS.ensureOffsetLength( a, offset, length );
final int arrayLength = extract( index, a, offset, length );
if ( length >= arrayLength ) return arrayLength;
return length - arrayLength;
}
/** Stores in the given array an array stored in this front-coded list.
*
* @param index an index.
* @param a the array that will store the content of the result (we assume that it can hold the result).
* @return if a
can hold the extracted elements, the number of extracted elements;
* otherwise, the number of remaining elements with the sign changed.
*/
public int get( final int index, final KEY_TYPE[] a ) {
return get( index, a, 0, a.length );
}
public int size() {
return n;
}
public ObjectListIterator listIterator( final int start ) {
ensureIndex( start );
return new AbstractObjectListIterator() {
KEY_TYPE s[] = ARRAYS.EMPTY_ARRAY;
int i = 0;
long pos = 0;
boolean inSync; // Whether the current value in a is the string just before the next to be produced.
{
if ( start != 0 ) {
if ( start == n ) i = start; // If we start at the end, we do nothing.
else {
pos = p[ start / ratio ];
int j = start % ratio;
i = start - j;
while( j-- != 0 ) next();
}
}
}
public boolean hasNext() {
return i < n;
}
public boolean hasPrevious() {
return i > 0;
}
public int previousIndex() {
return i - 1;
}
public int nextIndex() {
return i;
}
public KEY_TYPE[] next() {
int length, common;
if ( ! hasNext() ) throw new NoSuchElementException();
if ( i % ratio == 0 ) {
pos = p[ i / ratio ];
length = readInt( array, pos );
s = ARRAYS.ensureCapacity( s, length, 0 );
BIG_ARRAYS.copyFromBig( array, pos + count( length ), s, 0, length );
pos += length + count( length );
inSync = true;
}
else {
if ( inSync ) {
length = readInt( array, pos );
common = readInt( array, pos + count( length ) );
s = ARRAYS.ensureCapacity( s, length + common, common );
BIG_ARRAYS.copyFromBig( array, pos + count( length ) + count ( common ), s, common, length );
pos += count( length ) + count( common ) + length;
length += common;
}
else {
s = ARRAYS.ensureCapacity( s, length = length( i ), 0 );
extract( i, s, 0, length );
}
}
i++;
return ARRAYS.copy( s, 0, length );
}
public KEY_TYPE[] previous() {
if ( ! hasPrevious() ) throw new NoSuchElementException();
inSync = false;
return getArray( --i );
}
};
}
/** Returns a copy of this list.
*
* @return a copy of this list.
*/
public ARRAY_FRONT_CODED_LIST clone() {
return this;
}
public String toString() {
final StringBuffer s = new StringBuffer();
s.append( "[ " );
for( int i = 0; i < n; i++ ) {
if ( i != 0 ) s.append( ", " );
s.append( ARRAY_LIST.wrap( getArray( i ) ).toString() );
}
s.append( " ]" );
return s.toString();
}
/** Computes the pointer array using the currently set ratio, number of elements and underlying array.
*
* @return the computed pointer array.
*/
protected long[] rebuildPointerArray() {
final long[] p = new long[ ( n + ratio - 1 ) / ratio ];
final KEY_TYPE a[][] = array;
int length, count;
long pos = 0;
for( int i = 0, j = 0, skip = ratio - 1; i < n; i++ ) {
length = readInt( a, pos );
count = count( length );
if ( ++skip == ratio ) {
skip = 0;
p[ j++ ] = pos;
pos += count + length;
}
else pos += count + count( readInt( a, pos + count ) ) + length;
}
return p;
}
private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException {
s.defaultReadObject();
// Rebuild pointer array
p = rebuildPointerArray();
}
#ifdef TEST
private static long seed = System.currentTimeMillis();
private static java.util.Random r = new java.util.Random( seed );
private static KEY_TYPE genKey() {
#if #keyclass(Byte) || #keyclass(Short) || #keyclass(Character)
return (KEY_TYPE)(r.nextInt());
#elif #keys(primitive)
return r.NEXT_KEY();
#elif #keyclass(Object)
return Integer.toBinaryString( r.nextInt() );
#else
return new java.io.Serializable() {};
#endif
}
private static java.text.NumberFormat format = new java.text.DecimalFormat( "#,###.00" );
private static java.text.FieldPosition fp = new java.text.FieldPosition( 0 );
private static String format( double d ) {
StringBuffer s = new StringBuffer();
return format.format( d, s, fp ).toString();
}
private static void speedTest( int n, boolean comp ) {
System.out.println( "There are presently no speed tests for this class." );
}
private static void fatal( String msg ) {
System.out.println( msg );
System.exit( 1 );
}
private static void ensure( boolean cond, String msg ) {
if ( cond ) return;
fatal( msg );
}
private static boolean contentEquals( java.util.List x, java.util.List y ) {
if ( x.size() != y.size() ) return false;
for( int i = 0; i < x.size(); i++ ) if ( ! java.util.Arrays.equals( (KEY_TYPE[])x.get( i ), (KEY_TYPE[])y.get( i ) ) ) return false;
return true;
}
private static int l[];
private static KEY_TYPE[][] a;
private static void test( int n ) {
int c;
l = new int[ n ];
a = new KEY_TYPE[n][];
for( int i = 0; i < n; i++ ) l[i] = (int)(Math.abs(r.nextGaussian())*32);
for( int i = 0; i < n; i++ ) a[i] = new KEY_TYPE[l[i]];
for( int i = 0; i < n; i++ ) for( int j = 0; j < l[i]; j++ ) a[i][j] = genKey();
ARRAY_FRONT_CODED_LIST m = new ARRAY_FRONT_CODED_LIST( it.unimi.dsi.fastutil.objects.ObjectIterators.wrap( a ), r.nextInt( 4 ) + 1 );
it.unimi.dsi.fastutil.objects.ObjectArrayList t = new it.unimi.dsi.fastutil.objects.ObjectArrayList( a );
//System.out.println(m);
//for( i = 0; i < t.size(); i++ ) System.out.println(ARRAY_LIST.wrap((KEY_TYPE[])t.get(i)));
/* Now we check that m actually holds that data. */
ensure( contentEquals( m, t ), "Error (" + seed + "): m does not equal t at creation" );
/* Now we check cloning. */
ensure( contentEquals( m, (java.util.List)m.clone() ), "Error (" + seed + "): m does not equal m.clone()" );
/* Now we play with iterators. */
{
ObjectListIterator i;
java.util.ListIterator j;
Object J;
i = m.listIterator();
j = t.listIterator();
for( int k = 0; k < 2*n; k++ ) {
ensure( i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext()" );
ensure( i.hasPrevious() == j.hasPrevious(), "Error (" + seed + "): divergence in hasPrevious()" );
if ( r.nextFloat() < .8 && i.hasNext() ) {
ensure( java.util.Arrays.equals( (KEY_TYPE[])i.next(), (KEY_TYPE[])j.next() ), "Error (" + seed + "): divergence in next()" );
}
else if ( r.nextFloat() < .2 && i.hasPrevious() ) {
ensure( java.util.Arrays.equals( (KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous() ), "Error (" + seed + "): divergence in previous()" );
}
ensure( i.nextIndex() == j.nextIndex(), "Error (" + seed + "): divergence in nextIndex()" );
ensure( i.previousIndex() == j.previousIndex(), "Error (" + seed + "): divergence in previousIndex()" );
}
}
{
Object previous = null;
Object I, J;
int from = r.nextInt( m.size() +1 );
ObjectListIterator i;
java.util.ListIterator j;
i = m.listIterator( from );
j = t.listIterator( from );
for( int k = 0; k < 2*n; k++ ) {
ensure( i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext() (iterator with starting point " + from + ")" );
ensure( i.hasPrevious() == j.hasPrevious() , "Error (" + seed + "): divergence in hasPrevious() (iterator with starting point " + from + ")" );
if ( r.nextFloat() < .8 && i.hasNext() ) {
ensure( java.util.Arrays.equals( (KEY_TYPE[])i.next(), (KEY_TYPE[])j.next() ), "Error (" + seed + "): divergence in next() (iterator with starting point " + from + ")" );
//System.err.println("Done next " + I + " " + J + " " + badPrevious);
}
else if ( r.nextFloat() < .2 && i.hasPrevious() ) {
ensure( java.util.Arrays.equals( (KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous() ), "Error (" + seed + "): divergence in previous() (iterator with starting point " + from + ")" );
}
}
}
try {
java.io.File ff = new java.io.File("it.unimi.dsi.fastutil.test");
java.io.OutputStream os = new java.io.FileOutputStream(ff);
java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(os);
oos.writeObject(m);
oos.close();
java.io.InputStream is = new java.io.FileInputStream(ff);
java.io.ObjectInputStream ois = new java.io.ObjectInputStream(is);
m = (ARRAY_FRONT_CODED_LIST)ois.readObject();
ois.close();
ff.delete();
}
catch(Exception e) {
e.printStackTrace();
System.exit( 1 );
}
ensure( contentEquals( m, t ), "Error (" + seed + "): m does not equal t after save/read" );
System.out.println("Test OK");
return;
}
public static void main( String args[] ) {
int n = Integer.parseInt(args[1]);
if ( args.length > 2 ) r = new java.util.Random( seed = Long.parseLong( args[ 2 ] ) );
try {
if ("speedTest".equals(args[0]) || "speedComp".equals(args[0])) speedTest( n, "speedComp".equals(args[0]) );
else if ( "test".equals( args[0] ) ) test(n);
} catch( Throwable e ) {
e.printStackTrace( System.err );
System.err.println( "seed: " + seed );
}
}
#endif
}