All Downloads are FREE. Search and download functionalities are using the official Maven repository.

drv.ArrayFrontCodedList.drv Maven / Gradle / Ivy

Go to download

fastutil extends the Java Collections Framework by providing type-specific maps, sets, lists, and queues with a small memory footprint and fast access and insertion; it provides also big (64-bit) arrays, sets and lists, sorting algorithms, fast, practical I/O classes for binary and text files, and facilities for memory mapping large files. Note that if you have both this jar and fastutil-core.jar in your dependencies, fastutil-core.jar should be excluded.

There is a newer version: 8.5.15
Show newest version
/*		 
 * Copyright (C) 2002-2016 Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 */


package PACKAGE;

import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.fastutil.longs.LongArrays;

import java.io.Serializable;
import java.util.Iterator;
import java.util.Collection;
import java.util.NoSuchElementException;
import java.util.RandomAccess;

/** Compact storage of lists of arrays using front coding.
 * 
 * 

This class stores immutably a list of arrays in a single large array * using front coding (of course, the compression will be reasonable only if * the list is sorted lexicographically—see below). It implements an * immutable type-specific list that returns the i-th array when * calling {@link #get(int) get(i)}. The returned array may be * freely modified. * *

Front coding is based on the idea that if the i-th and the * (i+1)-th array have a common prefix, we might store the length * of the common prefix, and then the rest of the second array. * *

This approach, of course, requires that once in a while an array is * stored entirely. The ratio of a front-coded list defines how * often this happens (once every {@link #ratio()} arrays). A higher ratio * means more compression, but means also a longer access time, as more arrays * have to be probed to build the result. Note that we must build an array * every time {@link #get(int)} is called, but this class provides also methods * that extract one of the stored arrays in a given array, reducing garbage * collection. See the documentation of the family of get() * methods. * *

By setting the ratio to 1 we actually disable front coding: however, we * still have a data structure storing large list of arrays with a reduced * overhead (just one integer per array, plus the space required for lengths). * *

Note that the typical usage of front-coded lists is under the form of * serialized objects; usually, the data that has to be compacted is processed * offline, and the resulting structure is stored permanently. Since the * pointer array is not stored, the serialized format is very small. * *

Implementation Details

* *

All arrays are stored in a {@linkplain it.unimi.dsi.fastutil.BigArrays big array}. A separate array of pointers * indexes arrays whose position is a multiple of the ratio: thus, a higher ratio * means also less pointers. * *

More in detail, an array whose position is a multiple of the ratio is * stored as the array length, followed by the elements of the array. The array * length is coded by a simple variable-length list of k-1 bit * blocks, where k is the number of bits of the underlying primitive * type. All other arrays are stored as follows: let common the * length of the maximum common prefix between the array and its predecessor. * Then we store the array length decremented by common, followed * by common, followed by the array elements whose index is * greater than or equal to common. For instance, if we store * foo, foobar, football and * fool in a front-coded character-array list with ratio 3, the * character array will contain * *

 * 3 f o o 3 3 b a r 5 3 t b a l l 4 f o o l 
 * 
*/ public class ARRAY_FRONT_CODED_LIST extends AbstractObjectList implements Serializable, Cloneable, RandomAccess { private static final long serialVersionUID = 1L; /** The number of arrays in the list. */ protected final int n; /** The ratio of this front-coded list. */ protected final int ratio; /** The big array containing the compressed arrays. */ protected final KEY_TYPE[][] array; /** The pointers to entire arrays in the list. */ protected transient long[] p; /** Creates a new front-coded list containing the arrays returned by the given iterator. * * @param arrays an iterator returning arrays. * @param ratio the desired ratio. */ public ARRAY_FRONT_CODED_LIST( final Iterator arrays, final int ratio ) { if ( ratio < 1 ) throw new IllegalArgumentException( "Illegal ratio (" + ratio + ")" ); KEY_TYPE[][] array = BIG_ARRAYS.EMPTY_BIG_ARRAY; long[] p = LongArrays.EMPTY_ARRAY; KEY_TYPE[][] a = new KEY_TYPE[ 2 ][]; long curSize = 0; int n = 0, b = 0, common, length, minLength; while( arrays.hasNext() ) { a[ b ] = arrays.next(); length = a[ b ].length; if ( n % ratio == 0 ) { p = LongArrays.grow( p, n / ratio + 1 ); p[ n / ratio ] = curSize; array = BIG_ARRAYS.grow( array, curSize + count( length ) + length, curSize ); curSize += writeInt( array, length, curSize ); BIG_ARRAYS.copyToBig( a[ b ], 0, array, curSize, length ); curSize += length; } else { minLength = a[ 1 - b ].length; if ( length < minLength ) minLength = length; for( common = 0; common < minLength; common++ ) if ( a[ 0 ][ common ] != a[ 1 ][ common ] ) break; length -= common; array = BIG_ARRAYS.grow( array, curSize + count( length ) + count( common ) + length, curSize ); curSize += writeInt( array, length, curSize ); curSize += writeInt( array, common, curSize ); BIG_ARRAYS.copyToBig( a[ b ], common, array, curSize, length ); curSize += length; } b = 1 - b; n++; } this.n = n; this.ratio = ratio; this.array = BIG_ARRAYS.trim( array, curSize ); this.p = LongArrays.trim( p, ( n + ratio - 1 ) / ratio ); } /** Creates a new front-coded list containing the arrays in the given collection. * * @param c a collection containing arrays. * @param ratio the desired ratio. */ public ARRAY_FRONT_CODED_LIST( final Collection c, final int ratio ) { this( c.iterator(), ratio ); } /* The following (rather messy) methods implements the encoding of arbitrary integers inside a big array. * Unfortunately, we have to specify different codes for almost every type. */ /** Reads a coded length. * @param a the data big array. * @param pos the starting position. * @return the length coded at pos. */ private static int readInt( final KEY_TYPE a[][], long pos ) { #if #keyclass(Integer) return IntBigArrays.get( a, pos ); #elif #keyclass(Long) return (int)LongBigArrays.get( a, pos ); #elif #keyclass(Character) final char c0 = CharBigArrays.get( a, pos ); return c0 < 0x8000 ? c0 : ( c0 & 0x7FFF ) << 16 | CharBigArrays.get( a, pos + 1 ); #elif #keyclass(Short) final short s0 = ShortBigArrays.get( a, pos ); return s0 >= 0 ? s0 : s0 << 16 | ( ShortBigArrays.get( a, pos + 1 ) & 0xFFFF ); #else final byte b0 = ByteBigArrays.get( a, pos ); if ( b0 >= 0 ) return b0; final byte b1 = ByteBigArrays.get( a, pos + 1 ); if ( b1 >= 0 ) return ( - b0 - 1 ) << 7 | b1; final byte b2 = ByteBigArrays.get( a, pos + 2 ); if ( b2 >= 0 ) return ( - b0 - 1 ) << 14 | ( - b1 - 1 ) << 7 | b2; final byte b3 = ByteBigArrays.get( a, pos + 3 ); if ( b3 >= 0 ) return ( - b0 - 1 ) << 21 | ( - b1 - 1 ) << 14 | ( - b2 - 1 ) << 7 | b3; return ( - b0 - 1 ) << 28 | ( - b1 - 1 ) << 21 | ( - b2 - 1 ) << 14 | ( - b3 - 1 ) << 7 | ByteBigArrays.get( a, pos + 4 ); #endif } /** Computes the number of elements coding a given length. * @param length the length to be coded. * @return the number of elements coding length. */ private static int count( final int length ) { #if #keyclass(Integer) || #keyclass(Long) return 1; #elif #keyclass(Character) || #keyclass(Short) return length < ( 1 << 15 ) ? 1 : 2; #else if ( length < ( 1 << 7 ) ) return 1; if ( length < ( 1 << 14 ) ) return 2; if ( length < ( 1 << 21 ) ) return 3; if ( length < ( 1 << 28 ) ) return 4; return 5; #endif } /** Writes a length. * @param a the data array. * @param length the length to be written. * @param pos the starting position. * @return the number of elements coding length. */ private static int writeInt( final KEY_TYPE a[][], int length, long pos ) { #if #keyclass(Long) LongBigArrays.set( a, pos, length ); return 1; #elif #keyclass(Integer) IntBigArrays.set( a, pos, length ); return 1; #elif #keyclass(Character) if ( length < ( 1 << 15 ) ) { CharBigArrays.set( a, pos, (char)length ); return 1; } CharBigArrays.set( a, pos++, (char)( length >>> 16 | 0x8000 ) ); CharBigArrays.set( a, pos, (char)( length & 0xFFFF ) ); return 2; #elif #keyclass(Short) if ( length < ( 1 << 15 ) ) { ShortBigArrays.set( a, pos, (short)length ); return 1; } ShortBigArrays.set( a, pos++, (short)( - ( length >>> 16 ) - 1 ) ); ShortBigArrays.set( a, pos, (short)( length & 0xFFFF ) ); return 2; #else final int count = count( length ); ByteBigArrays.set( a, pos + count - 1, (byte)( length & 0x7F ) ); if ( count != 1 ) { int i = count - 1; while( i-- != 0 ) { length >>>= 7; ByteBigArrays.set( a, pos + i, (byte)( - ( length & 0x7F ) - 1 ) ); } } return count; #endif } /** Returns the ratio of this list. * * @return the ratio of this list. */ public int ratio() { return ratio; } /** Computes the length of the array at the given index. * *

This private version of {@link #arrayLength(int)} does not check its argument. * * @param index an index. * @return the length of the index-th array. */ private int length( final int index ) { final KEY_TYPE[][] array = this.array; final int delta = index % ratio; // The index into the p array, and the delta inside the block. long pos = p[ index / ratio ]; // The position into the array of the first entire word before the index-th. int length = readInt( array, pos ); if ( delta == 0 ) return length; // First of all, we recover the array length and the maximum amount of copied elements. int common; pos += count( length ) + length; length = readInt( array, pos ); common = readInt( array, pos + count( length ) ); for( int i = 0; i < delta - 1; i++ ) { pos += count( length ) + count( common ) + length; length = readInt( array, pos ); common = readInt( array, pos + count( length ) ); } return length + common; } /** Computes the length of the array at the given index. * * @param index an index. * @return the length of the index-th array. */ public int arrayLength( final int index ) { ensureRestrictedIndex( index ); return length( index ); } /** Extracts the array at the given index. * * @param index an index. * @param a the array that will store the result (we assume that it can hold the result). * @param offset an offset into a where elements will be store. * @param length a maximum number of elements to store in a. * @return the length of the extracted array. */ private int extract( final int index, final KEY_TYPE a[], final int offset, final int length ) { final int delta = index % ratio; // The delta inside the block. final long startPos = p[ index / ratio ]; // The position into the array of the first entire word before the index-th. long pos, prevArrayPos; int arrayLength = readInt( array, pos = startPos ), currLen = 0, actualCommon; if ( delta == 0 ) { pos = p[ index / ratio ] + count( arrayLength ); BIG_ARRAYS.copyFromBig( array, pos, a, offset, Math.min( length, arrayLength ) ); return arrayLength; } int common = 0; for( int i = 0; i < delta; i++ ) { prevArrayPos = pos + count( arrayLength ) + ( i != 0 ? count( common ) : 0 ); pos = prevArrayPos + arrayLength; arrayLength = readInt( array, pos ); common = readInt( array, pos + count( arrayLength ) ); actualCommon = Math.min( common, length ); if ( actualCommon <= currLen ) currLen = actualCommon; else { BIG_ARRAYS.copyFromBig( array, prevArrayPos, a, currLen + offset, actualCommon - currLen ); currLen = actualCommon; } } if ( currLen < length ) BIG_ARRAYS.copyFromBig( array, pos + count( arrayLength ) + count( common ), a, currLen + offset, Math.min( arrayLength, length - currLen ) ); return arrayLength + common; } public KEY_TYPE[] get( final int index ) { return getArray( index ); } /** * @see #get(int) */ public KEY_TYPE[] getArray( final int index ) { ensureRestrictedIndex( index ); final int length = length( index ); final KEY_TYPE a[] = new KEY_TYPE[ length ]; extract( index, a, 0, length ); return a; } /** Stores in the given array elements from an array stored in this front-coded list. * * @param index an index. * @param a the array that will store the result. * @param offset an offset into a where elements will be store. * @param length a maximum number of elements to store in a. * @return if a can hold the extracted elements, the number of extracted elements; * otherwise, the number of remaining elements with the sign changed. */ public int get( final int index, final KEY_TYPE[] a, final int offset, final int length ) { ensureRestrictedIndex( index ); ARRAYS.ensureOffsetLength( a, offset, length ); final int arrayLength = extract( index, a, offset, length ); if ( length >= arrayLength ) return arrayLength; return length - arrayLength; } /** Stores in the given array an array stored in this front-coded list. * * @param index an index. * @param a the array that will store the content of the result (we assume that it can hold the result). * @return if a can hold the extracted elements, the number of extracted elements; * otherwise, the number of remaining elements with the sign changed. */ public int get( final int index, final KEY_TYPE[] a ) { return get( index, a, 0, a.length ); } public int size() { return n; } public ObjectListIterator listIterator( final int start ) { ensureIndex( start ); return new AbstractObjectListIterator() { KEY_TYPE s[] = ARRAYS.EMPTY_ARRAY; int i = 0; long pos = 0; boolean inSync; // Whether the current value in a is the string just before the next to be produced. { if ( start != 0 ) { if ( start == n ) i = start; // If we start at the end, we do nothing. else { pos = p[ start / ratio ]; int j = start % ratio; i = start - j; while( j-- != 0 ) next(); } } } public boolean hasNext() { return i < n; } public boolean hasPrevious() { return i > 0; } public int previousIndex() { return i - 1; } public int nextIndex() { return i; } public KEY_TYPE[] next() { int length, common; if ( ! hasNext() ) throw new NoSuchElementException(); if ( i % ratio == 0 ) { pos = p[ i / ratio ]; length = readInt( array, pos ); s = ARRAYS.ensureCapacity( s, length, 0 ); BIG_ARRAYS.copyFromBig( array, pos + count( length ), s, 0, length ); pos += length + count( length ); inSync = true; } else { if ( inSync ) { length = readInt( array, pos ); common = readInt( array, pos + count( length ) ); s = ARRAYS.ensureCapacity( s, length + common, common ); BIG_ARRAYS.copyFromBig( array, pos + count( length ) + count ( common ), s, common, length ); pos += count( length ) + count( common ) + length; length += common; } else { s = ARRAYS.ensureCapacity( s, length = length( i ), 0 ); extract( i, s, 0, length ); } } i++; return ARRAYS.copy( s, 0, length ); } public KEY_TYPE[] previous() { if ( ! hasPrevious() ) throw new NoSuchElementException(); inSync = false; return getArray( --i ); } }; } /** Returns a copy of this list. * * @return a copy of this list. */ public ARRAY_FRONT_CODED_LIST clone() { return this; } public String toString() { final StringBuffer s = new StringBuffer(); s.append( "[ " ); for( int i = 0; i < n; i++ ) { if ( i != 0 ) s.append( ", " ); s.append( ARRAY_LIST.wrap( getArray( i ) ).toString() ); } s.append( " ]" ); return s.toString(); } /** Computes the pointer array using the currently set ratio, number of elements and underlying array. * * @return the computed pointer array. */ protected long[] rebuildPointerArray() { final long[] p = new long[ ( n + ratio - 1 ) / ratio ]; final KEY_TYPE a[][] = array; int length, count; long pos = 0; for( int i = 0, j = 0, skip = ratio - 1; i < n; i++ ) { length = readInt( a, pos ); count = count( length ); if ( ++skip == ratio ) { skip = 0; p[ j++ ] = pos; pos += count + length; } else pos += count + count( readInt( a, pos + count ) ) + length; } return p; } private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { s.defaultReadObject(); // Rebuild pointer array p = rebuildPointerArray(); } #ifdef TEST private static long seed = System.currentTimeMillis(); private static java.util.Random r = new java.util.Random( seed ); private static KEY_TYPE genKey() { #if #keyclass(Byte) || #keyclass(Short) || #keyclass(Character) return (KEY_TYPE)(r.nextInt()); #elif #keys(primitive) return r.NEXT_KEY(); #elif #keyclass(Object) return Integer.toBinaryString( r.nextInt() ); #else return new java.io.Serializable() {}; #endif } private static java.text.NumberFormat format = new java.text.DecimalFormat( "#,###.00" ); private static java.text.FieldPosition fp = new java.text.FieldPosition( 0 ); private static String format( double d ) { StringBuffer s = new StringBuffer(); return format.format( d, s, fp ).toString(); } private static void speedTest( int n, boolean comp ) { System.out.println( "There are presently no speed tests for this class." ); } private static void fatal( String msg ) { System.out.println( msg ); System.exit( 1 ); } private static void ensure( boolean cond, String msg ) { if ( cond ) return; fatal( msg ); } private static boolean contentEquals( java.util.List x, java.util.List y ) { if ( x.size() != y.size() ) return false; for( int i = 0; i < x.size(); i++ ) if ( ! java.util.Arrays.equals( (KEY_TYPE[])x.get( i ), (KEY_TYPE[])y.get( i ) ) ) return false; return true; } private static int l[]; private static KEY_TYPE[][] a; private static void test( int n ) { int c; l = new int[ n ]; a = new KEY_TYPE[n][]; for( int i = 0; i < n; i++ ) l[i] = (int)(Math.abs(r.nextGaussian())*32); for( int i = 0; i < n; i++ ) a[i] = new KEY_TYPE[l[i]]; for( int i = 0; i < n; i++ ) for( int j = 0; j < l[i]; j++ ) a[i][j] = genKey(); ARRAY_FRONT_CODED_LIST m = new ARRAY_FRONT_CODED_LIST( it.unimi.dsi.fastutil.objects.ObjectIterators.wrap( a ), r.nextInt( 4 ) + 1 ); it.unimi.dsi.fastutil.objects.ObjectArrayList t = new it.unimi.dsi.fastutil.objects.ObjectArrayList( a ); //System.out.println(m); //for( i = 0; i < t.size(); i++ ) System.out.println(ARRAY_LIST.wrap((KEY_TYPE[])t.get(i))); /* Now we check that m actually holds that data. */ ensure( contentEquals( m, t ), "Error (" + seed + "): m does not equal t at creation" ); /* Now we check cloning. */ ensure( contentEquals( m, (java.util.List)m.clone() ), "Error (" + seed + "): m does not equal m.clone()" ); /* Now we play with iterators. */ { ObjectListIterator i; java.util.ListIterator j; Object J; i = m.listIterator(); j = t.listIterator(); for( int k = 0; k < 2*n; k++ ) { ensure( i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext()" ); ensure( i.hasPrevious() == j.hasPrevious(), "Error (" + seed + "): divergence in hasPrevious()" ); if ( r.nextFloat() < .8 && i.hasNext() ) { ensure( java.util.Arrays.equals( (KEY_TYPE[])i.next(), (KEY_TYPE[])j.next() ), "Error (" + seed + "): divergence in next()" ); } else if ( r.nextFloat() < .2 && i.hasPrevious() ) { ensure( java.util.Arrays.equals( (KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous() ), "Error (" + seed + "): divergence in previous()" ); } ensure( i.nextIndex() == j.nextIndex(), "Error (" + seed + "): divergence in nextIndex()" ); ensure( i.previousIndex() == j.previousIndex(), "Error (" + seed + "): divergence in previousIndex()" ); } } { Object previous = null; Object I, J; int from = r.nextInt( m.size() +1 ); ObjectListIterator i; java.util.ListIterator j; i = m.listIterator( from ); j = t.listIterator( from ); for( int k = 0; k < 2*n; k++ ) { ensure( i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext() (iterator with starting point " + from + ")" ); ensure( i.hasPrevious() == j.hasPrevious() , "Error (" + seed + "): divergence in hasPrevious() (iterator with starting point " + from + ")" ); if ( r.nextFloat() < .8 && i.hasNext() ) { ensure( java.util.Arrays.equals( (KEY_TYPE[])i.next(), (KEY_TYPE[])j.next() ), "Error (" + seed + "): divergence in next() (iterator with starting point " + from + ")" ); //System.err.println("Done next " + I + " " + J + " " + badPrevious); } else if ( r.nextFloat() < .2 && i.hasPrevious() ) { ensure( java.util.Arrays.equals( (KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous() ), "Error (" + seed + "): divergence in previous() (iterator with starting point " + from + ")" ); } } } try { java.io.File ff = new java.io.File("it.unimi.dsi.fastutil.test"); java.io.OutputStream os = new java.io.FileOutputStream(ff); java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(os); oos.writeObject(m); oos.close(); java.io.InputStream is = new java.io.FileInputStream(ff); java.io.ObjectInputStream ois = new java.io.ObjectInputStream(is); m = (ARRAY_FRONT_CODED_LIST)ois.readObject(); ois.close(); ff.delete(); } catch(Exception e) { e.printStackTrace(); System.exit( 1 ); } ensure( contentEquals( m, t ), "Error (" + seed + "): m does not equal t after save/read" ); System.out.println("Test OK"); return; } public static void main( String args[] ) { int n = Integer.parseInt(args[1]); if ( args.length > 2 ) r = new java.util.Random( seed = Long.parseLong( args[ 2 ] ) ); try { if ("speedTest".equals(args[0]) || "speedComp".equals(args[0])) speedTest( n, "speedComp".equals(args[0]) ); else if ( "test".equals( args[0] ) ) test(n); } catch( Throwable e ) { e.printStackTrace( System.err ); System.err.println( "seed: " + seed ); } } #endif }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy