All Downloads are FREE. Search and download functionalities are using the official Maven repository.

drv.ArrayFrontCodedList.drv Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2002-2017 Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package PACKAGE;

import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.fastutil.longs.LongArrays;

import java.io.Serializable;
import java.util.Iterator;
import java.util.Collection;
import java.util.NoSuchElementException;
import java.util.RandomAccess;

/** Compact storage of lists of arrays using front coding.
 *
 * 

This class stores immutably a list of arrays in a single large array * using front coding (of course, the compression will be reasonable only if * the list is sorted lexicographically—see below). It implements an * immutable type-specific list that returns the i-th array when * calling {@link #get(int) get(i)}. The returned array may be * freely modified. * *

Front coding is based on the idea that if the i-th and the * (i+1)-th array have a common prefix, we might store the length * of the common prefix, and then the rest of the second array. * *

This approach, of course, requires that once in a while an array is * stored entirely. The ratio of a front-coded list defines how * often this happens (once every {@link #ratio()} arrays). A higher ratio * means more compression, but means also a longer access time, as more arrays * have to be probed to build the result. Note that we must build an array * every time {@link #get(int)} is called, but this class provides also methods * that extract one of the stored arrays in a given array, reducing garbage * collection. See the documentation of the family of get() * methods. * *

By setting the ratio to 1 we actually disable front coding: however, we * still have a data structure storing large list of arrays with a reduced * overhead (just one integer per array, plus the space required for lengths). * *

Note that the typical usage of front-coded lists is under the form of * serialized objects; usually, the data that has to be compacted is processed * offline, and the resulting structure is stored permanently. Since the * pointer array is not stored, the serialized format is very small. * *

Implementation Details

* *

All arrays are stored in a {@linkplain it.unimi.dsi.fastutil.BigArrays big array}. A separate array of pointers * indexes arrays whose position is a multiple of the ratio: thus, a higher ratio * means also less pointers. * *

More in detail, an array whose position is a multiple of the ratio is * stored as the array length, followed by the elements of the array. The array * length is coded by a simple variable-length list of k-1 bit * blocks, where k is the number of bits of the underlying primitive * type. All other arrays are stored as follows: let common the * length of the maximum common prefix between the array and its predecessor. * Then we store the array length decremented by common, followed * by common, followed by the array elements whose index is * greater than or equal to common. For instance, if we store * foo, foobar, football and * fool in a front-coded character-array list with ratio 3, the * character array will contain * *

 * 3 f o o 3 3 b a r 5 3 t b a l l 4 f o o l
 * 
*/ public class ARRAY_FRONT_CODED_LIST extends AbstractObjectList implements Serializable, Cloneable, RandomAccess { private static final long serialVersionUID = 1L; /** The number of arrays in the list. */ protected final int n; /** The ratio of this front-coded list. */ protected final int ratio; /** The big array containing the compressed arrays. */ protected final KEY_TYPE[][] array; /** The pointers to entire arrays in the list. */ protected transient long[] p; /** Creates a new front-coded list containing the arrays returned by the given iterator. * * @param arrays an iterator returning arrays. * @param ratio the desired ratio. */ public ARRAY_FRONT_CODED_LIST(final Iterator arrays, final int ratio) { if (ratio < 1) throw new IllegalArgumentException("Illegal ratio (" + ratio + ")"); KEY_TYPE[][] array = BIG_ARRAYS.EMPTY_BIG_ARRAY; long[] p = LongArrays.EMPTY_ARRAY; KEY_TYPE[][] a = new KEY_TYPE[2][]; long curSize = 0; int n = 0, b = 0, common, length, minLength; while(arrays.hasNext()) { a[b] = arrays.next(); length = a[b].length; if (n % ratio == 0) { p = LongArrays.grow(p, n / ratio + 1); p[n / ratio] = curSize; array = BIG_ARRAYS.grow(array, curSize + count(length) + length, curSize); curSize += writeInt(array, length, curSize); BIG_ARRAYS.copyToBig(a[b], 0, array, curSize, length); curSize += length; } else { minLength = a[1 - b].length; if (length < minLength) minLength = length; for(common = 0; common < minLength; common++) if (a[0][common] != a[1][common]) break; length -= common; array = BIG_ARRAYS.grow(array, curSize + count(length) + count(common) + length, curSize); curSize += writeInt(array, length, curSize); curSize += writeInt(array, common, curSize); BIG_ARRAYS.copyToBig(a[b], common, array, curSize, length); curSize += length; } b = 1 - b; n++; } this.n = n; this.ratio = ratio; this.array = BIG_ARRAYS.trim(array, curSize); this.p = LongArrays.trim(p, (n + ratio - 1) / ratio); } /** Creates a new front-coded list containing the arrays in the given collection. * * @param c a collection containing arrays. * @param ratio the desired ratio. */ public ARRAY_FRONT_CODED_LIST(final Collection c, final int ratio) { this(c.iterator(), ratio); } /* The following (rather messy) methods implements the encoding of arbitrary integers inside a big array. * Unfortunately, we have to specify different codes for almost every type. */ /** Reads a coded length. * @param a the data big array. * @param pos the starting position. * @return the length coded at pos. */ private static int readInt(final KEY_TYPE a[][], long pos) { #if KEY_CLASS_Integer return IntBigArrays.get(a, pos); #elif KEY_CLASS_Long return (int)LongBigArrays.get(a, pos); #elif KEY_CLASS_Character final char c0 = CharBigArrays.get(a, pos); return c0 < 0x8000 ? c0 : (c0 & 0x7FFF) << 16 | CharBigArrays.get(a, pos + 1); #elif KEY_CLASS_Short final short s0 = ShortBigArrays.get(a, pos); return s0 >= 0 ? s0 : s0 << 16 | (ShortBigArrays.get(a, pos + 1) & 0xFFFF); #else final byte b0 = ByteBigArrays.get(a, pos); if (b0 >= 0) return b0; final byte b1 = ByteBigArrays.get(a, pos + 1); if (b1 >= 0) return (- b0 - 1) << 7 | b1; final byte b2 = ByteBigArrays.get(a, pos + 2); if (b2 >= 0) return (- b0 - 1) << 14 | (- b1 - 1) << 7 | b2; final byte b3 = ByteBigArrays.get(a, pos + 3); if (b3 >= 0) return (- b0 - 1) << 21 | (- b1 - 1) << 14 | (- b2 - 1) << 7 | b3; return (- b0 - 1) << 28 | (- b1 - 1) << 21 | (- b2 - 1) << 14 | (- b3 - 1) << 7 | ByteBigArrays.get(a, pos + 4); #endif } /** Computes the number of elements coding a given length. * @param length the length to be coded. * @return the number of elements coding length. */ private static int count(final int length) { #if KEY_CLASS_Integer || KEY_CLASS_Long return 1; #elif KEY_CLASS_Character || KEY_CLASS_Short return length < (1 << 15) ? 1 : 2; #else if (length < (1 << 7)) return 1; if (length < (1 << 14)) return 2; if (length < (1 << 21)) return 3; if (length < (1 << 28)) return 4; return 5; #endif } /** Writes a length. * @param a the data array. * @param length the length to be written. * @param pos the starting position. * @return the number of elements coding length. */ private static int writeInt(final KEY_TYPE a[][], int length, long pos) { #if KEY_CLASS_Long LongBigArrays.set(a, pos, length); return 1; #elif KEY_CLASS_Integer IntBigArrays.set(a, pos, length); return 1; #elif KEY_CLASS_Character if (length < (1 << 15)) { CharBigArrays.set(a, pos, (char)length); return 1; } CharBigArrays.set(a, pos++, (char)(length >>> 16 | 0x8000)); CharBigArrays.set(a, pos, (char)(length & 0xFFFF)); return 2; #elif KEY_CLASS_Short if (length < (1 << 15)) { ShortBigArrays.set(a, pos, (short)length); return 1; } ShortBigArrays.set(a, pos++, (short)(- (length >>> 16) - 1)); ShortBigArrays.set(a, pos, (short)(length & 0xFFFF)); return 2; #else final int count = count(length); ByteBigArrays.set(a, pos + count - 1, (byte)(length & 0x7F)); if (count != 1) { int i = count - 1; while(i-- != 0) { length >>>= 7; ByteBigArrays.set(a, pos + i, (byte)(- (length & 0x7F) - 1)); } } return count; #endif } /** Returns the ratio of this list. * * @return the ratio of this list. */ public int ratio() { return ratio; } /** Computes the length of the array at the given index. * *

This private version of {@link #arrayLength(int)} does not check its argument. * * @param index an index. * @return the length of the index-th array. */ private int length(final int index) { final KEY_TYPE[][] array = this.array; final int delta = index % ratio; // The index into the p array, and the delta inside the block. long pos = p[index / ratio]; // The position into the array of the first entire word before the index-th. int length = readInt(array, pos); if (delta == 0) return length; // First of all, we recover the array length and the maximum amount of copied elements. int common; pos += count(length) + length; length = readInt(array, pos); common = readInt(array, pos + count(length)); for(int i = 0; i < delta - 1; i++) { pos += count(length) + count(common) + length; length = readInt(array, pos); common = readInt(array, pos + count(length)); } return length + common; } /** Computes the length of the array at the given index. * * @param index an index. * @return the length of the index-th array. */ public int arrayLength(final int index) { ensureRestrictedIndex(index); return length(index); } /** Extracts the array at the given index. * * @param index an index. * @param a the array that will store the result (we assume that it can hold the result). * @param offset an offset into a where elements will be store. * @param length a maximum number of elements to store in a. * @return the length of the extracted array. */ private int extract(final int index, final KEY_TYPE a[], final int offset, final int length) { final int delta = index % ratio; // The delta inside the block. final long startPos = p[index / ratio]; // The position into the array of the first entire word before the index-th. long pos, prevArrayPos; int arrayLength = readInt(array, pos = startPos), currLen = 0, actualCommon; if (delta == 0) { pos = p[index / ratio] + count(arrayLength); BIG_ARRAYS.copyFromBig(array, pos, a, offset, Math.min(length, arrayLength)); return arrayLength; } int common = 0; for(int i = 0; i < delta; i++) { prevArrayPos = pos + count(arrayLength) + (i != 0 ? count(common) : 0); pos = prevArrayPos + arrayLength; arrayLength = readInt(array, pos); common = readInt(array, pos + count(arrayLength)); actualCommon = Math.min(common, length); if (actualCommon <= currLen) currLen = actualCommon; else { BIG_ARRAYS.copyFromBig(array, prevArrayPos, a, currLen + offset, actualCommon - currLen); currLen = actualCommon; } } if (currLen < length) BIG_ARRAYS.copyFromBig(array, pos + count(arrayLength) + count(common), a, currLen + offset, Math.min(arrayLength, length - currLen)); return arrayLength + common; } /** {@inheritDoc} *

This implementation delegates to {@link #getArray(int)}. */ @Override public KEY_TYPE[] get(final int index) { return getArray(index); } /** Returns an array stored in this front-coded list. * * @param index an index. * @return the corresponding array stored in this front-coded list. */ public KEY_TYPE[] getArray(final int index) { ensureRestrictedIndex(index); final int length = length(index); final KEY_TYPE a[] = new KEY_TYPE[length]; extract(index, a, 0, length); return a; } /** Stores in the given array elements from an array stored in this front-coded list. * * @param index an index. * @param a the array that will store the result. * @param offset an offset into a where elements will be store. * @param length a maximum number of elements to store in a. * @return if a can hold the extracted elements, the number of extracted elements; * otherwise, the number of remaining elements with the sign changed. */ public int get(final int index, final KEY_TYPE[] a, final int offset, final int length) { ensureRestrictedIndex(index); ARRAYS.ensureOffsetLength(a, offset, length); final int arrayLength = extract(index, a, offset, length); if (length >= arrayLength) return arrayLength; return length - arrayLength; } /** Stores in the given array an array stored in this front-coded list. * * @param index an index. * @param a the array that will store the content of the result (we assume that it can hold the result). * @return if a can hold the extracted elements, the number of extracted elements; * otherwise, the number of remaining elements with the sign changed. */ public int get(final int index, final KEY_TYPE[] a) { return get(index, a, 0, a.length); } @Override public int size() { return n; } @Override public ObjectListIterator listIterator(final int start) { ensureIndex(start); return new AbstractObjectListIterator() { KEY_TYPE s[] = ARRAYS.EMPTY_ARRAY; int i = 0; long pos = 0; boolean inSync; // Whether the current value in a is the string just before the next to be produced. { if (start != 0) { if (start == n) i = start; // If we start at the end, we do nothing. else { pos = p[start / ratio]; int j = start % ratio; i = start - j; while(j-- != 0) next(); } } } @Override public boolean hasNext() { return i < n; } @Override public boolean hasPrevious() { return i > 0; } @Override public int previousIndex() { return i - 1; } @Override public int nextIndex() { return i; } @Override public KEY_TYPE[] next() { int length, common; if (! hasNext()) throw new NoSuchElementException(); if (i % ratio == 0) { pos = p[i / ratio]; length = readInt(array, pos); s = ARRAYS.ensureCapacity(s, length, 0); BIG_ARRAYS.copyFromBig(array, pos + count(length), s, 0, length); pos += length + count(length); inSync = true; } else { if (inSync) { length = readInt(array, pos); common = readInt(array, pos + count(length)); s = ARRAYS.ensureCapacity(s, length + common, common); BIG_ARRAYS.copyFromBig(array, pos + count(length) + count (common), s, common, length); pos += count(length) + count(common) + length; length += common; } else { s = ARRAYS.ensureCapacity(s, length = length(i), 0); extract(i, s, 0, length); } } i++; return ARRAYS.copy(s, 0, length); } @Override public KEY_TYPE[] previous() { if (! hasPrevious()) throw new NoSuchElementException(); inSync = false; return getArray(--i); } }; } /** Returns a copy of this list. * * @return a copy of this list. */ @Override public ARRAY_FRONT_CODED_LIST clone() { return this; } @Override public String toString() { final StringBuffer s = new StringBuffer(); s.append("["); for(int i = 0; i < n; i++) { if (i != 0) s.append(", "); s.append(ARRAY_LIST.wrap(getArray(i)).toString()); } s.append("]"); return s.toString(); } /** Computes the pointer array using the currently set ratio, number of elements and underlying array. * * @return the computed pointer array. */ protected long[] rebuildPointerArray() { final long[] p = new long[(n + ratio - 1) / ratio]; final KEY_TYPE a[][] = array; int length, count; long pos = 0; for(int i = 0, j = 0, skip = ratio - 1; i < n; i++) { length = readInt(a, pos); count = count(length); if (++skip == ratio) { skip = 0; p[j++] = pos; pos += count + length; } else pos += count + count(readInt(a, pos + count)) + length; } return p; } private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { s.defaultReadObject(); // Rebuild pointer array p = rebuildPointerArray(); } #ifdef TEST private static long seed = System.currentTimeMillis(); private static java.util.Random r = new java.util.Random(seed); private static KEY_TYPE genKey() { #if KEY_CLASS_Byte || KEY_CLASS_Short || KEY_CLASS_Character return (KEY_TYPE)(r.nextInt()); #elif KEYS_PRIMITIVE return r.NEXT_KEY(); #elif KEY_CLASS_Object return Integer.toBinaryString(r.nextInt()); #else return new java.io.Serializable() {}; #endif } private static java.text.NumberFormat format = new java.text.DecimalFormat("#,###.00"); private static java.text.FieldPosition fp = new java.text.FieldPosition(0); private static String format(double d) { StringBuffer s = new StringBuffer(); return format.format(d, s, fp).toString(); } private static void speedTest(int n, boolean comp) { System.out.println("There are presently no speed tests for this class."); } private static void fatal(String msg) { System.out.println(msg); System.exit(1); } private static void ensure(boolean cond, String msg) { if (cond) return; fatal(msg); } private static boolean contentEquals(java.util.List x, java.util.List y) { if (x.size() != y.size()) return false; for(int i = 0; i < x.size(); i++) if (! java.util.Arrays.equals((KEY_TYPE[])x.get(i), (KEY_TYPE[])y.get(i))) return false; return true; } private static int l[]; private static KEY_TYPE[][] a; private static void test(int n) { int c; l = new int[n]; a = new KEY_TYPE[n][]; for(int i = 0; i < n; i++) l[i] = (int)(Math.abs(r.nextGaussian())*32); for(int i = 0; i < n; i++) a[i] = new KEY_TYPE[l[i]]; for(int i = 0; i < n; i++) for(int j = 0; j < l[i]; j++) a[i][j] = genKey(); ARRAY_FRONT_CODED_LIST m = new ARRAY_FRONT_CODED_LIST(it.unimi.dsi.fastutil.objects.ObjectIterators.wrap(a), r.nextInt(4) + 1); it.unimi.dsi.fastutil.objects.ObjectArrayList t = new it.unimi.dsi.fastutil.objects.ObjectArrayList(a); //System.out.println(m); //for(i = 0; i < t.size(); i++) System.out.println(ARRAY_LIST.wrap((KEY_TYPE[])t.get(i))); /* Now we check that m actually holds that data. */ ensure(contentEquals(m, t), "Error (" + seed + "): m does not equal t at creation"); /* Now we check cloning. */ ensure(contentEquals(m, (java.util.List)m.clone()), "Error (" + seed + "): m does not equal m.clone()"); /* Now we play with iterators. */ { ObjectListIterator i; java.util.ListIterator j; Object J; i = m.listIterator(); j = t.listIterator(); for(int k = 0; k < 2*n; k++) { ensure(i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext()"); ensure(i.hasPrevious() == j.hasPrevious(), "Error (" + seed + "): divergence in hasPrevious()"); if (r.nextFloat() < .8 && i.hasNext()) { ensure(java.util.Arrays.equals((KEY_TYPE[])i.next(), (KEY_TYPE[])j.next()), "Error (" + seed + "): divergence in next()"); } else if (r.nextFloat() < .2 && i.hasPrevious()) { ensure(java.util.Arrays.equals((KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous()), "Error (" + seed + "): divergence in previous()"); } ensure(i.nextIndex() == j.nextIndex(), "Error (" + seed + "): divergence in nextIndex()"); ensure(i.previousIndex() == j.previousIndex(), "Error (" + seed + "): divergence in previousIndex()"); } } { Object previous = null; Object I, J; int from = r.nextInt(m.size() +1); ObjectListIterator i; java.util.ListIterator j; i = m.listIterator(from); j = t.listIterator(from); for(int k = 0; k < 2*n; k++) { ensure(i.hasNext() == j.hasNext(), "Error (" + seed + "): divergence in hasNext() (iterator with starting point " + from + ")"); ensure(i.hasPrevious() == j.hasPrevious() , "Error (" + seed + "): divergence in hasPrevious() (iterator with starting point " + from + ")"); if (r.nextFloat() < .8 && i.hasNext()) { ensure(java.util.Arrays.equals((KEY_TYPE[])i.next(), (KEY_TYPE[])j.next()), "Error (" + seed + "): divergence in next() (iterator with starting point " + from + ")"); //System.err.println("Done next " + I + " " + J + " " + badPrevious); } else if (r.nextFloat() < .2 && i.hasPrevious()) { ensure(java.util.Arrays.equals((KEY_TYPE[])i.previous(), (KEY_TYPE[])j.previous()), "Error (" + seed + "): divergence in previous() (iterator with starting point " + from + ")"); } } } try { java.io.File ff = new java.io.File("it.unimi.dsi.fastutil.test"); java.io.OutputStream os = new java.io.FileOutputStream(ff); java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(os); oos.writeObject(m); oos.close(); java.io.InputStream is = new java.io.FileInputStream(ff); java.io.ObjectInputStream ois = new java.io.ObjectInputStream(is); m = (ARRAY_FRONT_CODED_LIST)ois.readObject(); ois.close(); ff.delete(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } ensure(contentEquals(m, t), "Error (" + seed + "): m does not equal t after save/read"); System.out.println("Test OK"); return; } public static void main(String args[]) { int n = Integer.parseInt(args[1]); if (args.length > 2) r = new java.util.Random(seed = Long.parseLong(args[2])); try { if ("speedTest".equals(args[0]) || "speedComp".equals(args[0])) speedTest(n, "speedComp".equals(args[0])); else if ("test".equals(args[0])) test(n); } catch(Throwable e) { e.printStackTrace(System.err); System.err.println("seed: " + seed); } } #endif }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy