src.it.unimi.dsi.sux4j.io.ChunkedHashStore Maven / Gradle / Ivy
package it.unimi.dsi.sux4j.io;
/*
* Sux4J: Succinct data structures for Java
*
* Copyright (C) 2008-2019 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.commons.collections.Predicate;
import org.apache.commons.math3.random.RandomGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.longs.AbstractLongBigList;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongBigLists;
import it.unimi.dsi.fastutil.longs.LongIterable;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.io.SafelyCloseable;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.GOV3Function;
import it.unimi.dsi.sux4j.mph.GOV3Function.Builder;
import it.unimi.dsi.sux4j.mph.Hashes;
import it.unimi.dsi.util.XoRoShiRo128PlusRandomGenerator;
/** A temporary store of hash triples virtually divided into chunks.
*
* A chunked hash store accumulates elements (objects of type {@code T})
* by turning them into bit vectors (using a provided {@link TransformationStrategy})
* and then hashing such vectors into a triple of longs (i.e., overall we get a hash of 192 bits).
* Elements can be added {@linkplain #add(Object, long) one by one}
* or {@linkplain #addAll(Iterator, LongIterator) in batches}.
* Elements must be distinct, or, more precisely, they must be transformed into distinct bit vectors.
*
*
Besides the hashes, we store some data associated with each element:
* if {@linkplain #add(Object) no data is specified}, we store the rank of each element added (the first element added has rank 0,
* the second one has rank 1, and so on), unless you specified at {@linkplain #ChunkedHashStore(TransformationStrategy, File, int, ProgressLogger) construction time}
* a nonzero hash width: in that case, the value stored by {@link #add(Object)} will be given the lowest bits of the first hash of the triple
* associated with the object (the hash width is the number of bits stored). This feature makes it possible, for example, to implement a static
* {@linkplain Builder#dictionary(int) dictionary} using a {@link GOV3Function}.
*
*
Once all elements have been added, they can be gathered into chunks whose
* tentative size can be set by calling {@link #log2Chunks(int)}. More precisely,
* if the latter method is called with argument k, then each chunk
* will be formed by grouping triples by the k most significant bits of their first hash.
*
*
To obtain triples, one calls {@link #iterator()}, which returns chunks one at a time (in their
* natural order); triples within each chunk are returned by increasing hash. Actually, the iterator
* provided by a chunk returns a quadruple whose last element is the data associated with the element
* that generated the triple.
*
*
It is possible (albeit very unlikely) that different elements generate the same hash. This event is detected
* during chunk iteration (not while accumulating hashes), and it will throw a {@link ChunkedHashStore.DuplicateException}.
* At that point, the caller must handle the exception by {@linkplain #reset(long) resetting the store} and trying again
* from scratch. Note that after a few (say, three) exceptions you can safely assume that there are duplicate elements. If you
* need to force a check on the whole store you can call {@link #check()}. If all your elements come from an {@link Iterable},
* {@link #checkAndRetry(Iterable, LongIterable)} will try three times to build a checked chunked hash store.
*
*
Every {@link #reset(long)} changes the seed used by the store to generate triples. So, if this seed has to be
* stored this must happen after the last call to {@link #reset(long)}. To help tracking this fact, a call to
* {@link #seed()} will lock the store; any further call to {@link #reset(long)} will throw an {@link IllegalStateException}.
* In case the store needs to be reused, you can call {@link #clear()}, that will bring back the store to after-creation state.
*
*
When you have finished using a chunked hash store, you should {@link #close()} it. This class implements
* {@link SafelyCloseable}, and thus provides a safety-net finalizer.
*
*
Filtering
*
* You can at any time {@linkplain #filter(Predicate) set a predicate} that will filter the triples returned by the store.
*
*
Computing frequencies
*
* If you specify so {@linkplain #ChunkedHashStore(TransformationStrategy, File, int, ProgressLogger) at construction time},
* a chunked hash store will compute for you a {@linkplain #value2FrequencyMap() a map from values to their frequency}.
*
*
Implementation details
*
* Internally, a chunked hash store has a notion of disk chunk: triples are stored on disk using a fixed number of bits.
* Once the user chooses a chunk size, the store exhibits the data on disk by grouping disk chunks or splitting them
* in a suitable way. This process is transparent to the user.
*
*
An instance of this class will save triples into {@link #DISK_CHUNKS} disk chunks. Triples have to
* be loaded into memory only chunk by chunk, so to be sorted and tested for uniqueness. As long as
* {@link #DISK_CHUNKS} is larger than eight, the store will need less than one bit per element of main
* memory. {@link #DISK_CHUNKS} can be increased arbitrarily at compile time, but each store
* will open {@link #DISK_CHUNKS} files at the same time. (For the same reason, it is
* strongly suggested that you close your stores as soon as you do not need them).
*
*
Intended usage
*
* Chunked hash stores should be built by classes that need to manipulate elements in chunks of approximate given
* size without needing access to the elements themselves, but just to their triples, a typical
* example being {@link GOV3Function}, which uses the triples to compute a 3-hyperedge. Once a chunked hash
* store is built, it can be passed on to further substructures, reducing greatly the computation time (as the original
* collection need not to be scanned again).
*
*
To compute the chunk corresponding to given element, use
*
* final long[] h = new long[3];
* Hashes.spooky4(transform.toBitVector(key), seed, h);
* final int chunk = chunkShift == Long.SIZE ? 0 : (int)(h[0] >>> chunkShift);
*
* where seed
is the store seed, and chunkShift
* is the return value of {@link #log2Chunks(int)} and should be stored by the caller. Note
* that you do not need the chunked hash store to compute these data, but just its seed and the chunk shift.
*
* @author Sebastiano Vigna
* @since 1.0.4
* @deprecated Please use a {@link BucketedHashStore}, which provides accurate bucket sizing.
*/
@Deprecated
public class ChunkedHashStore implements Serializable, SafelyCloseable, Iterable {
public static final long serialVersionUID = 1L;
private static final Logger LOGGER = LoggerFactory.getLogger(ChunkedHashStore.class);
private static final boolean DEBUG = false;
/** Denotes that the chunked hash store contains a duplicate hash triple. */
public static class DuplicateException extends RuntimeException {
private static final long serialVersionUID = 1L;
}
/** The size of the output buffers. */
public final static int BUFFER_SIZE = 16 * 1024;
/** The logarithm of the number of physical disk chunks. */
public final static int LOG2_DISK_CHUNKS = 8;
/** The number of physical disk chunks. */
public final static int DISK_CHUNKS = 1 << LOG2_DISK_CHUNKS;
/** The shift for physical disk chunks. */
public final static int DISK_CHUNKS_SHIFT = Long.SIZE - LOG2_DISK_CHUNKS;
/** The number of elements ever {@linkplain #add(Object) added}. */
protected long size;
/** The number of elements that pass the current filter, or -1 we it must be recomputed. */
protected long filteredSize;
/** The seed used to generate the hash triples. */
protected long seed;
/** The number of triples in each disk chunk. */
private int[] count;
/** The number of chunks. */
private long chunks;
/** The files containing disk chunks. */
private File file[];
/** The number of disk chunks making up a chunk, or 1 if a chunk is smaller than or equal to a disk chunk. */
private int diskChunkStep;
/** The shift to be applied to the first hash to obtain the chunk index, set by {@link #log2Chunks(int)} (watch out: it can be {@link Long#SIZE}). */
private int chunkShift;
/** If true, this store has been checked for duplicates. */
private boolean checkedForDuplicates;
/** The transformation strategy provided at construction time. */
private final TransformationStrategy super T> transform;
/** A progress logger. */
private final ProgressLogger pl;
/** If nonzero, no associated data is saved in the store: {@link Chunk#data(long)} will return the first of the three hashes associated with the key, masked by this value. */
private final long hashMask;
/** The temporary directory for this chunked hash store, or {@code null}. */
private final File tempDir;
/** The file channels for the disk chunks. */
private WritableByteChannel[] writableByteChannel;
/** The file channels for the disk chunks. */
private ByteBuffer[] byteBuffer;
/** The number of disk chunks divided by {@link #diskChunkStep}. */
private int virtualDiskChunks;
/** If not {@code null}, a filter that will be used to select triples. */
private Predicate filter;
/** Whether this store is locked. Any attempt to {@link #reset(long)} the store will cause an {@link IllegalStateException} if this variable is true.*/
private boolean locked;
/** Whether this store has already been closed. */
private boolean closed;
/** The wall time spent during quicksort. */
private long quickSortWallTime;
/** The optional map from values to count. */
private Long2LongOpenHashMap value2FrequencyMap;
/** Creates a chunked hash store with given transformation strategy.
*
* @param transform a transformation strategy for the elements.
* @throws IOException
*/
public ChunkedHashStore(final TransformationStrategy super T> transform) throws IOException {
this(transform, null, null);
}
/** Creates a chunked hash store with given transformation strategy and temporary file directory.
*
* @param transform a transformation strategy for the elements.
* @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
*/
public ChunkedHashStore(final TransformationStrategy super T> transform, final File tempDir) throws IOException {
this(transform, tempDir, null);
}
/** Creates a chunked hash store with given transformation strategy.
*
* @param transform a transformation strategy for the elements.
* @param pl a progress logger, or {@code null}.
*/
public ChunkedHashStore(final TransformationStrategy super T> transform, final ProgressLogger pl) throws IOException {
this(transform, null, pl);
}
/** Creates a chunked hash store with given transformation strategy and progress logger.
*
* @param transform a transformation strategy for the elements.
* @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
* @param pl a progress logger, or {@code null}.
*/
public ChunkedHashStore(final TransformationStrategy super T> transform, final File tempDir, final ProgressLogger pl) throws IOException {
this(transform, tempDir, 0, pl);
}
/** Creates a chunked hash store with given transformation strategy, hash width and progress logger.
*
* @param transform a transformation strategy for the elements.
* @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
* @param hashWidthOrCountValues if positive, no associated data is saved in the store: {@link Chunk#data(long)} will return this many lower bits
* of the first of the three hashes associated with the key; zero, values are stored; if negative, values are stored and a map from values
* to their frequency is computed.
* @param pl a progress logger, or {@code null}.
*/
public ChunkedHashStore(final TransformationStrategy super T> transform, final File tempDir, final int hashWidthOrCountValues, final ProgressLogger pl) throws IOException {
this.transform = transform;
this.pl = pl;
this.tempDir = tempDir;
this.hashMask = hashWidthOrCountValues <= 0 ? 0 : -1L >>> Long.SIZE - hashWidthOrCountValues;
if (hashWidthOrCountValues < 0) value2FrequencyMap = new Long2LongOpenHashMap();
file = new File[DISK_CHUNKS];
writableByteChannel = new WritableByteChannel[DISK_CHUNKS];
byteBuffer = new ByteBuffer[DISK_CHUNKS];
// Create disk chunks
for(int i = 0; i < DISK_CHUNKS; i++) {
byteBuffer[i] = ByteBuffer.allocateDirect(BUFFER_SIZE).order(ByteOrder.nativeOrder());
writableByteChannel[i] = new FileOutputStream(file[i] = File.createTempFile(ChunkedHashStore.class.getSimpleName(), String.valueOf(i), tempDir)).getChannel();
file[i].deleteOnExit();
}
count = new int[DISK_CHUNKS];
}
/** Return the current seed of this chunked hash store. After calling this method, no {@link #reset(long)} will be allowed (unless the store
* is {@linkplain #clear() cleared}).
*
* @return the current seed of this chunked hash store.
*/
public long seed() {
locked = true;
return seed;
}
/** Return the temporary directory of this chunked hash store, or {@code null}.
*
* @return the temporary directory of this chunked hash store, or {@code null}.
*/
public File tempDir() {
return tempDir;
}
/** Return the transformation strategy provided at construction time.
* @return the transformation strategy provided at construction time. */
public TransformationStrategy super T> transform() {
return transform;
}
/** Adds an element to this store, associating it with a specified value.
*
* @param o the element to be added.
* @param value the associated value.
*/
public void add(final T o, final long value) throws IOException {
final long[] triple = new long[3];
Hashes.spooky4(transform.toBitVector(o), seed, triple);
add(triple, value);
}
/** Adds an element to this store, associating it with its ordinal position.
*
* @param o the element to be added.
*/
public void add(final T o) throws IOException {
add(o, filteredSize);
}
/** Adds a triple to this store.
*
* @param triple the triple to be added.
* @param value the associated value.
*/
private void add(final long[] triple, final long value) throws IOException {
final int chunk = (int)(triple[0] >>> DISK_CHUNKS_SHIFT);
count[chunk]++;
checkedForDuplicates = false;
if (DEBUG) System.err.println("Adding " + Arrays.toString(triple));
writeLong(triple[0], byteBuffer[chunk], writableByteChannel[chunk]);
writeLong(triple[1], byteBuffer[chunk], writableByteChannel[chunk]);
writeLong(triple[2], byteBuffer[chunk], writableByteChannel[chunk]);
if (hashMask == 0) writeLong(value, byteBuffer[chunk], writableByteChannel[chunk]);
if (filteredSize != -1 && (filter == null || filter.evaluate(triple))) filteredSize++;
if (value2FrequencyMap != null) value2FrequencyMap.addTo(value, 1);
size++;
}
/** Adds the elements returned by an iterator to this store, associating them with specified values,
* possibly building the associated value frequency map.
*
* @param elements an iterator returning elements.
* @param values an iterator on values parallel to {@code elements}.
* @param requiresValue2CountMap whether to build the value frequency map (associating with each value its frequency).
*/
public void addAll(final Iterator extends T> elements, final LongIterator values, final boolean requiresValue2CountMap) throws IOException {
if (pl != null) {
pl.expectedUpdates = -1;
pl.start("Adding elements...");
}
final long[] triple = new long[3];
while(elements.hasNext()) {
Hashes.spooky4(transform.toBitVector(elements.next()), seed, triple);
add(triple, values != null ? values.nextLong() : filteredSize);
if (pl != null) pl.lightUpdate();
}
if (values != null && values.hasNext()) throw new IllegalStateException("The iterator on values contains more entries than the iterator on keys");
if (pl != null) pl.done();
}
/** Adds the elements returned by an iterator to this store, associating them with specified values.
*
* @param elements an iterator returning elements.
* @param values an iterator on values parallel to {@code elements}.
*/
public void addAll(final Iterator extends T> elements, final LongIterator values) throws IOException {
addAll(elements, values, false);
}
/** Adds the elements returned by an iterator to this store, associating them with their ordinal position.
*
* @param elements an iterator returning elements.
*/
public void addAll(final Iterator extends T> elements) throws IOException {
addAll(elements, null);
}
private void flushAll() throws IOException {
for(int i = 0; i < DISK_CHUNKS; i++) flush(byteBuffer[i], writableByteChannel[i]);
}
/** Returns the size of this store. Note that if you set up
* a {@linkplain #filter(Predicate) filter}, the first call to
* this method will require a scan to the whole store.
*
* @return the number of (possibly filtered) triples of this store.
*/
public long size() throws IOException {
if (filter == null) return size;
if (filteredSize == - 1) {
long c = 0;
final long[] triple = new long[3];
final ByteBuffer iteratorByteBuffer = ByteBuffer.allocateDirect(BUFFER_SIZE).order(ByteOrder.nativeOrder());
for(int i = 0; i < DISK_CHUNKS; i++) {
if (filter == null) c += count[i];
else {
flushAll();
@SuppressWarnings("resource")
final ReadableByteChannel channel = new FileInputStream(file[i]).getChannel();
iteratorByteBuffer.clear().flip();
for(int j = 0; j < count[i]; j++) {
triple[0] = readLong(iteratorByteBuffer, channel);
triple[1] = readLong(iteratorByteBuffer, channel);
triple[2] = readLong(iteratorByteBuffer, channel);
if (hashMask == 0) readLong(iteratorByteBuffer, channel);
if (filter.evaluate(triple)) c++;
}
channel.close();
}
}
filteredSize = c;
}
return filteredSize;
}
/** Clears this store. After a call to this method, the store can be reused. */
public void clear() throws IOException {
locked = false;
if (value2FrequencyMap != null) value2FrequencyMap = new Long2LongOpenHashMap();
reset(0);
}
/** Return the current value frequency map.
*
* @return the current value frequency map.
* @throws IllegalStateException if this chunked hash store does not contain a value frequency map.
*/
public Long2LongOpenHashMap value2FrequencyMap() {
if (value2FrequencyMap == null) throw new IllegalStateException("This chunked hash store does not contain a value frequency map");
return value2FrequencyMap;
}
private static void writeLong(final long value, final ByteBuffer byteBuffer, final WritableByteChannel channel) throws IOException {
if (!byteBuffer.hasRemaining()) flush(byteBuffer, channel);
byteBuffer.putLong(value);
}
private static void flush(final ByteBuffer buffer, final WritableByteChannel channel) throws IOException {
buffer.flip();
channel.write(buffer);
buffer.clear();
}
private static long readLong(final ByteBuffer byteBuffer, final ReadableByteChannel channel) throws IOException {
if (! byteBuffer.hasRemaining()) {
byteBuffer.clear();
final int result = channel.read(byteBuffer);
assert result != 0;
if (result == -1) throw new EOFException();
byteBuffer.flip();
}
return byteBuffer.getLong();
}
private static ReadableByteChannel concatenate(final ReadableByteChannel[] channel) {
return new ReadableByteChannel() {
private int curr = 0;
private boolean closed;
@Override
public boolean isOpen() {
return ! closed;
}
@Override
public void close() throws IOException {
if (closed) return;
closed = true;
for(final ReadableByteChannel c: channel) c.close();
}
@Override
public int read(final ByteBuffer dst) throws IOException {
if (!dst.hasRemaining()) return 0;
for(;;) {
if (curr == channel.length) return -1;
final int result = channel[curr].read(dst);
if (result >= 0) return result;
if (result == -1) curr++;
}
}
};
}
@Override
protected void finalize() throws Throwable {
try {
if (! closed) {
LOGGER.warn("This " + this.getClass().getName() + " [" + toString() + "] should have been closed.");
close();
}
}
finally {
super.finalize();
}
}
/** Closes this store, disposing all associated resources. */
@Override
public void close() throws IOException {
if (! closed) {
LOGGER.debug("Wall clock for quicksort: " + Util.format(quickSortWallTime / 1E9) + "s");
closed = true;
for(final WritableByteChannel channel: writableByteChannel) channel.close();
for(final File f: file) f.delete();
}
}
/** Resets this store using a new seed. All accumulated data are cleared, and a new seed is reinstated.
*
* @param seed the new seed.
* @throws IllegalStateException if this store was locked by a call to {@link #seed()}, and never {@linkplain #clear() cleared} thereafter.
*/
public void reset(final long seed) throws IOException {
if (locked) throw new IllegalStateException();
if (DEBUG) System.err.println("RESET(" + seed + ")");
filteredSize = 0;
this.seed = seed;
checkedForDuplicates = false;
Arrays.fill(count, 0);
for (int i = 0; i < DISK_CHUNKS; i++) {
writableByteChannel[i].close();
byteBuffer[i].clear();
writableByteChannel[i] = new FileOutputStream(file[i]).getChannel();
}
}
/** Checks that this store has no duplicate triples, throwing an exception if this fails to happen.
*
* @throws DuplicateException if this store contains duplicate triples.
*/
public void check() throws DuplicateException {
for(final ChunkedHashStore.Chunk b: this) b.iterator();
}
/** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen.
*
* @param iterable the elements with which the store will be refilled if there are duplicate triples.
* @param values the values that will be associated with the elements returned by iterable
.
* @throws IllegalArgumentException if after a few trials the store still contains duplicate triples.
*/
public void checkAndRetry(final Iterable extends T> iterable, final LongIterable values) throws IOException {
final RandomGenerator random = new XoRoShiRo128PlusRandomGenerator();
int duplicates = 0;
for(;;)
try {
check();
break;
}
catch (final DuplicateException e) {
if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates");
LOGGER.warn("Found duplicate. Recomputing triples...");
reset(random.nextLong());
addAll(iterable.iterator(), values.iterator());
}
checkedForDuplicates = true;
}
/** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen.
*
* Warning: the actions are executed exactly in the specified order—first
* check, then retry. If you invoke this method on an empty store you'll get a checked empty store.
*
* @param iterable the elements with which the store will be refilled if there are duplicate triples.
* @throws IllegalArgumentException if after a few trials the store still contains duplicate triples.
*/
public void checkAndRetry(final Iterable extends T> iterable) throws IOException {
checkAndRetry(iterable, null);
}
/** Generate a list of signatures using the lowest bits of the first hash in this store.
*
*
For this method to work, this store must contain ranks.
*
* @param signatureWidth the width in bits of the signatures.
* @param pl a progress logger.
*/
public LongBigList signatures(final int signatureWidth, final ProgressLogger pl) throws IOException {
final LongBigList signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth);
final long signatureMask = -1L >>> Long.SIZE - signatureWidth;
signatures.size(size());
pl.expectedUpdates = size();
pl.itemsName = "signatures";
pl.start("Signing...");
for (final ChunkedHashStore.Chunk chunk : this) {
final Iterator chunkIterator = chunk.iterator();
for(int i = chunk.size(); i-- != 0;) {
final long[] quadruple = chunkIterator.next();
signatures.set(quadruple[3], signatureMask & quadruple[0]);
pl.lightUpdate();
}
}
pl.done();
return signatures;
}
/** Sets the number of chunks.
*
* Once the store is filled, you must call this method to set the number of chunks. The store will take
* care of merging or fragmenting disk chunks to get exactly the desired chunks.
*
* @param log2chunks the base-2 logarithm of the number of chunks.
* @return the shift to be applied to the first hash of a triple to get the chunk number (see the {@linkplain ChunkedHashStore introduction}).
*/
public int log2Chunks(final int log2chunks) {
this.chunks = 1 << log2chunks;
diskChunkStep = (int)Math.max(DISK_CHUNKS / chunks, 1);
virtualDiskChunks = DISK_CHUNKS / diskChunkStep;
if (DEBUG) {
System.err.print("Chunk sizes: ");
final double avg = filteredSize / (double)DISK_CHUNKS;
double var = 0;
for(int i = 0; i < DISK_CHUNKS; i++) {
System.err.print(i + ":" + count[i] + " ");
var += (count[i] - avg) * (count[i] - avg);
}
System.err.println();
System.err.println("Average: " + avg);
System.err.println("Variance: " + var / filteredSize);
}
chunkShift = Long.SIZE - log2chunks;
LOGGER.debug("Number of chunks: " + chunks);
LOGGER.debug("Number of disk chunks: " + DISK_CHUNKS);
LOGGER.debug("Number of virtual disk chunks: " + virtualDiskChunks);
return chunkShift;
}
/** A chunk returned by a {@link ChunkedHashStore}. */
public final static class Chunk implements Iterable {
/** The index of this chunk (the ordinal position in the chunk enumeration). */
private final int index;
/** The start position of this chunk in the parallel arrays {@link #buffer0}, {@link #buffer1}, {@link #buffer2}, and {@link #data}. */
private final int start;
/** The final position (excluded) of this chunk in the parallel arrays {@link #buffer0}, {@link #buffer1}, {@link #buffer2}, and {@link #data}. */
private final int end;
private final long[] buffer0;
private final long[] buffer1;
private final long[] buffer2;
private final long[] data;
private final long hashMask;
private Chunk(final int index, final long[] buffer0, final long[] buffer1, final long[] buffer2, final long[] data, final long hashMask, final int start, final int end) {
this.index = index;
this.start = start;
this.end = end;
this.data = data;
this.hashMask = hashMask;
this.buffer0 = buffer0;
this.buffer1 = buffer1;
this.buffer2 = buffer2;
}
/** Copy constructor for multi-threaded chunk analysis.
*
* @param chunk a chunk to be copied.
*/
public Chunk(final Chunk chunk) {
index = chunk.index;
hashMask = chunk.hashMask;
start = 0;
end = chunk.end - chunk.start;
buffer0 = Arrays.copyOfRange(chunk.buffer0, chunk.start, chunk.end);
buffer1 = Arrays.copyOfRange(chunk.buffer1, chunk.start, chunk.end);
buffer2 = Arrays.copyOfRange(chunk.buffer2, chunk.start, chunk.end);
data = chunk.data == null ? null : Arrays.copyOfRange(chunk.data, chunk.start, chunk.end);
}
/** Creates a chunk with all field set to zero or null. Mainly useful to create marker objects. */
public Chunk() {
this.index = 0;
this.start = 0;
this.end = 0;
this.data = null;
this.hashMask = 0;
this.buffer0 = null;
this.buffer1 = null;
this.buffer2 = null;
}
/** The number of triples in this chunk.
*
* @return the number of triples in this chunk.
*/
public int size() {
return end - start;
}
/** The index of this chunk.
*
* @return the index of this chunk.
*/
public int index() {
return index;
}
/** Returns the data of the k
-th triple returned by this chunk.
*
* This method provides an alternative random access to data (w.r.t. indexing the fourth element of the
* quadruples returned by {@link #iterator()}).
*
* @param k the index (in iteration order) of a triple.
* @return the corresponding data.
*/
public long data(final long k) {
return data != null ? data[(int)(start + k)] : (buffer0[(int)(start + k)] & hashMask);
}
/** Returns an iterator over the quadruples associated with this chunk; the returned array of longs is reused at each call.
*
* @return an iterator over quadruples formed by a triple (indices 0, 1, 2) and the associated data (index 3).
*/
@Override
public Iterator iterator() {
return new ObjectIterator() {
private int pos = start;
private final long[] quadruple = new long[4];
@Override
public boolean hasNext() {
return pos < end;
}
@Override
public long[] next() {
if (! hasNext()) throw new NoSuchElementException();
final long[] quadruple = this.quadruple;
quadruple[0] = buffer0[pos];
quadruple[1] = buffer1[pos];
quadruple[2] = buffer2[pos];
quadruple[3] = data != null ? data[pos] : buffer0[pos] & hashMask;
pos++;
return quadruple;
}
};
}
/** Commodity methods that exposes transparently either the data contained in the chunk,
* or the data obtained by using the chunk to index a list.
*
* @param values a list of values. Must be either an instance of {@link LongList}, or an instance of {@link LongBigList}. If it
* is not {@code null}, the data in the chunk is used to index this list and return a value. Otherwise, the data in the chunk
* is returned directly.
* @return a big list of longs representing the values associated with each element in the chunk.
*/
public LongBigList valueList(final LongIterable values) {
return new AbstractLongBigList() {
private final LongBigList valueList = values == null ? null : (values instanceof LongList ? LongBigLists.asBigList((LongList)values) : (LongBigList)values);
@Override
public long size64() {
return Chunk.this.size();
}
@Override
public long getLong(final long index) {
return valueList == null ? Chunk.this.data(index) : valueList.getLong(Chunk.this.data(index));
}
};
}
}
/** Sets a filter for this store.
*
* @param filter a predicate that will be used to filter triples.
*/
public void filter(final Predicate filter) {
this.filter = filter;
filteredSize = -1;
}
/** Returns an iterator over the chunks of this chunked hash store.
*
* Note that at each iteration part of the state of this chunked hash store
* is reused. Thus, after each call to {@code next()} the previously returned
* {@link Chunk} will be no longer valid. Please use the provided
* {@linkplain Chunk#Chunk(Chunk) copy constructor} if you need to process
* in parallel several chunks.
*
* @return an iterator over the chunks of this chunked hash store.
*/
@Override
public Iterator iterator() {
if (closed) throw new IllegalStateException("This " + getClass().getSimpleName() + " has been closed ");
try {
flushAll();
}
catch (final IOException e) {
throw new RuntimeException(e);
}
int m = 0;
for(int i = 0; i < virtualDiskChunks; i++) {
int s = 0;
for(int j = 0; j < diskChunkStep; j++) s += count[i * diskChunkStep + j];
if (s > m) m = s;
}
final int maxCount = m;
return new ObjectIterator() {
private int chunk;
private ReadableByteChannel channel;
private final ByteBuffer iteratorByteBuffer = ByteBuffer.allocateDirect(BUFFER_SIZE).order(ByteOrder.nativeOrder());
private int last;
private int chunkSize;
private final long[] buffer0 = new long[maxCount];
private final long[] buffer1 = new long[maxCount];
private final long[] buffer2 = new long[maxCount];
private final long[] data = hashMask != 0 ? null : new long[maxCount];
@Override
public boolean hasNext() {
return chunk < chunks;
}
@Override
public Chunk next() {
if (! hasNext()) throw new NoSuchElementException();
final long[] buffer0 = this.buffer0;
if (chunk % (chunks / virtualDiskChunks) == 0) {
final int diskChunk = (int)(chunk / (chunks / virtualDiskChunks));
final long[] buffer1 = this.buffer1, buffer2 = this.buffer2;
chunkSize = 0;
try {
if (diskChunkStep == 1) {
channel = new FileInputStream(file[diskChunk]).getChannel();
chunkSize = count[diskChunk];
}
else {
final ReadableByteChannel[] subChannel = new ReadableByteChannel[diskChunkStep];
for(int i = 0; i < diskChunkStep; i++) {
subChannel[i] = new FileInputStream(file[diskChunk * diskChunkStep + i]).getChannel();
chunkSize += count[diskChunk * diskChunkStep + i];
}
channel = concatenate(subChannel);
}
iteratorByteBuffer.clear().flip();
final long triple[] = new long[3];
int count = 0;
for(int j = 0; j < chunkSize; j++) {
triple[0] = readLong(iteratorByteBuffer, channel);
triple[1] = readLong(iteratorByteBuffer, channel);
triple[2] = readLong(iteratorByteBuffer, channel);
if (DEBUG) System.err.println("From disk: " + Arrays.toString(triple));
if (filter == null || filter.evaluate(triple)) {
buffer0[count] = triple[0];
buffer1[count] = triple[1];
buffer2[count] = triple[2];
if (hashMask == 0) data[count] = readLong(iteratorByteBuffer, channel);
count++;
}
else if (hashMask == 0) readLong(iteratorByteBuffer, channel); // Discard data
}
chunkSize = count;
channel.close();
}
catch (final IOException e) {
throw new RuntimeException(e);
}
final long start = System.nanoTime();
it.unimi.dsi.fastutil.Arrays.parallelQuickSort(0, chunkSize, (x, y) -> {
int t = Long.compareUnsigned(buffer0[x], buffer0[y]);
if (t != 0) return t;
t = Long.compareUnsigned(buffer1[x], buffer1[y]);
if (t != 0) return t;
return Long.compareUnsigned(buffer2[x], buffer2[y]);
},
(x, y) -> {
final long e0 = buffer0[x], e1 = buffer1[x], e2 = buffer2[x];
buffer0[x] = buffer0[y];
buffer1[x] = buffer1[y];
buffer2[x] = buffer2[y];
buffer0[y] = e0;
buffer1[y] = e1;
buffer2[y] = e2;
if (hashMask == 0) {
final long v = data[x];
data[x] = data[y];
data[y] = v;
}
});
quickSortWallTime += System.nanoTime() - start;
if (DEBUG) {
for(int i = 0; i < chunkSize; i++) System.err.println(buffer0[i] + ", " + buffer1[i] + ", " + buffer2[i]);
}
last = 0;
}
final int start = last;
if (chunkShift < Long.SIZE) {
// Exponential search for the next chunk
int incr;
for(incr = 1; last + incr < chunkSize && buffer0[last + incr] >>> chunkShift == chunk; incr <<= 1);
int to = Math.min(chunkSize, last + incr);
last += incr >>> 1;
while(last < to) {
final int mid = (last + to) >>> 1;
if (buffer0[mid] >>> chunkShift == chunk) last = mid + 1;
else to = mid;
}
}
else last = chunkSize;
if (!checkedForDuplicates && start < last)
for (int i = start + 1; i < last; i++)
if (buffer0[i - 1] == buffer0[i] && buffer1[i - 1] == buffer1[i] && buffer2[i - 1] == buffer2[i])
throw new DuplicateException();
if (chunk == chunks - 1 && last == chunkSize) checkedForDuplicates = true;
return new Chunk(chunk++, buffer0, buffer1, buffer2, data, hashMask, start, last);
}
};
}
}