All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.internal.HashCollisionUtility Maven / Gradle / Ivy

There is a newer version: 2.1.4
Show newest version
package com.bigdata.rdf.internal;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TimeZone;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import java.util.zip.Deflater;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.RDFParserFactory;
import org.openrdf.rio.RDFParserRegistry;
import org.openrdf.rio.helpers.RDFHandlerBase;

import com.bigdata.Banner;
import com.bigdata.btree.BTree;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KV;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.SuccessorUtil;
import com.bigdata.btree.raba.codec.CanonicalHuffmanRabaCoder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoder;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.compression.RecordCompressor;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.Journal;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.internal.impl.literal.PartlyInlineTypedLiteralIV;
import com.bigdata.rdf.internal.impl.uri.PartlyInlineURIIV;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.vocab.BaseVocabulary;
import com.bigdata.rwstore.sector.IMemoryManager;
import com.bigdata.rwstore.sector.MemoryManager;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.concurrent.Latch;

/**
 * Utility class to parse some RDF resource(s) and count hash collisions using a
 * variety of hash codes.
 * 
 * TODO Various data sets:
 * 
 * 
 * /nas/data/lubm/U1/data/University0/
 * /nas/data/bsbm/bsbm_2785/dataset.nt.gz
 * /nas/data/bsbm/bsbm_566496/dataset.nt.gz
 * /data/bsbm3_200m_1MSplits
 * 
 * 8B triple bioinformatics data set.
 * 
 * BTC data (some very large literals)
 * 
 * 
* * TODO order preserving hash codes could be interesting here. Look at 32 and 64 * bit variants of the math and at generalized order preserving hash codes. With * order preserving hash codes, it makes sense to insert all Unicode terms into * TERM2ID such that we have a total order there. * * TODO benchmark the load time with different hash codes. the cost of the hash * computation and the randomness of the distribution will both play a role. The * B+Tree will need to be setup with a sufficient [writeRetentionQueue] and we * will need to specify [-server -Xmx1G]. * * SHA-256 - no collisions on BSBM 200M. 30G file. time? * * 32-bit hash codes. #collisions=1544132 Elapsed: 16656445ms Journal size: * 23841341440 bytes (23G) * * Now limiting the size of values in a leaf and also increasing the branching * factor to 512 (was 32). [The current run is scanning after the initial * insert, which involves a little wasted effort. It was also without the * -server -Xmx2g, and write retention queue parameters. Finally, it was * serializing BigdataValue objects, including their IV, rather than RDF Value * objects. The code has since been modified to serialize just the BigdataValue * Also, I've since raised the initial extent from 10M to 200M]. * maxCollisions=3, Elapsed: 22579073ms Journal size: 35270950912 bytes * * Now buffering 100k values at a time: 2x faster. * *
 * U1:
 * Elapsed: 23379ms
 * NumStatements: 1000313
 * NumDistinctVals: 291259
 * TotalKeyBytes: 1747554
 * TotalValBytes: 60824514
 * MaxCollisions: 1
 * TotalCollisions: 6
 * Journal size: 209715200 bytes
 * name	m	height	nnodes	nleaves	nodeBytes	leafBytes	totalBytes	avgNodeBytes	avgLeafBytes	minNodeBytes	maxNodeBytes	minLeafBytes	maxLeafBytes
 * lex	1024	1	1	474	7913	3662623	3670536	7913	7727	7913	7913	5786	13784
 * 
* * With only a byte (versus short) counter in the key. Oddly, this has no impact * on the average leaf size. That suggests that the keys in the leaves are very * sparse in terms of the hash code space such that prefix compression is not * really doing that much for us. * *
 * Elapsed: 23235ms
 * NumStatements: 1000313
 * NumDistinctVals: 291259
 * TotalKeyBytes: 1456295
 * TotalValBytes: 60824514
 * MaxCollisions: 1
 * TotalCollisions: 6
 * Journal size: 209715200 bytes
 * name	m	height	nnodes	nleaves	nodeBytes	leafBytes	totalBytes	avgNodeBytes	avgLeafBytes	minNodeBytes	maxNodeBytes	minLeafBytes	maxLeafBytes
 * lex	1024	1	1	474	7913	3371370	3379283	7913	7112	7913	7913	5274	12774
 * 
* * BSBM 200M: This is the best time and space so far. using a byte counter * rather than a short. * *
 * Elapsed: 16338357ms
 * NumStatements: 198808848
 * NumDistinctVals: 45647082
 * TotalKeyBytes: 228235410
 * TotalValBytes: 11292849582
 * MaxCollisions: 3
 * TotalCollisions: 244042
 * Journal size: 16591683584 bytes
 * 
* * BSBM 200M: Note: I restarted this run after terminating yourkit so the * results should be valid (right?). The main changes are to use stringValue() * to test for dateTime, to use the canonical huffman coder for the leaf keys. * *
 * Elapsed: 20148506ms
 * NumStatements: 198808848
 * NumDistinctVals: 45647082
 * TotalKeyBytes: 228235410
 * TotalValBytes: 11292849582
 * MaxCollisions: 3
 * TotalCollisions: 244042
 * Journal size: 16591683584 bytes
 * 
* * BSBM 200M: raw records are compress if they are over 64 bytes long. * *
 * Elapsed: 18757003ms
 * NumStatements: 198808848
 * NumDistinctVals: 45647082
 * TotalKeyBytes: 228235410
 * TotalValBytes: 7910596818
 * MaxCollisions: 3
 * TotalCollisions: 244042
 * Journal size: 12270108672 bytes
 * 
* * BSBM 200M: literals LT 64 byte labels are assumed inlined into statement * indices (except datatype URIs). * *
 * Elapsed: 16193915ms
 * NumStatements: 198808848
 * NumDistinctVals: 43273381
 * NumShortLiterals: 2723662
 * TotalKeyBytes: 216366905
 * TotalValBytes: 7807037644
 * MaxCollisions: 3
 * TotalCollisions: 219542
 * Journal size: 11083186176 bytes
 * 
* * BSBM 200M: uris LT 64 byte localNames are assumed inlined into statement * indices (plus datatype literals LT 64 bytes). * *
 * Elapsed: 5699248ms
 * NumStatements: 198808848
 * NumDistinctVals: 12198222
 * NumShortLiterals: 32779032
 * NumShortURIs: 493520581
 * TotalKeyBytes: 60991110
 * TotalValBytes: 4944223808
 * MaxCollisions: 2
 * TotalCollisions: 17264
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M: one parser thread and one indexer thread. * *
 * Elapsed: 3724415ms
 * NumStatements: 198808848
 * NumDistinctVals: 12198222
 * NumShortLiterals: 32779032
 * NumShortBNodes: 0
 * NumShortURIs: 493520581
 * TotalKeyBytes: 60991110
 * TotalValBytes: 4944223808
 * MaxCollisions: 2
 * TotalCollisions: 17264
 * Journal size: 7320764416 bytes
 * 
* * GC OH problem trying to run multiple parsers against BSBM 200M when split * into 200 files. * *
 * valBufSize := 10000
 *     valQueueCapacity = 100
 *     maxDrain := 50
 *     nparserThreads := 2
 *     parserWorkQueue := 1000
 * 
* * BSBM 200M - this is 3x longer. This run did not have the GC OH problem, but * GC had frequent 10% spikes, which is a lot in comparison to our best run. * *
 *     valBufSize := 1000
 *     valQueueCapacity = 10
 *     maxDrain := 5
 *     nparserThreads := 4
 *     parserWorkQueue := 100
 * 
 * Elapsed: 9589520ms
 * NumStatements: 198805837
 * NumDistinctVals: 12202052
 * NumShortLiterals: 32776100
 * NumShortBNodes: 0
 * NumShortURIs: 493514954
 * TotalKeyBytes: 61010260
 * TotalValBytes: 4945278396
 * MaxCollisions: 2
 * TotalCollisions: 17260
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M: split in 200 files. 69m versus best time so far of 62m. There is * only one thread in the pool, but the caller runs policy means that we are * actually running two parsers. So, this is not really the same as the best * run, which was one parser running in the main thread with the indexer running * in another thread. * *
 * 	   valBufSize := 10000
 *     valQueueCapacity = 10
 *     maxDrain := 5
 *     nparserThreads := 1
 *     parserWorkQueue := 100
 * 
 * Elapsed: 4119775ms
 * NumStatements: 198805837
 * NumDistinctVals: 12202052
 * NumShortLiterals: 32776100
 * NumShortBNodes: 0
 * NumShortURIs: 493514954
 * TotalKeyBytes: 61010260
 * TotalValBytes: 4945278396
 * MaxCollisions: 2
 * TotalCollisions: 17260
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M with 1M statement splits using the memory manager to buffer the * data on the native heap. This is the best score so far (compare with * 3724415ms with one parser and one indexer thread). For some reason, the #of * distinct values and literals is slightly different for these two runs. One * other change in this run is that we always gzip the record since we can not * deserialize the record unless we know in advance whether or not it is * compressed. Previous runs had conditionally compressed based on the original * byte[] value length and stored the compressed record iff it was shorter. * However, we can only conditionally compress if we use a header or bit flag to * indicate that the record is compressed. Peak memory manager use was 262M. * *
 * Elapsed: 2863898ms
 * NumStatements: 198805837
 * NumDistinctVals: 12,202,052
 * NumShortLiterals: 61,100,900
 * NumShortBNodes: 0
 * NumShortURIs: 493514954
 * TotalKeyBytes: 61010260
 * TotalValBytes: 4945376779
 * MaxCollisions: 2
 * TotalCollisions: 17260
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M using memory manager (high tide of 351M) and 5 parser threads (plus * the main thread). Heap usage is pretty controlled. * *
 * Elapsed: 2803451ms
 * NumStatements: 198805837
 * NumDistinctVals: 12202052
 * NumShortLiterals: 61100900
 * NumShortBNodes: 0
 * NumShortURIs: 493514954
 * TotalKeyBytes: 61010260
 * TotalValBytes: 4945376779
 * MaxCollisions: 2
 * TotalCollisions: 17260
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M. Using memory manager and only one parser thread. This does run * significantly slower (55m versus 47m with two parser threads). It might not * be slower if we also ran against the single source file (this ran against the * split files) since each chunk placed onto the queue would then be full, but I * doubt that this will make that much difference. * *
 * Elapsed: 3300871ms
 * NumStatements: 198805837
 * NumDistinctVals: 12049125
 * NumShortLiterals: 61100900
 * NumShortBNodes: 0
 * NumShortURIs: 493514954
 * TotalKeyBytes: 60245625
 * TotalValBytes: 4877760110
 * MaxCollisions: 2
 * TotalCollisions: 16840
 * Journal size: 7320764416 bytes
 * 
* * BSBM 200M. Using memory manager, one parser thread (the caller), and a single * source file. The question is whether we do better with a statement handler * that is only flushed incrementally (when full) compared to using 2 parsers * and flushing each time we reach the end of a 1M statement source file. Nope. * This was 77 minutes. (This was a fair comparison since the source files for * the split sources are compressed. So we really do better with two parsers and * split files) * *
 * /allocationCount=0
 * /bufferCapacity=1000
 * /bufferCount=232
 * /extent=243269632
 * /slotBytes=0
 * /userBytes=0
 * Elapsed: 4605950ms
 * NumStatements: 198808848
 * NumDistinctVals: 12198222
 * NumShortLiterals: 61103832
 * NumShortBNodes: 0
 * NumShortURIs: 493520581
 * TotalKeyBytes: 60991110
 * TotalValBytes: 4944322031
 * MaxCollisions: 2
 * TotalCollisions: 17264
 * Journal size: 7320764416 bytes
 * 
* * TODO Try with only N bytes worth of the SHA hash code, leaving some bits left * over for partitioning URIs, Literals, and BNodes (for told bnode mode) and * for a counter to break ties when there is a hash collision. We should wind up * with an 8-12 byte termId which is collision proof and very well distributed. * * TODO Add bit flags at the front for {BLOB, URI, Literal, BNode} (BLOB being * the odd one out). If we move BLOBs out of the key range of other plain * literals, or literals of a given language code or datatype, then we can not * do an ordered scan of the literals anymore which is inclusive of the blobs. * There is a similar consequence of moving small literals into the statement * index. *

* If we inline small unicode values (<32 bytes) and reserve the TERM2ID index * for large(r) values then we can approach a situation in which it serves * solely for blobs but with a tradeoff in size (of the statement indices) * versus indirection. *

* Large value promotion does not really let us handle large blobs * (multi-megabytes) in s/o as a 50 4M blobs would fill up a shard. There, I * think that we need to give the control over to the application and require it * to write on a shared resource (shared file system, S3, etc). The value * inserted into the index would then be just the pathname in the shared file * system or the URL of the S3 resource. This breaks the ACID decision boundary * though as the application has no means available to atomically decide that * the resource does not exist and hence create it. Even using a conditional * E-Tag on S3 would not work since it would have to have an index over the S3 * entities to detect a write-write conflict for the same data under different * URLs. * * @author thompsonbry */ public class HashCollisionUtility { private final static Logger log = Logger .getLogger(HashCollisionUtility.class); /** * An index mapping hashCode(Value)+counter : Value. This * provides a dictionary for RDF {@link Value}s encountered when loading * {@link Statement}s into the database. The counter provides a simple * mechanism for reconciling hash collisions. */ private final BTree termsIndex; private final LexiconConfiguration conf; private final BigdataValueFactory vf; /** * Counters for things that we track. */ private static class Counters { /** * #of statements visited. */ private final AtomicLong nstmts = new AtomicLong(); /** * The #of {@link URI}s whose localName was short enough * that we decided to inline them into the statement indices instead. */ private final AtomicLong nshortURIs = new AtomicLong(); /** * The #of {@link BNode}s whose ID was short enough that we * decided to inline them into the statement indices instead (this also * counts blank nodes which are inlined because they have integer or * UUID IDs). */ private final AtomicLong nshortBNodes = new AtomicLong(); /** * The #of {@link Literal}s which were short enough that we decided to * inline them into the statement indices instead. */ private final AtomicLong nshortLiterals = new AtomicLong(); // private final ConcurrentWeakValueCacheWithBatchedUpdates valueCache; /** * The size of the hash collision set for the RDF Value with the most * hash collisions observed to date. */ private final AtomicLong maxCollisions = new AtomicLong(); /** * The total #of hash collisions. */ private final AtomicLong totalCollisions = new AtomicLong(); // /** // * The #of RDF {@link Value}s which were found in the {@link // #valueCache}, // * thereby avoiding a lookup against the index. // */ // private final AtomicLong ncached = new AtomicLong(); /** * The #of distinct RDF {@link Value}s inserted into the index. */ private final AtomicLong ninserted = new AtomicLong(); /** The total #of bytes in the generated B+Tree keys (leaves only). */ private final AtomicLong totalKeyBytes = new AtomicLong(); /** The total #of bytes in the serialized RDF Values. */ private final AtomicLong totalValBytes = new AtomicLong(); } // class Counters //// private interface IHashCode { //// void hashCode(IKeyBuilder keyBuilder,Object o); //// } // // private static class Int32HashCode { //implements IHashCode { // // public void hashCode(IKeyBuilder keyBuilder, Object o) { // // keyBuilder.append(o.hashCode()); // // } // // } // // private static class MessageDigestHashCode { //implements IHashCode { // // final MessageDigest d; // // MessageDigestHashCode() throws NoSuchAlgorithmException { // // d = MessageDigest.getInstance("SHA-256"); // 256 bits (32 bytes) // // } // // public void hashCode(IKeyBuilder keyBuilder, final byte[] b) { // // d.reset(); // d.digest(b); // keyBuilder.append(d.digest()); // // } // // } /** * Lock used to coordinate {@link #shutdown()} and the {@link #valueQueue}. */ private final ReentrantLock lock = new ReentrantLock(); /** * Latch which is incremented as we accept files to parse and decremented * once a parser begins to parse that file. */ private final Latch parserQueueLatch = new Latch(lock); /** * Latch which is incremented once we begin to parse a file and decremented * as the parser task completes. */ private final Latch parserRunLatch = new Latch(lock); /** * Thread pool used to run the parser. */ private final ExecutorService parserService; /** * Thread pool used to run the parser and indexer. */ private final ExecutorService indexerService; /** * Class hooks the runnable to provide reporting on the outcome of the * {@link FutureTask}. */ private class ReportingFutureTask extends FutureTask { public final File file; public ReportingFutureTask(final File file, Callable callable) { super(callable); this.file = file; parserQueueLatch.inc(); } public void run() { try { parserRunLatch.inc(); parserQueueLatch.dec(); super.run(); parserRunLatch.dec(); } finally { report(this); } } /** * Callback is invoked when a {@link ParseFileTask} completes. * * @param task * The future for that task. */ protected void report(final ReportingFutureTask task) { try { task.get(); if (log.isDebugEnabled()) log.debug("Finished parsing: " + task.file + ", queueLatch=" + parserQueueLatch + ", runLatch=" + parserRunLatch); } catch (ExecutionException ex) { log.error(ex, ex); } catch (InterruptedException e) { // propagate the interrupt. Thread.currentThread().interrupt(); } } } /** * A {@link Bucket} has an unsigned byte[] key and an unordered * list of long addrs for byte[] values. * {@link Bucket} implements {@link Comparable} can can be used to place an * array of {@link Bucket}s into ascending key order. * * TODO This is space efficient for large {@link Value}s, but it would not * be efficient for storing binding sets which hash to the same key. In the * case of binding sets, the binding sets are normally small. An extensible * hash table would conserve space by dynamically determining the #of hash * bits in the address, and hence mapping the records onto a smaller #of * pages. */ static private class Bucket implements Comparable { /** * The unsigned byte[] key. */ public final byte[] key; /** * The list of addresses for this bucket. * * TODO Collisions in a bucket are very rare given an int32 hash code, * so this should be optimized for the common case with a single * address. */ public final List addrs = new LinkedList(); public Bucket(final byte[] key) { if(key == null) throw new IllegalArgumentException(); this.key = key; } public Bucket(final byte[] key,final long addr) { this(key); addrs.add(addr); } /** * Add an address to this bucket. * * @param addr * The address. */ public void add(final long addr) { addrs.add(addr); } /** * Order {@link Bucket}s into ascending unsigned byte[] key * order. */ public int compareTo(final Bucket o) { return BytesUtil.compareBytes(key, o.key); } } /** * A chunk of RDF {@link Value}s from the parser which are ready to be * inserted into the TERMS index. */ static private class ValueBuffer { /** * The allocation contexts which can be released once these data have * been processed. */ private final Set contexts = new LinkedHashSet(); /** * The #of distinct records in the addrMap (this is more than the map * size if there are hash collisions since some buckets will have more * than one entry). */ private final int nvalues; /** * A map from the unsigned byte[] keys to the collision * bucket containing the address of each record for a given * unsigned byte[] key. */ private final Map addrMap; /** * * @param contexts * The allocation contexts for the records in the addrMap. * @param nvalues * The #of distinct records in the addrMap (this is more than * the map size if there are hash collisions since some * buckets will have more than one entry). * @param addrMap * A map from the unsigned byte[] keys to the * collision bucket containing the address of each record for * a given unsigned byte[] key. */ public ValueBuffer(final List contexts, final int nvalues, final Map addrMap) { if (contexts == null) throw new IllegalArgumentException(); if (addrMap == null) throw new IllegalArgumentException(); this.contexts.addAll(contexts); this.nvalues = nvalues; this.addrMap = addrMap; } /** * Clear the address map and the {@link IMemoryManager} allocation * contexts against which the data were stored. */ public void clear() { addrMap.clear(); for(IMemoryManager context : contexts) { context.clear(); } } public long getUserBytes() { long nbytes = 0L; for(IMemoryManager context : contexts) { nbytes += context.getUserBytes(); } return nbytes; } } // class ValueBuffer /** * Queue used to hand off {@link ValueBuffer}s from the parser to the * indexer. */ private BlockingQueue valueQueue; /** * Counters for things that we track. */ private final Counters c = new Counters(); /** * The upper bound on the size of a {@link ValueBuffer} chunk (currently in * slotBytes for the allocations against the {@link MemoryManager}). *

* The size of the chunks, the capacity of the queue, and the number of * chunks that may be combined into a single chunk for the indexer may all * be used to adjust the parallelism and efficiency of the parsing and * indexing. You have to be careful not to let too much data onto the Java * heap, but the indexer will do better when it is given a bigger chunk * since it can order the data and be more efficient in the index updates. */ final int valBufSize = Bytes.megabyte32 * 10;// 100000; /** Capacity of the {@link #valueQueue}. */ final int valQueueCapacity = 10; /** * Maximum #of chunks to drain from the {@link #valueQueue} in one go. This * bounds the largest chunk that we will index at one go. You can remove the * limit by specifying {@link Integer#MAX_VALUE}. */ final int maxDrain = 5; /** * The size of the read buffer when reading a file. */ final int fileBufSize = 1024 * 8;// default 8k /** * How many parser threads to use. There can be only one parser per file, * but you can parse more than one file at a time. */ final int nparserThreads = 1; /** * The size of the work queue for the {@link #parserService}. *

* Note: This should be large enough that we will not wait around forever if * the caller is forced to parse a file rather than scan the file system for * the next file to be parsed. This hack is introduced by the need to handle * a {@link RejectedExecutionException} from the {@link #parserService}. We * do that by forcing the parse task to run in the caller's thread. Another * choice would be for the caller to catch the * {@link RejectedExecutionException}, wait a bit, and then retry. */ final int parserWorkQueueCapacity = 100; /** * A direct memory heap used to buffer RDF {@link Value}s which will be * inserted into the TERMS index. A distinct child {@link IMemoryManager} * context is created by the {@link StatementHandler} each time it needs to * buffer data. The {@link StatementHandler} monitors the size of the * allocation context to decide when it is "big enough" to be transferred * onto the {@link #valueQueue}. The indexer eventually obtains the context * from the {@link #valueQueue}. Once the indexer is done with a context, it * {@link IMemoryManager#clear() clears} the context. The total memory * across the allocation contexts is released back to the * {@link DirectBufferPool} in {@link #shutdown()} and * {@link #shutdownNow()} and no later than when the {@link #mmgr} is * finalized. */ final MemoryManager mmgr; /** * The #of buffers to give to the {@link MemoryManager}. */ private final int nbuffers = 1000; private HashCollisionUtility(final Journal jnl) { this.termsIndex = getTermsIndex(jnl); /* * Setup the parser thread pool. If there is an attempt to run more * threads then * * Note: The pool size is one less than the total #of specified threads * since the caller will wind up running tasks rejected by the pool. If * the pool would be empty then it is [null] and the caller will run * the parser in its own thread. * * Note: The work queue is bounded so that we do not read any too far in * the file system. The #of threads is bounded so that we do not run too * many parsers at once. However, running multiple parsers can increase * throughput as the parser itself caps out at ~ 68k tps. */ if (nparserThreads > 1) { // this.parserService = // Executors.newFixedThreadPool(nparserThreads); final int corePoolSize = nparserThreads-1; final int maximumPoolSize = nparserThreads-1; final long keepAliveTime = 60; final TimeUnit unit = TimeUnit.SECONDS; final BlockingQueue workQueue = new LinkedBlockingQueue( parserWorkQueueCapacity); // final BlockingQueue workQueue = new SynchronousQueue(); this.parserService = new ThreadPoolExecutor(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, new ThreadPoolExecutor.CallerRunsPolicy() ); } else { /* * The caller must run the parser in its own thread. */ this.parserService = null; } // But they all feed the same indexer. this.indexerService = Executors.newSingleThreadExecutor(); // *blocking* queue of ValueBuffers to be indexed this.valueQueue = new LinkedBlockingQueue( valQueueCapacity);// lock); vf = BigdataValueFactoryImpl.getInstance("test"); final BaseVocabulary vocab; try { vocab = (BaseVocabulary) Class.forName( AbstractTripleStore.Options.DEFAULT_VOCABULARY_CLASS) .getDeclaredConstructor(String.class) .newInstance(vf.getNamespace()); vocab.init(); } catch (Exception e) { throw new RuntimeException(e); } // factory does not support any extensions. final IExtensionFactory xFactory = new IExtensionFactory() { @Override public void init(final IDatatypeURIResolver resolver, final ILexiconConfiguration config) { // NOP } @Override @SuppressWarnings("rawtypes") public Iterator> getExtensions() { return Collections.emptyIterator(); } }; final InlineURIFactory uriFactory = new InlineURIFactory(); uriFactory.init(vocab); /* * Note: This inlines everything *except* xsd:dateTime, which * substantially reduces the data we will put into the index. * * @todo Do a special IExtension implementation to handle xsd:dateTime * since the DateTimeExtension uses the LexiconRelation to do its work. */ conf = new LexiconConfiguration( 256, // blobsThreshold true, // inlineXSDDatatypeLiterals true, // inlineTextLiterals 64, // maxInlineStringLength true, // inlineBNodes false, // inlineDateTimes TimeZone.getDefault(), // inlineDateTimesTimeZone false, // rejectInvalidXSDValues xFactory, // extension factory vocab, // predefined vocabulary vf, uriFactory, false, // GeoSpatial support null // GeoSpatial config string ); // valueCache = new ConcurrentWeakValueCacheWithBatchedUpdates( // 50000 // hard reference queue capacity // ); mmgr = new MemoryManager(DirectBufferPool.INSTANCE, nbuffers); } /** * Start the task which will index data as it is parsed. */ public void start() { lock.lock(); try { if (indexerTask != null) throw new IllegalStateException(); // start indexer. indexerTask = new FutureTask(new IndexerMainTask()); indexerService.submit(indexerTask); // allow parsers to run. parsing.set(true); } finally { lock.unlock(); } } /** * Future for the task which drains the {@link #valueQueue} and indexes * the {@link ValueBuffer}s drained from that queue. */ private FutureTask indexerTask; /** Flag is true while parsers are still running. */ private final AtomicBoolean parsing = new AtomicBoolean(false); /** * Poison pill used to indicate that no more objects will be placed onto the * {@link #valueQueue}. */ private final ValueBuffer poisonPill = new ValueBuffer( new LinkedList(), 0, new LinkedHashMap()); /** * Normal shutdown. Running parsers will complete and their data will be * indexed, but new parsers will not start. This method will block until * all data has been indexed. * * @throws Exception */ public void shutdown() throws Exception { log.debug("shutting down..."); lock.lock(); try { if (log.isDebugEnabled()) log.debug("Waiting on parserQueueLatch: " + parserQueueLatch); parserQueueLatch.await(); if (parserService != null) { // no new parsers may start parserService.shutdown(); } if (log.isDebugEnabled()) log.debug("Waiting on parserRunLatch: " + parserRunLatch); parserRunLatch.await(); // no parsers should be running. parsing.set(false); // drop a poison pill on the queue. log.debug("Inserting poison pill."); valueQueue.put(poisonPill); if (indexerTask != null) { // wait for the indexer to finish. indexerTask.get(); } if (indexerService != null) indexerService.shutdown(); if (mmgr != null) { if (log.isInfoEnabled()) log.info(mmgr.getCounters().toString()); mmgr.clear(); } } finally { lock.unlock(); } log.debug("all done."); } /** * Immediate shutdown. Running tasks will be canceled. * * @throws Exception */ public void shutdownNow() throws Exception { log.debug("shutdownNow"); parsing.set(false); if (parserService != null) parserService.shutdownNow(); if (indexerService != null) indexerService.shutdownNow(); if (indexerTask != null) { indexerTask.cancel(true/* mayInterruptIfRunning */); } if (mmgr != null) { mmgr.clear(); } } /** * Task drains the valueQueue and runs an {@link IndexerTask} each time * something is drained from that queue. * * @author thompsonbry */ private class IndexerMainTask implements Callable { public Void call() throws Exception { boolean done = false; while (!done) { try { // Blocking take so we know that there is something ready. final ValueBuffer first = valueQueue.take(); // Drain queue, but keep an eye out for that poison pill. final LinkedList coll = new LinkedList(); // The element we already took from the queue. coll.add(first); // Drain (non-blocking). final int ndrained = valueQueue.drainTo(coll, maxDrain) + 1; if (log.isInfoEnabled()) log.info("Drained " + ndrained + " chunks with " + valueQueue.size() + " remaining in the queue."); // look for and remove poison pill, noting if found. if (coll.remove(poisonPill)) { if (log.isDebugEnabled()) log.debug("Found poison pill."); done = true; // fall through and index what we already have. } if (!coll.isEmpty()) { // combine the buffers into a single chunk. final ValueBuffer b = combineChunks(coll); if (log.isDebugEnabled()) log.debug("Will index " + coll.size() + " chunks having " + b.nvalues + " values in " + b.getUserBytes() + " bytes"); // Now index that chunk. new IndexValueBufferTask(mmgr, b, termsIndex, vf, c) .call(); } } catch (Throwable t) { log.error(t, t); HashCollisionUtility.this.shutdownNow(); throw new RuntimeException(t); } } // while(!done) log.debug("done."); return (Void) null; } /** * Combine chunks from the queue into a single chunk. */ private ValueBuffer combineChunks(final LinkedList coll) { final ValueBuffer b; if (coll.size() == 1) { // There is only one chunk. b = coll.getFirst(); } else { // Combine together into a single chunk. int nvalues = 0; for (ValueBuffer t : coll) nvalues += t.nvalues; final List contexts = new LinkedList(); final LinkedHashMap addrMap = new LinkedHashMap(); // int off = 0; for (ValueBuffer t : coll) { contexts.addAll(t.contexts); nvalues += t.nvalues; for(Bucket bucket : t.addrMap.values()) { final Bucket tmp = addrMap.get(bucket.key); if(tmp == null) { // copy bucket. addrMap.put(bucket.key, bucket); } else { // merge bucket. tmp.addrs.addAll(bucket.addrs); } } // System // .arraycopy(t.keys/* src */, 0/* srcPos */, // keys/* dest */, off/* destPos */, // t.nvalues/* length */); // // System // .arraycopy(t.addrs/* src */, 0/* srcPos */, // addrs/* dest */, off/* destPos */, // t.nvalues/* length */); // // off += t.nvalues; } b = new ValueBuffer(contexts, nvalues, addrMap); } return b; } } // class IndexerMainTask /** * Return the index in which we store RDF {@link Value}s. * * @param jnl * The index manager. * * @return The index. */ /* * TODO CanonicalHuffmanRabaCoder for U1 drops the average leaf size * * @ m=512 from 24k to 16k. Experiment with performance tradeoff * when compared with gzip of the record. * * No apparent impact for U1 on the leaves or nodes for 32 versus 8 * on the front-coded raba. * * Dropping maxRecLen from 256 to 64 reduces the leaves from 16k to * 10k. Dropping it to ZERO (0) reduces the leaves to 5k. This * suggests that we could to much better if we keep all RDF Values * out of the index. In standalone, we can give people a TermId * which is the raw record address. However, in scale-out it needs * to be the key (to locate the shard) and we will resolve the RDF * Value using the index on the shard. * * Suffix compression would allow us to generalize the counter and * avoid index space costs when collisions are rare while being able * to tolerate more collisions (short versus byte). U1: m=800, q=8000, ratio=8, maxRecLen=0, Elapsed: 41340ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9813 Average leaf: 6543 U1: m=800, q=8000, ratio=32, maxRecLen=0, Elapsed: 40971ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9821 Average leaf: 6478 U1: m=800, q=8000, ratio=64, maxRecLen=0, Elapsed: 41629ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9822 Average leaf: 6467 U1: m=512, q=8000, ratio=32, maxRecLen=0, Elapsed: 44722ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node/leaf: 3969 4149 U1: m=512, q=8000, ratio=32, maxRecLen=0, Elapsed: 40519ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node/leaf, node(min/max), leaf(min/max): 7583 8326 7583 7583 5755 14660 It would be great if we tracked the node/leaf data live on the RWStore for these counters so it could all be reported periodically (via http) or at the end in a summary. TODO The front compression of the keys is not helping out much since the keys are so sparse in the hash code space. It is a Good Thing that the keys are so sparse, but this suggests that we should try a different coder for the leaf keys. */ private BTree getTermsIndex(final Journal jnl) { final String name = "TERMS"; BTree ndx = jnl.getIndex(name); final int m = 1024; final int q = 8000; final int ratio = 32; final int maxRecLen = 0; if(ndx == null) { final IndexMetadata md = new IndexMetadata(name, UUID.randomUUID()); md.setNodeKeySerializer(new FrontCodedRabaCoder(ratio)); final DefaultTupleSerializer tupleSer = new DefaultTupleSerializer( new DefaultKeyBuilderFactory(new Properties()),// /* * leaf keys */ // DefaultFrontCodedRabaCoder.INSTANCE,// new FrontCodedRabaCoder(ratio),// // CanonicalHuffmanRabaCoder.INSTANCE, /* * leaf values */ CanonicalHuffmanRabaCoder.INSTANCE // new SimpleRabaCoder()// ); md.setTupleSerializer(tupleSer); // enable raw record support. md.setRawRecords(true); // set the maximum length of a byte[] value in a leaf. md.setMaxRecLen(maxRecLen); /* * increase the branching factor since leaf size is smaller w/o * large records. */ md.setBranchingFactor(m); // Note: You need to give sufficient heap for this option! md.setWriteRetentionQueueCapacity(q); ndx = jnl.registerIndex(name, md); } return ndx; } private void parseFileOrDirectory(final File fileOrDir, final RDFFormat fallback) throws Exception { if (fileOrDir.isDirectory()) { final File[] files = fileOrDir.listFiles(); for (int i = 0; i < files.length; i++) { final File f = files[i]; parseFileOrDirectory(f, fallback); } return; } final File f = fileOrDir; final String n = f.getName(); RDFFormat fmt = RDFFormat.forFileName(n, fallback); if (fmt == null && n.endsWith(".zip")) { fmt = RDFFormat.forFileName(n.substring(0, n.length() - 4), fallback); } if (fmt == null && n.endsWith(".gz")) { fmt = RDFFormat.forFileName(n.substring(0, n.length() - 3), fallback); } if (fmt == null) { log.warn("Ignoring: " + f); return; } final StatementHandler stmtHandler = new StatementHandler(valBufSize, c, conf, vf, mmgr, valueQueue, parsing); final FutureTask ft = new ReportingFutureTask( f, new ParseFileTask(f, fallback, fileBufSize, vf, stmtHandler) ); if (parserService != null) { // run on the thread pool. parserService.submit(ft); } else { // Run in the caller's thread. ft.run(); // Test the Future. ft.get(); } } /** * Task parses a single file. * * @author thompsonbry */ private static class ParseFileTask implements Callable { private final File file; private final RDFFormat fallback; private final int fileBufSize; private final BigdataValueFactory vf; private final StatementHandler stmtHandler; public ParseFileTask(final File file, final RDFFormat fallback, final int fileBufSize, final BigdataValueFactory vf, final StatementHandler stmtHandler) { if (file == null) throw new IllegalArgumentException(); if (stmtHandler == null) throw new IllegalArgumentException(); this.file = file; this.fallback = fallback; this.fileBufSize = fileBufSize; this.vf = vf; this.stmtHandler = stmtHandler; } public Void call() throws Exception { parseFile(file); return (Void) null; } private void parseFile(final File file) throws IOException, RDFParseException, RDFHandlerException, NoSuchAlgorithmException, InterruptedException { if (!file.exists()) throw new RuntimeException("Not found: " + file); final RDFFormat format = RDFFormat.forFileName(file.getName(),fallback); if (format == null) throw new RuntimeException("Unknown format: " + file); if (log.isTraceEnabled()) log.trace("RDFFormat=" + format); final RDFParserFactory rdfParserFactory = RDFParserRegistry .getInstance().get(format); if (rdfParserFactory == null) throw new RuntimeException("No parser for format: " + format); final RDFParser rdfParser = rdfParserFactory.getParser(); rdfParser.setValueFactory(vf); rdfParser.setVerifyData(false); rdfParser.setStopAtFirstError(false); rdfParser.setDatatypeHandling(RDFParser.DatatypeHandling.IGNORE); rdfParser.setRDFHandler(stmtHandler); /* * Run the parser, which will cause statements to be inserted. */ if (log.isDebugEnabled()) log.debug("Parsing: " + file); InputStream is = new FileInputStream(file); try { is = new BufferedInputStream(is, fileBufSize); final boolean gzip = file.getName().endsWith(".gz"); if (gzip) is = new GZIPInputStream(is); final String baseURI = file.toURI().toString(); // parse the file rdfParser.parse(is, baseURI); } finally { is.close(); } } } /** * Helper class adds statements to the sail as they are visited by a parser. */ static private class StatementHandler extends RDFHandlerBase { // private static final transient Logger log = HashCollisionUtility.log; /** * Various counters that we track. */ private final Counters c; /** The lexicon configuration. */ private final LexiconConfiguration conf; /** * Blocking queue to which we add {@link ValueBuffer} instances as they * are generated by the parser. */ final BlockingQueue valueQueue; /** * true iff the parser is permitted to run and * false if the parser should terminate. */ final AtomicBoolean parsing; /** * Used to build the keys (just a hash code). */ private final IKeyBuilder keyBuilder = KeyBuilder.newInstance(); /** Used to serialize RDF Values as byte[]s. */ private final DataOutputBuffer out = new DataOutputBuffer(); /** Used to serialize RDF Values as byte[]s. */ private final ByteArrayBuffer tbuf = new ByteArrayBuffer(); /** Used to serialize RDF Values as byte[]s. */ private final BigdataValueSerializer valSer; /** * Used to (de-)compress the raw values. *

* Note: This is not thread-safe, even for decompression. You need a * pool or thread-local instance to support concurrent reads against the * TERMS index. */ private final RecordCompressor compressor = new RecordCompressor( Deflater.BEST_SPEED); // /** Buffer for (serialized) RDF Values. */ // private KV[] values; /** #of buffered values. */ private int nvalues = 0; /** The memory manager. */ private final IMemoryManager memoryManager; /** The current allocation context. */ private IMemoryManager context = null; /** * Map of distinct values in the buffer. * * TODO In addition to enforcing DISTINCT over the Values in the * ValueBuffer, an LRU/LIRS cache would be nice here so we can reuse the * frequently resolved (BigdataValue => IV) mappings across buffer * instances. * * FIXME We need to provide a canonicalizing mapping for blank nodes. * * TODO The key should also include the URI,Literal,BNode, etc. prefix * bits (or is this necessary any more?). */ private Map addrMap; /** The size of the {@link #values} buffer when it is allocated. */ private final int valueBufSize; public StatementHandler(// final int valueBufSize, final Counters c, final LexiconConfiguration conf, final BigdataValueFactory vf, final IMemoryManager memoryManager, final BlockingQueue valueQueue, final AtomicBoolean parsing) { this.valueBufSize = valueBufSize; this.c = c; this.conf = conf; this.memoryManager = memoryManager; this.valueQueue = valueQueue; this.parsing = parsing; this.valSer = vf.getValueSerializer(); } public void endRDF() { if(log.isTraceEnabled()) log.trace("End of source."); try { flush(); } catch (InterruptedException e) { throw new RuntimeException(e); } } public void handleStatement(final Statement stmt) throws RDFHandlerException { if (!parsing.get()) { // Either shutdown or never started. throw new IllegalStateException(); } try { bufferValue((BigdataValue) stmt.getSubject()); bufferValue((BigdataValue) stmt.getPredicate()); bufferValue((BigdataValue) stmt.getObject()); if (stmt.getContext() != null) { bufferValue((BigdataValue) stmt.getContext()); } } catch (InterruptedException ex) { // Interrupted while blocked on the valueQueue throw new RDFHandlerException(ex); } c.nstmts.incrementAndGet(); } /** * If the RDF {@link Value} can not be represented inline within the * statement indices, then buffer the value for batch resolution against * the TERMS index. * * @param value * The RDF {@link Value}. * * @return A {@link Value}. If the caller's {@link Value} could be * represented as an inline {@link IV}, then the returned value * will be a {@link BigdataValue} and the inline {@link IV} will * be available from {@link BigdataValue#getIV()}. Otherwise the * caller's {@link Value} is returned and the {@link Value} must * be resolved against the TERMS index in order to obtain its * {@link IV}. * * @throws InterruptedException * * FIXME Handle {@link BlobIV}, {@link PartlyInlineURIIV}, and * {@link PartlyInlineTypedLiteralIV}. These are three kinds of * "non-inline" values. They will have to be queued for * insertion into the TERMS index and Statement which depend * on those non-inline values will have to be deferred until * we have resolved those non-inline values. This is * basically the same logic that we already have for * StatementBuffer, except that an asynchronous queue is * being used (by this class) to do the resolution of the IV * for large values. *

* Other kinds of {@link IV}s which could be handled here * would be references to large values stored in the file * system, in S3, etc. */ private void bufferValue(final BigdataValue value) throws InterruptedException { // Not expecting the IV to already be cached. assert value.getIV() == null; // Attempt to inline this value. final IV iv = conf.createInlineIV(value); if (iv != null) { // This is being inlined. switch (iv.getVTE()) { case URI: c.nshortURIs.incrementAndGet(); break; case BNODE: c.nshortBNodes.incrementAndGet(); break; case LITERAL: c.nshortLiterals.incrementAndGet(); break; default: throw new AssertionError(); } // Verify IV is cached on that Value. assert value.getIV() == iv; return; } if (context != null && context.getSlotBytes() >= valueBufSize) { // Incremental flush of large values to the TERMS index. flush(); } if (context == null) { // Lazy allocation of the buffer. context = memoryManager.createAllocationContext(); addrMap = new LinkedHashMap(); } /* * Generate a key (hash code) and value (serialized and compressed) * from the BigdataValue. */ final KV t = makeKV(value); /* * Lookup the list of addresses for RDF Values which hash to the * same key. */ Bucket bucket = addrMap.get(t.key); if (bucket == null) { /* * No match on that hash code key. */ // lay the record down on the memory manager. final long addr = context.allocate(ByteBuffer.wrap(t.val)); // add new bucket to the map. addrMap.put(t.key, bucket = new Bucket(t.key, addr)); nvalues++; } else { /* * Either a hash collision or the value is already stored at * a known address. */ { for (Long addr : bucket.addrs) { if (context.allocationSize(addr) != t.val.length) { // Non-match based on the allocated record size. continue; } /* * TODO It would be more efficient to compare the data * using the zero-copy get(addr) method. */ final byte[] tmp = context.read(addr); if (BytesUtil.bytesEqual(t.val, tmp)) { // We've already seen this Value. if (log.isDebugEnabled()) log.debug("Duplicate value in chunk: " + Arrays.toString(t.val)); /* * FIXME This pattern does not really work out for * building statements since we lack a reference to * the Value which is being inserted into the TERMS * index. The StatementBuffer handles this. It keeps * the Values in a map and inserts all values into * the database. [It should only keep the distinct * non-inline values but it currently keeps all * distinct values without regard to inlining.] */ return; } } // Fall through - there is no such record on the store. } // lay the record down on the memory manager. bucket.add(context.allocate(ByteBuffer.wrap(t.val))); nvalues++; } return; } // bufferValue() /** * Transfer a non-empty buffer to the {@link #valueQueue}. * * @throws InterruptedException */ void flush() throws InterruptedException { if (nvalues == 0) return; if (!parsing.get()) { // Either shutdown or never started. throw new IllegalStateException(); } if (log.isInfoEnabled()) log.info("Adding chunk with " + nvalues + " values and " + context.getUserBytes() + " bytes to queue."); /* * Create an object which encapsulates the allocation context (to be * cleared when the data have been consumed) and the address map. */ final List contexts = new LinkedList(); contexts.add(context); // put the buffer on the queue (blocking operation). valueQueue.put(new ValueBuffer(contexts, nvalues, addrMap)); // clear reference since we just handed off the data. context = null; addrMap = null; nvalues = 0; // clear distinct value set so it does not build for ever. // distinctValues.clear(); // addrMap.clear(); } private KV makeKV(final BigdataValue r) { byte[] val = valSer.serialize(r, out.reset(), tbuf); /* * FIXME In order support conditional compression we will have to * mark the record with a header to indicate whether or not * it is compressed. Without that header we can not * deserialize a record resolved via its TermId since we * will not know whether or not it is compressed (actually, * that could be part of the termId....) */ if (compressor != null) {//&& val.length > 64) { // compress, reusing [out]. out.reset(); compressor.compress(val, out); } // if (out.pos() < val.length) // TODO Use compressed version iff smaller. { val = out.toByteArray(); } /* * Note: This is an exclusive lower bound (it does not include the * counter). * * TODO We could format the counter in here as a ZERO (0) since it * is a fixed length value and then patch it up later. That would * involve less copying. */ final byte[] key = buildKey(r, val).getKey(); return new KV(key, val); } // makeKV() private IKeyBuilder buildKey(final Value r, final byte[] val) { // if (true) { /* * Simple 32-bit hash code based on the byte[] representation of * the RDF Value. */ final int hashCode = r.hashCode(); return keyBuilder.reset().append(hashCode); // } else { // // /* // * Message digest of the serialized representation of the RDF // * Value. // * // * TODO There are methods to copy out the digest (hash code) // * without memory allocations. getDigestLength() and // * getDigest(out,start,len). // */ // private final MessageDigest d; // // try { // // d = MessageDigest.getInstance("SHA-256"); // 256 bits (32 bytes) // // } catch (NoSuchAlgorithmException e) { // // throw new RuntimeException(e); // // } // // // final byte[] hashCode = d.digest(val); // // return keyBuilder.reset().append(hashCode); // // } } // buildKey } // class StatementHandler /** * Index a {@link ValueBuffer}. */ private static class IndexValueBufferTask implements Callable { /** * The {@link MemoryManager} against which the allocations were made. */ private final MemoryManager mmgr; /** * The data to be indexed. */ private final ValueBuffer vbuf; /** * The index to write on. */ private final BTree termsIndex; /** Counters for things that we track. */ private final Counters c; /** * Used to build the keys. */ private final IKeyBuilder keyBuilder = KeyBuilder.newInstance(); // /** Used to serialize RDF Values as byte[]s. */ // private final DataOutputBuffer out = new DataOutputBuffer(); /** Used to de-serialize RDF Values (debugging only). */ private final BigdataValueSerializer valSer; /** * Used to de-compress the raw values (debugging only). *

* Note: This is not thread-safe, even for decompression. You need a * pool or thread-local instance to support concurrent reads against the * TERMS index. */ private final RecordCompressor compressor; public IndexValueBufferTask(final MemoryManager mmgr, final ValueBuffer vbuf, final BTree termsIndex, final BigdataValueFactory vf, final Counters c) { if(mmgr == null) throw new IllegalArgumentException(); if(vbuf == null) throw new IllegalArgumentException(); if(termsIndex== null) throw new IllegalArgumentException(); if(vf == null) throw new IllegalArgumentException(); if(c == null) throw new IllegalArgumentException(); this.mmgr = mmgr; this.vbuf = vbuf; this.termsIndex = termsIndex; this.c = c; /* * Note: debugging only. */ this.valSer = vf.getValueSerializer(); this.compressor = new RecordCompressor(Deflater.BEST_SPEED); } public Void call() throws Exception { final long begin = System.currentTimeMillis(); if (log.isInfoEnabled()) log.info("Indexing " + vbuf.nvalues + " values occupying " + vbuf.getUserBytes() + " bytes"); /* * Place into sorted order by the keys. * * The Bucket implements Comparable. We extract the buckets, sort * them, and then process them. */ final Bucket[] a = vbuf.addrMap.values().toArray(new Bucket[0]); Arrays.sort(a); // Index the values. for (int i = 0; i = Byte.MAX_VALUE) { /* * Impose a hard limit on the #of hash collisions we will accept * in this utility. * * @todo We do not need to have a hard limit if we use * BigInteger for the counter, but the performance will go * through the floor if we have to scan 32k entries on a hash * collision! */ throw new RuntimeException("Too many hash collisions: ncoll=" + rangeCount); } // force range count into (signed) byte final byte counter = (byte) rangeCount; if (rangeCount == 0) { /* * This is the first time we have observed a Value which * generates this hash code, so append a [short] ZERO (0) to * generate the actual key and then insert the Value into the * index. Since there is nothing in the index for this hash * code, no collision is possible and we do not need to test the * index for the value before inserting the value into the * index. */ final byte[] key = keyBuilder.reset().append(fromKey).appendSigned( counter).getKey(); if (termsIndex.insert(key, val) != null) { throw new AssertionError(); } c.ninserted.incrementAndGet(); c.totalKeyBytes.addAndGet(key.length); c.totalValBytes.addAndGet(val.length); return; } /* * iterator over that key range * * TODO Filter for the value of interest so we can optimize the scan * by comparing with the value without causing it to be * materialized, especially we should be able to efficiently reject * tuples where the byte[] value length is known to differ from the * a given length, including when the value is stored as a raw * record at which point we are doing a fast rejection based on * comparing the byteCount(addr) for the raw record with the target * byte count for value that we are seeking in the index. * * We can visit something iff the desired tuple already exists (same * length, and possibly the same data). If we visit nothing then we * know that we have to insert a tuple and we know the counter value * from the collision count. */ final ITupleIterator itr = termsIndex.rangeIterator(fromKey, toKey, 0/* capacity */, IRangeQuery.VALS, null/* filter */); boolean found = false; while(itr.hasNext()) { final ITuple tuple = itr.next(); // raw bytes final byte[] tmp = tuple.getValue(); if (false) System.out.println(getValue(tmp)); // Note: Compares the compressed values ;-) if(BytesUtil.bytesEqual(val, tmp)) { found = true; break; } } if(found) { // Already in the index. return; } /* * Hash collision. */ if (rangeCount > c.maxCollisions.get()) { // Raise the maximum collision count. c.maxCollisions.set(rangeCount); log.warn("MAX COLLISIONS NOW: " + c.maxCollisions.get()); } final byte[] key = keyBuilder.reset().append(fromKey).appendSigned( counter).getKey(); // Insert into the index. if (termsIndex.insert(key, val) != null) { throw new AssertionError(); } c.ninserted.incrementAndGet(); c.totalKeyBytes.addAndGet(key.length); c.totalValBytes.addAndGet(val.length); c.totalCollisions.incrementAndGet(); if (rangeCount > 128) { // arbitrary limit to log @ WARN. log.warn("Collision: hashCode=" + BytesUtil.toString(key) + ", nstmts="+c.nstmts + ", nshortLiterals=" + c.nshortLiterals + ", nshortURIs=" + c.nshortURIs + ", ninserted=" + c.ninserted + ", totalCollisions=" + c.totalCollisions + ", maxCollisions=" + c.maxCollisions + ", ncollThisTerm=" + rangeCount + ", resource=" + getValue(val)); } else if (log.isDebugEnabled()) log.debug("Collision: hashCode=" + BytesUtil.toString(key) + ", nstmts="+c.nstmts + ", nshortLiterals=" + c.nshortLiterals + ", nshortURIs=" + c.nshortURIs + ", ninserted=" + c.ninserted + ", totalCollisions=" + c.totalCollisions + ", maxCollisions=" + c.maxCollisions + ", ncollThisTerm=" + rangeCount + ", resource=" + getValue(val)); } /** * Decompress and deserialize a {@link Value}. * * @param tmp * The serialized and compressed value. * * @return The {@link Value}. */ private Value getValue(final byte[] tmp) { // decompress final ByteBuffer b = compressor.decompress(tmp); final byte[] c = new byte[b.limit()]; b.get(c); // deserialize. return valSer.deserialize(c); } } // class IndexValueBufferTask /** * Parse files, inserting {@link Value}s into indices and counting hash * collisions. * * @param args * filename(s) * * @throws IOException * @throws RDFHandlerException * @throws RDFParseException * @throws NoSuchAlgorithmException */ public static void main(final String[] args) throws Exception { Banner.banner(); // check args. { for (String filename : args) { final File file = new File(filename); if (!file.exists()) throw new RuntimeException("Not found: " + file); } } final long begin = System.currentTimeMillis(); final Properties properties = new Properties(); properties.setProperty(Journal.Options.BUFFER_MODE, BufferMode.DiskRW .toString()); properties.setProperty(Journal.Options.INITIAL_EXTENT, "" + (Bytes.megabyte * 200)); // properties.setProperty(Journal.Options.COLLECT_PLATFORM_STATISTICS,"true"); // properties.setProperty(Journal.Options.COLLECT_QUEUE_STATISTICS,"true"); properties.setProperty(Journal.Options.HTTPD_PORT,"8081"); // The caller MUST specify the filename using -D on the command line. final String journalFile = System.getProperty(Journal.Options.FILE); if (journalFile == null) { System.err.println("Journal file must be specified: -D" + Journal.Options.FILE); System.exit(1); } properties.setProperty(Journal.Options.FILE, journalFile); if (new File(journalFile).exists()) { System.err.println("Removing old journal: " + journalFile); new File(journalFile).delete(); } final Journal jnl = new Journal(properties); final RDFFormat fallback = RDFFormat.N3; HashCollisionUtility u = null; try { u = new HashCollisionUtility(jnl); u.start(); for (String filename : args) { u.parseFileOrDirectory(new File(filename), fallback); } // // flush anything left in the buffer. // u.stmtHandler.flush(); // shutdown and block until all data is indexed. u.shutdown(); jnl.commit(); } catch (Throwable t) { u.shutdownNow(); throw new RuntimeException(t); } finally { jnl.close(); final long elapsed = System.currentTimeMillis() - begin; System.out.println("Elapsed: " + elapsed + "ms"); if (u != null) { System.out.println("NumStatements: " + u.c.nstmts); System.out.println("NumDistinctVals: " + u.c.ninserted); System.out.println("NumShortLiterals: " + u.c.nshortLiterals); System.out.println("NumShortBNodes: " + u.c.nshortBNodes); System.out.println("NumShortURIs: " + u.c.nshortURIs); // System.out.println("NumCacheHit: " + u.ncached); System.out.println("TotalKeyBytes: " + u.c.totalKeyBytes); System.out.println("TotalValBytes: " + u.c.totalValBytes); System.out.println("MaxCollisions: " + u.c.maxCollisions); System.out.println("TotalCollisions: " + u.c.totalCollisions); } if (new File(journalFile).exists()) { System.out.println("Journal size: " + new File(journalFile).length() + " bytes"); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy