org.apache.jena.tdb.index.ext.ExtHash Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jena-tdb Show documentation
Show all versions of jena-tdb Show documentation
TDB is a storage subsystem for Jena and ARQ, it is a native triple store providing persistent storage of triples/quads.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This file contains code under the Apache 2 license - see hashFNV */
package org.apache.jena.tdb.index.ext;
import static java.lang.String.format;
import java.nio.IntBuffer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.jena.atlas.io.IndentedLineBuffer ;
import org.apache.jena.atlas.io.IndentedWriter ;
import org.apache.jena.atlas.lib.BitsLong ;
import org.apache.jena.atlas.lib.Bytes ;
import org.apache.jena.atlas.lib.Lib ;
import org.apache.jena.tdb.base.StorageException ;
import org.apache.jena.tdb.base.block.BlockMgr ;
import org.apache.jena.tdb.base.block.BlockMgrFactory ;
import org.apache.jena.tdb.base.buffer.RecordBuffer ;
import org.apache.jena.tdb.base.file.PlainFile ;
import org.apache.jena.tdb.base.file.PlainFileMem ;
import org.apache.jena.tdb.base.record.Record ;
import org.apache.jena.tdb.base.record.RecordFactory ;
import org.apache.jena.tdb.index.Index ;
import org.apache.jena.tdb.sys.SystemTDB ;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** Extensible hashing
* http://en.wikipedia.org/wiki/Extendible_hashing
*/
public final class ExtHash implements Index
{
/* Hashing.
* Extendible hashing is based on taking more of the bits of the hash
* value to address an expanding dictionary. This is a bit-trie, stored
* as an array. One bucket can be used for several hash slots.
*
* We need that the bits are used in decreasing signifance because we
* keep items in buckets in full-hash sorted order.
*
* Side effect: the whole structure is sorted by full hash, using
* dictionary and buckets.
*
* But.
* Java .hashCode() does not make suitable hash directly because either
* they are Object.hashCode (not too bad but it tends not to use high bits) or
* something like Integer.hashCode is the integer value itself. The
* latter is very bad as the hash is not using the high bits (most
* integers are small - especially sequentially allocated numbers).
*
* Solution: use the hashCode, 31 bits (arrays indexes are signed)
* but bit reversed so low bits of the original value are the most
* significant (when shorter than the currect hash, it's the low bits
* that are used).
*
*
*
* Example: using hex chars, not bits.
*
* Record: key: 0xABCD
* Length = 1 ==> trie is D
* if the length changes from 1 to 2,
* Length = 2 ==> trie is DC, that is, the D is most significant.
* and all buckets Dx point to what was in slot for D.
*
* All hash handling is encapulated in the internal routines.
*/
static private Logger log = LoggerFactory.getLogger(ExtHash.class) ;
// Production: make these final and false.
public static boolean Debugging = false ;
public static boolean Checking = false ; // Perform internal checking
public static boolean Logging = false ; // Allow any logging code on critical paths
IntBuffer dictionary ; // mask(hash) -> Bucket id
// Current length of trie bit used. Invariant: dictionary.length = 1<>>1 ;
}
/** Calculate the array index for a key given the dictionary bit length */
private int trieKey(Record key, int bitLen) { return trieKey(trieKey(key), bitLen) ; }
/** Convert from full hash to array index for a dictionary bit length */
private int trieKey(int fullTrie, int bitLen) { return fullTrie >>> (31-bitLen) ; }
/** Calculate the bucket id for a key given the dictionary bit length */
private int bucketId(Record key, int bitLen)
{
int x = trieKey(trieKey(key), bitLen) ;
int id = dictionary.get(x) ;
return id ;
}
/** Size of the file, in bytes */
private static long filesize(int dictionarySize) { return 4L*dictionarySize ; }
// =====================
private void resizeDictionary()
{
int oldSize = 1<>>>Resize") ;
log("resize: %d ==> %d", oldSize, newSize) ;
}
IntBuffer newDictionary = dictionaryFile.ensure(newSize*SystemTDB.SizeOfInt).asIntBuffer() ;
if ( dictionary != null )
{
// Fill new dictionary
// NB Fills from high to low so that it works "in place"
for ( int i = oldSize-1 ; i>=0 ; i-- )
{
int b = newDictionary.get(i) ;
//if ( logging() ) log("Resize: put: (%d, %d)", 2*i, b) ;
newDictionary.put(2*i, b) ;
newDictionary.put(2*i+1, b) ;
}
}
dictionary = newDictionary ;
bitLen = newBitLen ;
if ( logging() )
{
if ( false ) dump() ;
if ( false ) log(this) ;
log("<<<> get(%s)", key) ;
int blockId = bucketId(key, bitLen) ;
HashBucket bucket = hashBucketMgr.get(blockId) ;
Record value = bucket.find(key) ;
if ( logging() ) log("<< get(%s) -> %s", key.getKey(), value) ;
return value ;
}
@Override
public boolean add(Record record)
{
if ( logging() ) log(">> add(%s)", record) ;
int h = trieKey(record) ;
boolean b = put(record, h) ;
if ( logging() )
{
log("<< add(%s)", record) ;
//dump() ;
}
internalCheck() ;
return b ;
}
@Override
public boolean delete(Record record)
{
if ( logging() ) log(">> remove(%s)", record) ;
int blockId = bucketId(record, bitLen) ;
HashBucket bucket = hashBucketMgr.get(blockId) ;
boolean b = bucket.removeByKey(record) ;
hashBucketMgr.put(bucket) ;
internalCheck() ;
if ( logging() ) log("<< remove(%s)", record) ;
return b ;
}
@Override
public RecordFactory getRecordFactory()
{ return recordFactory ; }
@Override
public Iterator iterator()
{
return new ExtHashIterator(this) ;
}
@Override
public boolean isEmpty()
{
if ( dictionary.limit() == 1 )
{
HashBucket b = hashBucketMgr.get(1) ;
return b.isEmpty() ;
}
// No idea.
return false ;
}
@Override
public void clear()
{ throw new UnsupportedOperationException("RangeIndex("+Lib.classShortName(this.getClass())+").clear") ; }
@Override
public long size()
{ return count() ; }
/** Explicitly count the items in the hash table */
public long count()
{
Set seen = new HashSet<>() ;
long count = 0 ;
for ( int i = 0 ; i < dictionary.capacity() ; i++ )
{
int id = dictionary.get(i) ;
if ( seen.contains(id) )
continue ;
seen.add(id) ;
HashBucket bucket = hashBucketMgr.get(id) ;
count += bucket.getCount() ;
}
return count ;
}
@Override
public void sync()
{
hashBucketMgr.getBlockMgr().sync() ;
dictionaryFile.sync() ;
}
@Override
public void close()
{
hashBucketMgr.getBlockMgr().close() ;
dictionaryFile.close() ;
}
// =====================
// Insert
// Reentrant part of "put"
private boolean put(Record record, int hash)
{
if ( logging() ) log("put(%s,0x%08X)", record, hash) ;
int dictIdx = trieKey(hash, bitLen) ; // Dictionary index
int blockId = dictionary.get(dictIdx) ;
HashBucket bucket = hashBucketMgr.get(blockId) ;
if ( ! bucket.isFull() )
{
if ( Debugging ) log("Insert [(0x%04X) %s]: %d", hash, record, bucket.getId()) ;
boolean b = bucket.put(record) ;
hashBucketMgr.put(bucket) ;
return b ;
}
//Is this and +1 the same? Is the block splitable?
// Bucket full.
if ( bitLen == bucket.getTrieBitLen() )
{
// // Log it anyway
// if ( ! logging() ) log("put(%s,0x%08X)", record, hash) ;
boolean oldLogging = Logging ;
boolean oldDebugging = Debugging ;
try {
// Logging = true ;
// Debugging = true ;
if ( Debugging )
{
log("Bucket full: %d", bucket.getId()) ;
log("Bucket can't be split - dictionary resize needed") ;
//log(bucket) ;
this.dump() ;
}
// Bucket not splitable..
// TODO Overflow buckets.
// Expand the dictionary.
int x = dictionarySize() ;
resizeDictionary() ;
if ( Debugging ) log("Resize: %d -> %d", x, dictionarySize()) ;
// Try again
return put(record, hash) ;
} finally { Logging = oldLogging ; Debugging = oldDebugging ;}
}
if ( Debugging ) log("Split bucket: %d", bucket.getId()) ;
// bitLen > bucket.getHashBitLen() : bucket can be split
splitAndReorganise(bucket, dictIdx, blockId, hash) ;
// Reorg done - try again.
return put(record, hash) ;
}
// Bucket bitlength is less than that of the dictionary.
private void splitAndReorganise(HashBucket bucket, int dictionaryIdx, int bucketId, int hash)
{
if ( logging() )
{
log("splitAndReorganise: idx=%d, id=%d, bitLen=%d, bucket.hashLength=%d",
dictionaryIdx, bucketId, bitLen, bucket.getTrieBitLen()) ;
if ( false ) dump() ;
}
if ( Checking )
{
if ( bucket.getTrieBitLen() >= bitLen )
error("splitAndReorganise: idx=0x%X : hash=0x%X[0x%X,0x%X] : Hash not shorter : %s",
dictionaryIdx, hash, trieKey(hash, bucket.getTrieBitLen()), bucket.getTrieValue(), bucket) ;
if ( trieKey(hash, bucket.getTrieBitLen()) != bucket.getTrieValue() )
error("splitAndReorganise: idx=0x%X : hash=0x%X[0x%X,0x%X] : Inconsistency : %s",
dictionaryIdx, hash, trieKey(hash, bucket.getTrieBitLen()), bucket.getTrieValue(), bucket) ;
}
// Bucket did not have a full length hash so split it.
// Find the companion slots.
// Remember before messing with split.
int bucketHash = bucket.getTrieValue() ;
int bucketHashLength = bucket.getTrieBitLen() ;
// Split the bucket in two. bucket2 is the upper bucket.
HashBucket bucket2 = split(bucketId, bucket) ;
// Determine the slots affected:
// All the dictionary entries that in the extension of the bit trie, have a 1
// in the newly exposed bit. These will point to bucket2.
// All the slots for bit 0 will continue to point to the existing (reorganised) bucket.
// The hash is reversed (the low bits the hash value are most significant).
// So extending a hash is shift up, and OR in a 0 or 1.
// Zeros in the difference the bucket bit length and the dictionary bitlength.
// Upper section of bucket hash, extended, then the gap in lengths zero filled.
int trieUpperRoot = ((bucketHash<<1)|0x1) << (bitLen-bucketHashLength-1) ;
// Upper bound (exclusive) of values affected between dictionary and current bucket.
// NB relationship to the second shift on trieUpperRoot
int trieUpperRange = (1<<(bitLen-bucketHashLength-1)) ;
for ( int j = 0 ; j < trieUpperRange ; j++ )
{
// j runs over the values of the unused bits of the trie start for the upper bucket positions.
int k = trieUpperRoot | j ;
if ( logging() )
log("Point to split bucket: 0x%04X", k) ;
if ( Checking )
{
if ( (trieUpperRoot&j) != 0 )
error("put: idx=%d : trieRoot=0x%X, sub=%d: Broken trie pattern ", dictionaryIdx, trieUpperRoot, j) ;
if ( ! BitsLong.isSet(k, (bitLen-(bucketHashLength+1)) ) )
error("put: Broken trie pattern (0x%X,%d)", trieUpperRoot, j) ;
// We should looking at the original bucket
int id = dictionary.get(k) ;
HashBucket hb = hashBucketMgr.get(id) ;
if ( hb.getId() != bucket.getId() )
error("put: Wrong bucket at trie 0x%X %d: (%d,%d)", trieUpperRoot, j, hb.getId(), bucket.getId()) ;
}
dictionary.put(k, bucket2.getId()) ;
}
if ( logging() )
{
log("Reorg complete") ;
if ( false ) dump() ;
}
}
private HashBucket split(int bucketId, HashBucket bucket)
{
// idx is the array offset to the lower of the bucket point pair.
if ( logging() )
{
log("split: Bucket %d : size: %d; Bucket bitlength %d", bucketId, bucket.getCount(), bucket.getTrieBitLen()) ;
log("split: %s", bucket) ;
}
// Create new bucket, which will be the upper bucket.
// Low bucket will have the old hash value,
// Lengthen the hash; the new will be one more.
bucket.incTrieBitLen() ;
// Bucket hash value is kept in index-order (i.e. high bits are most significant).
int hash1 = bucket.getTrieValue() << 1 ;
int hash2 = (bucket.getTrieValue() << 1) | 0x1 ;
// Reset, now it's longer
bucket.setTrieValue(hash1) ;
if ( logging() )
log("split: bucket hashes 0x%04X 0x%04X", hash1, hash2) ;
// // New bucket
HashBucket bucket2 = hashBucketMgr.create(hash2, bucket.getTrieBitLen()) ;
if ( logging() ) log("New bucket: %s", bucket2) ;
//bucket2.setTrieValue(hash2) ;
RecordBuffer rBuff1 = bucket.getRecordBuffer() ;
RecordBuffer rBuff2 = bucket2.getRecordBuffer() ;
int idx1 = 0 ; // Destination indexes into the above
int idx2 = 0 ;
for ( int i = 0 ; i < rBuff1.size() ; i++ )
{
Record r = rBuff1.get(i) ;
int x = trieKey(r, bucket.getTrieBitLen()) ; // Incremented bit length
if ( x == hash1 )
{
if ( logging() )
log("Allocate index %d to bucket1", i) ;
// idx1 <= i (we are writing less records back).
// So this foes not interfer with the loop
// We're shifting down records that saty in this bucket.
if ( idx1 != i )
rBuff1.set(idx1, r) ;
idx1++ ;
}
else if ( x == hash2 )
{
if ( logging() )
log("Allocate index %d to bucket2", i) ;
rBuff2.add(r) ;
idx2 ++ ;
}
else
error("Bad trie for allocation to split buckets") ;
}
if ( true )
rBuff1.clear(idx1, bucket.getCount()-idx1) ;
rBuff1.setSize(idx1) ;
// rBuff2 was fresh so still clean.
if ( logging() )
{
log("split: Lower bucket: %s", bucket) ;
log("split: Upper bucket: %s", bucket2) ;
}
// Check with splitAndReorganise()
hashBucketMgr.put(bucket) ;
hashBucketMgr.put(bucket2) ;
return bucket2 ;
}
// =====================
@Override
public String toString()
{
IndentedLineBuffer buff = new IndentedLineBuffer() ;
dump(buff) ;
return buff.asString() ;
}
public void dump()
{
dump(IndentedWriter.stdout) ;
IndentedWriter.stdout.ensureStartOfLine() ;
IndentedWriter.stdout.flush() ;
}
private void dump(IndentedWriter out)
{
out.printf("Bitlen = %d\n" , bitLen) ;
out.printf("Dictionary = %d\n" , 1< seen = new HashSet<>() ;
for ( int i = 0 ; i < d ; i++ )
{
int id = dictionary.get(i) ;
if ( seen.contains(id) )
continue ;
seen.add(id) ;
HashBucket bucket = hashBucketMgr.get(id) ;
performCheck(i, bucket) ;
}
}
private void performCheck(int idx, HashBucket bucket)
{
if ( bucket.getTrieBitLen() > bitLen )
error("[%d] Bucket %d has bit length longer than the dictionary's (%d, %d)", idx, bucket.getId(), bucket.getTrieBitLen(), bitLen) ;
// Check the bucket hash against the slot it's in.
// Convert directory index to bucket hash
int tmp = (idx >>> (bitLen-bucket.getTrieBitLen())) ;
if ( tmp != bucket.getTrieValue())
error("[%d] Bucket %d : hash prefix 0x%X, expected 0x%X : %s", idx, bucket.getId(), bucket.getTrieValue(), tmp, bucket) ;
// Check the contents.
Record prevKey = Record.NO_REC ;
for ( int i = 0 ; i < bucket.getCount() ; i++ )
{
Record rec = bucket.get(i) ;
if ( prevKey != Record.NO_REC && Record.keyLT(rec,prevKey) )
error("[%d] Bucket %d: Not sorted (slot %d) : %s", idx, bucket.getId(), i, bucket) ;
prevKey = rec ;
int x = trieKey(rec, bucket.getTrieBitLen()) ;
// Check the key is bucket-compatible.
if ( x != bucket.getTrieValue() )
error("[%d] Bucket %d: Key (0x%04X) does not match the hash (0x%04X) : %s",
idx, bucket.getId(), x, bucket.getTrieValue(), bucket) ;
}
if ( SystemTDB.NullOut )
{
for ( int i = bucket.getCount() ; i < bucket.getMaxSize() ; i++ )
{
if ( ! bucket.getRecordBuffer().isClear(i) )
error("[%d] Bucket %d : overspill at [%d]: %s", idx, bucket.getId(), i, bucket) ;
}
}
}
private void error(String msg, Object... args)
{
msg = format(msg, args) ;
log.error(msg) ;
throw new StorageException(msg) ;
}
private final boolean logging() { return Logging /* && log.isDebugEnabled()*/ ; }
private final void log(String format, Object... args)
{
//if ( ! logging() ) return ;
log.debug(format(format, args)) ;
}
private final void log(Object obj)
{
//if ( ! logging() ) return ;
log.debug(obj.toString()) ;
}
}