org.apache.jena.tdb.index.bplustree.BPlusTree Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jena-tdb Show documentation
Show all versions of jena-tdb Show documentation
TDB is a storage subsystem for Jena and ARQ, it is a native triple store providing persistent storage of triples/quads.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.tdb.index.bplustree;
import static org.apache.jena.tdb.index.bplustree.BPlusTreeParams.CheckingNode ;
import static org.apache.jena.tdb.index.bplustree.BPlusTreeParams.CheckingTree ;
import java.util.Iterator ;
import org.apache.jena.atlas.io.IndentedWriter ;
import org.apache.jena.atlas.iterator.Iter ;
import org.apache.jena.tdb.base.block.BlockMgr ;
import org.apache.jena.tdb.base.block.BlockMgrFactory ;
import org.apache.jena.tdb.base.block.BlockMgrTracker ;
import org.apache.jena.tdb.base.record.Record ;
import org.apache.jena.tdb.base.record.RecordFactory ;
import org.apache.jena.tdb.base.recordbuffer.RecordBufferPage ;
import org.apache.jena.tdb.base.recordbuffer.RecordBufferPageMgr ;
import org.apache.jena.tdb.base.recordbuffer.RecordRangeIterator ;
import org.apache.jena.tdb.index.RangeIndex ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
/** B-Tree converted to B+Tree
*
* B-Tree taken from:
* Introduction to Algorithms, Second Edition
* Chapter 18: B-Trees
* by Thomas H. Cormen, Charles E. Leiserson,
* Ronald L. Rivest and Clifford Stein
*
* Includes implementation of removal.
*
* Notes:
* Stores "records", which are a key and value (the value may be null).
*
* In this B+Tree implementation, the (key,value) pairs are held in
* RecordBuffer, which wrap a ByteBuffer that only has records in it.
* BPTreeRecords provides the B+Tree view of a RecordBuffer. All records
* are in RecordBuffer - the "tree" part is an index for finding the right
* page. The tree only holds keys, copies from the (key, value) pairs in
* the RecordBuffers.
*
* Notes:
*
* The version above splits nodes on the way down when full,
* not when needed where a split can bubble up from below.
* It means it only ever walks down the tree on insert.
* Similarly, the delete code ensures a node is suitable
* before descending.
*
* Variations:
* In this impl, splitRoot leaves the root node in place.
* The root is always the same block.
*/
public class BPlusTree implements Iterable, RangeIndex
{
/*
* Insertion:
* There are two styles for handling node splitting.
*
* Classically, when a leaf is split, the separating key is inserted into
* the parent, which may itself be full and so that is split, etc propagting
* up to the root (splitting the root is the only time the depth of the
* BTree increases). This involves walking back up the tree.
*
* It is more convenient to have a spare slot in a tree node, so that the
* new key can be inserted, then the keys and child pointers split.
*
* Modification: during insertion, splitting is applied to any full node
* traversed on the way down, resulting in any node passed through having
* some space for a new key. When splitting starts at a leaf, only the
* immediate parent is changed because it must have space for the new key.
* There is no cascade back to the top of the tree (it would have happened on
* the way down); in other words, splitting is done early. This is insertion
* in a single downward pass.
*
* When compared to the classic approach including the extra slot for
* convenient inserting, the space useage is approximately the same.
*
* Deletion:
* Deletion always occurs at a leaf; if it's an internal node, swap the key
* with the right-most left key (predecessor) or left-most right key (successor),
* and delete in the leaf.
*
* The classic way is to propagate node merging back up from the leaf. The
* book outlines a way that checks that a nod eis delte-suitable (min+1 in size)
* on the way down. This is implemented here; this is one-pass(ish).
*
* Variants:
* http://en.wikipedia.org/wiki/Btree
*
* B+Tree: Tree contains keys, and only the leaves have the values. Used for
* secondary indexes (external pointers) but also for general on-disk usage
* because more keys are packed into a level. Can chain the leaves for a
* sorted-order traversal.
*
* B*Tree: Nodes are always 2/3 full. When a node is full, keys are shared adjacent
* nodes and if all they are all full do 2 nodes get split into 3 nodes.
* Implementation wise, it is more complicated; can cause more I/O.
*
* B#Tree: A B+Tree where the operations try to swap nodes between immediate
* sibling nodes instead of immediately splitting (like delete, only on insert).
*/
private static Logger log = LoggerFactory.getLogger(BPlusTree.class) ;
private int rootIdx = BPlusTreeParams.RootId ;
///*package*/ BPTreeNode root ;
private BPTreeNodeMgr nodeManager ;
private BPTreeRecordsMgr recordsMgr;
private BPlusTreeParams bpTreeParams ;
/** Create the in-memory structures to correspond to
* the supplied block managers for the persistent storage.
* Initialize the persistent storage to the empty B+Tree if it does not exist.
* This is the normal way to create a B+Tree.
*/
public static BPlusTree create(BPlusTreeParams params, BlockMgr blkMgrNodes, BlockMgr blkMgrLeaves)
{
BPlusTree bpt = attach(params, blkMgrNodes, blkMgrLeaves) ;
bpt.createIfAbsent() ;
return bpt ;
}
/** Create the in-memory structures to correspond to
* the supplied block managers for the persistent storage.
* Does not initialize the B+Tree - it assumes the block managers
* correspond to an existing B+Tree.
*/
public static BPlusTree attach(BPlusTreeParams params, BlockMgr blkMgrNodes, BlockMgr blkMgrRecords)
{
return new BPlusTree(params, blkMgrNodes, blkMgrRecords) ;
}
/** (Testing mainly) Make an in-memory B+Tree, with copy-in, copy-out block managers */
public static BPlusTree makeMem(int order, int minRecords, int keyLength, int valueLength)
{ return makeMem(null, order, minRecords, keyLength, valueLength) ; }
/** (Testing mainly) Make an in-memory B+Tree, with copy-in, copy-out block managers */
public static BPlusTree makeMem(String name, int order, int minRecords, int keyLength, int valueLength)
{
BPlusTreeParams params = new BPlusTreeParams(order, keyLength, valueLength) ;
int blkSize ;
if ( minRecords > 0 )
{
int maxRecords = 2*minRecords ;
//int rSize = RecordBufferPage.HEADER+(maxRecords*params.getRecordLength()) ;
blkSize = RecordBufferPage.calcBlockSize(params.getRecordFactory(), maxRecords) ;
}
else
blkSize = params.getCalcBlockSize() ;
BlockMgr mgr1 = BlockMgrFactory.createMem(name+"(nodes)", params.getCalcBlockSize()) ;
BlockMgr mgr2 = BlockMgrFactory.createMem(name+"(records)", blkSize) ;
BPlusTree bpTree = BPlusTree.create(params, mgr1, mgr2) ;
return bpTree ;
}
/** Debugging */
public static BPlusTree addTracking(BPlusTree bpTree)
{
BlockMgr mgr1 = bpTree.getNodeManager().getBlockMgr() ;
BlockMgr mgr2 = bpTree.getRecordsMgr().getBlockMgr() ;
// mgr1 = BlockMgrTracker.track("BPT/Nodes", mgr1) ;
// mgr2 = BlockMgrTracker.track("BPT/Records", mgr2) ;
mgr1 = BlockMgrTracker.track(mgr1) ;
mgr2 = BlockMgrTracker.track(mgr2) ;
return BPlusTree.attach(bpTree.getParams(), mgr1, mgr2) ;
}
private BPlusTree(BPlusTreeParams params, BlockMgr blkMgrNodes, BlockMgr blkMgrRecords)
{
// Consistency checks.
this.bpTreeParams = params ;
this.nodeManager = new BPTreeNodeMgr(this, blkMgrNodes) ;
RecordBufferPageMgr recordPageMgr = new RecordBufferPageMgr(params.getRecordFactory(), blkMgrRecords) ;
recordsMgr = new BPTreeRecordsMgr(this, recordPageMgr) ;
}
/** Create if does not exist */
private void createIfAbsent()
{
// This fixes the root to being block 0
if ( ! nodeManager.valid(BPlusTreeParams.RootId) )
//if ( ! nodeManager.getBlockMgr().isEmpty() )
{
// Create as does not exist.
// [TxTDB:PATCH-UP]
// ** Better: seperate "does it exist? - create statics used in factory"
startUpdateBlkMgr() ;
// Fresh BPlusTree
rootIdx = nodeManager.createEmptyBPT() ;
if ( rootIdx != 0 )
throw new InternalError() ;
if ( CheckingNode )
{
BPTreeNode root = nodeManager.getRead(rootIdx, BPlusTreeParams.RootParent) ;
root.checkNodeDeep() ;
root.release() ;
}
// Sync created blocks to disk - any caches are now clean.
nodeManager.getBlockMgr().sync() ;
recordsMgr.getBlockMgr().sync() ;
// Cache : not currently done - root is null
//setRoot(root) ;
finishUpdateBlkMgr() ;
}
}
private BPTreeNode getRoot()
{
// No caching here.
BPTreeNode root = nodeManager.getRoot(rootIdx) ;
//this.root = root ;
return root ;
}
private void releaseRoot(BPTreeNode rootNode)
{
// // [TxTDB:PATCH-UP]
// if ( root != null )
// {
// root.release() ;
// //nodeManager.release(rootNode) ;
// }
// if ( root != null && rootNode != root )
// log.warn("Root is not root!") ;
rootNode.release() ;
}
private void setRoot(BPTreeNode node)
{
//root = node ;
}
/** Get the parameters describing this B+Tree */
public BPlusTreeParams getParams() { return bpTreeParams ; }
/** Only use for careful manipulation of structures */
public BPTreeNodeMgr getNodeManager() { return nodeManager ; }
/** Only use for careful manipulation of structures */
public BPTreeRecordsMgr getRecordsMgr() { return recordsMgr ; }
@Override
public RecordFactory getRecordFactory()
{
return bpTreeParams.recordFactory ;
}
@Override
public Record find(Record record)
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
Record v = BPTreeNode.search(root, record) ;
releaseRoot(root) ;
finishReadBlkMgr() ;
return v ;
}
@Override
public boolean contains(Record record)
{
Record r = find(record) ;
return r != null ;
}
@Override
public Record minKey()
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
Record r = root.minRecord();
releaseRoot(root) ;
finishReadBlkMgr() ;
return r ;
}
@Override
public Record maxKey()
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
Record r = root.maxRecord() ;
releaseRoot(root) ;
finishReadBlkMgr() ;
return r ;
}
@Override
public boolean add(Record record)
{
return addAndReturnOld(record) == null ;
}
/** Add a record into the B+Tree */
public Record addAndReturnOld(Record record)
{
startUpdateBlkMgr() ;
BPTreeNode root = getRoot() ;
Record r = BPTreeNode.insert(root, record) ;
if ( CheckingTree ) root.checkNodeDeep() ;
releaseRoot(root) ;
finishUpdateBlkMgr() ;
return r ;
}
@Override
public boolean delete(Record record)
{ return deleteAndReturnOld(record) != null ; }
public Record deleteAndReturnOld(Record record)
{
startUpdateBlkMgr() ;
BPTreeNode root = getRoot() ;
Record r = BPTreeNode.delete(root, record) ;
if ( CheckingTree ) root.checkNodeDeep() ;
releaseRoot(root) ;
finishUpdateBlkMgr() ;
return r ;
}
@Override
public Iterator iterator()
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
Iterator iter = iterator(root) ;
releaseRoot(root) ;
finishReadBlkMgr() ;
return iter ;
}
@Override
public Iterator iterator(Record fromRec, Record toRec)
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
Iterator iter = iterator(root, fromRec, toRec) ;
releaseRoot(root) ;
finishReadBlkMgr() ;
// Note that this end the read-part (find the start), not the iteration.
// Iterator read blocks still get handled.
return iter ;
}
/** Iterate over a range of fromRec (inclusive) to toRec (exclusive) */
private static Iterator iterator(BPTreeNode node, Record fromRec, Record toRec)
{
// Look for starting RecordsBufferPage id.
int id = BPTreeNode.recordsPageId(node, fromRec) ;
if ( id < 0 )
return Iter.nullIter() ;
RecordBufferPageMgr pageMgr = node.getBPlusTree().getRecordsMgr().getRecordBufferPageMgr() ;
// No pages are active at this point.
return RecordRangeIterator.iterator(id, fromRec, toRec, pageMgr) ;
}
private static Iterator iterator(BPTreeNode node)
{
return iterator(node, null, null) ;
}
// Internal calls.
private void startReadBlkMgr()
{
nodeManager.startRead() ;
recordsMgr.startRead() ;
}
private void finishReadBlkMgr()
{
nodeManager.finishRead() ;
recordsMgr.finishRead() ;
}
private void startUpdateBlkMgr()
{
nodeManager.startUpdate() ;
recordsMgr.startUpdate() ;
}
private void finishUpdateBlkMgr()
{
nodeManager.finishUpdate() ;
recordsMgr.finishUpdate() ;
}
@Override
public boolean isEmpty()
{
startReadBlkMgr() ;
BPTreeNode root = getRoot() ;
boolean b = ! root.hasAnyKeys() ;
releaseRoot(root) ;
finishReadBlkMgr() ;
return b ;
}
private static int SLICE = 10000 ;
@Override
public void clear() {
Record[] records = new Record[SLICE] ;
while(true) {
Iterator iter = iterator() ;
int i = 0 ;
for ( i = 0 ; i < SLICE ; i++ ) {
if ( ! iter.hasNext() )
break ;
Record r = iter.next() ;
records[i] = r ;
}
if ( i == 0 )
break ;
for ( int j = 0 ; j < i ; j++ ) {
delete(records[j]) ;
records[j] = null ;
}
}
}
@Override
public void sync()
{
if ( nodeManager.getBlockMgr() != null )
nodeManager.getBlockMgr().sync() ;
if ( recordsMgr.getBlockMgr() != null )
recordsMgr.getBlockMgr().sync() ;
}
@Override
public void close()
{
if ( nodeManager.getBlockMgr() != null )
nodeManager.getBlockMgr().close() ;
if ( recordsMgr.getBlockMgr() != null )
recordsMgr.getBlockMgr().close() ;
}
// public void closeIterator(Iterator iter)
// {
// }
@Override
public long size()
{
Iterator iter = iterator() ;
return Iter.count(iter) ;
}
@Override
public void check()
{
getRoot().checkNodeDeep() ;
}
public void dump()
{
getRoot().dump() ;
}
public void dump(IndentedWriter out)
{
getRoot().dump(out) ;
}
}