com.bigdata.btree.IndexSegmentMultiBlockIterator Maven / Gradle / Ivy
package com.bigdata.btree;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.NoSuchElementException;
import org.apache.log4j.Logger;
import com.bigdata.btree.IndexSegment.IndexSegmentTupleCursor;
import com.bigdata.btree.IndexSegment.ImmutableNodeFactory.ImmutableLeaf;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.IBufferAccess;
import com.bigdata.util.BytesUtil;
/**
* A fast iterator based on multi-block IO for the {@link IndexSegment}. This
* iterator is designed for operations which will fully visit either all leaves
* in the {@link IndexSegment} or a key-range corresponding to a substantial
* proportion of those leaves. A direct {@link ByteBuffer} is allocated from the
* caller's {@link DirectBufferPool} and an IO request is issued against the
* {@link IndexSegment} to fill the {@link ByteBuffer} with as many leaves
* spanned by the key-range as will fit into the buffer. The leaves laid out
* contiguously in total key order in the {@link IndexSegment}. The addresses of
* the leaves spanned by a key-range are easily identified by two key probes
* into the nodes, and the nodes region is generally fully buffered. The #of
* leaves spanned by a key range may be estimated as
* (rangeCount/branchingFactor).
*
* During traversal, each leaf is copied into a Java byte[]
in
* order to provide fast decode of the data in the leaf. When the buffered
* leaves have been exhausted, another chunk of leaves will be read using
* another multi-block IO.
*
* You should choose this iterator if: (a) the iterator uses forward traversal
* only; (b) the key-range includes the entire {@link IndexSegment} -or- a probe
* reveals that more than a few leaves would be read; (c) the largest record in
* the {@link IndexSegment} will fit within a buffer acquired from the selected
* {@link DirectBufferPool}; and (d) it is reasonable to expect that the
* iterator will be fully consumed by the caller.
*
* The #of leaves which would be read can be estimated by dividing the range
* count by the branching factor. If there are more than 2 full leaves worth of
* data to be read this iterator will be faster than the linked leaf traversal
* provided by {@link IndexSegmentTupleCursor} since this class will do one IO
* rather than one per leaf.
*
* @author Bryan Thompson
* @version $Id$
* @param
*
* @todo This is just fast forward traversal. We could support cursors based on
* this same model.
*
* FIXME Support compressed leaves (we have to decompress the record
* ourselves since the {@link IndexSegmentStore} is not being used to
* access the individual records).
*/
public class IndexSegmentMultiBlockIterator implements ITupleIterator {
protected static final transient Logger log = Logger
.getLogger(IndexSegmentMultiBlockIterator.class);
/**
* The {@link IndexSegment}.
*/
private final IndexSegment seg;
private final IndexSegmentStore store;
/**
* The pool from which we acquire the buffer and to which we will release
* the buffer.
*/
private final DirectBufferPool pool;
/**
* The buffer.
*/
private volatile IBufferAccess buffer;
/**
* The inclusive lower bound -or- null
if there is no lower
* bound.
*/
private final byte[] fromKey;
/**
* The exclusive upper bound -or- null
if there is no upper
* bound.
*/
private final byte[] toKey;
/*
* Tuple stuff.
*/
/**
* true
iff the iterator is exhausted (the last tuple has been
* read from the last leaf).
*/
private boolean exhausted = false;
/**
* The current {@link Tuple} for the {@link #tupleItr}.
*/
private final Tuple tuple;
/**
* Iterator used to scan each leaf in turn. It is null
if there
* is no {@link #currentLeaf} or if the {@link #currentLeaf} is exhausted.
*/
private LeafTupleIterator tupleItr = null;
/*
* Leaf stuff.
*/
/**
* The address of the first leaf to be read.
*/
private final long firstLeafAddr;
/**
* The address of the last leaf to be read.
*
* Note that the last byte to be read is obtained from
* {@link IndexSegmentStore#getByteCount(long)}
*/
private final long lastLeafAddr;
/**
* The current leaf -or- null
if no leaves have been read. The
* address of the current leaf is available from {@link Leaf#getIdentity()}.
* The address of the next leaf is available from {@link Leaf#getNextAddr()}.
*/
private ImmutableLeaf currentLeaf = null;
/*
* Block stuff.
*/
/**
* The byte offset of the current block in the {@link IndexSegment}.
* Together with the {@link #blockLength}, this is used to determine which
* leaves may be addressed within the block, when we need to read another
* block in order to address a leaf, etc.
*/
private long blockOffset = 0L;
/**
* The byte length of the current block.
*/
private int blockLength = 0;
/*
* Counters
*/
/** The #of leaves read so far. */
private long leafReadCount = 0;
/** The #of blocks read so far. */
private long blockReadCount = 0;
/**
*
* @param seg
* The {@link IndexSegment}.
* @param pool
* The pool from which a direct {@link ByteBuffer} will be
* acquired and into which blocks will be read from the backing
* file.
* @param fromKey
* The inclusive lower bound -or- null
if there is
* no lower bound.
* @param toKey
* The exclusive upper bound -or- null
if there is
* no upper bound.
* @param flags
*/
public IndexSegmentMultiBlockIterator(//
final IndexSegment seg,//
final DirectBufferPool pool,//
final byte[] fromKey,//
final byte[] toKey,//
final int flags) {
if (seg == null)
throw new IllegalArgumentException();
if (pool == null)
throw new IllegalArgumentException();
this.seg = seg;
this.store = seg.getStore();
this.pool = pool;
this.fromKey = fromKey;
this.toKey = toKey;
/*
* Check flags for unsupported options.
*/
if ((flags & IRangeQuery.REVERSE) != 0)
throw new IllegalArgumentException();
if ((flags & IRangeQuery.REMOVEALL) != 0)
throw new IllegalArgumentException();
if ((flags & IRangeQuery.CURSOR) != 0)
throw new IllegalArgumentException();
this.tuple = new Tuple(seg, flags);
this.firstLeafAddr = (fromKey == null ? store.getCheckpoint().addrFirstLeaf
: seg.findLeafAddr(fromKey));
this.lastLeafAddr = (toKey == null ? store.getCheckpoint().addrLastLeaf
: seg.findLeafAddr(toKey));
if (pool.getBufferCapacity() < store.getCheckpoint().maxNodeOrLeafLength) {
/*
* If the buffers in the pool are too small to hold the largest
* record in the index segment then you can not use this iterator.
*
* Note: We presume that the largest record is therefore a leaf. In
* practice this will nearly always be true as nodes have relatively
* little metadata per tuple while leaves store the value associated
* with the tuple.
*
* Note: AbstractBTree checks for this condition before choosing
* this iterator.
*/
throw new UnsupportedOperationException(
"Record is larger than buffer: maxNodeOrLeafLength="
+ store.getCheckpoint().maxNodeOrLeafLength
+ ", bufferCapacity=" + pool.getBufferCapacity());
}
if (firstLeafAddr == 0L) {
// Empty index segment.
exhausted = true;
}
// these are zero since no block has been read yet.
this.blockOffset = 0L;
this.blockLength = 0;
}
/**
* {@inheritDoc}
*
* This is extended to ensure that the buffer is released back to the
* {@link DirectBufferPool}.
*/
protected void finalize() throws Throwable {
releaseBuffer();
super.finalize();
}
private ByteBuffer acquireBuffer() {
if (buffer == null) {
try {
buffer = pool.acquire();
} catch (InterruptedException e) {
// We can not continue if the buffer is not acquired.
throw new RuntimeException(e);
}
}
return buffer.buffer();
}
private void releaseBuffer() {
if (buffer != null) {
try {
buffer.release();
} catch (InterruptedException e) {
// Propagate interrupt.
Thread.currentThread().interrupt();
return;
}
this.buffer = null;
}
}
/**
* Return the current leaf.
*
* @return The current leaf -or- null
iff no leaves have been
* read from the {@link IndexSegment}.
*/
protected ImmutableLeaf getLeaf() {
return currentLeaf;
}
public boolean hasNext() {
return _hasNext();
}
public ITuple next() {
if (exhausted)
throw new NoSuchElementException();
return tupleItr.next();
}
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Return true
iff another tuple is available.
*
* @return
*/
private boolean _hasNext() {
while (!exhausted) {
if (tupleItr != null) {
if (tupleItr.hasNext()) {
// More tuples are available from the current leaf.
return true;
}
// The current leaf is exhausted.
tupleItr = null;
if(log.isTraceEnabled())
log.trace("Current leaf is exhausted.");
}
if ((currentLeaf = nextLeaf()) != null) {
// setup the tuple iterator for the next leaf.
tupleItr = new LeafTupleIterator(currentLeaf, tuple,
fromKey, toKey);
} else {
// done.
exhausted = true;
}
}
if(log.isTraceEnabled())
log.trace("Exhausted.");
// release the buffer back to the pool.
releaseBuffer();
// nothing left.
return false;
}
/**
* Return the next leaf from the {@link #buffer}. If the next leaf is not in
* the buffer, read the next block of leaves from the backing file.
*
* @return The next leaf -or- null
iff there are no more leaves
* to be visited.
*/
private ImmutableLeaf nextLeaf() {
if (exhausted)
throw new IllegalStateException();
if (currentLeaf == null) {
if (log.isTraceEnabled())
log.trace("Reading initial leaf");
// acquire the buffer from the pool.
acquireBuffer();
// Read the first block.
nextBlock(firstLeafAddr, buffer.buffer());
// Extract the first leaf.
final ImmutableLeaf leaf = getLeaf(firstLeafAddr);
// Return the first leaf.
return leaf;
}
if (currentLeaf.identity == lastLeafAddr) {
// No more leaves.
if (log.isTraceEnabled())
log.trace("No more leaves (end of key range)");
return null;
}
/*
* We need to return the next leaf. We get the address of the next leaf
* from the nextAddr field of the current leaf.
*/
final long nextLeafAddr = currentLeaf.getNextAddr();
if (nextLeafAddr == 0L) {
// No more leaves.
if (log.isTraceEnabled())
log.trace("No more leaves (end of segment)");
return null;
}
/*
* Figure out if the leaf is in the current buffer/block.
*/
{
final long offset = store.getOffset(nextLeafAddr);
final int nbytes = store.getByteCount(nextLeafAddr);
if (offset < blockOffset) {
// going backwards in the file.
throw new AssertionError();
}
if (offset + nbytes > blockOffset + blockLength) {
// read the next block.
nextBlock(nextLeafAddr, buffer.buffer());
}
}
// extract the next leaf.
final ImmutableLeaf leaf = getLeaf(nextLeafAddr);
// return the current leaf.
return leaf;
}
/**
* Read a leaf from the {@link #buffer}.
*
* @param addr
* The address of the leaf.
*
* @return The leaf and never null
.
*
* @throws IllegalArgumentException
* if the leaf does not lie entirely within the buffer.
*/
private ImmutableLeaf getLeaf(final long addr) {
final long offset = store.getOffset(addr);
final int nbytes = store.getByteCount(addr);
if (offset < blockOffset)
throw new IllegalArgumentException();
if (offset + nbytes > blockOffset + blockLength)
throw new IllegalArgumentException();
// offset into the buffer.
final int offsetWithinBuffer = (int)(offset - blockOffset);
// read only view of the leaf in the buffer.
final ByteBuffer tmp = buffer.buffer().asReadOnlyBuffer();
tmp.limit(offsetWithinBuffer + nbytes);
tmp.position(offsetWithinBuffer);
// decode byte[] as ILeafData.
final ILeafData data = (ILeafData) seg.nodeSer.decode(tmp);
leafReadCount++;
if (log.isTraceEnabled())
log
.trace("read leaf: leafReadCount=" + leafReadCount
+ ", addr=" + addr + "(" + store.toString(addr)
+ "), blockOffset=" + blockOffset
+ " offsetWithinBuffer=" + offsetWithinBuffer);
// return as Leaf.
return new ImmutableLeaf(seg, addr, data);
}
/**
* Read as many leaves from the backing from into the buffer as will fit
* using a multi-block IO.
*
* Note: This implementation ensures that at least one full leaf will be
* read into the block buffer. However, it does not guard against a partial
* read of the last leaf within the buffer. In order to guarantee that we
* only read complete leaves we would have to traverse the nodes of the
* {@link IndexSegment} and locate the largest leaf address which could be
* fully read. Rather than doing that, this allows a partial read of the
* last leaf but the logic in {@link #nextLeaf()} checks to see whether a
* leaf lies fully in the buffer and, if not, then demands the next block of
* leaves starting with the address of the next leaf to be read. The
* additional IO cost of a partial leaf read is trivial since this is
* multi-block IO and we are operating at the disk transfer rate.
*
* @param leafAddr
* The address of the leaf at which the block will start.
* @param b
* The buffer into which the data will be read.
*/
private void nextBlock(final long leafAddr, final ByteBuffer b) {
final int minSize = store.getByteCount(leafAddr);
if (minSize > b.capacity()) {
/*
* Note: This condition is checked by the constructor so you should
* not see this error thrown from here.
*/
throw new UnsupportedOperationException(
"Leaf is larger than buffer: leafSize=" + minSize
+ ", bufferCapacity=" + b.capacity());
}
// the offset of the first byte we will read.
final long startOffset = store.getOffset(leafAddr);
// the offset of the last byte we are allowed to read.
final long lastOffset = store.getOffset(lastLeafAddr)
+ store.getByteCount(lastLeafAddr);
// the #of bytes that we will actually read.
final int nbytes = (int) Math.min(lastOffset - startOffset, b
.capacity());
if(log.isTraceEnabled())
log.trace("leafAddr=" + store.toString(leafAddr) + ", startOffset="
+ startOffset + ", lastOffset=" + lastOffset + ", nbytes="
+ nbytes);
if (nbytes == 0) {
throw new AssertionError("nbytes=0 : leafAddr"
+ store.toString(leafAddr) + " : " + this);
}
// set the position to zero.
b.position(0);
// set the limit to the #of bytes to be read.
b.limit(nbytes);
// read the data from the file.
try {
store.readFromFile(startOffset, b);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
// update the offset/length in the store for the in memory block
blockOffset = startOffset;
blockLength = nbytes;
blockReadCount++;
if (log.isTraceEnabled())
log.trace("read block: blockReadCount=" + blockReadCount
+ ", leafAddr=" + store.toString(leafAddr)
+ ", blockOffset=" + blockOffset + ", blockLength="
+ blockLength);
}
public String toString() {
return super.toString() + //
"{file=" + store.getFile() + //
",checkpoint="+store.getCheckpoint()+//
",fromKey="+BytesUtil.toString(fromKey)+//
",toKey="+BytesUtil.toString(toKey)+//
",firstLeafAddr=" + store.toString(firstLeafAddr) + //
",lastLeafAddr=" + store.toString(lastLeafAddr) + //
",currentLeaf=" + (currentLeaf!=null?store.toString(currentLeaf.identity):"N/A") + //
",blockOffset="+blockOffset+//
",blockLength="+blockLength+//
",bufferCapacity="+pool.getBufferCapacity()+//
",leafReadCount="+leafReadCount+//
",blockReadCount="+blockReadCount+//
"}";
}
}