com.bigdata.btree.IndexSegmentBuilder Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Dec 5, 2006
*/
package com.bigdata.btree;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.text.NumberFormat;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.log4j.Logger;
import com.bigdata.btree.data.IAbstractNodeData;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.data.INodeData;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.MutableKeyBuffer;
import com.bigdata.btree.raba.MutableValueBuffer;
import com.bigdata.btree.view.FusedView;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.ChecksumUtility;
import com.bigdata.io.DataInputBuffer;
import com.bigdata.io.FileChannelUtility;
import com.bigdata.io.NOPReopener;
import com.bigdata.io.SerializerUtil;
import com.bigdata.io.writecache.WriteCache;
import com.bigdata.journal.Journal;
import com.bigdata.journal.Name2Addr;
import com.bigdata.journal.TemporaryRawStore;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.rawstore.IAddressManager;
import com.bigdata.rawstore.IBlock;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rawstore.WormAddressManager;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
/**
* Builds an {@link IndexSegment} given a source btree and a target branching
* factor. There are two main use cases:
*
*
* - Evicting a key range of an index into an optimized on-disk index. In this
* case, the input is a {@link BTree} that is ideally backed by a fully buffered
* {@link IRawStore} so that no random reads are required.
*
* - Merging index segments. In this case, the input is typically records
* emerging from a merge-sort. There are two distinct cases here. In one, we
* simply have raw records that are being merged into an index. This might occur
* when merging two key ranges or when external data are being loaded. In the
* other case we are processing two time-stamped versions of an overlapping key
* range. In this case, the more recent version may have "delete" markers
* indicating that a key present in an older version has been deleted in the
* newer version. Also, key-value entries in the newer version replaced (rather
* than are merged with) key-value entries in the older version. If an entry
* history policy is defined, then it must be applied here to cause key-value
* whose retention is no longer required by that policy to be dropped.
*
*
*
* One pass vs. Two Pass Design Alternatives
*
* There are at least three design alternatives for index segment builds: (A) do
* an exact range count instead and generate a perfect plan; (B) fully buffer
* the source iterator into byte[][] keys, byte[][] vals, boolean[]
* deleteMarkers, and long[] versionTimestamps and generate an exact plan,
* consuming the buffered byte[]s directly from RAM; and (C) use the fast range
* count to generate a plan based on an overestimate of the tuple count and then
* apply a variety of hacks when the source iterator is exhausted to make the
* output B+Tree usable, but not well formed.
*
* The disadvantage of (A) is that it requires two passes over the source view,
* which substantially increases the run time of the algorithm. In addition, the
* passes can drive evictions in the global LRU and could defeat caching for a
* view approaching the nominal size for a split. However, with (A) we can do
* builds for very large source B+Trees. Therefore, (A) is implemented for such
* use cases.
*
* The disadvantage of (B) is that it requires more memory. However, it is much
* faster than (A). To compensate for the increased memory demand, we can single
* thread builds, merges, and splits and fall back to (A) if memory is very
* tight or the source view is very large.
*
* The disadvantage of (C) is that the "hacks" break encapsulation and leak into
* the API where operations such as retrieving the right sibling of a node could
* return an empty leaf (since we ran out of tuples for the plan). Since these
* "hacks" would break encapsulation, it would be difficult to have confidence
* that the B+Tree API was fully insulated against the effects of ill-formed
* {@link IndexSegment}s. Therefore, I have discarded this approach and backed
* out changes designed to support it from the code base.
*
*
Design alternatives for totally ordered nodes and leaves
*
* In order for the nodes to be written in a contiguous block we either have to
* buffer them in memory or have to write them onto a temporary file and then
* copy them into place after the last leaf has been processed. This concern was
* not present in West's algorithm because it did not attempt to place the nodes
* and/or leaves contiguously onto the generated B+Tree file.
*
* For the two pass design described above as option (A), the code buffers the
* nodes and leaves onto {@link TemporaryRawStore}s. This approach is scalable,
* which is the concern of (A), but requires at least twice the IO when compared
* to directly writing the nodes and leaves onto the output file.
*
* When sufficient memory is available, as cases where (B) would apply, we can
* write the leaves directly on the backing file (using double-buffering to
* update the prior/next addrs). Since there are far fewer nodes than leaves, we
* can buffer the nodes in memory, writing them once the leaves are finished.
*
* @author Bryan Thompson
* @version $Id: IndexSegmentBuilder.java 2265 2009-10-26 12:51:06Z thompsonbry
* $
*
* @see "Post-order B-Tree Construction" by Lawerence West, ACM 1992. Note that
* West's algorithm is for a b-tree (values are stored on internal stack as
* well as leaves), not a b+-tree (values are stored only on the leaves).
* Our implementation is therefore an adaptation.
*
* @see "Batch-Construction of B+-Trees" by Kim and Won, ACM 2001. The approach
* outlined by Kim and Won is designed for B+-Trees, but it appears to be
* less efficient on first glance.
*
* @see IndexSegment
* @see IndexSegmentFile
* @see IndexSegmentCheckpoint
*
* @todo Put profiler on (B) for build stress tests. The source view should be
* pre-generated such that we only measure the build behavior in the
* profiler. See {@link TestIndexSegmentBuilderWithLargeTrees}. [We need
* to first remove the synchronization for disk reads on the journal so we
* have the maximum possible IO rate, and possibly materialize the leaves
* of the source view in parallel using pre-fetch.]
*
* Make sure that {@link #elapsed} reports the total build time, including
* the range count or pre-materialization costs as those are a significant
* part of the total cost.
*
* Much of the cost of the build is the range iterator, including decoding
* the nodes and leaves and materializing tuples. Most of the remaining
* cost is the coding of the new nodes and leaves and their IO. Some of
* that cost can be trimmed by faster coders, but we can also trim the IO
* using parallel materialization of leaves (for {@link IndexSegment}) and
* pre-fetch of nodes and leaves (for {@link BTree}s).
*
* The B+Tree iterator on the fused view of the journal and index segments
* is effectively an incremental merge of the source iterators. However,
* the iterator has a [capacity] parameter which is a hint for
* materialization. We could do this as a parallel merge sort of the
* leaves spanned of the key range with a restriction for the capacity.
* That could also be done on a GPU. The same iterator is used as the
* source for a compacting merge. If we tunnel the representation to
* something like the byte[][] keys, byte[][] vals, boolean[]
* deleteMarkers, long[] revisionTimestamps, then we could do a merge sort
* of the leaves from the (ordered) view and then do a parallel build
* segment from the backing representation. We could do that build step in
* java or on a GPU.
*
* @todo GPU parallel builds. Once we have the data in a bunch of byte[][]s we
* could conceivable get this stuff organized to the point where each
* tuple and child in the output B+Tree could be assigned to a thread and
* each leaf and node to a core on a GPU. At that point, we run the
* "build" in parallel. For this approach, the uncoded leaves will all
* wind up in memory using the same byte[]s for keys and values as the
* source view.
*
* This approach could be carried further to code the nodes and leaves in
* parallel, perhaps as a 2nd GPU program. If the source view was fully
* buffered, then it should be released once the nodes and leaves have
* been coded as no more use will be made of those data.
*
* While it may not improve the IO substantially, it is possible to use
* gathered writes if the leaves and nodes are fully buffered in memory.
* For the leaves, we write them in the index order. For the nodes, we
* write them in their pre-order traversal. The order in which we would
* write out the nodes and leaves should be part of the state during a GPU
* build regardless of whether the IO is done sequentially or as a
* gathered write.
*
* @todo Make sure it is possible to grab the {@link IndexMetadata} and the
* bloom filter from the generated file in a single IO. This could be
* useful when the index segment files are stored on a parallel file
* system. [It is possible to do this since these data are contiguous and
* in the same region of the generated file (they both use addresses
* relative to the BASE of the file).]
*/
public class IndexSegmentBuilder implements Callable {
/**
* Logger.
*/
private static final Logger log = Logger
.getLogger(IndexSegmentBuilder.class);
/**
* Error message when the #of tuples in the {@link IndexSegment} would
* exceed {@link Integer#MAX_VALUE}.
*
* Note: This is not an inherent limit in the {@link IndexSegment} but
* rather a limit in the {@link IndexSegmentPlan} (and perhaps the
* {@link IndexSegmentBuilder}) which presumes that the entry count is an
* int
rather than a long
.
*/
protected static final String ERR_TOO_MANY_TUPLES = "Too many tuples";
/**
* Message when the index segment will be empty.
*/
protected static final String ERR_NO_TUPLES = "No tuples";
/**
* The file mode used to open the file on which the {@link IndexSegment} is
* written.
*/
final String mode = "rw"; // also rws or rwd
/**
* The file specified by the caller on which the {@link IndexSegment} is
* written.
*/
public final File outFile;
/**
* The value specified to the ctor.
*/
final public long entryCount;
/**
* The iterator specified to the ctor. This is the source for the keys and
* values that will be written onto the generated {@link IndexSegment}.
*/
final private ITupleIterator> entryIterator;
/**
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated (from the ctor). This value is
* written into {@link IndexSegmentCheckpoint#commitTime}.
*/
final public long commitTime;
/**
* true
iff the generated {@link IndexSegment} will
* incorporate all state for the source index (partition) as of the
* specified commitTime.
*
* Note: This flag is written into the {@link IndexSegmentCheckpoint} but it
* has no other effect on the build process.
*/
final public boolean compactingMerge;
/**
* The name of the index or index partition for which the build is being
* performed.
*/
final String name;
/**
* A copy of the metadata object provided to the ctor. This object is
* further modified before being written on the
* {@link IndexSegmentStore}.
*/
final public IndexMetadata metadata;
/**
* true
iff the source index is isolatable (supports both
* deletion markers and version timestamps).
*/
final boolean isolatable;
/**
* true
iff the source index has delete markers enabled.
*
* Note: delete markers are ONLY copied for an incremental build (when
* {@link #compactingMerge} is false
).
*/
final boolean deleteMarkers;
/**
* true
iff the source index has tuple revision timestamps
* enabled.
*/
final boolean versionTimestamps;
/**
* true
iff the source index has raw records enabled.
*
* Note: raw records will be copied into the BLOBS region of the index
* segment and the address of the raw record in the output tuple will be
* updated to reflect the relative address of the record within the index
* segment.
*/
final boolean rawRecords;
/**
* A buffer used to encode a raw record address for a mutable {@link BTree}
* and otherwise null
.
*/
private final ByteArrayBuffer recordAddrBuf;
/**
* The unique identifier for the generated {@link IndexSegment} resource.
*/
final public UUID segmentUUID;
/**
* The cache for the generated {@link IndexSegmentStore}. When non-
* null
the generated {@link INodeData} objects will be placed
* into the cache, which is backed by a shared LRU. This helps to reduce
* latency when an index partition built or merge operation finishes and the
* index partition view is updated since the data will already be present in
* the cache. Generating the index segment will drive evictions from the
* shared LRU, but those will be the least recently used records and the new
* {@link IndexSegmentStore} is often hot as soon as it is generated.
*
* Note: If the build fails, then the cache will be cleared.
*
* @todo The {@link IndexMetadata} and the {@link BloomFilter} should be in
* the {@link #storeCache} as well. Make sure that we do this for read
* and write for both the {@link BTree} and the {@link IndexSegment}.
*/
/*
* The storeCache field is marked as "Deprecated" but it should stick around
* for a while since we might wind up reusing this feature on an index local
* basis at some point.
*
* @see BLZG-1501 (remove LRUNexus)
*/
@Deprecated
final private ConcurrentMap storeCache;
/**
* Used to serialize the nodes and leaves of the output tree.
*/
final private NodeSerializer nodeSer;
/**
* Note: The offset bits on the {@link IndexSegmentFileStore} does NOT
* have to agree with the offset bits on the source store. However, it
* must be large enough to handle the large branching factors typically
* associated with an {@link IndexSegment} vs a {@link BTree}. Further,
* if blobs are to be copied into the index segment then it generally
* must be large enough for those blobs (up to 64M per record).
*
* Note: The same #of offset bits MUST be used by the temporary stores
* that we use to buffer nodes, leaves, and blobs as are used by the
* generated index segment!
*/
final int offsetBits = WormAddressManager.SCALE_OUT_OFFSET_BITS;
/**
* The {@link IAddressManager} used to form addresses for the generated
* file. Addresses are formed from a byteCount and an encoded
* offset comprised of a relative offset into a known region and the region
* identifier.
*
* @see IndexSegmentRegion
* @see IndexSegmentAddressManager
*/
final private WormAddressManager addressManager;
/**
* The bloom filter iff we build one (errorRate != 0.0).
*/
final IBloomFilter bloomFilter;
/**
* When true
record level checksums will be used in the
* generated file.
*
* FIXME This can not be enabled until we factor out the direct use of the
* {@link WriteCache} since special handling is otherwise required to ensure
* that the checksum makes it into the output record when we write directly
* on the disk.
*
* FIXME When enabling this, make sure that the bloom filter,
* {@link IndexMetadata}, and the blobs are all checksummed and make sure
* that the {@link IndexSegmentStore} verifies the checksums when it reads
* through to the disk and only returns the raw record w/o the trailing
* checksum.
*
* FIXME The right time to reconcile these things may be when this branch
* (HAJournal) is merged with the dynamic shard refactor branch.
*/
final private boolean useChecksums = false;
/**
* Used to compute record level checksums when {@link #useChecksums} is
* true
.
*/
final private ChecksumUtility checker = new ChecksumUtility();
/**
* The file on which the {@link IndexSegment} is written. The file is closed
* regardless of the outcome of the operation.
*/
protected RandomAccessFile out = null;
/**
* The {@link IndexSegmentCheckpoint} record written on the
* {@link IndexSegmentStore}.
*/
private IndexSegmentCheckpoint checkpoint;
/**
* The {@link IndexSegmentCheckpoint} record written on the
* {@link IndexSegmentStore}.
*/
public IndexSegmentCheckpoint getCheckpoint() {
return checkpoint;
}
// /**
// * The buffer used to hold leaves so that they can be evicted en mass onto a
// * region of the {@link #outFile}.
// *
// * @deprecated This forces us to do IO twice for the leaves. They should be
// * explicitly double-buffered in memory (the last leaf and the
// * current leaf) and evicted directly onto {@link #out}. This
// * will remove the requirement for the {@link IUpdateStore} API
// * on the {@link TemporaryRawStore} and on the
// * {@link DiskOnlyStrategy}. A r/w store version of the
// * {@link TemporaryRawStore} could be deployed which supports
// * update if that becomes important.
// */
// private TemporaryRawStore leafBuffer;
/**
* This is used to buffer the leaves written onto the output file for
* greater efficiency.
*
* FIXME Use a WriteCacheService which will hide this complexity and give
* better throughput.
*/
private WriteCache.FileChannelWriteCache leafWriteCache;
/**
* Class combines the address at which a node is written onto the output
* file (relative to the start of the nodes region) with the coded data
* record for the node.
*
* @author Bryan
* Thompson
* @version $Id$
*/
private static class NodeMetadata {
public final long addr;
public final INodeData data;
public NodeMetadata(final long addr,final INodeData data) {
this.addr = addr;
this.data = data;
}
}
/**
* The buffer used to hold nodes so that they can be evicted en mass onto a
* region of the {@link #outFile}. This is conditionally enabled depending
* on whether #bufferNodes is true.
*/
private TemporaryRawStore nodeBuffer;
/**
* When the nodes are to be fully buffered they are added into this list in
* the order in which they are generated.
*/
private List nodeList;
/**
* When true
the generated nodes will be fully buffered in RAM.
* Otherwise they will be buffered on the {@link #nodeBuffer} and then
* transferred to the output file en mass.
*/
final protected boolean bufferNodes;
/**
* The optional buffer used to hold records referenced by index entries.
* This is opened if the index uses raw records -or- if the index specifies
* and {@link IOverflowHandler}.
*/
private TemporaryRawStore blobBuffer;
private final IOverflowHandler overflowHandler;
/**
* The encoded address of the first leaf written on the
* {@link IndexSegmentStore} (there is always at least one, even if it is
* the root leaf).
*
* Note: A copy of this value is preserved by
* {@link IndexSegmentCheckpoint#addrFirstLeaf}.
*/
private long addrFirstLeaf = 0L;
/**
* The encoded address of the last leaf written on the
* {@link IndexSegmentStore} (there is always at least one, even if it is
* the root leaf).
*
* Note: A copy of this value is preserved by
* {@link IndexSegmentCheckpoint#addrLastLeaf}.
*/
private long addrLastLeaf = 0L;
// /**
// * The offset in the output file of the last leaf written onto that file.
// * Together with {@link #lastLeafSize} this is used to compute the
// * address of the prior leaf.
// */
// long lastLeafOffset = -1L;
//
// /**
// * The size in bytes of the last leaf written onto the output file (the size
// * of the compressed record that is actually written onto the output file
// * NOT the size of the serialized leaf before it is compressed). Together
// * with {@link #lastLeafOffset} this is used to compute the address of the
// * prior leaf.
// */
// int lastLeafSize = -1;
/**
* Tracks the maximum length of any serialized node or leaf. This is used
* to fill in one of the {@link IndexSegmentCheckpoint} fields.
*/
int maxNodeOrLeafLength = 0;
/**
* The #of tuples written for the output tree.
*/
long ntuplesWritten;
/**
* The #of nodes written for the output tree. This will be zero if all
* entries fit into a root leaf.
*/
int nnodesWritten = 0;
/**
* The #of leaves written for the output tree.
*/
int nleavesWritten = 0;
/**
* The #of nodes or leaves that have been written out in each level of the
* tree.
*
* @see IndexSegmentPlan#numInLevel
*/
final int writtenInLevel[];
/**
* The stack of nodes that are currently being populated. The first N-1
* elements in this array are always nodes while the last element is always
* a leaf ({@link #leaf} is the same reference as the last element in this
* array). The nodes and the leaf in this array are reused rather than being
* reallocated.
*/
final AbstractSimpleNodeData[] stack;
/**
* The current leaf that is being populated from the source btree. This leaf
* is reused for each output leaf rather than being reallocated. In the
* degenerate case when the output btree is a single root leaf then this
* will be that leaf. This reference is always the same as the last
* reference in {@link #stack}.
*/
final SimpleLeafData leaf;
/**
* The plan for building the B+-Tree.
*/
final public IndexSegmentPlan plan;
/**
* The timestamp in milliseconds when {@link #call()} was invoked.
*/
private long begin_build;
/**
* The timestamp in milliseconds when {@link #call()} was invoked -or-
* ZERO (0L) if {@link #call()} has not been invoked.
*/
public long getStartTime() {
return begin_build;
}
/**
* The time to setup the index build, including the generation of the index
* plan and the initialization of some helper objects.
*/
public final long elapsed_setup;
/**
* The time to write the nodes and leaves into their respective buffers, not
* including the time to transfer those buffered onto the output file.
*/
public long elapsed_build;
/**
* The time to write the nodes and leaves from their respective buffers
* onto the output file and synch and close that output file.
*/
public long elapsed_write;
/**
* The process runtime in milliseconds.
*/
public long elapsed;
/**
* The data throughput rate in megabytes per second.
*/
public float mbPerSec;
/**
* Builder factory will build an {@link IndexSegment} from an index
* (partition). Delete markers are propagated to the {@link IndexSegment}
* unless compactingMerge is true
.
*
* @param src
* A view of the index partition as of the createTime.
* When compactingMerge is false
then this
* MUST be a single {@link BTree} since incremental builds are
* only support for a {@link BTree} source while compacting
* merges are defined for any {@link IIndex}.
* @param outFile
* The file on which the {@link IndexSegment} will be written.
* The file MAY exist, but if it exists then it MUST be empty.
* @param compactingMerge
* When true
the caller asserts that src is a
* {@link FusedView} and deleted index entries WILL NOT be
* included in the generated {@link IndexSegment}. Otherwise, it
* is assumed that the only select component(s) of the index
* partition view are being exported onto an {@link IndexSegment}
* and deleted index entries will therefore be propagated to the
* new {@link IndexSegment} (aka an incremental build).
* @param createTime
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated. This value is written
* into {@link IndexSegmentCheckpoint#commitTime}.
* @param fromKey
* The lowest key that will be included (inclusive). When
* null
there is no lower bound.
* @param toKey
* The first key that will be included (exclusive). When
* null
there is no upper bound.
*
* @return An object which can be used to construct the {@link IndexSegment}
* .
*
* @throws IOException
*/
public static IndexSegmentBuilder newInstance(
final ILocalBTreeView src, final File outFile, final File tmpDir,
final boolean compactingMerge, final long createTime,
final byte[] fromKey, final byte[] toKey) throws IOException {
if (src == null)
throw new IllegalArgumentException();
if (outFile == null)
throw new IllegalArgumentException();
if (tmpDir == null)
throw new IllegalArgumentException();
if (createTime <= 0L)
throw new IllegalArgumentException();
// The output branching factor.
final int m = src.getIndexMetadata().getIndexSegmentBranchingFactor();
// a fast range count, which can overestimate the #of tuples in the view.
final long fastRangeCount = src.rangeCount(fromKey, toKey);
// a fast summary of the view.
final ViewStatistics stats = new ViewStatistics(src);
// 2x the nominal size of an index shard (200M).
final long MAX_SIZE_ON_DISK = Bytes.megabyte * 200 * 2;
// ~2x the nominal size of a 200M index shard in tuples at 50 bytes/tuple.
final long MAX_TUPLES_IN_VIEW = Bytes.megabyte * 8;
/*
* FIXME I have temporary disabled this as it appears to be slower to
* fully buffer the data on the current test cluster.... I will look
* into this further as soon as I get a good baseline on that cluster.
*
* Ah. The problem is likely to be Java heap pressure. The one pass
* approach might have to use the MemoryManager in order for us to
* realize the efficiency obtain from a single IO pass, especially since
* the two pass approach is already benefiting from the file system
* cache.
*/
if (false && stats.sumSegBytes < MAX_SIZE_ON_DISK
&& fastRangeCount < MAX_TUPLES_IN_VIEW) {
/*
* This fully buffers the tuples in RAM, computing the exact range
* count as it goes. This is therefore more efficient since it
* avoids a 2nd pass over the source view to read the tuples.
*/
return newInstanceFullyBuffered(src, outFile, tmpDir, m,
compactingMerge, createTime, fromKey, toKey, true/* bufferNodes */);
} else {
/*
* There is so much data that we can not materialize it into RAM.
*/
return newInstanceTwoPass(src, outFile, tmpDir, m, compactingMerge,
createTime, fromKey, toKey, false/* bufferNodes */);
}
}
/**
* A two pass build algorithm. The first pass is used to obtain an exact
* entry count for the view. Based on that exact range count we can compute
* a plan for a balanced B+Tree. A second pass over the view is required to
* populate the output B+Tree. This flavor also buffers the leaves and nodes
* on temporary stores, which means that it does more IO. However, this
* version is capable of processing very large source views.
*/
protected static IndexSegmentBuilder newInstanceTwoPass(
final ILocalBTreeView src, final File outFile, final File tmpDir,
final int m, final boolean compactingMerge, final long createTime,
final byte[] fromKey, final byte[] toKey, final boolean bufferNodes)
throws IOException {
if (src == null)
throw new IllegalArgumentException();
if (outFile == null)
throw new IllegalArgumentException();
if (tmpDir == null)
throw new IllegalArgumentException();
if (createTime <= 0L)
throw new IllegalArgumentException();
// the exact range count.
final int nentries;
// the flags that will be used to obtain the desired tuples.
final int flags;
if (compactingMerge) {
/*
* For a compacting merge the delete markers are ignored so they
* will NOT be transferred to the new index segment.
*/
flags = IRangeQuery.DEFAULT;
final long n = src.rangeCountExact(fromKey, toKey);
if (n > Integer.MAX_VALUE) {
throw new UnsupportedOperationException(ERR_TOO_MANY_TUPLES);
}
nentries = (int) n;
} else {
/*
* For an incremental build the deleted tuples are propagated to
* the new index segment. This is required in order for the fact
* that those tuples were deleted as of the commitTime to be
* retained by the generated index segment.
*/
flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED;
final long n = src.rangeCountExactWithDeleted(fromKey, toKey);
if (n > Integer.MAX_VALUE) {
throw new UnsupportedOperationException(ERR_TOO_MANY_TUPLES);
}
nentries = (int) n;
}
/*
* Iterator reading the source tuples to be copied to the index
* segment.
*
* Note: The DELETED flag was set above unless this is a compacting
* merge. That is necessary to ensure that deleted tuples are
* preserved when the index segment does not reflect the total
* history of a view.
*/
// source iterator.
final ITupleIterator> itr = src.rangeIterator(fromKey, toKey,
0/* capacity */, flags, null/* filter */);
// metadata for that index / index partition.
final IndexMetadata indexMetadata = src.getIndexMetadata();
// Setup the index segment build operation.
return IndexSegmentBuilder.newInstance(//
outFile, //
tmpDir, //
nentries, // exact range count
itr, // source iterator
m, // the output branching factor.
indexMetadata,//
createTime,//
compactingMerge,//
bufferNodes//
);
}
/**
* A one pass algorithm which materializes the tuples in RAM, computing the
* exact tuple count as it goes. This is faster than the two-pass algorithm
* and is a better choice when the source view and the output index segment
* are within the normal ranges for an index partition, e.g., an output
* index segment file of ~200M on the disk.
*
* FIXME The unit tests need to run against both builds based on the
* materialized tuples and builds based on two passes in order to obtain the
* exact range count. They already do for
* {@link TestIndexSegmentBuilderWithLargeTrees} but not yet for the other
* test suite variants.
*/
protected static IndexSegmentBuilder newInstanceFullyBuffered(
final ILocalBTreeView src, final File outFile, final File tmpDir,
final int m, final boolean compactingMerge, final long createTime,
final byte[] fromKey, final byte[] toKey, final boolean bufferNodes)
throws IOException {
if (src == null)
throw new IllegalArgumentException();
if (outFile == null)
throw new IllegalArgumentException();
if (tmpDir == null)
throw new IllegalArgumentException();
if (createTime <= 0L)
throw new IllegalArgumentException();
// metadata for that index / index partition.
final IndexMetadata indexMetadata = src.getIndexMetadata();
final long fastRangeCount = src.rangeCount(fromKey, toKey);
/*
* If the fast range count and the size on the disk of the segments in
* the view are reasonable, then eagerly materialize the tuples into an
* IRaba[] dimensioned to the fast range count and then wrap the data
* with an iterator and run the normal build.
*/
final boolean hasVersionTimestamps = indexMetadata
.getVersionTimestamps();
final boolean hasDeleteMarkers = indexMetadata.getDeleteMarkers();
final boolean hasRawRecords = indexMetadata.getRawRecords();
// A temporary leaf used to buffer the data in RAM.
final MutableLeafData tleaf = new MutableLeafData((int) fastRangeCount,
hasVersionTimestamps, hasDeleteMarkers, hasRawRecords);
final int flags;
if (compactingMerge) {
/*
* For a compacting merge the delete markers are ignored so they
* will NOT be transferred to the new index segment.
*/
flags = IRangeQuery.DEFAULT;
} else {
/*
* For an incremental build the deleted tuples are propagated to the
* new index segment. This is required in order for the fact that
* those tuples were deleted as of the commitTime to be retained by
* the generated index segment.
*/
flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED;
}
/*
* Iterator reading the source tuples to be copied to the index segment.
*
* Note: The DELETED flag was set above unless this is a compacting
* merge. That is necessary to ensure that deleted tuples are preserved
* when the index segment does not reflect the total history of a view.
*
* The tuples are materialized and buffered in a single, and potentially
* very large, leaf. That is Ok since the MutableLeaf is using very
* simple data structures.
*
* @todo The fastRangeCount is a hint that we want to eagerly
* materialize all of the data. This hint should be turned into
* pre-fetch and into a single IO for the index segment leaves if they
* are not in memory. [In fact, the hint is completely ignored at this
* point. If hints get more weight, then review code for their use.]
*/
final ITupleIterator> titr = src.rangeIterator(fromKey, toKey,
(int) fastRangeCount/* capacity */, flags, null/* filter */);
int i = 0;
// init per API specification.
long minimumVersionTimestamp = Long.MAX_VALUE;
long maximumVersionTimestamp = Long.MIN_VALUE;
while (titr.hasNext()) {
final ITuple> tuple = titr.next();
tleaf.keys.keys[i] = tuple.getKey();
if (hasVersionTimestamps) {
final long t = tuple.getVersionTimestamp();
tleaf.versionTimestamps[i] = t;
if (t < minimumVersionTimestamp) {
minimumVersionTimestamp = t;
}
if (t > maximumVersionTimestamp) {
maximumVersionTimestamp = t;
}
}
if (hasDeleteMarkers && tuple.isDeletedVersion()) {
/*
* Note: When delete markers are used, the array will be
* pre-populated with [false] so we only have to set the flag on
* the tuples that are actually deleted.
*/
tleaf.deleteMarkers[i] = true;
} else {
/*
* Note: If the source has raw records for some values, then
* this will cause those records to be materialized within the
* single massive root leaf. From there, the data will be
* written onto the index segment file.
*/
tleaf.vals.values[i] = tuple.getValue();
}
i++;
}
tleaf.keys.nkeys = i; // note final #of tuples.
tleaf.vals.nvalues = i; // note final #of tuples.
tleaf.maximumVersionTimestamp = maximumVersionTimestamp;
tleaf.minimumVersionTimestamp = minimumVersionTimestamp;
// The exact range count.
final int nentries = i;
// The source iterator (reading on the fully buffered tuples).
final ITupleIterator> itr = new MyTupleIterator(tleaf, flags);
// Setup the index segment build operation.
return IndexSegmentBuilder.newInstance(//
outFile, //
tmpDir, //
nentries, // exact range count
itr, // source iterator
m, // the output branching factor.
indexMetadata,//
createTime,//
compactingMerge,//
bufferNodes//
);
}
/**
* Variant using an array of objects in the desired order. A single root
* leaf is generated from those objects. The root leaf is then fed into the
* algorithm to efficient construct the corresponding read-only
* {@link IndexSegment}.
*
* @param a
* The array of objects to be written onto the index. The index
* must know how to generate tuples from these objects. The
* objects must already be in the natural order of the keys that
* will be generated for those tuples.
* @param alen
* The #of elements in that array.
* @param indexMetadata
* The {@link IndexMetadata} that will serve as the template for
* the generated {@link IndexSegment}.
* @param outFile
* The file on which the {@link IndexSegment} will be written.
* The file MAY exist, but if it exists then it MUST be empty.
* @param tmpDir
* The temporary directory in data are buffered during the build
* (optional - the default temporary directory is used if this is
* null
).
* @param m
* The branching factor for the generated {@link IndexSegment}.
* @param compactingMerge
* When true
the caller asserts that src is a
* {@link FusedView} and deleted index entries WILL NOT be
* included in the generated {@link IndexSegment}. Otherwise, it
* is assumed that the only select component(s) of the index
* partition view are being exported onto an {@link IndexSegment}
* and deleted index entries will therefore be propagated to the
* new {@link IndexSegment} (aka an incremental build).
* @param createTime
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated. This value is written
* into {@link IndexSegmentCheckpoint#commitTime}.
* @param bufferNodes
* When true
the generated nodes will be fully
* buffered in RAM (faster, but imposes a memory constraint).
* Otherwise they will be written onto a temporary file and then
* transferred to the output file en mass.
* @return
* @throws IOException
*
* TODO We could pass a flag indicating whether the leaf needs
* to be sorted after it is generated, but the caller would
* still be responsible for ensuring that there are no
* duplicates in the array.
*/
// * @param fromKey
// * The lowest key that will be included (inclusive). When
// * null
there is no lower bound.
// * @param toKey
// * The first key that will be included (exclusive). When
// * null
there is no upper bound.
@SuppressWarnings("unchecked")
public static IndexSegmentBuilder newInstance(
final Object[] a, final int alen,
final IndexMetadata indexMetadata, final File outFile,
final File tmpDir, final int m, final boolean compactingMerge,
final long createTime,
//final byte[] fromKey, final byte[] toKey,
final boolean bufferNodes)
throws IOException {
if (a == null)
throw new IllegalArgumentException();
if (alen < 0)
throw new IllegalArgumentException();
if (alen > a.length)
throw new IllegalArgumentException();
if (indexMetadata == null)
throw new IllegalArgumentException();
if (outFile == null)
throw new IllegalArgumentException();
if (tmpDir == null)
throw new IllegalArgumentException();
if (createTime <= 0L)
throw new IllegalArgumentException();
final boolean hasVersionTimestamps = indexMetadata
.getVersionTimestamps();
if (hasVersionTimestamps)
throw new IllegalArgumentException(
"versionTimestamps not available in source [].");
final boolean hasDeleteMarkers = indexMetadata.getDeleteMarkers();
if (hasDeleteMarkers && !compactingMerge)
throw new IllegalArgumentException(
"deleteMarkers not available in source [].");
final boolean hasRawRecords = indexMetadata.getRawRecords();
// A temporary leaf used to buffer the data in RAM.
final MutableLeafData tleaf = new MutableLeafData(alen,
hasVersionTimestamps, hasDeleteMarkers, hasRawRecords);
final int flags;
if (compactingMerge) {
/*
* For a compacting merge the delete markers are ignored so they
* will NOT be transferred to the new index segment.
*/
flags = IRangeQuery.DEFAULT;
} else {
/*
* For an incremental build the deleted tuples are propagated to the
* new index segment. This is required in order for the fact that
* those tuples were deleted as of the commitTime to be retained by
* the generated index segment.
*/
flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED;
}
/*
* Iterator reading the source tuples to be copied to the index segment.
*
* Note: The DELETED flag was set above unless this is a compacting
* merge. That is necessary to ensure that deleted tuples are preserved
* when the index segment does not reflect the total history of a view.
*
* The tuples are materialized and buffered in a single, and potentially
* very large, leaf. That is Ok since the MutableLeaf is using very
* simple data structures.
*
* @todo The fastRangeCount is a hint that we want to eagerly
* materialize all of the data. This hint should be turned into
* pre-fetch and into a single IO for the index segment leaves if they
* are not in memory. [In fact, the hint is completely ignored at this
* point. If hints get more weight, then review code for their use.]
*/
// final ITupleIterator> titr = src.rangeIterator(fromKey, toKey,
// (int) fastRangeCount/* capacity */, flags, null/* filter */);
// init per API specification.
long minimumVersionTimestamp = Long.MAX_VALUE;
long maximumVersionTimestamp = Long.MIN_VALUE;
@SuppressWarnings("rawtypes")
final ITupleSerializer tupSer = indexMetadata.getTupleSerializer();
for (int i = 0; i < alen; i++) {
// final ITuple> tuple = titr.next();
tleaf.keys.keys[i] = tupSer.serializeKey(a[i]);
// Note: Version timestamps are not available from a[].
// if (hasVersionTimestamps) {
//
// final long t = tuple.getVersionTimestamp();
//
// tleaf.versionTimestamps[i] = t;
//
// if (t < minimumVersionTimestamp) {
//
// minimumVersionTimestamp = t;
//
// }
//
// if (t > maximumVersionTimestamp) {
//
// maximumVersionTimestamp = t;
//
// }
//
// }
// Note: delete markers are not available from a[].
// if (hasDeleteMarkers && tuple.isDeletedVersion()) {
//
// /*
// * Note: When delete markers are used, the array will be
// * pre-populated with [false] so we only have to set the flag on
// * the tuples that are actually deleted.
// */
// tleaf.deleteMarkers[i] = true;
//
// } else {
/*
* Note: If the source has raw records for some values, then
* this will cause those records to be materialized within the
* single massive root leaf. From there, the data will be
* written onto the index segment file.
*/
tleaf.vals.values[i] = tupSer.serializeVal(a[i]);
// }
// i++;
}
tleaf.keys.nkeys = alen; // note final #of tuples.
tleaf.vals.nvalues = alen; // note final #of tuples.
tleaf.maximumVersionTimestamp = maximumVersionTimestamp;
tleaf.minimumVersionTimestamp = minimumVersionTimestamp;
// The exact range count.
final int nentries = alen;
// The source iterator (reading on the fully buffered tuples).
@SuppressWarnings("rawtypes")
final ITupleIterator> itr = new MyTupleIterator(tleaf, flags);
// Setup the index segment build operation.
return IndexSegmentBuilder.newInstance(//
outFile, //
tmpDir, //
nentries, // exact range count
itr, // source iterator
m, // the output branching factor.
indexMetadata,//
createTime,//
compactingMerge,//
bufferNodes//
);
}
/**
*
* A more flexible factory for an {@link IndexSegment} build which permits
* override of the index segment branching factor, replacement of the
* {@link IndexMetadata}, and the use of the caller's iterator.
*
*
* Note: The caller must determine whether or not deleted index entries are
* present in the view. The entryCount MUST be the exact #of index
* entries that are visited by the given iterator. In general, this is not
* difficult. However, if a compacting merge is desired (that is, if you are
* trying to generate a view containing only the non-deleted entries) then
* you MUST explicitly count the #of entries that will be visited by the
* iterator, e.g., it will require two passes over the iterator to setup the
* index build operation.
*
*
* Note: With a branching factor of 4096 a tree of height 2 (three levels)
* could address 68,719,476,736 entries - well beyond what we want in a
* given index segment! Well before that the index segment should be split
* into multiple files. The split point should be determined by the size of
* the serialized leaves and nodes, e.g., the amount of data on disk
* required by the index segment and the amount of memory required to fully
* buffer the index nodes. While the size of a serialized node can be
* estimated easily, the size of a serialized leaf depends on the kinds of
* values stored in that index. The actual sizes are recorded in the
* {@link IndexSegmentCheckpoint} record in the header of the
* {@link IndexSegment}.
*
*
* @param outFile
* The file on which the index segment is written. The file MAY
* exist but MUST have zero length if it does exist (this permits
* you to use the temporary file facility to create the output
* file).
* @param tmpDir
* The temporary directory in data are buffered during the build
* (optional - the default temporary directory is used if this is
* null
).
* @param entryCount
* The #of entries that will be visited by the iterator. This
* MUST be an exact range count.
* @param entryIterator
* Visits the index entries in key order that will be written
* onto the {@link IndexSegment}.
* @param m
* The branching factor for the generated tree. This can be
* chosen with an eye to minimizing the height of the generated
* tree. (Small branching factors are permitted for testing, but
* generally you want something relatively large.)
* @param metadata
* The metadata record for the source index. A copy will be made
* of this object. The branching factor in the generated tree
* will be overridden to m.
* @param commitTime
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated. This value is written
* into {@link IndexSegmentCheckpoint#commitTime}.
* @param compactingMerge
* true
iff the generated {@link IndexSegment} will
* incorporate all state for the source index (partition) as of
* the specified commitTime. This flag is written into the
* {@link IndexSegmentCheckpoint} but does not otherwise effect
* the build process.
* @param bufferNodes
* When true
the generated nodes will be fully
* buffered in RAM (faster, but imposes a memory constraint).
* Otherwise they will be written onto a temporary file and then
* transferred to the output file en mass.
*
* @throws IOException
*/
public static IndexSegmentBuilder newInstance(//
final File outFile,//
final File tmpDir,//
final long entryCount,//
final ITupleIterator> entryIterator, //
final int m,//
final IndexMetadata metadata,//
final long commitTime,//
final boolean compactingMerge,//
final boolean bufferNodes//
)
throws IOException {
return new IndexSegmentBuilder(outFile, tmpDir, entryCount,
entryIterator, m, metadata, commitTime, compactingMerge,
bufferNodes);
}
/**
*
* Designated constructor sets up a build of an {@link IndexSegment} for
* some caller defined read-only view.
*
*
* Note: The caller must determine whether or not deleted index entries are
* present in the view. The entryCount MUST be the exact #of index
* entries that are visited by the given iterator. In general, this is not
* difficult. However, if a compacting merge is desired (that is, if you are
* trying to generate a view containing only the non-deleted entries) then
* you MUST explicitly count the #of entries that will be visited by the
* iterator, e.g., it will require two passes over the iterator to setup the
* index build operation.
*
*
* Note: With a branching factor of 4096 a tree of height 2 (three levels)
* could address 68,719,476,736 entries - well beyond what we want in a
* given index segment! Well before that the index segment should be split
* into multiple files. The split point should be determined by the size of
* the serialized leaves and nodes, e.g., the amount of data on disk
* required by the index segment and the amount of memory required to fully
* buffer the index nodes. While the size of a serialized node can be
* estimated easily, the size of a serialized leaf depends on the kinds of
* values stored in that index. The actual sizes are recorded in the
* {@link IndexSegmentCheckpoint} record in the header of the
* {@link IndexSegment}.
*
*
* @param outFile
* The file on which the index segment is written. The file MAY
* exist but MUST have zero length if it does exist (this permits
* you to use the temporary file facility to create the output
* file).
* @param tmpDir
* The temporary directory in data are buffered during the build
* (optional - the default temporary directory is used if this is
* null
).
* @param entryCount
* The #of entries that will be visited by the iterator. This
* MUST be an exact range count.
* @param entryIterator
* Visits the index entries in key order that will be written
* onto the {@link IndexSegment}.
* @param m
* The branching factor for the generated tree. This can be
* chosen with an eye to minimizing the height of the generated
* tree. (Small branching factors are permitted for testing, but
* generally you want something relatively large.)
* @param metadata
* The metadata record for the source index. A copy will be made
* of this object. The branching factor in the generated tree
* will be overridden to m.
* @param commitTime
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated. This value is written
* into {@link IndexSegmentCheckpoint#commitTime}.
* @param compactingMerge
* true
iff the generated {@link IndexSegment} will
* incorporate all state for the source index (partition) as of
* the specified commitTime. This flag is written into the
* {@link IndexSegmentCheckpoint} but does not otherwise effect
* the build process.
* @param bufferNodes
* When true
the generated nodes will be fully
* buffered in RAM (faster, but imposes a memory constraint).
* Otherwise they will be written onto a temporary file and then
* transferred to the output file en mass.
*
* @throws IOException
*/
protected IndexSegmentBuilder(//
final File outFile,//
final File tmpDir,//
final long entryCount,//
final ITupleIterator> entryIterator, //
final int m,//
IndexMetadata metadata,//
final long commitTime,//
final boolean compactingMerge,//
final boolean bufferNodes//
)
throws IOException {
if (outFile == null)
throw new IllegalArgumentException();
if (tmpDir == null)
throw new IllegalArgumentException();
if (entryCount < 0)
throw new IllegalArgumentException();
// if (entryCount == 0 && !compactingMerge) {
//
// /*
// * Note: A zero entry count is allowed for a compacting merge. This
// * can arise when all tuples in an index (partition) have been
// * deleted. It is impossible to detect this condition before we
// * explicitly range count the tuples (including any delete markers).
// * Rather than forcing the caller to handle this via a thrown
// * exception it is significantly easier to generate an empty
// * IndexSegment.
// */
//
// throw new IllegalArgumentException();
//
// }
if (entryCount == 0&&log.isInfoEnabled())
log.info(ERR_NO_TUPLES);
if (entryIterator == null)
throw new IllegalArgumentException();
if (commitTime <= 0L)
throw new IllegalArgumentException();
final long begin_setup = System.currentTimeMillis();
// @todo New files SHOUOLD use record level checksums.
// this.useChecksums = false;
// the UUID assigned to this index segment file.
this.segmentUUID = UUID.randomUUID();
this.entryCount = entryCount;
this.entryIterator = entryIterator;
// the name of the index or the index partition.
name = (metadata.getPartitionMetadata() == null)//
// local index name (if any).
? metadata.getName() == null ? "N/A" : metadata.getName()
// index partition name
: metadata.getName() + "#"
+ metadata.getPartitionMetadata().getPartitionId();
/*
* Make a copy of the caller's metadata.
*
* Note: The callers's reference is replaced by a reference to the clone
* in order to avoid accidental modifications to the caller's metadata
* object.
*/
this.metadata = metadata = metadata.clone();
{
final LocalPartitionMetadata pmd = this.metadata.getPartitionMetadata();
if (pmd != null) {
/*
* Copy the local partition metadata, but do not include the
* resource metadata identifying the resources that comprise the
* index partition view. that information is only stored on the
* BTree, not on the IndexSegment.
*/
this.metadata.setPartitionMetadata(
new LocalPartitionMetadata(//
pmd.getPartitionId(),//
pmd.getSourcePartitionId(),//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey(),//
null, // No resource metadata for indexSegment.
pmd.getIndexPartitionCause()
// ,pmd.getHistory()+
// "build("+pmd.getPartitionId()+",compactingMerge="+compactingMerge+") "
)
);
}
}
// true iff the source index is isolatable.
this.isolatable = metadata.isIsolatable();
/*
* true iff the source index maintains tuple revision timestamps.
*/
this.versionTimestamps = metadata.getVersionTimestamps();
/*
* true iff the source index supports delete markers (but they will be
* copied IFF this is an incremental build).
*/
this.deleteMarkers = metadata.getDeleteMarkers();
/*
* true iff the source index supports raw records. raw records will be
* copied into the BLOBS region of the index segment and the address of
* the raw record in the output tuple will be updated to reflect the
* relative address of the record within the index segment.
*/
this.rawRecords = metadata.getRawRecords();
/*
* Buffer used to encode addresses into the tuple value for a mutable
* B+Tree.
*/
this.recordAddrBuf = rawRecords ? new ByteArrayBuffer(Bytes.SIZEOF_LONG)
: null;
//
this.commitTime = commitTime;
this.compactingMerge = compactingMerge;
this.bufferNodes = bufferNodes;
/*
* Override the branching factor on the index segment.
*
* Note: this override is a bit dangerous since it might propagate back
* to the mutable btree, which could hurt performance through the use of
* a too large branching factor on the journal. However, the metadata
* index stores the template metadata for the scale-out index and if you
* use either that or the metadata record from an existing BTree then
* this should never be a problem.
*/
this.metadata.setBranchingFactor(m);
/*
* @todo The override of the BTree class name does not make much sense
* here. Either we should strongly discourage further subclassing of
* BTree and IndexSegment or we should allow the subclass to be named
* for both the mutable btree and the read-only index segment.
*/
this.metadata.setBTreeClassName(IndexSegment.class.getName());
this.addressManager = new WormAddressManager(offsetBits);
/*
* The INodeData cache for the generated index segment store.
*
* @todo LIRS: The index segment builder should perhaps only drive into
* the shared LRU those records which were already hot. Figuring this
* out will break encapsulation. Since the branching factor is not the
* same, and since the source is a view, "hot" has to be interpreted in
* terms of key ranges which are hot. As a workaround in a memory
* limited system you can configure the LRUNexus so that the build will
* not drive the records into the cache. [LIRS would partly address this
* by not evicting records from the cache which are hot.]
*/
// TODO BLZG-1501 (remove LRUNexus)
storeCache = null;
// storeCache = (LRUNexus.INSTANCE != null && LRUNexus
// .getIndexSegmentBuildPopulatesCache()) //
// ? LRUNexus.INSTANCE.getCache(segmentUUID, addressManager)//
// : null//
// ;
/*
* Create the index plan and do misc setup.
*/
{
// Create a plan for generating the output tree.
plan = new IndexSegmentPlan(m, entryCount);
/*
* Setup a stack of nodes (one per non-leaf level) and one leaf.
* These are filled in based on the plan and the entries visited in
* the source btree. Nodes and leaves are written out to their
* respective channel each time they are complete as defined by the
* plan given the #of children assigned to a node or the #of keys
* assigned to a leaf.
*/
stack = new AbstractSimpleNodeData[plan.height + 1];
// Note: assumes defaults to all zeros.
writtenInLevel = new int[plan.height + 1];
for (int h = 0; h < plan.height; h++) {
final SimpleNodeData node = new SimpleNodeData(h, plan.m,
versionTimestamps);
node.max = plan.numInNode[h][0];
stack[h] = node;
}
// the output leaf (reused for each leaf we populate).
leaf = new SimpleLeafData(plan.height, plan.m, metadata);
leaf.max = entryCount == 0 ? 0 : plan.numInNode[plan.height][0];
stack[plan.height] = leaf;
/*
* Setup optional bloom filter.
*
* Note: For read-only {@link IndexSegment} we always know the #of
* keys exactly at the time that we provision the bloom filter. This
* makes it easy for us to tune the filter for a desired false
* positive rate.
*
* Note: The bloom filter can not be used with very large indices
* due to the space requirements of the filter. However, very large
* in this case is MAX_INT tuples!
*/
if (metadata.getBloomFilterFactory() != null && plan.nentries > 0
&& plan.nentries < Integer.MAX_VALUE) {
// the desired error rate for the bloom filter.
final double p = metadata.getBloomFilterFactory().p;
// create the bloom filter.
bloomFilter = new BloomFilter((int) plan.nentries, p);
} else {
bloomFilter = null;
}
/*
* Used to serialize the nodes and leaves for the output tree.
*/
nodeSer = new NodeSerializer(//
/*
* Note: it does not seem like there should be any
* interaction between various IAddressSerializer strategies
* and the manner in which we encode the region (BASE, NODE,
* or BLOB) into the offset of addresses for the index
* segment store. The offset is effectively left-shifted by
* two bits to encode the region, there by reducing the
* maximum possible byte offset within any region (including
* BASE). However, that should not pose problems for any
* IAddressSerializer strategy as long as it accepts any
* legal [byteCount] and [offset] - it is just that our
* offsets are essentially 4x larger than they would be
* otherwise.
*/
addressManager,//
NOPNodeFactory.INSTANCE,//
plan.m,// the output branching factor.
0, // initialBufferCapacity - will be estimated.
metadata, //
false, // NOT read-only (we are using it for writing).
metadata.getIndexSegmentRecordCompressorFactory()
);
}
this.overflowHandler = metadata.getOverflowHandler();
this.outFile = outFile;
elapsed_setup = System.currentTimeMillis() - begin_setup;
if (log.isInfoEnabled()) {
log.info("name=" + name + ", nentries=" + entryCount
+ ", compactingMerge=" + compactingMerge);
}
}
/**
* Build the {@link IndexSegment} given the parameters specified to the
* constructor.
*/
public IndexSegmentCheckpoint call() throws Exception {
/*
* Setup for IO.
*/
begin_build = System.currentTimeMillis();
if (outFile.exists() && outFile.length() != 0L) {
throw new IllegalArgumentException("File exists and is not empty: "
+ outFile.getAbsoluteFile());
}
final FileChannel outChannel;
try {
/*
* Open the output channel
*
* @todo get an exclusive lock (FileLock).
*/
// out = FileLockUtility.openFile(outFile, mode, true/*useFileLock*/);
out = new RandomAccessFile(outFile, mode);
//
outChannel = out.getChannel();
//
// if (outChannel.tryLock() == null) {
//
// throw new IOException("Could not lock file: "
// + outFile.getAbsoluteFile());
//
// }
// /*
// * Open the leaf buffer. We only do this if there is at least a
// * single root leaf, i.e., if the output tree is not empty.
// */
// leafBuffer = plan.nleaves > 0 ? new TemporaryRawStore(offsetBits)
// : null;
leafWriteCache = plan.nleaves == 0 ? null
: new WriteCache.FileChannelWriteCache(
IndexSegmentCheckpoint.SIZE, null/* buf */,
useChecksums, false/* isHighlyAvailable */,
false/* bufferHasData */, new NOPReopener(out), 0L/* fileExtent */);
/*
* Open the node buffer. We only do this if there will be at least
* one node written, i.e., the output tree will consist of more than
* just a root leaf.
*/
if (plan.nnodes == 0) {
// No nodes, so no buffering.
nodeBuffer = null;
nodeList = null;
} else if (bufferNodes) {
// Buffer the nodes in memory.
nodeBuffer = null;
nodeList = new LinkedList();
} else {
// Buffer the nodes on a temporary file.
nodeBuffer = new TemporaryRawStore(offsetBits);
nodeList = null;
}
/*
* Open buffer for blobs if an overflow handler was specified -or-
* if the index is using raw records.
*/
blobBuffer = (rawRecords || overflowHandler != null) //
? new TemporaryRawStore(offsetBits)
: null;
/*
* Generate the output B+Tree.
*/
buildBTree();
// Verify that all leaves were written out.
assert plan.nleaves == nleavesWritten;
// Verify that all nodes were written out.
assert plan.nnodes == nnodesWritten;
elapsed_build = System.currentTimeMillis() - begin_build;
final long begin_write = System.currentTimeMillis();
// write everything out on the outFile.
checkpoint = writeIndexSegment(outChannel, commitTime);
/*
* Flush this channel to disk and close the channel. This also
* releases our lock. We are done and the index segment is ready for
* use.
*/
outChannel.force(true);
// FileLockUtility.closeFile(outFile, out);
out.close(); // also releases the lock.
//// out = null;
elapsed_write = System.currentTimeMillis() - begin_write;
/*
* log run time.
*/
elapsed = (System.currentTimeMillis() - begin_build) + elapsed_setup;
// data rate in MB/sec.
mbPerSec = (elapsed == 0 ? 0 : checkpoint.length / Bytes.megabyte32
/ (elapsed / 1000f));
if(log.isInfoEnabled()) {
final NumberFormat cf = NumberFormat.getNumberInstance();
cf.setGroupingUsed(true);
final NumberFormat fpf = NumberFormat.getNumberInstance();
fpf.setGroupingUsed(false);
fpf.setMaximumFractionDigits(2);
log.info("finished"
+ ": total(ms)="+ elapsed//
+ "= setup("+ elapsed_setup +")"//
+ "+ build("+ elapsed_build + ")"//
+ "+ write("+ elapsed_write +")"//
+ "; branchingFactor=" + plan.m//
+ ", nentries=("+ ntuplesWritten+ " actual, "+plan.nentries+ " plan)"//
+ ", nnodes=("+ nnodesWritten+" actual, "+plan.nnodes+" plan)"//
+ ", nleaves=("+ nleavesWritten+" actual, "+plan.nleaves+" plan)"//
+ ", length="+ fpf.format(((double) checkpoint.length / Bytes.megabyte32))+ "MB" //
+ ", rate=" + fpf.format(mbPerSec) + "MB/sec"//
);
}
return checkpoint;
} catch (Exception ex) {
/*
* Note: The output file is deleted if the build fails.
*/
deleteOutputFile();
// Re-throw exception
throw ex;
} catch (Throwable ex) {
/*
* Note: The output file is deleted if the build fails.
*/
deleteOutputFile();
// Masquerade exception.
throw new RuntimeException(ex);
} finally {
// /*
// * make sure that the temporary file gets deleted regardless.
// */
// if (leafBuffer != null && leafBuffer.isOpen()) {
// try {
// leafBuffer.close(); // also deletes the file if any.
// } catch (Throwable t) {
// log.warn(t,t);
// }
// }
/*
* make sure that the leaf write cache is closed regardless.
*/
if (leafWriteCache != null) {
try {
leafWriteCache.close();
} catch (Throwable t) {
log.warn(t,t);
}
}
/*
* make sure that the temporary file gets deleted regardless.
*/
if (nodeBuffer != null && nodeBuffer.isOpen()) {
try {
nodeBuffer.close(); // also deletes the file if any.
} catch (Throwable t) {
log.warn(t,t);
}
}
}
}
/**
* Scan the source tuple iterator in key order writing output leaves onto
* the index segment file with the new branching factor. We also track a
* stack of nodes that are being written out concurrently on a temporary
* channel.
*
* The plan tells us the #of values to insert into each leaf and the #of
* children to insert into each node. Each time a leaf becomes full
* (according to the plan), we "close" the leaf, writing it out onto the
* store and obtaining its "address". The "close" logic also takes care of
* setting the address on the leaf's parent node (if any). If the parent
* node becomes filled (according to the plan) then it is also "closed".
*
* Each time (except the first) that we start a new leaf we record its first
* key as a separatorKey in the appropriate parent node.
*
* Note: The root may be a leaf as a degenerate case.
*
* @todo Verify correct rejection if the source iterator visits too many or
* too few tuples.
*/
protected void buildBTree() {
// // Flag used to flush the last leaf iff it is dirty.
// boolean needsFlush = false;
if (plan.nentries == 0) {
/*
* A single empty root leaf.
*/
leaf.reset(plan.numInNode[leaf.level][0]);
flushNodeOrLeaf(leaf);
return;
}
// For each leaf in the plan while tuples remain.
for (int i = 0; i < plan.nleaves && entryIterator.hasNext(); i++) {
/*
* Fill in defined keys and values for this leaf.
*
* Note: Since the shortage (if any) is distributed by the plan from
* the last leaf backward a shortage will cause [leaf] to have
* key/val data that is not overwritten. This does not cause a
* problem as long as [leaf.nkeys] is set correctly since only that
* many key/val entries will actually be serialized.
*/
leaf.reset(plan.numInNode[leaf.level][i]);
final int limit = leaf.max; // #of keys to fill in this leaf.
// For each tuple allowed by the plan into the current leaf.
for (int j = 0; j < limit && entryIterator.hasNext(); j++) {
// Copy the tuple into the leaf.
copyTuple(j, entryIterator.next());
// needsFlush = true;
if (i > 0 && j == 0) {
/*
* Every time (after the first) that we enter a new leaf we
* need to record its first key as a separatorKey in the
* appropriate parent.
*
* Note: In the case where the parent of the previous leaf
* is full, this actually ascends through the parent of the
* previous leaf since the parent slot in the stack has not
* yet been reset. This can be a different node than the
* parent of this leaf, but only in the case when the parent
* of the previous leaf was full. In that case, the
* separatorKey is lifted into the parent's parent until an
* open slot is found. While confusing, the separatorKey
* always winds up in the correct node.
*/
addSeparatorKey(leaf);
}
}
/*
* Close the current leaf. This will write the address of the leaf
* on the parent (if any). If the parent becomes full then the
* parent will be closed as well.
*/
flushNodeOrLeaf(leaf);//, !entryIterator.hasNext());
// needsFlush = false;
}
// if (needsFlush) {
//
// /*
// * This flushes the last leaf when the plan was based on an over
// * estimate of the range count of the source iterator.
// */
//
// flush(leaf, true/* exhausted */);
//
// }
}
/**
* Copy a tuple into the current leaf at the given index.
*
* @param j
* The index in the leaf to which the tuple will be copied.
* @param tuple
* The tuple.
*/
private void copyTuple(final int j, final ITuple> tuple) {
if (ntuplesWritten == 0) {
// Verify iterator is reporting necessary data.
assertIteratorOk(tuple);
}
ntuplesWritten++;
final MutableKeyBuffer keys = leaf.keys;
assert keys.nkeys == j;
keys.keys[j] = tuple.getKey();
if (deleteMarkers)
leaf.deleteMarkers[j] = tuple.isDeletedVersion();
if (versionTimestamps) {
final long t = tuple.getVersionTimestamp();
leaf.versionTimestamps[j] = t;
if (t < leaf.minimumVersionTimestamp)
leaf.minimumVersionTimestamp = t;
if (t > leaf.maximumVersionTimestamp)
leaf.maximumVersionTimestamp = t;
}
final byte[] val;
if(deleteMarkers && tuple.isDeletedVersion()) {
val = null;
} else {
if (overflowHandler != null) {
/*
* Provide the handler with the opportunity to copy
* the blob's data onto the buffer and re-write the
* value, which is presumably the blob reference.
*/
val = overflowHandler.handle(tuple, blobBuffer);
} else {
/*
* Note: If the source index uses raw records then this will
* return the materialized value from the raw record.
*/
val = tuple.getValue();
}
}
if (rawRecords) {
final long maxRecLen = metadata.getMaxRecLen();
if (val != null && val.length > maxRecLen) {
// write the value on the backing store.
final long addr1 = blobBuffer.write(ByteBuffer.wrap(val));
// decode the offset and byte length of the record.
final int nbytes = blobBuffer.getByteCount(addr1);
final long offset = blobBuffer.getOffset(addr1);
// recode as a relative address against the BLOBs region.
final long addr = addressManager.toAddr(nbytes,
IndexSegmentRegion.BLOB.encodeOffset(offset));
// save its address in the values raba.
leaf.vals.values[j] = AbstractBTree.encodeRecordAddr(
recordAddrBuf, addr);
// flag as a raw record.
leaf.rawRecords[j] = true;
} else {
leaf.vals.values[j] = val;
leaf.rawRecords[j] = false;
}
} else {
leaf.vals.values[j] = val;
}
if (bloomFilter != null) {
/*
* Note: We record the keys for deleted tuples in the
* bloom filter. This is important since we need a
* search of an ordered set of AbstractBTree sources for
* a FusedView to halt as soon as it finds a delete
* marker for a key. If we do not add the key for
* deleted tuples to the bloom filter then the bloom
* filter will report (incorrectly) that the key is not
* in this IndexSegment. It is - with a delete marker.
*/
bloomFilter.add(keys.keys[j]);
}
keys.nkeys++;
leaf.vals.nvalues++;
}
/**
* This is invoked for the first tuple visited to make sure that the
* iterator is reporting the data we need.
*/
private void assertIteratorOk(final ITuple> tuple) {
if (!tuple.getKeysRequested())
throw new RuntimeException("keys not reported by itr.");
if (!tuple.getValuesRequested())
throw new RuntimeException("vals not reported by itr.");
if (!compactingMerge && deleteMarkers
&& ((tuple.flags() & IRangeQuery.DELETED) == 0)) {
/*
* This is an incremental build and the source index supports delete
* markers but the iterator is not visiting deleted tuples.
*/
throw new RuntimeException("delete markers not reported by itr.");
}
/*
* @todo I am not sure about this test. iterators should always report
* the revision timestamp metadata. The real question is whether or not
* they are reporting deleted tuples and that is tested above. [The
* other question is whether we always need to report deleted tuples for
* an isolatable index and that is what I am not sure about.]
*/
assert !isolatable
|| (isolatable && ((tuple.flags() & IRangeQuery.DELETED) == 0))
: "version metadata not reported by itr for isolatable index";
}
/**
* Used to make sure that the output file is deleted unless it was
* successfully processed.
*/
private void deleteOutputFile() {
if (out != null && out.getChannel().isOpen()) {
try {
// FileLockUtility.closeFile(outFile, out);
out.close();
} catch (Throwable t) {
log.error("Ignoring: " + t, t);
}
}
if (!outFile.delete()) {
log.warn("Could not delete: file=" + outFile.getAbsolutePath());
}
if (storeCache != null) {
/*
* Clear the cache since the index segment store was not generated
* successfully and the cache records will never be read.
*/
storeCache.clear();
}
}
/**
*
* Flush a node or leaf that has been closed (no more data will be added).
*
*
* Note: When a node or leaf is flushed we write it out to obtain its
* address and set that address on its direct parent using
* {@link #addChild(SimpleNodeData, long, AbstractSimpleNodeData, boolean)}.
* This also updates the per-child counters of the #of entries spanned by a
* node.
*
*
* @param node
* The node to be flushed.
*/
protected void flushNodeOrLeaf(final AbstractSimpleNodeData node) {
// final boolean exhausted) {
final int h = node.level;
// The index into the level for this node or leaf.
final int col = writtenInLevel[h];
assert col < plan.numInLevel[h];
if (log.isDebugEnabled())
log.debug("closing " + (node.isLeaf() ? "leaf" : "node") + "; h="
+ h + ", col=" + col + ", max=" + node.max + ", nkeys="
+ node.keys.size());
/*
* Note: Nodes are written out immediately. For a leaf, this allocates a
* data record for the leaf and updates the last leaf's representation
* to set the priorAddr and nextAddr fields. If the build is done then
* the nextAddr field will remain 0L.
*
* Note: This will recursively invoke flush() if the parent Node is
* full.
*
* Note: The node is not reset in the stack by this method so it will
* remain available to getParent(), which we invoke next.
*/
final long addr = writeNodeOrLeaf(node);//, exhausted);
// Lookup the parent of this leaf/node in the stack.
final SimpleNodeData parent = getParent(node);
if(parent != null) {
addChild(parent, addr, node);//, exhausted);
}
// if (col + 1 < plan.numInLevel[h]) {
//
// int max = plan.numInNode[h][col + 1];
//
// parent.reset(max);
//
// }
writtenInLevel[h]++;
}
/**
* Record the persistent address of a child on its parent and the #of
* entries spanned by that child. If all children on the parent become
* assigned then the parent is closed.
*
* @param parent
* The parent.
* @param childAddr
* The address of the child (node or leaf).
* @param child
* The child reference.
*/
protected void addChild(final SimpleNodeData parent, final long childAddr,
final AbstractSimpleNodeData child) {
// #of entries spanned by this node.
final long nentries = (child.isLeaf() ? child.getKeyCount()
: ((INodeData) child).getSpannedTupleCount());
if (parent.nchildren == parent.max) {
/*
* If there are more nodes to be filled at this level then prepare
* this node to receive its next values/children.
*/
resetNode(parent);
}
// assert parent.nchildren < parent.max;
if(log.isDebugEnabled())
log.debug("setting " + (child.isLeaf() ? "leaf" : "node")
+ " as child(" + parent.nchildren + ")" + " at h="
+ parent.level + ", col=" + writtenInLevel[parent.level]
+ ", addr=" + addressManager.toString(childAddr));
final int nchildren = parent.nchildren;
parent.childAddr[nchildren] = childAddr;
parent.childEntryCount[nchildren] = nentries;
parent.nentries += nentries;
if(versionTimestamps) {
parent.minimumVersionTimestamp = Math.min(
parent.minimumVersionTimestamp,
child.minimumVersionTimestamp);
parent.maximumVersionTimestamp = Math.max(
parent.maximumVersionTimestamp,
child.maximumVersionTimestamp);
}
parent.nchildren++;
// final int h = parent.level;
// if (exhausted
// && child.isLeaf()
//// && parent != null
// // #of separator keys LT planned childCount for parent.
// && (parent.keys.nkeys + 1) < plan.numInNode[h][writtenInLevel[h]]) {
//
// /*
// * When the source iterator is exhausted before the expected #of
// * tuples have been processed then the last leaf will be
// * non-empty (we do not start a leaf unless there is at least
// * one tuple on hand to copy into that leaf). Unless this is the
// * root leaf, then its parent may lack a separator key since the
// * separator key is chosen based on the first key to enter the
// * next leaf and we will never generate that next leaf since
// * there are no more tuples in the source iterator. This edge
// * case is detected when the #of children in the parent of the
// * last leaf is less than the #of planned children. Since we
// * never saw the next planned leaf, we need to hack in a
// * separator key for that leaf now so that queries LT the
// * separator key are directed to the last leaf which we did see.
// * This edge case is handled by adding a separatorKey based on
// * successor(lastKey) to the parent of the last leaf.
// */
//
// final byte[] lastKey = leaf.keys.keys[leaf.keys.nkeys - 1];
//
// final byte[] separatorKey = BytesUtil.successor(lastKey);
//
// parent.keys.keys[parent.keys.nkeys++] = separatorKey;
//// addSeparatorKey(parent, separatorKey);
//
// /*
// * @todo Note that the childAddr of the next leaf was already
// * assigned since we allocate the leaf's record before it is
// * populated, so we zero out that childAddr now. [The non-0L
// * childAddr for this last leaf is not really a problem since it
// * will never be visited by top-down navigation (the B+Tree will not
// * have any data for keys GTE the successor key directing probes to
// * that leaf). What is more important is that the
// * IndexSegmentCheckpoint should not direct us to the empty last
// * leaf and that the current leaf [node] should have nextAddr=0L so
// * we never navigate to that last leaf.
// *
// * @todo Write more detailed unit tests for these points.
// */
//// parent.childAddr[parent.keys.nkeys] = 0L;
//
// }
if ( parent.nchildren == parent.max ) {
/*
* Flush the parent if the leaf/node is full.
*/
flushNodeOrLeaf(parent);
}
}
/**
* The {@link #stack} contains nodes which are reused for each node or leaf
* at a given level in the generated B+Tree. This method prepares a node in
* the stack for reuse.
*/
protected void resetNode(final SimpleNodeData parent) {
final int h = parent.level;
/*
* The index into the level for this node. Note that we subtract one
* since the node is full and was already "closed". What we are
* trying to figure out here is whether the node may be reset so as
* to allow more children into what is effectively a new node or
* whether there are no more nodes allowed at this level of the
* output tree.
*/
final int col = writtenInLevel[h] - 1;
if (col + 1 < plan.numInLevel[h]) {
/*
* Reset the Node in the stack. It will be reused for the next
* Node at the same level in the B+Tree.
*/
parent.reset(plan.numInNode[h][col + 1]/*max*/);
} else {
/*
* The data is driving us to populate more nodes in this level
* than the plan allows for the output tree. This is either an
* error in the control logic or an error in the plan.
*/
throw new AssertionError();
}
}
/**
* Copies the first key of a new leaf as a separatorKey for the appropriate
* parent (if any) of that leaf. This must be invoked when the first key is
* set on that leaf. However, it must not be invoked on the first leaf.
*
* @param leaf
* The current leaf. The first key on that leaf must be defined.
*/
protected void addSeparatorKey(final SimpleLeafData leaf) {
final SimpleNodeData parent = getParent(leaf);
if (parent == null) {
/*
* This is the root leaf, so there is no parent and the separator
* key will not be assigned.
*/
return;
}
/*
* @todo Use the shortest separator key (this provides space savings on
* the nodes, but prefix compression of the keys has much the same
* effect).
*/
final byte[] separatorKey = leaf.keys.get(0);
if (separatorKey == null) {
throw new AssertionError();
}
addSeparatorKey(parent, separatorKey);
}
/**
* Copies the separatorKey into the appropriate parent (if any). This method
* is self-recursive.
*
* @param parent
* A node which is a parent of the current leaf or an ancestor of
* the node which is the parent of the current leaf (non-null).
* @param separatorKey
* The separator key to be assigned to the parent (non-null).
*/
private void addSeparatorKey(final SimpleNodeData parent,
final byte[] separatorKey) {
if (parent == null)
throw new AssertionError();
if (separatorKey == null)
throw new AssertionError();
/*
* The maximum #of keys for a node is one less key than the maximum #of
* children for that node.
*/
final int maxKeys = parent.max - 1;
final MutableKeyBuffer parentKeys = parent.keys;
if (parentKeys.nkeys < maxKeys) {
/*
* Copy the separator key into the next free position on the parent,
* incrementing the #of keys in the parent.
*/
if (log.isDebugEnabled())
log.debug("h=" + parent.level + ", col="
+ writtenInLevel[parent.level] + ", separatorKey="
+ BytesUtil.toString(separatorKey));
parentKeys.keys[parentKeys.nkeys++] = separatorKey;
// parentKeys.keys[parentKeys.nkeys++] = leaf.keys.get(0);
// parent.copyKey(parentKeys.nkeys++, leaf, 0 );
} else {
/*
* Delegate to the parent recursively until we find the first parent
* into which the separatorKey can be inserted.
*/
addSeparatorKey(getParent(parent), separatorKey);
}
}
/**
* Return the parent of a node or leaf in the {@link #stack}.
*
* @param node
* The node or leaf.
*
* @return The parent or null
iff node is the root node
* or leaf.
*/
protected SimpleNodeData getParent(final AbstractSimpleNodeData node) {
if (node.level == 0) {
return null;
}
return (SimpleNodeData) stack[node.level - 1];
}
/**
* Write the node or leaf onto the appropriate output channel.
*
* @return The address that may be used to read the node or leaf from the
* file. Note that the address of a node is relative to the start of
* the node channel and therefore must be adjusted before reading
* the node from the final index segment file.
*/
protected long writeNodeOrLeaf(final AbstractSimpleNodeData node) {
return node.isLeaf() ? writeLeaf((SimpleLeafData) node)
: writeNode((SimpleNodeData) node);
}
/**
* Code the leaf, obtaining its address, update the prior/next addr of the
* previous leaf, and write that previous leaf onto the output file.
*
* Note: For leaf addresses we know the absolute offset into the
* {@link IndexSegmentStore} where the leaf will wind up so we encode the
* address of the leaf using the {@link IndexSegmentRegion#BASE} region.
*
* Note: In order to write out the leaves using a double-linked list with
* prior-/next-leaf addresses we have to use a "write behind" strategy.
* Instead of writing out the leaf as soon as it is serialized, we save the
* uncoded address and a copy of the coded data record on private member
* fields. When we code the next leaf (or if we learn that we have no more
* leaves to code because {@link IndexSegmentPlan#nleaves} EQ
* {@link #nleavesWritten}) then we patch the coded representation of the
* prior leaf and write it on the store at the previously obtained address,
* thereby linking the leaves together in both directions. It is definitely
* confusing.
*
* @return The address that may be used to read the leaf from the file
* backing the {@link IndexSegmentStore}.
*/
protected long writeLeaf(final SimpleLeafData leaf) {
/*
* The encoded address of the leaf that we allocated here. The encoded
* address will be relative to the BASE region.
*/
final long addr;
{
// code the leaf, obtaining a view onto an internal (shared) buffer.
// final ByteBuffer buf = nodeSer.encode(leaf).asByteBuffer();
// code the leaf.
final ILeafData thisLeafData = nodeSer.encodeLive(leaf);
// Obtain address to be assigned to this leaf.
// // Allocate a record for the leaf on the temporary store.
// final long addr1 = leafBuffer.allocate(buf.remaining());
// final long addr1 = leafBuffer.allocate(thisLeafData.data().len());
final long addr1 = allocateLeafAddr(thisLeafData.data().len());
// encode the address assigned to the serialized leaf.
addr = encodeLeafAddr(addr1);
if (log.isDebugEnabled())
log.debug("allocated storage for leaf data record"//
+ ": addr=" + addressManager.toString(addr));
if (nleavesWritten > 0) {
/*
* Update the previous leaf, but only for the 2nd+ leaf.
*/
if (log.isDebugEnabled())
log.debug("updating previous leaf"//
+ ": addr="+addressManager.toString(encodeLeafAddr(bufLastLeafAddr))//
+ ", priorAddr="+ addressManager.toString(addrPriorLeaf)//
+ ", nextAddr=" + addressManager.toString(addr)//
// + ", exhausted=" + exhausted
);
else if (log.isInfoEnabled()) {
System.err.print("."); // wrote a leaf.
if (nleavesWritten % 80 == 0) {
// break lines.
System.err.print("\n");
}
}
// view onto the coded record for the prior leaf.
final ByteBuffer bufLastLeaf = lastLeafData.data().asByteBuffer();
/*
* Patch representation of the previous leaf.
*
* Note: This patches the coded record using the ByteBuffer view
* of that record. However, the change is made to the backing
* byte[] so the change is visible on the coded record as well.
*/
nodeSer.updateLeaf(bufLastLeaf, addrPriorLeaf, addr/*addrNextLeaf*/);
assert lastLeafData.getPriorAddr() == addrPriorLeaf;
assert lastLeafData.getNextAddr() == addr;
// write the previous leaf onto the store.
// leafBuffer.update(bufLastLeafAddr, 0/*offset*/, bufLastLeaf);
writeLeafForReal(bufLastLeafAddr, bufLastLeaf);
// the encoded address of the leaf that we just wrote out.
addrPriorLeaf = encodeLeafAddr(bufLastLeafAddr);
if (storeCache != null) {
/*
* Insert the coded, patched record for the prior leaf into
* cache.
*/
storeCache.putIfAbsent(addrPriorLeaf, lastLeafData);
}
}
// update reference to the leaf we just coded.
lastLeafData = thisLeafData;
// the address allocated for the leaf in the temp store.
bufLastLeafAddr = addr1;
}
if (nleavesWritten == 0) {
/*
* Encoded addr of the 1st leaf - update only for the first leaf
* that we allocate.
*/
addrFirstLeaf = addr;
}
// encoded addr of the last leaf - update for each leaf that we allocate.
addrLastLeaf = addr;
// the #of leaves written so far.
nleavesWritten++;
if (plan.nleaves == nleavesWritten) {//||*/ exhausted) {
/*
* Update the last leaf.
*
* Note: The last leaf is the one for which we allocated storage
* immediately above.
*
* Note: We only invoke flush() if a leaf has data so we should
* never be in a position of writing out an empty leaf (with the
* exception of a B+Tree which has no tuples).
*/
assert plan.nentries == 0 || lastLeafData.getKeyCount() > 0 : "Last leaf is empty?";
if (log.isDebugEnabled())
log.debug("updating last leaf"//
+ ": addr="+addressManager.toString(encodeLeafAddr(bufLastLeafAddr))//
+ ", priorAddr="+ addressManager.toString(addrPriorLeaf)//
+ ", nextAddr=0L"//
// + ", exhausted="+exhausted
);
// log.debug("Writing leaf: priorLeaf=" + addrPriorLeaf
// + ", nextLeaf=" + 0L + ", exhausted=" + exhausted);
else if (log.isInfoEnabled())
System.err.print("."); // wrote a leaf.
// View onto the coded record for the prior leaf.
final ByteBuffer bufLastLeaf = lastLeafData.data().asByteBuffer();
/*
* Patch representation of the last leaf.
*
* Note: This patches the coded record using the ByteBuffer view
* of that record. However, the change is made to the backing
* byte[] so the change is visible on the coded record as well.
*/
nodeSer.updateLeaf(bufLastLeaf, addrPriorLeaf, 0L/*addrNextLeaf*/);
assert lastLeafData.getPriorAddr() == addrPriorLeaf;
assert lastLeafData.getNextAddr() == 0L;
// write the last leaf onto the store.
// leafBuffer.update(bufLastLeafAddr, 0/*offset*/, bufLastLeaf);
writeLeafForReal(bufLastLeafAddr, bufLastLeaf);
if (storeCache != null) {
/*
* Insert the coded, patched record for the prior leaf into
* cache.
*/
storeCache.putIfAbsent(addrLastLeaf, lastLeafData);
}
}
return addr;
}
private long allocateLeafAddr(final int nbytes) {
// final long addr1 = leafBuffer.allocate(nbytes);
final long offset = leafAddrFactory.get();
leafAddrFactory.addAndGet(nbytes);
final long addr1 = addressManager.toAddr((int) nbytes, offset);
return addr1;
}
/**
* The address factory for the leaves. Note that addresses are relative to
* the start of the leaf region, not the start of the output file.
*/
private final AtomicLong leafAddrFactory = new AtomicLong(0L);
// IndexSegmentCheckpoint.SIZE);
private void writeLeafForReal(final long addr, final ByteBuffer data) {
//leafBuffer.update(addr, 0/*offset*/, data);
final long offset = addressManager.getOffset(addr);
try {
final int chk = useChecksums ? checker.checksum(data) : 0;
// write leaf on the cache.
if(!leafWriteCache.write(offset, data, chk)) {
// leaf does not fit in the cache, so evict cache to the file.
leafWriteCache.flush(false/*force*/);
// reset the cache!
leafWriteCache.reset();
// write leaf on the cache.
if(!leafWriteCache.write(offset, data, chk)) {
/*
* The leaf is larger than the write cache, so we will write
* it directly onto the output file.
*
* @todo This is tested by the larger random builds, but we
* really should have an explicit test for this case.
*/
// Write the record onto the file at that offset.
FileChannelUtility.writeAll(leafWriteCache.opener, data,
offset);
}
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
/**
* "Allocates" a node address when we will buffer the nodes in RAM.
*
* @throws UnsupportedOperationException
* if we are not buffering nodes in RAM.
*/
private long allocateNodeAddr(final int nbytes) {
if (!bufferNodes)
throw new UnsupportedOperationException();
final long offset = nodeAddrFactory.get();
nodeAddrFactory.addAndGet(nbytes);
final long addr1 = addressManager.toAddr((int) nbytes, offset);
return addr1;
}
/**
* The address factory for the nodes used when we will buffer the nodes in
* RAM. Note that addresses are relative to the start of the node region,
* not the start of the output file.
*/
private final AtomicLong nodeAddrFactory = new AtomicLong(0L);
/**
* Encode the address of a leaf.
*
* Note: This updates {@link #maxNodeOrLeafLength} as a side-effect.
*
* @param addr1
* The address of a leaf as allocated by
* {@link #allocateLeafAddr(int)}
*
* @return The encoded address of the leaf relative to the
* {@link IndexSegmentRegion#BASE} region where it will appear once
* the leaves have been copied onto the output file.
*/
private long encodeLeafAddr(final long addr1) {
final int nbytes = addressManager.getByteCount(addr1);
if (nbytes > maxNodeOrLeafLength) {
// track the largest node or leaf written.
maxNodeOrLeafLength = nbytes;
}
/*
* Note: The offset is adjusted by the size of the checkpoint record
* such that the offset is correct for the generated file NOT the buffer
* into which the leaves are being written.
*/
final long offset = addressManager.getOffset(addr1)
+ IndexSegmentCheckpoint.SIZE;
// Encode the address of the leaf.
final long addr = addressManager.toAddr(nbytes, IndexSegmentRegion.BASE
.encodeOffset(offset));
return addr;
}
/*
* Data used to chain the leaves together in a prior/next double-linked
* list.
*/
/**
* The address of the previous leaf, but encoded for the generated
* {@link IndexSegmentStore}.
*/
private long addrPriorLeaf = 0L;
/**
* The address of the last leaf allocated (but not yet written out).
*
* Note: This address is NOT encoded for the {@link IndexSegmentStore}.
* Instead, it is encoded for the output file using the
* {@link #addressManager} and is relative to the start of leaves region in
* the output file.
*
* @see #writeLeaf(SimpleLeafData)
*/
private long bufLastLeafAddr = 0L;
// /**
// * Buffer holds a copy of the serialized representation of the last leaf.
// * This buffer is reset and written by {@link #writeLeaf(SimpleLeafData)}.
// * The contents of this buffer are used by {@link #writePriorLeaf(long)} to
// * write out the serialized representation of the previous leaf in key order
// * after it has been patched to reflect the prior and next leaf addresses.
// * The buffer is automatically reallocated if it is too small for a leaf.
// */
// private ByteBuffer bufLastLeaf = ByteBuffer.allocate(10 * Bytes.kilobyte32);
/**
* Buffer holds a copy of the coded representation of the last leaf. This
* buffer is written by {@link #writeLeaf(SimpleLeafData)}. The contents of
* this buffer are used to write out the serialized representation of the
* previous leaf in key order after it has been patched to reflect the prior
* and next leaf addresses. The coded {@link ILeafData} record is modified
* before the previous leaf is written out to reflect the address assigned
* to the next leaf in key order.
*/
private ILeafData lastLeafData;
/**
* Code and write the node onto the {@link #nodeBuffer}.
*
* @return An relative address that must be correctly decoded
* before you can read the compressed node from the file. This value
* is also set on {@link SimpleNodeData#addr}.
*
* @see SimpleNodeData
* @see IndexSegmentRegion
* @see IndexSegmentAddressManager
*/
protected long writeNode(final SimpleNodeData node) {
// code node, obtaining slice onto shared buffer and wrap that
// shared buffer.
final INodeData codedNodeData = nodeSer.encodeLive(node);
// final ByteBuffer buf = nodeSer.encode(node).asByteBuffer();
final long tempAddr;
if (nodeBuffer != null) {
// write the node on the buffer (a temporary store).
tempAddr = nodeBuffer.write(codedNodeData.data().asByteBuffer());
} else {
// allocate address relative to the start of the nodes region.
tempAddr = allocateNodeAddr(codedNodeData.data().len());
// buffer the node (it will be written out later).
nodeList.add(new NodeMetadata(tempAddr, codedNodeData));
}
final long offset = addressManager.getOffset(tempAddr);
final int nbytes = addressManager.getByteCount(tempAddr);
if (nbytes > maxNodeOrLeafLength) {
// track the largest node or leaf written.
maxNodeOrLeafLength = nbytes;
}
// the #of nodes written so far.
nnodesWritten++;
if (log.isInfoEnabled())
System.err.print("x"); // wrote a node.
/*
* Encode the node address. Since we do not know the offset of the NODE
* region in advance this address gets encoded as relative to the start
* of the NODE region in the file.
*/
final long addr = addressManager.toAddr(nbytes, IndexSegmentRegion.NODE
.encodeOffset(offset));
node.addr = addr;
if (storeCache != null) {
/*
* Insert the coded record into cache as [addr2 : nodeData], where
* nodeData is encodeLive() wrapped version of the slice.
*/
storeCache.putIfAbsent(addr, codedNodeData);
}
return addr;
}
/**
*
* Writes the complete file format for the index segment. The file is
* divided up as follows:
*
*
* - fixed length {@link IndexSegmentCheckpoint} record (required)
* - leaves (required)
* - nodes (may be empty)
* - the bloom filter (optional)
* - the {@link IndexMetadata} record (required, but extensible)
*
*
*
* The index segment metadata is divided into a base
* {@link IndexSegmentCheckpoint} record with a fixed format containing only
* essential data and additional metadata records written at the end of the
* file including the optional bloom filter and the required
* {@link IndexMetadata} record. The latter is where we write variable
* length metadata including the _name_ of the index, or additional metadata
* defined by a specific class of index.
*
*
* Once all nodes and leaves have been buffered we are ready to start
* writing the data. We skip over a fixed size metadata record since
* otherwise we are unable to pre-compute the offset to the leaves and hence
* the addresses of the leaves. The node addresses are written in an
* encoding that requires active translation by the receiver who must be
* aware of the offset to the start of the node region. We can not write the
* metadata record until we know the size and length of each of these
* regions (leaves, nodes, and the bloom filter, or other metadata records)
* since that information is required in order to be able to form their
* addresses for insertion in the metadata record.
*
*
* @param outChannel
*
* @param commitTime
*
* @throws IOException
* @throws InterruptedException
*
* FIXME There is no sense of an atomic commit when building a
* new index segment. We should write ZEROs into the checkpoint
* record initially and then seek back to the head of the file
* once we are done and write out the correct checkpoint record.
*
* Note: There are similar issues involved when we replicate
* index segment or journal files to verify that they are good.
*/
protected IndexSegmentCheckpoint writeIndexSegment(
final FileChannel outChannel, final long commitTime)
throws IOException, InterruptedException {
/*
* All nodes and leaves have been written. If we wrote any nodes
* onto the temporary channel then we also have to bulk copy them
* into the output channel.
*/
final long offsetLeaves;
final long extentLeaves;
final long offsetNodes;
final long extentNodes;
final long offsetBlobs;
final long extentBlobs;
final long addrRoot;
/*
* Skip over the checkpoint record at the start of the file.
*
* Note: We fill this areas with zeros. When the index segment is empty
* (has no entries) then this causes the file length to be extended
* beyond the checkpoint record and the index metadata record gets
* written onto the file at that point. If we merely position the file
* to beyond the checkpoint record then nothing has been written on the
* file and the index metadata record gets written at offset 0L!
*/
outChannel.write(ByteBuffer.allocate(IndexSegmentCheckpoint.SIZE));
/*
* Direct copy the leaves from their buffer into the output file. If the
* buffer was backed by a file then that file will be deleted as a
* post-condition on the index build operation.
*/
if (plan.nleaves == 0) {
/*
* The tree is empty (no root leaf).
*/
// No leaves.
offsetLeaves = 0L;
extentLeaves = 0L;
// No nodes.
offsetNodes = 0L;
extentNodes = 0L;
// No root.
addrRoot = 0L;
} else {
offsetLeaves = IndexSegmentCheckpoint.SIZE;
// output the leaf buffer.
{
// /*
// * Transfer the leaf buffer en mass onto the output channel.
// *
// * Note: If a planned leaf is not emitted then this can cause an
// * exception to be thrown indicating that the IO transfer is not
// * progressing. This occurs when the record for that leaf was
// * allocated on the leafBuffer but never written onto the
// * leafBuffer. This allocate-then-write policy allows us to
// * double-link the leaves during the build. The build SHOULD
// * automatically correct for cases when there are not enough
// * tuples to fill out the leaves in the plan. However, if it
// * does not correct the problem, and hence does not write the
// * last allocated leaf data record, then you might see this
// * exception.
// */
// extentLeaves = leafBuffer.getBufferStrategy().transferTo(out);
// if (nodeBuffer != null) {
//
// // The offset to the start of the node region.
// offsetNodes = IndexSegmentCheckpoint.SIZE + extentLeaves;
//
// assert outChannel.position() == offsetNodes;
//
// } else {
//
// // zero iff there are no nodes.
// offsetNodes = 0L;
//
// }
// The extent of the leaves region on the file.
extentLeaves = leafAddrFactory.get();
if (plan.nnodes != 0) {
// The offset to the start of the node region.
offsetNodes = IndexSegmentCheckpoint.SIZE + extentLeaves;
} else {
// zero iff there are no nodes.
offsetNodes = 0L;
}
// Close the buffer.
// leafBuffer.close();
try {
// flush the last writes.
leafWriteCache.flush(false/* force */);
// close cache (discards buffer).
leafWriteCache.close();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
/*
* Direct copy the node index from the buffer into the output file.
* If the buffer was backed by a file then that file will be deleted
* as a post-condition on the index build operation.
*/
if (nodeBuffer != null) {
/*
* Seek to the start of the nodes region (the write cache does
* not change the file position when it writes onto the file so
* we need to explicitly seek to the desired location).
*/
outChannel.position(offsetNodes);
// Verify we are at the start of the nodes region.
assert outChannel.position() == offsetNodes : "position="
+ outChannel.position() + ", but offsetNodes="
+ offsetNodes;
// transfer the nodes en mass onto the output channel.
extentNodes = nodeBuffer.getBufferStrategy().transferTo(out);
// Close the buffer.
nodeBuffer.close();
// Note: already encoded relative to NODE region.
addrRoot = (((SimpleNodeData) stack[0]).addr);
} else if (nodeList != null) {
/*
* Write the nodes onto the output file.
*
* Note: The addresses are relative to the start of the nodes
* region, so we adjust the write cache using the offset to the
* nodes region.
*
* FIXME Use a WriteCacheService which will hide this complexity
* and give better throughput.
*/
// Setup a write cache.
final WriteCache.FileChannelWriteCache writeCache = new WriteCache.FileChannelWriteCache(
offsetNodes, null/* buf */, useChecksums,
false/* isHighlyAvailable */, false/* bufferHasData */,
new NOPReopener(out), 0L/* fileExtent */);
try {
// Count the #of bytes in the nodes.
int nbytes = 0;
// For each node.
for (NodeMetadata md : nodeList) {
final long addr = md.addr;
// the offset relative to the start of the nodes region.
final long offset = addressManager.getOffset(addr);
final AbstractFixedByteArrayBuffer slice = md.data
.data();
// track #of bytes across all nodes.
nbytes += slice.len();
final ByteBuffer data = slice.asByteBuffer();
final int chk = useChecksums?checker.checksum(data):0;
// write onto cache.
if (!writeCache.write(offset, data,chk)) {
// cache is full, evict to file.
writeCache.flush(false/* force */);
// reset the cache!
writeCache.reset();
// and write on the cache again.
if (!writeCache.write(offset, data,chk)) {
// directly write onto the output file.
FileChannelUtility.writeAll(writeCache.opener,
data, offset);
}
}
}
// force the last writes to the output file.
writeCache.flush(false/* force */);
// reset the cache!
writeCache.reset();
// #of bytes across all nodes.
extentNodes = nbytes;
} finally {
// releases the buffer.
writeCache.close();
}
// Note: already encoded relative to NODE region.
addrRoot = (((SimpleNodeData) stack[0]).addr);
} else {
/*
* The tree consists of just a root leaf.
*/
// This MUST be 0L if there are no leaves.
extentNodes = 0L;
// Address of the root leaf.
addrRoot = addrLastLeaf;
}
}
if (log.isInfoEnabled())
log.info("addrRoot: " + addrRoot + ", "
+ addressManager.toString(addrRoot));
/*
* Direct copy the optional blobBuffer onto the output file.
*
* Note: The backing BufferStrategy for the blobBuffer is allocated
* eagerly if there is an indication that blobs *might* be in use by the
* index. However, if nothing ever gets written onto the temporary store
* then the backing file is never created and there will be nothing to
* transfer. We look for this case an opt out of the transfer when
* nothing has been written onto the blobBuffer.
*/
if (blobBuffer == null || blobBuffer.getBufferStrategy().size() == 0L) {
// No blobs region.
offsetBlobs = extentBlobs = 0L;
} else {
// #of bytes written so far on the output file.
offsetBlobs = out.length();
// seek to the end of the file.
out.seek(offsetBlobs);
// transfer the nodes en mass onto the output channel.
extentBlobs = blobBuffer.getBufferStrategy().transferTo(out);
// Close the buffer.
blobBuffer.close();
}
/*
* If the bloom filter was constructed then serialize it on the end
* of the file.
*/
final long addrBloom;
if( bloomFilter == null ) {
addrBloom = 0L;
} else {
// serialize the bloom filter.
final byte[] bloomBytes = SerializerUtil.serialize(bloomFilter);
// #of bytes written so far on the output file.
final long offset = out.length();
// seek to the end of the file.
out.seek(offset);
// write the serialized bloom filter.
out.write(bloomBytes, 0, bloomBytes.length);
// Address of the region containing the bloom filter (one record).
addrBloom = addressManager.toAddr(bloomBytes.length,
IndexSegmentRegion.BASE.encodeOffset(offset));
if (storeCache != null) {
/*
* Insert the record into the cache.
*/
storeCache.putIfAbsent(addrBloom, bloomFilter);
}
}
/*
* Write out the metadata record.
*/
final long addrMetadata;
{
/*
* Serialize the metadata record.
*/
final byte[] metadataBytes = SerializerUtil.serialize(metadata);
// #of bytes written so far on the output file.
final long offset = out.length();
// seek to the end of the file.
out.seek(offset);
// write the serialized extension metadata.
out.write(metadataBytes, 0, metadataBytes.length);
// Address of the region containing the metadata record (one record)
addrMetadata = addressManager.toAddr(metadataBytes.length,
IndexSegmentRegion.BASE.encodeOffset(offset));
if (storeCache != null) {
/*
* Insert the record into the cache.
*/
storeCache.putIfAbsent(addrMetadata, metadata);
}
}
/*
* Write out the ICUVersionRecord at the end of the file.
*
* FIXME Enable when merging in the ICUVersionRecord change set.
*/
// {
//
// /*
// * Serialize the record.
// */
// final byte[] icuVersionBytes = SerializerUtil.serialize(ICUVersionRecord.newInstance());
//
// // #of bytes written so far on the output file.
// final long offset = out.length();
//
// // seek to the end of the file.
// out.seek(offset);
//
// // write the serialized extension metadata.
// out.write(icuVersionBytes, 0, icuVersionBytes.length);
//
// // Address of the region containing the metadata record (one record)
// long addrICUVersion = addressManager.toAddr(icuVersionBytes.length,
// IndexSegmentRegion.BASE.encodeOffset(offset));
//
// if (storeCache != null) {
//
// /*
// * Insert the record into the cache.
// */
//
// storeCache.putIfAbsent(addrMetadata, metadata);
//
// }
//
// }
/*
* Seek to the start of the file and write out the checkpoint record.
*/
{
// // timestamp for the index segment.
// final long now = System.currentTimeMillis();
outChannel.position(0);
/*
* Note: The build plan is restricted to MAX_INT leaves and there
* are always more leaves than nodes in a B+Tree both nnodes and
* nleaves are int32 values.
*/
if(nnodesWritten>Integer.MAX_VALUE)
throw new AssertionError();
if(nleavesWritten>Integer.MAX_VALUE)
throw new AssertionError();
final IndexSegmentCheckpoint md = new IndexSegmentCheckpoint(
addressManager.getOffsetBits(), //
plan.height, // will always be correct.
(int)nleavesWritten, // actual #of leaves written.
(int)nnodesWritten, // actual #of nodes written.
ntuplesWritten, // actual #of tuples written.
maxNodeOrLeafLength,//
offsetLeaves, extentLeaves, offsetNodes, extentNodes,
offsetBlobs, extentBlobs, addrRoot, addrMetadata,
addrBloom, addrFirstLeaf, addrLastLeaf, out.length(),
compactingMerge, useChecksums, segmentUUID, commitTime);
md.write(out);
if(log.isInfoEnabled())
log.info(md.toString());
// save the index segment resource description for the caller.
this.segmentMetadata = new SegmentMetadata(outFile, //out.length(),
segmentUUID, commitTime);
return md;
}
}
/**
* The description of the constructed {@link IndexSegment} resource.
*
* @throws IllegalStateException
* if requested before the build operation is complete.
*/
public IResourceMetadata getSegmentMetadata() {
if (segmentMetadata == null) {
throw new IllegalStateException();
}
return segmentMetadata;
}
private SegmentMetadata segmentMetadata = null;
/**
* Abstract base class for classes used to construct and serialize nodes and
* leaves written onto the index segment.
*
* @author Bryan
* Thompson
*/
abstract protected static class AbstractSimpleNodeData implements
IAbstractNodeData {
/**
* The level in the output tree for this node or leaf (origin zero). The
* root is always at level zero (0).
*/
final int level;
final int m;
/**
* Mutable keys (directly managed by the {@link IndexSegmentBuilder}).
*/
final MutableKeyBuffer keys;
/**
* The max/max version timestamp for the node/leaf. These data are only
* used when the B+Tree is maintaining per tuple revision timestamps.
*/
long minimumVersionTimestamp;
long maximumVersionTimestamp;
/**
* We precompute the #of children to be assigned to each node and the
* #of values to be assigned to each leaf and store that value in this
* field. While the field name is "max", this is the exact that must be
* assigned to the node.
*/
int max = -1;
protected AbstractSimpleNodeData(final int level, final int m) {
this.level = level;
this.m = m;
/*
* @todo This should probably be dimensioned to m-1 for a node and m
* for a leaf. The mutable B+Tree would have dimensions to m for a
* node and m+1 for a leaf to allow for overflow during split/join,
* but we only need the exact number of slots.
*/
this.keys = new MutableKeyBuffer(m);
this.minimumVersionTimestamp = Long.MAX_VALUE;
this.maximumVersionTimestamp = Long.MIN_VALUE;
}
/**
*
* @param max
* The #of children to be assigned to this node -or- the #of
* tuples to be assigned to a leaf.
*/
protected void reset(final int max) {
this.max = max;
this.keys.nkeys = 0;
this.minimumVersionTimestamp = Long.MAX_VALUE;
this.maximumVersionTimestamp = Long.MIN_VALUE;
}
final public int getKeyCount() {
return keys.size();
}
final public IRaba getKeys() {
return keys;
}
/**
* Yes (however, note that the {@link IndexSegmentBuilder} directly
* accesses and modified the internal data structures).
*/
final public boolean isReadOnly() {
return true;
}
/**
* No.
*/
final public boolean isCoded() {
return false;
}
final public AbstractFixedByteArrayBuffer data() {
throw new UnsupportedOperationException();
}
final public long getMaximumVersionTimestamp() {
if(!hasVersionTimestamps())
throw new UnsupportedOperationException();
return minimumVersionTimestamp;
}
final public long getMinimumVersionTimestamp() {
if(!hasVersionTimestamps())
throw new UnsupportedOperationException();
return maximumVersionTimestamp;
}
}
/**
* A class that can be used to (de-)serialize the data for a leaf without
* any of the logic for operations on the leaf.
*
* @author Bryan Thompson
*/
protected static class SimpleLeafData extends AbstractSimpleNodeData
implements ILeafData {
/**
* The values stored in the leaf (directly accessed by the
* {@link IndexSegmentBuilder}).
*/
final MutableValueBuffer vals;
final public IRaba getValues() {
return vals;
}
/**
* Allocated iff delete markers are maintained.
*/
final boolean[] deleteMarkers;
/**
* Allocated iff version timestamps are maintained.
*/
final long[] versionTimestamps;
/**
* Allocated iff raw record markers are maintained.
*/
final boolean[] rawRecords;
public SimpleLeafData(final int level, final int m,
final IndexMetadata metadata) {
super(level, m);
this.vals = new MutableValueBuffer(m);
this.deleteMarkers = metadata.getDeleteMarkers() ? new boolean[m]
: null;
this.versionTimestamps = metadata.getVersionTimestamps() ? new long[m]
: null;
this.rawRecords = metadata.getRawRecords() ? new boolean[m]
: null;
}
protected void reset(final int max) {
super.reset(max);
vals.nvalues = 0;
}
// final public int getSpannedTupleCount() {
//
// return keys.size();
//
// }
final public int getValueCount() {
return keys.size();
}
final public boolean isLeaf() {
return true;
}
final public boolean getDeleteMarker(final int index) {
if (deleteMarkers == null)
throw new UnsupportedOperationException();
return deleteMarkers[index];
}
final public long getVersionTimestamp(final int index) {
if (versionTimestamps == null)
throw new UnsupportedOperationException();
return versionTimestamps[index];
}
final public long getRawRecord(final int index) {
if (rawRecords == null)
throw new UnsupportedOperationException();
if(!rawRecords[index])
return IRawStore.NULL;
return AbstractBTree.decodeRecordAddr(vals.get(index));
}
final public boolean hasDeleteMarkers() {
return deleteMarkers != null;
}
final public boolean hasVersionTimestamps() {
return versionTimestamps != null;
}
final public boolean hasRawRecords() {
return rawRecords != null;
}
/**
* Yes - the caller maintains the necessary information and then updates
* the coded {@link ReadOnlyLeafData} record once we have the address of
* the next record.
*/
final public boolean isDoubleLinked() {
return true;
}
/**
* @throws UnsupportedOperationException
* since the data are maintained externally and patched on
* the coded records by the {@link IndexSegmentBuilder}.
*/
final public long getNextAddr() {
throw new UnsupportedOperationException();
}
/**
* @throws UnsupportedOperationException
* since the data are maintained externally and patched on
* the coded records by the {@link IndexSegmentBuilder}.
*/
final public long getPriorAddr() {
throw new UnsupportedOperationException();
}
}
/**
* A class that can be used to (de-)serialize the data for a node without
* any of the logic for operations on the node.
*
* Note: All node addresses that are internal to a node and reference a
* child node (vs a leaf) are correct relative to the start of the
* {@link IndexSegmentRegion#NODE} region. This is an unavoidable
* consequence of serializing the nodes before we have the total offset to
* the start of the {@link IndexSegmentRegion#NODE} region.
*
* @see IndexSegmentRegion
* @see IndexSegmentAddressManager
*
* @author Bryan Thompson
*/
protected static class SimpleNodeData extends AbstractSimpleNodeData
implements INodeData {
// mutable.
/**
* The relative address at which this node was written on the temporary
* channel. This is a negative integer. If you flip the sign then it
* encodes a relative offset to the start of the index node block and
* the correct size for the compressed node.
*/
long addr = 0L;
/**
* The address at which the child nodes were written. This is a negative
* integer iff the child is a node and a positive integer iff the child
* is a leaf. When it is a negative integer, you must flip the sign to
* obtain a relative offset to the start of the index node block and the
* correct size for the compressed node. The actual offset of the index
* node block must be added to the relative offset before you can use
* this to address into the output file.
*/
final long[] childAddr;
/**
* This tracks the #of defined values in {@link #childAddr} separately
* from the #of defined keys. The reason that we do this is that the
* logic for handling a leaf eviction and recording the address of the
* child and the separator key for the _next_ child requires an
* awareness of the intermediate state - when we have filled in the
* childAddr for the last leaf but not yet filled in the separatorKey
* for the next leaf.
*/
int nchildren = 0;
/**
* The #of entries spanned by this node.
*/
long nentries;
/**
* The #of entries spanned by each child of this node.
*/
final long [] childEntryCount;
/**
* true
iff the node is tracking the min/max tuple revision
* timestamps.
*/
final boolean hasVersionTimestamps;
final public long getSpannedTupleCount() {
return nentries;
}
final public long getChildAddr(final int index) {
if (index < 0 || index > keys.size() + 1)
throw new IllegalArgumentException();
return childAddr[index];
}
final public long getChildEntryCount(final int index) {
if (index < 0 || index > keys.size() + 1)
throw new IllegalArgumentException();
return childEntryCount[index];
}
public SimpleNodeData(final int level, final int m,
final boolean hasVersionTimestamps) {
super(level, m);
this.childAddr = new long[m];
this.childEntryCount = new long[m];
this.hasVersionTimestamps = hasVersionTimestamps;
}
/**
* Reset counters and flags so that the node may be reused.
*
* @param max
* The new limit on the #of children to fill on this node.
*/
protected void reset(final int max) {
// /*
// * Note: We have to clear these arrays for the edge case when source
// * iterator is prematurely exhausted. If we do not clear them then
// * the last entry in each array can be non-zero when it should be 0L
// * when the planned right child under a separatorKey in a Node was
// * not emitted.
// */
// for (int i = 0; i < nchildren; i++) {
//
// childAddr[i] = 0L;
//
// childEntryCount[i] = 0;
//
// }
super.reset(max);
addr = 0;
nchildren = 0;
nentries = 0;
}
final public int getChildCount() {
return keys.size() + 1;
}
final public boolean isLeaf() {
return false;
}
final public boolean hasVersionTimestamps() {
return hasVersionTimestamps;
}
}
/**
* Factory does not support node or leaf creation.
*/
protected static class NOPNodeFactory implements INodeFactory {
public static final INodeFactory INSTANCE = new NOPNodeFactory();
private NOPNodeFactory() {
}
public Leaf allocLeaf(final AbstractBTree btree, final long addr,
final ILeafData data) {
throw new UnsupportedOperationException();
}
public Node allocNode(final AbstractBTree btree, final long addr,
final INodeData data) {
throw new UnsupportedOperationException();
}
}
/**
* A tuple iterator backed by a mutable leaf. This implementation is used
* when we materialize the view in RAM in a single leaf and then do the
* build over that. The implementation always returns the pre-allocated
* byte[] for the key or value in order to avoid redundant allocations. This
* is safe since the data in that {@link MutableLeafData} instance were
* allocated when the view was materialized and their references can be
* safely reused when we build the output B+Tree.
*
* @author Bryan
* Thompson
* @param The generic type of the objects which would be materialized
* from the tuples.
*
* @todo in fact, we could clear the references from the
* {@link MutableLeafData} as we go.
*/
static private class MyTupleIterator implements ITupleIterator {
private final boolean hasVersionTimestamp, hasDeleteMarkers, visitDeleted;
/**
* Directly exposes the data from the {@link MutableLeafData}.
*
* @author Bryan Thompson
* @version $Id$
* @param
*/
private class MyTuple implements ITuple {
/** The index in the leaf of the state revealed by this tuple. */
private int leafIndex;
private final int flags;
private final boolean needsKeys, needsVals;
public MyTuple(final int flags) {
this.flags = flags;
this.needsKeys = (flags & IRangeQuery.KEYS) != 0;
this.needsVals = (flags & IRangeQuery.VALS) != 0;
if (!needsKeys)
throw new UnsupportedOperationException();
if (!needsVals)
throw new UnsupportedOperationException();
}
public int flags() {
return flags;
}
public boolean getKeysRequested() {
return needsKeys;
}
public boolean getValuesRequested() {
return needsVals;
}
public long getVisitCount() {
return leafIndex;
}
public byte[] getKey() {
return leaf.keys.keys[leafIndex];
}
public byte[] getValue() {
return leaf.vals.values[leafIndex];
}
public boolean isDeletedVersion() {
if(!hasDeleteMarkers)
return false;
return leaf.deleteMarkers[leafIndex];
}
public boolean isNull() {
return leaf.vals.values[leafIndex] == null;
}
public long getVersionTimestamp() {
if (!hasVersionTimestamp)
return 0L;
return leaf.versionTimestamps[leafIndex];
}
/*
* We do not actually use these methods in the IndexSegmentBuilder
* and they would not be as efficient if we did since we are relying
* on directly access to the MutableLeafData's internal data
* structures.
*/
public ByteArrayBuffer getKeyBuffer() {
throw new UnsupportedOperationException();
}
public DataInputBuffer getKeyStream() {
throw new UnsupportedOperationException();
}
public ByteArrayBuffer getValueBuffer() {
throw new UnsupportedOperationException();
}
public DataInputBuffer getValueStream() {
throw new UnsupportedOperationException();
}
public E getObject() {
throw new UnsupportedOperationException();
}
public int getSourceIndex() {
throw new UnsupportedOperationException();
}
public ITupleSerializer getTupleSerializer() {
throw new UnsupportedOperationException();
}
public IBlock readBlock(long addr) {
throw new UnsupportedOperationException();
}
}
/** The source data. */
private final MutableLeafData leaf;
/** A view onto the current tuple in that leaf. */
private final MyTuple tuple;
/** The index of the next tuple to be considered in the leaf. */
private int i;
/** The first index to visit. */
private final int fromIndex;
/** The first index to NOT visit. */
private final int toIndex;
/**
*
* @param leaf
* The leaf whose entries will be traversed (required).
* @param fromKey
* The first key whose entry will be visited or
* null
if the lower bound on the key traversal
* is not constrained.
* @param toKey
* The first key whose entry will NOT be visited or
* null
if the upper bound on the key traversal
* is not constrained.
* @param flags
* Flags specifying whether the keys and/or values will be
* materialized.
*
* @exception IllegalArgumentException
* if fromKey is given and is greater than toKey.
*/
public MyTupleIterator(final MutableLeafData leaf, final int flags) {
this.leaf = leaf;
this.tuple = new MyTuple(flags);
this.hasVersionTimestamp = leaf.hasVersionTimestamps();
this.hasDeleteMarkers = leaf.hasDeleteMarkers();
this.visitDeleted = (flags & IRangeQuery.DELETED) != 0;
fromIndex = 0;
toIndex = leaf.getKeyCount();
}
/**
* Examines the entry at {@link #i}. If it passes the criteria for an
* entry to visit then return true. Otherwise increment the {@link #i}
* until either all entries in this leaf have been exhausted -or- the an
* entry is identified that passes the various criteria.
*/
public boolean hasNext() {
for( ; i >= fromIndex && i < toIndex; i++) {
/*
* Skip deleted entries unless specifically requested.
*/
if (hasDeleteMarkers && !visitDeleted
&& leaf.getDeleteMarker(i)) {
// skipping a deleted version.
continue;
}
// entry @ index is next to visit.
return true;
}
// nothing left to visit in this leaf.
return false;
}
public ITuple next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
// next tuple.
tuple.leafIndex = i++;
return tuple;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Identifies which build method to use.
*
* @author Bryan
* Thompson
* @version $Id$
*
* @todo add other methods here as they are defined.
*/
enum BuildEnum {
/**
* Do two passes over the source iterator. The first pass will compute
* the exact range count. The second pass will build the
* {@link IndexSegment}. The two pass algorithm uses less memory and can
* be highly efficient when using {@link IndexSegmentMultiBlockIterator}
* since it will read the data from any source {@link IndexSegment}(s)
* at the disk transfer rate.
*/
TwoPass,
/**
* Fully buffer the tuples from the {@link IndexSegment} into memory in
* a single pass over the source index. This approach does less IO, but
* requires more memory in the Java heap.
*/
FullyBuffered;
}
/**
* Prints the usage and then exits.
*
* @param args
* The command line args.
*/
protected static void usage(final String[] args, final String msg, final int exitCode) {
if (msg != null)
System.err.println(msg);
System.err.println("usage: [opts] journal [name]*");
System.err.println(" journal is the name of the journal file.");
System.err.println(" [name]* is the name of one or more indices (defaults to all).");
System.err.println(" [opts] is any of:");
System.err.println(" -m #\tThe branching factor for the output index segments.");
System.err.println(" -alg (FullyBuffered|TwoPass)\tThe algorithm to use.");
System.err.println(" -merge (true|false)\tWhen true, performs a compacting merge (default is merge).");
System.err.println(" -O outDir\tThe output directory.");
System.err.println(" -bufferNodes (true|false)\tWhen true, the nodes are fully buffered in memory (default true).");
System.exit(exitCode);
}
/**
* Driver for index segment build against a named index on a local journal.
*
* @param args
* [opts] journal [name]*
, where journal is
* the name of the journal file, where name is the name of
* a B+Tree registered on that journal, and where opts are
* any of:
*
* - -m #
* - Override the default branching factor for the index
* segment.
* - -alg algorithm
* - Specify which build algorithm to use. See
* {@link BuildEnum}.
* - -merge or -build
* - Specifies whether to do a compacting merge (deleted tuples
* are purged from the generated index segment) or an incremental
* build (deleted tuples are preserved). The default is
* merge.
* - -O outDir
* - Specify the name of the directory on which the generated
* index segment file(s) will be written. This defaults to the
* current working directory. Each index segment file will be
* named based on the name of the source index with the
*
.seg
extension). .
*
* . If no names are specified, then an index segment will
* be generated for each named B+Tree registered on the source
* journal.
*
* @throws Exception
*/
// * -verify
// * Verify the generated index segment against the source
// * B+Tree.
public static void main(final String[] args) throws Exception {
// The output branching factor (optional override).
Integer branchingFactorOverride = null;
// When true, performs a correctness check against the source BTree.
boolean verify = false;
// The journal file (must already exist).
File journalFile = null;
// The name(s) of the indices to be processed.
final List names = new LinkedList();
// The directory into which the generated index segments will be
// written. Each index segment will be named based on the source index
// name. The default is the current directory.
File outDir = new File(".");
/*
* When true, a compacting merge will be performed (deleted tuples will
* be purged). Otherwise this will be an incremental build (deleted
* tuples will be preserved in the generated index segment).
*/
boolean compactingMerge = true;
/*
* When true, the generats nodes will be fully buffered in memory rather
* than being written onto a temporary file.
*/
boolean bufferNodes = true;
// Which build algorithm to use.
BuildEnum buildEnum = BuildEnum.TwoPass;//FullyBuffered;
final File tmpDir = new File(System.getProperty("java.io.tmpdir"));
if (!tmpDir.exists() && !tmpDir.mkdir()) {
throw new IOException(
"Temporary directory does not exist / can not be created: "
+ tmpDir);
}
/*
* Parse the command line, overriding various properties.
*/
{
int i = 0;
for (; i < args.length && args[i].startsWith("-"); i++) {
final String arg = args[i];
if (arg.equals("-m")) {
branchingFactorOverride = Integer.valueOf(args[++i]);
} else if (arg.equals("-O")) {
outDir = new File(args[++i]);
} else if (arg.equals("-verify")) {
verify = true;
} else if (arg.equals("-merge")) {
compactingMerge = true;
} else if (arg.equals("-build")) {
compactingMerge = false;
} else if (arg.equals("-bufferNodes")) {
bufferNodes = Boolean.valueOf(args[++i]);
} else if (arg.equals("-alg")) {
buildEnum = BuildEnum.valueOf(args[++i]);
} else if (arg.equals("-help")||arg.equals("--?")) {
usage(args, null/* msg */, 1/* exitCode */);
} else {
throw new UnsupportedOperationException("Unknown option: "
+ arg);
}
} // next arg.
// The next argument is the journal file name, which is required.
if (i == args.length) {
usage(args, "journal name is required.", 1/* exitCode */);
}
journalFile = new File(args[i++]);
if (!journalFile.exists()) {
throw new FileNotFoundException(journalFile.toString());
}
// The remaining argument(s) are the source B+Tree names.
while (i < args.length) {
names.add(args[i++]);
}
if (journalFile == null) {
throw new RuntimeException(
"The journal file was not specified.");
}
if (names == null) {
throw new RuntimeException("The index name was not specified.");
}
if (!outDir.exists() && !outDir.mkdirs()) {
throw new IOException(
"Output directory does not exist and could not be created: "
+ outDir);
}
} // parse command line.
// Open the journal: must already exist.
final Journal journal;
{
final Properties properties = new Properties();
properties
.setProperty(Journal.Options.FILE, journalFile.toString());
properties.setProperty(Journal.Options.READ_ONLY, Boolean.TRUE
.toString());
journal = new Journal(properties);
}
try {
// @todo allow caller to specify the commitTime of interest.
if (names.isEmpty()) {
final ITupleIterator itr = journal
.getName2Addr().rangeIterator();
while (itr.hasNext()) {
names.add(itr.next().getObject().name);
}
} else {
// Some validation up front.
for (String name : names) {
// Verify named indices exist.
if (journal.getIndex(name) == null) {
// Index not found.
throw new RuntimeException("Index not found: " + name);
}
// Verify output file does not exist or is empty.
final File outFile = new File(outDir, name
+ Journal.Options.SEG);
if (outFile.exists() && outFile.length() != 0) {
throw new RuntimeException(
"Output file exists and is non-empty: "
+ outFile);
}
}
}
System.out.println("Will process " + names.size() + " indices.");
final long beginAll = System.currentTimeMillis();
// For each named index.
for (String name : names) {
// Do the build for this B+Tree.
final BTree btree = journal.getIndex(name);
final File outFile = new File(outDir, name
+ Journal.Options.SEG);
final int m = branchingFactorOverride == null ? btree
.getIndexMetadata().getIndexSegmentBranchingFactor()
: branchingFactorOverride.intValue();
final long begin = System.currentTimeMillis();
final long commitTime = btree.getLastCommitTime();
System.out.println("Building index segment: in(m="
+ btree.getBranchingFactor() + ", rangeCount="
+ btree.rangeCount() + "), out(m=" + m + "), alg="+buildEnum);
final IndexSegmentBuilder builder;
switch (buildEnum) {
case TwoPass:
builder = IndexSegmentBuilder.newInstanceTwoPass(btree,
outFile, tmpDir, m, compactingMerge, commitTime,
null/* fromKey */, null/* toKey */,
bufferNodes);
break;
case FullyBuffered:
builder = IndexSegmentBuilder.newInstanceFullyBuffered(
btree, outFile, tmpDir, m, compactingMerge,
commitTime, null/* fromKey */, null/* toKey */,
bufferNodes);
break;
default:
throw new AssertionError(buildEnum.toString());
}
// Do the build.
final IndexSegmentCheckpoint checkpoint = builder.call();
// The total elapsed build time, including range count or
// pre-materialization of tuples.
final long elapsed = System.currentTimeMillis() - begin;
final String results = "name=" + name + " : elapsed=" + elapsed
+ "ms, setup=" + builder.elapsed_setup + "ms, write="
+ builder.elapsed_write + "ms, m=" + builder.plan.m
+ ", size="
+ (builder.outFile.length() / Bytes.megabyte)
+ "mb, mb/sec=" + builder.mbPerSec;
System.out.println(results);
if(verify) {
/*
* Verify the generated index segment against the source
* B+Tree.
*/
// @see BLZG-1501 (remove LRUNexus)
// if (LRUNexus.INSTANCE != null) {
//
// /*
// * Clear the records for the index segment from the
// * cache so we will read directly from the file. This is
// * necessary to ensure that the data on the file is good
// * rather than just the data in the cache.
// */
//
// System.out.println("Flushing index segment cache: "
// + builder.outFile);
//
// LRUNexus.INSTANCE.deleteCache(checkpoint.segmentUUID);
//
// }
final IndexSegmentStore segStore = new IndexSegmentStore(
outFile);
try {
final IndexSegment seg = segStore.loadIndexSegment();
try {
System.out.println("Verifying index segment: "
+ builder.outFile);
assertSameEntryIterator(name, btree.rangeIterator(),
seg.rangeIterator());
} finally {
seg.close();
}
} finally {
segStore.close();
}
}
} // next source B+Tree.
final long elapsedAll = System.currentTimeMillis() - beginAll;
System.out.println("Processed " + names.size() + " indices in "
+ elapsedAll + "ms");
} finally {
journal.close();
}
}
/**
* Verifies that the iterators visit tuples having the same data in the same
* order.
*
* @param expectedItr
* @param actualItr
*/
private static void assertSameEntryIterator(
final String name,
final ITupleIterator> expectedItr,
final ITupleIterator> actualItr) {
long nvisited = 0L;
while (expectedItr.hasNext()) {
if (!actualItr.hasNext())
throw new RuntimeException(name
+ ":: Expecting another index entry: nvisited="
+ nvisited);
final ITuple> expectedTuple = expectedItr.next();
final ITuple> actualTuple = actualItr.next();
nvisited++;
if (!BytesUtil.bytesEqual(expectedTuple.getKey(), actualTuple
.getKey())) {
throw new RuntimeException(name + ":: Wrong key: nvisited="
+ nvisited + ", expected=" + expectedTuple
+ ", actual=" + actualTuple);
}
if (!BytesUtil.bytesEqual(expectedTuple.getValue(), actualTuple
.getValue())) {
throw new RuntimeException(name + ":: Wrong value: nvisited="
+ nvisited + ", expected=" + expectedTuple
+ ", actual=" + actualTuple);
}
}
if (actualItr.hasNext())
throw new RuntimeException(name + ":: Not expecting more tuples");
}
}