
com.bigdata.btree.IndexMetadata Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.btree;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.util.Locale;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.data.INodeData;
import com.bigdata.btree.isolation.IConflictResolver;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.raba.codec.CanonicalHuffmanRabaCoder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoder.DefaultFrontCodedRabaCoder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoderDupKeys;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.btree.view.FusedView;
import com.bigdata.config.Configuration;
import com.bigdata.config.IValidator;
import com.bigdata.config.IntegerRangeValidator;
import com.bigdata.config.IntegerValidator;
import com.bigdata.htree.HTree;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.LongPacker;
import com.bigdata.io.SerializerUtil;
import com.bigdata.io.compression.IRecordCompressorFactory;
import com.bigdata.journal.IIndexManager;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.MetadataIndex;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.resources.OverflowManager;
import com.bigdata.resources.StaleLocatorException;
import com.bigdata.service.AbstractFederation;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.IDataService;
import com.bigdata.service.ndx.pipeline.AbstractSubtask;
import com.bigdata.sparse.SparseRowStore;
/**
*
* The persistent and mostly immutable metadata for a {@link AbstractBTree}.
* This class allows you to configured several very important aspects of the
* B+Tree (and other persistence capable data structures) behavior. Read on.
*
*
* An instance of this class is required in order to create a {@link BTree} or
* {@link IndexSegment}. Further, when registering a scale-out index you will
* first create an instance of this class that will serve as the metadata
* template for all index resources which are part of that scale-out
* index.
*
* Delete markers, version timestamps, and isolatable indices
*
* By default a {@link BTree} does not maintain delete markers and a request to
* delete an entry under a key will cause that entry to be removed from the live
* version of the index. However, such indices do not support "overflow" (they
* can not be evicted onto read-only {@link IndexSegment}s) and as such they do
* not support scale-out).
*
*
* The {@link SparseRowStore} handles a "delete" of a property value by writing
* a null
value under a key and does NOT require the use of index
* entry delete markers, even in scale-out deployments. A compacting merge of a
* {@link SparseRowStore} applies a history policy based on a consideration of
* the timestamped property values, including values bound to a
* null
.
*
*
* Delete markers combined with an ordered set of index resources is sufficient
* to support all features of range-partitioned indices, including compacting
* merge. Given three index resources {A,B,C} for a single index partition, the
* order over the resources gives us the guarantee that any index entry in A
* will be more recent than any index enty in B or C. So when reading a fused
* view we always stop once we have an index entry for a key, even if that entry
* has the deleted flag set.
*
*
* Delete markers occupy very little space in the leaf data structure (one bit
* each), however when they are used a deleted index entry is NOT removed from
* the index. Instead, the key remains in the leaf paired to a delete bit and a
* null
value (or simply elided). These "deleted" entries can
* only be removed from the index by a compacting merge. When transactional
* isolation is used, the criteria for removing deleted entries are stronger -
* they must no longer be visible to any active or allowable transaction as
* determined by the transaction manager, see below for more on this.
*
*
* Transaction isolation requires delete markers plus version
* timestamps. The version timestamps in the unisolated index (the live index)
* give the timestamp of the commit during which the index entry was last
* updated. The timestamp in the write set of transaction is copy from the index
* view corresponding to the ground state of the transaction the first time that
* index entry is overwritten within that transaction (there is a special case
* when the index entry was not pre-existing - we assign the start time of the
* transaction in that case so when we validate we are only concerned that the
* entry is either not found (never written) or that the entry exists with the
* same timestamp - other conditions are write-write conflicts). On commit we
* write the commit time on the updated index entries in the unisolated index.
*
* History policies and timestamps
*
* There are in fact two kinds of timestamps in use - isolatable indices place a
* timestamp on the index entry itself while the {@link SparseRowStore} places a
* timestamp in the key. One consequence of this is that it is very
* efficient to read historical data from the {@link SparseRowStore} since the
* data are right there in key order. On the other hand, reading historical data
* from an isolatable index requires reading from each historical commit state
* of that index which is not interest (this is NOT efficient). This is why the
* {@link SparseRowStore} design places timestamps in the key - so that the
* application can efficiently read both the current and historical property
* values within a logical row.
*
*
* Regardless of whether the timestamp is on the index entry (as it always is
* for isolatable indices) or in the key ({@link SparseRowStore}), the
* existence of timestamps makes it possible for an application to specify a
* history policy governing when property values will be deleted.
*
*
* When an index participates in transactions the transaction manager manages
* the life cycle of overwritten and deleted index entries (and the resources on
* which the indices exist). This is done by preserving such data until no
* transaction exists that can read from those resources. Unless an immortal
* store is desired, the "purge time" is set at a time no more recent than the
* earliest fully isolated transaction (either a read-only tx as of the start
* time of the tx or a read-write tx as of its start time). The role of a
* "history policy" with transactions is therefore how much history to buffer
* between the earliest running tx and the chosen "purge time". When the
* transaction manager updates the "purge time" it notifies the journal/data
* services. Resources having no data later than the purge time may be deleted
* and SHOULD NOT be carried forward when building new index segments.
*
*
* History policies for non-transactional indices are somewhat different. An
* scale-out index without timestamps will buffer historical data only until the
* next compacting merge of a given index partition. The compacting merge uses
* the fused view of the resources comprising the index partition and only
* writes out the undeleted index entries.
*
*
* If an application instead chooses to use timestamps in a non-transactional
* index then (a) timestamps must be assigned by either the client or the data
* service; and (b) applications can specify a history policy where data older
* than a threshold time (but not #of versions) will be eradicated. This
* approach is possible, but not well-supported in the higher level APIs.
*
*
* The {@link SparseRowStore} design is more flexible since it allows (a) fast
* access to historical property values for the same "row"; and (b) history
* policies that may be specified in terms of the #of versions, the age of a
* datum, and that can keep at least N versions of a datum. This last point is
* quite important as it allows you to retain the entirety of the most current
* revision of the logical row, even when some datums are much older than
* others.
*
*
* Serialization
*
*
* Note: Derived classes SHOULD extend the {@link Externalizable} interface and
* explicitly manage serialization versions so that their metadata may evolve in
* a backward compatible manner.
*
*
* @todo Make sure that metadata for index partition "zones" propagates with the
* partition metadata so that appropriate policies are enforcable locally
* (access control, latency requirements, replication, purging of
* historical deleted versions, etc).
*
* @todo add optional property containing IndexMetadata to be used as of the
* next overflow so that people can swap out key and value serializers and
* the like during overflow operations. the conops is that you map the new
* metadata over the index partitions and either lazily or eagerly
* overflow is triggered and the resulting {@link BTree} and
* {@link IndexSegment} objects will begin to use the new key / value
* serializers, etc. while the existing objects will still have their old
* key/val serializers and therefore can still be read.
*
* @author Bryan Thompson
* @version $Id$
*/
public class IndexMetadata implements Serializable, Externalizable, Cloneable,
IKeyBuilderFactory {
private static final long serialVersionUID = 4370669592664382720L;
private static final transient Logger log = Logger
.getLogger(IndexMetadata.class);
/**
* Options and their defaults for the {@link com.bigdata.btree} package and
* the {@link BTree} and {@link IndexSegment} classes. Options that apply
* equally to views and {@link AbstractBTree}s are in the package namespace,
* such as whether or not a bloom filter is enabled. Options that apply to
* all {@link AbstractBTree}s are specified within that namespace while
* those that are specific to {@link BTree} or {@link IndexSegment} are
* located within their respective class namespaces. Some properties, such
* as the branchingFactor, are defined for both the {@link BTree} and the
* {@link IndexSegment} because their defaults tend to be different when an
* {@link IndexSegment} is generated from an {@link BTree}.
*
* @todo It should be possible to specify the key, value, and node/leaf
* coders via this interface. This is easy enough if there is a
* standard factory interface, since we can specify the class name,
* and more difficult if we need to create an instance.
*
* Note: The basic pattern here is using the class name, having a
* default instance of the class (or a factory for that instance), and
* then being able to override properties for that instance. Beans
* stuff really, just simpler.
*
* @todo it should be possible to specify the overflow handler and its
* properties via options (as you can with beans or jini
* configurations).
*
* @todo it should be possible to specify a different split handler and its
* properties via options (as you can with beans or jini
* configurations).
*
* @author Bryan
* Thompson
* @version $Id$
*/
public static interface Options {
/*
* Constants.
*/
/**
* The minimum allowed branching factor (3). The branching factor may be
* odd or even.
*/
int MIN_BRANCHING_FACTOR = 3;
/**
* A reasonable maximum branching factor for a {@link BTree}.
*/
int MAX_BTREE_BRANCHING_FACTOR = 4196;
/**
* A reasonable maximum branching factor for an {@link IndexSegment}.
*/
int MAX_INDEX_SEGMENT_BRANCHING_FACTOR = 10240;
/**
* The minimum write retention queue capacity is two (2) in order to
* avoid cache evictions of the leaves participating in a split.
*/
int MIN_WRITE_RETENTION_QUEUE_CAPACITY = 2;
/**
* A large maximum write retention queue capacity. A reasonable value
* with a large heap is generally in 4000 to 8000, depending on the
* branching factor. The impact on the JVM heap is a function of both
* the write retention queue capacity and the B+Tree branching factor.
* Larger values are of benefit if you are doing sustained writes on the
* index and have a large java heap (and even then, GC will probably
* prevent values larger than 10000 from being useful).
*/
int MAX_WRITE_RETENTION_QUEUE_CAPACITY = 50000;
/*
* Options that apply to FusedViews as well as to AbstractBTrees.
*
* Note: These options are in the package namespace.
*/
/**
* Optional property controls whether or not a bloom filter is
* maintained (default {@value #DEFAULT_BLOOM_FILTER}). When enabled,
* the bloom filter is effective up to ~ 2M entries per index
* (partition). For scale-up, the bloom filter is automatically disabled
* after its error rate would be too large given the #of index entries.
* For scale-out, as the index grows we keep splitting it into more and
* more index partitions, and those index partitions are comprised of
* both views of one or more {@link AbstractBTree}s. While the mutable
* {@link BTree}s might occasionally grow to large to support a bloom
* filter, data is periodically migrated onto immutable
* {@link IndexSegment}s which have perfect fit bloom filters. This
* means that the bloom filter scales-out, but not up.
*
* @see BloomFilterFactory#DEFAULT
*
* @see #DEFAULT_BLOOM_FILTER
*/
String BLOOM_FILTER = (com.bigdata.btree.BTree.class.getPackage()
.getName()
+ ".bloomFilter").intern();
String DEFAULT_BLOOM_FILTER = "false";
/**
* When raw record support is enabled for the index, this is the maximum
* length of an index value which will be stored within a leaf before it
* is automatically promoted to a raw record reference on the backing
* store (default {@value #DEFAULT_MAX_REC_LEN}).
*
* @see IndexMetadata#getRawRecords()
* @see IndexMetadata#getMaxRecLen()
*/
String MAX_REC_LEN = (com.bigdata.btree.BTree.class.getPackage()
.getName() + ".maxRecLen").intern();
String DEFAULT_MAX_REC_LEN = "256";
/**
* The name of an optional property whose value identifies the data
* service on which the initial index partition of a scale-out index
* will be created. The value may be the {@link UUID} of that data
* service (this is unambiguous) of the name associated with the data
* service (it is up to the administrator to not assign the same name to
* different data service instances and an arbitrary instance having the
* desired name will be used if more than one instance is assigned the
* same name). The default behavior is to select a data service using
* the load balancer, which is done automatically by
* {@link IBigdataFederation#registerIndex(IndexMetadata, UUID)} if
* {@link IndexMetadata#getInitialDataServiceUUID()} returns
* null
.
*/
// note: property applies to views so namespace is the package.
String INITIAL_DATA_SERVICE = com.bigdata.btree.BTree.class
.getPackage().getName()
+ ".initialDataService";
/**
* The capacity of the hard reference queue used to retain recently
* touched nodes (nodes or leaves) and to defer the eviction of dirty
* nodes (nodes or leaves).
*
* The purpose of this queue is to retain recently touched nodes and
* leaves and to defer eviction of dirty nodes and leaves in case they
* will be modified again soon. Once a node falls off the write
* retention queue it is checked to see if it is dirty. If it is dirty,
* then it is serialized and persisted on the backing store. If the
* write retention queue capacity is set to a large value (say, GTE
* 1000), then that will will increase the commit latency and have a
* negative effect on the overall performance. Too small a value will
* mean that nodes that are undergoing mutation will be serialized and
* persisted prematurely leading to excessive writes on the backing
* store. For append-only stores, this directly contributes to what are
* effectively redundant and thereafter unreachable copies of the
* intermediate state of nodes as only nodes that can be reached by
* navigation from a {@link Checkpoint} will ever be read again. The
* value 500
appears to be a good default. While it is
* possible that some workloads could benefit from a larger value, this
* leads to higher commit latency and can therefore have a broad impact
* on performance.
*
* Note: The write retention queue is used for both {@link BTree} and
* {@link IndexSegment}. Any touched node or leaf is placed onto this
* queue. As nodes and leaves are evicted from this queue, they are then
* placed onto the optional read-retention queue.
*
* The default value is a function of the JVM heap size. For small
* heaps, it is {@value #DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY}. For
* larger heaps the value may be 8000 (1G), or 20000 (10G). These larger
* defaults are heuristics. Values larger than 8000 benefit the size of
* disk of the journal, while values up to 8000 can also improve
* throughput dramatically. Larger values are ONLY useful if the
* application is performing sustained writes on the index (hundreds of
* thousands to millions of records).
*/
String WRITE_RETENTION_QUEUE_CAPACITY = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ ".writeRetentionQueue.capacity").intern();
/**
* The #of entries on the write retention queue that will be scanned for
* a match before a new reference is appended to the queue. This trades
* off the cost of scanning entries on the queue, which is handled by
* the queue itself, against the cost of queue churn. Note that queue
* eviction drives IOs required to write the leaves on the store, but
* incremental writes occur iff the {@link AbstractNode#referenceCount}
* is zero and the node or leaf is dirty.
*/
String WRITE_RETENTION_QUEUE_SCAN = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ ".writeRetentionQueue.scan").intern();
String DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY = "500";// was 500
String DEFAULT_WRITE_RETENTION_QUEUE_SCAN = "20";
/**
* Override the {@link IKeyBuilderFactory} used by the
* {@link DefaultTupleSerializer} (the default is a
* {@link DefaultKeyBuilderFactory} initialized with an empty
* {@link Properties} object).
*
* FIXME {@link KeyBuilder} configuration support is not finished.
*/
String KEY_BUILDER_FACTORY = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ "keyBuilderFactory").intern();
/**
* Override the {@link IRabaCoder} used for the keys in the nodes of a
* B+Tree (the default is a {@link FrontCodedRabaCoder} instance).
*/
String NODE_KEYS_CODER = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ "nodeKeysCoder").intern();
/**
* Override the {@link IRabaCoder} used for the keys of leaves in
* B+Trees (the default is a {@link FrontCodedRabaCoder} instance).
*
* @see DefaultTupleSerializer#setLeafKeysCoder(IRabaCoder)
*/
String LEAF_KEYS_CODER = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ ".leafKeysCoder").intern();
/**
* Override the {@link IRabaCoder} used for the values of leaves in
* B+Trees (default is a {@link CanonicalHuffmanRabaCoder}).
*
* @see DefaultTupleSerializer#setLeafValuesCoder(IRabaCoder)
*/
String LEAF_VALUES_CODER = (com.bigdata.btree.AbstractBTree.class
.getPackage().getName()
+ ".leafValuesCoder").intern();
// /**
// * Option determines whether or not per-child locks are used by
// * {@link Node} for a read-only {@link AbstractBTree} (default
// * {@value #DEFAULT_CHILD_LOCKS}). This option effects synchronization
// * in {@link Node#getChild(int)}. Synchronization is not required for
// * mutable {@link BTree}s as they already impose the constraint that the
// * caller is single threaded. Synchronization is required in this method
// * to ensure that the data structure remains coherent when concurrent
// * threads demand access to the same child of a given {@link Node}.
// * Per-child locks have higher potential concurrency since locking is
// * done on a distinct {@link Object} for each child rather than on a
// * shared {@link Object} for all children of a given {@link Node}.
// * However, per-child locks require more {@link Object} allocation (for
// * the locks) and thus contribute to heap demand.
// *
// * Note: While this can improve read concurrency, this option imposes
// * additional RAM demands since there is on {@link Object} allocated for
// * each {@link Node} in the {@link BTree}. This is why it is turned off
// * by default.
// */
// String CHILD_LOCKS = com.bigdata.btree.AbstractBTree.class.getPackage()
// .getName()
// + ".childLocks";
//
// String DEFAULT_CHILD_LOCKS = "false";
/*
* Options that are valid for any AbstractBTree but which are not
* defined for a FusedView.
*
* Note: These options are in the AbstractBTree namespace.
*/
/*
* Options that are specific to BTree.
*
* Note: These options are in the BTree namespace.
*/
/**
* The name of a class derived from {@link BTree} that will be used to
* re-load the index. Note that index partitions are in general views
* (of one or more resources). Therefore only unpartitioned indices can
* be meaningfully specialized solely in terms of the {@link BTree} base
* class.
*
* @todo in order to provide a similar specialization mechanism for
* scale-out indices you would need to specify the class name for
* the {@link IndexSegment} and the {@link FusedView}. You might
* also need to override the {@link Checkpoint} class - for
* example the {@link MetadataIndex} does this.
*/
String BTREE_CLASS_NAME = (BTree.class.getName()+".className").intern();
/**
* The name of an optional property whose value specifies the branching
* factor for a mutable {@link BTree}.
*
* @see #DEFAULT_BTREE_BRANCHING_FACTOR
* @see #INDEX_SEGMENT_BRANCHING_FACTOR
*/
String BTREE_BRANCHING_FACTOR = (BTree.class.getName()+".branchingFactor").intern();
/**
* The default branching factor for a mutable {@link BTree}.
*
* Note: on 9/11/2009 I changed the default B+Tree branching factor and
* write retention queue capacity to 64 (was 32) and 8000 (was 500)
* respectively. This change in the B+Tree branching factor reduces the
* height of B+Trees on the Journal, increases the size of the
* individual records on the disk, and aids performance substantially.
* The larger write retention queue capacity helps to prevent B+Tree
* nodes and leaves from being coded and flushed to disk too soon, which
* decreases disk IO and keeps things in their mutable form in memory
* longer, which improves search performance and keeps down the costs of
* mutation operations. [Dropped back to 32/500 on 9/15/09 since
* this does not do so well at scale on machines with less RAM.]
*/
String DEFAULT_BTREE_BRANCHING_FACTOR = "32"; //"256"
// /**
// * The capacity of the hard reference queue used to retain recently used
// * nodes (or leaves) (default
// * {@value #DEFAULT_BTREE_READ_RETENTION_QUEUE_CAPACITY}). When zero
// * (0), this queue is disabled.
// *
// * The read retention queue complements the write retention queue. The
// * latter has a strong buffering effect, but we can not increase the
// * size of the write retention queue without bound as that will increase
// * the commit latency. However, the read retention queue can be quite
// * large and will "simply" buffer "recently" used nodes and leaves in
// * memory. This can have a huge effect, especially when a complex
// * high-level query would otherwise thrash the disk as nodes that are
// * required for query processing fall off of the write retention queue
// * and get garbage collected. The pragmatic upper bound for this
// * probably depends on the index workload. At some point, you will stop
// * seeing an increase in performance as a function of the read retention
// * queue for a given workload. The larger the read retention queue, the
// * more burden the index can impose on the heap. However, copy-on-write
// * explicitly clears all references in a node so the JVM can collect the
// * data for nodes that are no longer part of the index before they fall
// * off of the queue even if it can not collect the node reference
// * itself.
// *
// * A large values works well for scale-up but you might need to
// * reduce the read retention queue capacity since if you expect to have
// * a large #of smaller indices open, e.g., for scale-out scenarios. Zero
// * will disable the read-retention queue. This queue ONLY applies to
// * {@link BTree}s (vs {@link IndexSegment}s).
// *
// * @todo The size of the read retention queue should be set dynamically
// * as a function of the depth of the BTree (or the #of nodes and
// * leaves), the branching factor, and the RAM available to the
// * HOST (w/o swapping) and to the JVM. For a mutable {@link BTree}
// * the depth changes only slowly, but the other factors are always
// * changing. Regardless, changing the read-retention queue size is
// * never a problem as cleared references will never cause a
// * strongly reachable node to be released.
// *
// * To avoid needless effort, there should be a minimum queue
// * capacity that is used up to depth=2/3. If the queue capacity is
// * set to n=~5-10% of the maximum possible #of nodes in a btree of
// * a given depth, then we can compute the capacity dynamically
// * based on that parameter. And of course it can be easily
// * provisioned when the BTree is {@link #reopen()}ed.
// */
// String BTREE_READ_RETENTION_QUEUE_CAPACITY = com.bigdata.btree.BTree.class
// .getPackage().getName()
// + ".readRetentionQueue.capacity";
//
// String DEFAULT_BTREE_READ_RETENTION_QUEUE_CAPACITY = "10000";
//
// /**
// * The #of entries on the hard reference queue that will be scanned for
// * a match before a new reference is appended to the queue. This trades
// * off the cost of scanning entries on the queue, which is handled by
// * the queue itself, against the cost of queue churn.
// */
// String BTREE_READ_RETENTION_QUEUE_SCAN = com.bigdata.btree.BTree.class
// .getPackage().getName()
// + ".readRetentionQueue.scan";
//
// String DEFAULT_BTREE_READ_RETENTION_QUEUE_SCAN = "20";
/**
* An optional factory providing record-level compression for the nodes
* and leaves of an {@link IndexSegment} (default
* {@value #DEFAULT_BTREE_RECORD_COMPRESSOR_FACTORY}).
*
* @see #INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY
*
* FIXME Record level compression support is not finished.
*/
String BTREE_RECORD_COMPRESSOR_FACTORY = (BTree.class.getName()
+ ".recordCompressorFactory").intern();
/**
*
* @see #BTREE_RECORD_COMPRESSOR_FACTORY
*/
String DEFAULT_BTREE_RECORD_COMPRESSOR_FACTORY = null;
/*
* Options that are specific to IndexSegment.
*
* Note: These options are in the IndexSegment namespace.
*/
/**
* The name of the property whose value specifies the branching factory
* for an immutable {@link IndexSegment}.
*/
String INDEX_SEGMENT_BRANCHING_FACTOR = (IndexSegment.class
.getName()
+ ".branchingFactor").intern();
/**
* The default branching factor for an {@link IndexSegment}.
*/
String DEFAULT_INDEX_SEGMENT_BRANCHING_FACTOR = "512";
/**
* When true
an attempt will be made to fully buffer the
* nodes (but not the leaves) of the {@link IndexSegment} (default
* {@value #DEFAULT_INDEX_SEGMENT_BUFFER_NODES}). The nodes in the
* {@link IndexSegment} are serialized in a contiguous region by the
* {@link IndexSegmentBuilder}. That region may be fully buffered when
* the {@link IndexSegment} is opened, in which case queries against the
* {@link IndexSegment} will incur NO disk hits for the nodes and only
* one disk hit per visited leaf.
*
* Note: The nodes are read into a buffer allocated from the
* {@link DirectBufferPool}. If the size of the nodes region in the
* {@link IndexSegmentStore} file exceeds the capacity of the buffers
* managed by the {@link DirectBufferPool}, then the nodes WILL NOT be
* buffered. The {@link DirectBufferPool} is used both for efficiency
* and because a bug dealing with temporary direct buffers would
* otherwise cause the C heap to be exhausted!
*
* @see #DEFAULT_INDEX_SEGMENT_BUFFER_NODES
*
* @todo should be on by default? (but verify that the unit tests do
* not run out of memory when it is enabled by default).
*/
String INDEX_SEGMENT_BUFFER_NODES = (IndexSegment.class.getName()
+ ".bufferNodes").intern();
/**
* @see #INDEX_SEGMENT_BUFFER_NODES
*/
String DEFAULT_INDEX_SEGMENT_BUFFER_NODES = "false";
// /**
// * The size of the LRU cache backing the weak reference cache for leaves
// * (default {@value #DEFAULT_INDEX_SEGMENT_LEAF_CACHE_CAPACITY}).
// *
// * While the {@link AbstractBTree} already provides caching for nodes
// * and leaves based on navigation down the hierarchy from the root node,
// * the {@link IndexSegment} uses an additional leaf cache to optimize
// * access to leaves based on the double-linked list connecting the
// * leaves.
// *
// * A larger value will tend to retain leaves longer at the expense of
// * consuming more RAM when many parts of the {@link IndexSegment} are
// * hot.
// */
// String INDEX_SEGMENT_LEAF_CACHE_CAPACITY = IndexSegment.class.getName()
// + ".leafCacheCapacity";
//
// /**
// *
// * @see #INDEX_SEGMENT_LEAF_CACHE_CAPACITY
// */
// String DEFAULT_INDEX_SEGMENT_LEAF_CACHE_CAPACITY = "100";
//
// /**
// * The timeout in nanoseconds for the LRU cache backing the weak
// * reference cache for {@link IndexSegment} leaves (default
// * {@value #DEFAULT_INDEX_SEGMENT_LEAF_CACHE_TIMEOUT}).
// *
// * While the {@link AbstractBTree} already provides caching for nodes
// * and leaves based on navigation down the hierarchy from the root node,
// * the {@link IndexSegment} uses an additional leaf cache to optimize
// * access to leaves based on the double-linked list connecting the
// * leaves.
// *
// * A larger value will tend to retain leaves longer at the expense of
// * consuming more RAM when many parts of the {@link IndexSegment} are
// * hot.
// */
// String INDEX_SEGMENT_LEAF_CACHE_TIMEOUT = IndexSegment.class.getName()
// + ".leafCacheTimeout";
//
// /**
// *
// * @see #INDEX_SEGMENT_LEAF_CACHE_TIMEOUT
// */
// String DEFAULT_INDEX_SEGMENT_LEAF_CACHE_TIMEOUT = ""
// + TimeUnit.SECONDS.toNanos(30);
/**
* An optional factory providing record-level compression for the nodes
* and leaves of an {@link IndexSegment} (default
* {@value #DEFAULT_INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY}).
*
* @see #BTREE_RECORD_COMPRESSOR_FACTORY
*
* FIXME Record level compression support is not finished.
*/
String INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY = (IndexSegment.class.getName()
+ ".recordCompressorFactory").intern();
/**
*
* @see #INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY
*/
String DEFAULT_INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY = null;
/*
* Split handler properties.
*/
// * @see DefaultSplitHandler
// *
// * Note: Use these settings to trigger splits sooner and thus enter the
// * more interesting regions of the phase space more quickly BUT DO NOT
// * use these settings for deployment!
// *
// * final int minimumEntryCount = 1 * Bytes.kilobyte32; (or 10k)
// *
// * final int entryCountPerSplit = 5 * Bytes.megabyte32; (or 50k)
// /**
// * An index partition which has no more than this many tuples should be
// * joined with its rightSibling (if any).
// */
// String SPLIT_HANDLER_MIN_ENTRY_COUNT = DefaultSplitHandler.class
// .getName()
// + ".minimumEntryCount";
//
// /**
// * The target #of tuples for an index partition.
// */
// String SPLIT_HANDLER_ENTRY_COUNT_PER_SPLIT = DefaultSplitHandler.class
// .getName()
// + ".entryCountPerSplit";
//
// /**
// * The index partition will be split when its actual entry count is GTE
// * to overCapacityMultiplier * entryCountPerSplit
// */
// String SPLIT_HANDLER_OVER_CAPACITY_MULTIPLIER = DefaultSplitHandler.class
// .getName()
// + ".overCapacityMultiplier";
//
// /**
// * When an index partition will be split, the #of new index partitions
// * will be chosen such that each index partition is approximately
// * underCapacityMultiplier full.
// */
// String SPLIT_HANDLER_UNDER_CAPACITY_MULTIPLIER = DefaultSplitHandler.class
// .getName()
// + ".underCapacityMultiplier";
//
// /**
// * The #of samples to take per estimated split (non-negative, and
// * generally on the order of 10s of samples). The purpose of the samples
// * is to accommodate the actual distribution of the keys in the index.
// */
// String SPLIT_HANDLER_SAMPLE_RATE = DefaultSplitHandler.class.getName()
// + ".sampleRate";
//
// String DEFAULT_SPLIT_HANDLER_MIN_ENTRY_COUNT = ""
// + (500 * Bytes.kilobyte32);
//
// String DEFAULT_SPLIT_HANDLER_ENTRY_COUNT_PER_SPLIT = ""
// + (1 * Bytes.megabyte32);
//
// String DEFAULT_SPLIT_HANDLER_OVER_CAPACITY_MULTIPLIER = "1.5";
//
// String DEFAULT_SPLIT_HANDLER_UNDER_CAPACITY_MULTIPLIER = ".75";
//
// String DEFAULT_SPLIT_HANDLER_SAMPLE_RATE = "20";
/*
* Asynchronous index write API.
*/
/**
* The capacity of the queue on which the application writes. Chunks are
* drained from this queue by the {@link AbstractTaskMaster}, broken
* into splits, and each split is written onto the
* {@link AbstractSubtask} sink handling writes for the associated index
* partition.
*/
String MASTER_QUEUE_CAPACITY = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".masterQueueCapacity").intern();
String DEFAULT_MASTER_QUEUE_CAPACITY = "5000";
/**
* The desired size of the chunks that the master will draw from its
* queue.
*/
String MASTER_CHUNK_SIZE = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".masterChunkSize").intern();
String DEFAULT_MASTER_CHUNK_SIZE = "10000";
/**
* The time in nanoseconds that the master will combine smaller chunks
* so that it can satisfy the desired masterChunkSize.
*/
String MASTER_CHUNK_TIMEOUT_NANOS = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".masterChunkTimeoutNanos").intern();
String DEFAULT_MASTER_CHUNK_TIMEOUT_NANOS = ""
+ TimeUnit.MILLISECONDS.toNanos(50);
/**
* The time in nanoseconds that the {@link AbstractSubtask sink} will
* wait inside of the {@link IAsynchronousIterator} when it polls the
* iterator for a chunk. This value should be relatively small so that
* the sink remains responsible rather than blocking inside of the
* {@link IAsynchronousIterator} for long periods of time.
*/
String SINK_POLL_TIMEOUT_NANOS = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".sinkPollTimeoutNanos").intern();
String DEFAULT_SINK_POLL_TIMEOUT_NANOS = ""
+ TimeUnit.MILLISECONDS.toNanos(50);
/**
* The capacity of the internal queue for the per-sink output buffer.
*/
String SINK_QUEUE_CAPACITY = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".sinkQueueCapacity").intern();
String DEFAULT_SINK_QUEUE_CAPACITY = "5000";
/**
* The desired size of the chunks written that will be written by the
* {@link AbstractSubtask sink}.
*/
String SINK_CHUNK_SIZE = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".sinkChunkSize").intern();
String DEFAULT_SINK_CHUNK_SIZE = "10000";
/**
* The maximum amount of time in nanoseconds that a sink will combine
* smaller chunks so that it can satisfy the desired sinkChunkSize
* (default {@value #DEFAULT_SINK_CHUNK_TIMEOUT_NANOS}). The default is
* an infinite timeout. This means that the sink will simply wait until
* {@link #SINK_CHUNK_SIZE} elements have accumulated before writing on
* the index partition. This makes it much easier to adjust the
* performance since you simply adjust the {@link #SINK_CHUNK_SIZE}.
*/
String SINK_CHUNK_TIMEOUT_NANOS = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".sinkChunkTimeoutNanos").intern();
String DEFAULT_SINK_CHUNK_TIMEOUT_NANOS = "" + Long.MAX_VALUE;
/**
* The time in nanoseconds after which an idle sink will be closed
* (default {@value #DEFAULT_SINK_IDLE_TIMEOUT_NANOS}). Any buffered
* writes are flushed when the sink is closed. The idle timeout is reset
* (a) if a chunk is available to be drained by the sink; or (b) if a
* chunk is drained from the sink. If no chunks become available the the
* sink will eventually decide that it is idle, will flush any buffered
* writes, and will close itself.
*
* If the idle timeout is LT the {@link #SINK_CHUNK_TIMEOUT_NANOS} then
* a sink will remain open as long as new chunks appear and are combined
* within idle timeout, otherwise the sink will decide that it is idle
* and will flush its last chunk and close itself. If this is
* {@link Long#MAX_VALUE} then the sink will identify itself as idle and
* will only be closed if the master is closed or the sink has received
* a {@link StaleLocatorException} for the index partition on which the
* sink is writing.
*/
// GTE chunkTimeout
String SINK_IDLE_TIMEOUT_NANOS = (AsynchronousIndexWriteConfiguration.class
.getName()
+ ".sinkIdleTimeoutNanos").intern();
String DEFAULT_SINK_IDLE_TIMEOUT_NANOS = "" + Long.MAX_VALUE;
/*
* Scatter split configuration.
*/
/**
* Boolean option indicates whether or not scatter splits are performed
* (default {@value #SCATTER_SPLIT_ENABLED}). Scatter splits only apply
* for scale-out indices where they "scatter" the initial index
* partition across the {@link IDataService}s in the federation. This
* is normally very useful.
*
* Sometimes a scatter split is not the "right" thing for an index. An
* example would be an index where you have to do a LOT of synchronous
* RPC rather than using asynchronous index writes. In this case, the
* synchronous RPC can be a bottleneck unless the "chunk" size of the
* writes is large. This is especially true when writes on other indices
* must wait for the outcome of the synchronous RPC. E.g., foreign keys.
*
* @see OverflowManager.Options#SCATTER_SPLIT_ENABLED
*/
String SCATTER_SPLIT_ENABLED = (ScatterSplitConfiguration.class
.getName()
+ ".enabled").intern();
String DEFAULT_SCATTER_SPLIT_ENABLED = "true";
/**
* The percentage of the nominal index partition size at which a scatter
* split is triggered when there is only a single index partition for a
* given scale-out index (default
* {@link #DEFAULT_SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD}). The
* scatter split will break the index into multiple partitions and
* distribute those index partitions across the federation in order to
* allow more resources to be brought to bear on the scale-out index.
* The value must LT the nominal index partition split point or normal
* index splits will take precedence and a scatter split will never be
* performed. The allowable range is therefore constrained to
* (0.1 : 1.0)
.
*/
String SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD = (ScatterSplitConfiguration.class
.getName()
+ ".percentOfSplitThreshold").intern();
String DEFAULT_SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD = ".25";
/**
* The #of data services on which the index will be scattered or ZERO(0)
* to use all discovered data services (default
* {@value #DEFAULT_SCATTER_SPLIT_DATA_SERVICE_COUNT}).
*/
String SCATTER_SPLIT_DATA_SERVICE_COUNT = (ScatterSplitConfiguration.class
.getName()
+ ".dataServiceCount").intern();
String DEFAULT_SCATTER_SPLIT_DATA_SERVICE_COUNT = "0";
/**
* The #of index partitions to generate when an index is scatter split.
* The index partitions will be evenly distributed across up to
* {@link #SCATTER_SPLIT_DATA_SERVICE_COUNT} discovered data services.
* When ZERO(0), the scatter split will generate
* (NDATA_SERVICES x 2)
index partitions, where
* NDATA_SERVICES is either {@link #SCATTER_SPLIT_DATA_SERVICE_COUNT} or
* the #of discovered data services when that option is ZERO (0).
*
* The "ideal" number of index partitions is generally between (NCORES x
* NDATA_SERVICES / NINDICES) and (NCORES x NDATA_SERVICES). When there
* are NCORES x NDATA_SERVICES index partitions, each core is capable of
* servicing a distinct index partition assuming that the application
* and the "schema" are capable of driving the data service writes with
* that concurrency. However, if you have NINDICES, and the application
* drives writes on all index partitions of all indices at the same
* rate, then a 1:1 allocation of index partitions to cores would be
* "ideal".
*
* The "right" answer also depends on the data scale. If you have far
* less data than can fill that many index partitions to 200M each, then
* you should adjust the scatter split to use fewer index partitions or
* fewer data services.
*
* Finally, the higher the scatter the more you will need to use
* asynchronous index writes in order to obtain high throughput with
* sustained index writes.
*/
String SCATTER_SPLIT_INDEX_PARTITION_COUNT = (ScatterSplitConfiguration.class
.getName()
+ ".indexPartitionCount").intern();
String DEFAULT_SCATTER_SPLIT_INDEX_PARTITION_COUNT = "0";
}
/**
* Address that can be used to read this metadata record from the store.
*
* Note: This is not persisted since we do not have the address until after
* we have written out the state of this record. However the value is
* written into each {@link Checkpoint} record.
*/
private transient /*final*/ long addrMetadata;
/**
* Address that can be used to read this metadata record from the store.
*
* Note: This is not a persistent property. However the value is set when
* the metadata record is read from, or written on, the store. It is zero
* when you {@link #clone()} a metadata record until it's been written onto
* the store.
*/
final public long getMetadataAddr() {
return addrMetadata;
}
/**
* The {@link UUID} of the {@link DataService} on which the first partition
* of the scale-out index should be created. This is a purely transient
* property and will be null
unless either explicitly set or
* set using {@value Options#INITIAL_DATA_SERVICE}. This property is only
* set by the ctor(s) that are used to create a new {@link IndexMetadata}
* instance, so no additional lookups are performed during de-serialization.
*
* @see Options#INITIAL_DATA_SERVICE
* @see AbstractFederation#registerIndex(IndexMetadata, UUID)
*/
public UUID getInitialDataServiceUUID() {
return initialDataServiceUUID;
}
public void setInitialDataServiceUUID(UUID uuid) {
initialDataServiceUUID = uuid;
}
private transient UUID initialDataServiceUUID;
/*
* @todo consider allowing distinct values for the branching factor (already
* done), the class name, and possibly some other properties (record
* compression, checksum) for the index segments vs the mutable btrees.
*/
private UUID indexUUID;
private String name;
/**
* The type of the index.
*
* @see #VERSION4
*/
private IndexTypeEnum indexType;
private int branchingFactor;
private int writeRetentionQueueCapacity;
private int writeRetentionQueueScan;
// private int btreeReadRetentionQueueCapacity;
// private int btreeReadRetentionQueueScan;
private LocalPartitionMetadata pmd;
private String btreeClassName;
private String checkpointClassName;
private IRabaCoder nodeKeysCoder;
private ITupleSerializer tupleSer;
private IRecordCompressorFactory btreeRecordCompressorFactory;
private IRecordCompressorFactory indexSegmentRecordCompressorFactory;
private IConflictResolver conflictResolver;
private boolean deleteMarkers;
private boolean versionTimestamps;
private boolean versionTimestampFilters;
private boolean rawRecords;
private short maxRecLen;
private BloomFilterFactory bloomFilterFactory;
private IOverflowHandler overflowHandler;
private ISimpleSplitHandler splitHandler2;
private AsynchronousIndexWriteConfiguration asynchronousIndexWriteConfiguration;
private ScatterSplitConfiguration scatterSplitConfiguration;
/*
* IndexSegment fields.
*/
/**
* @see Options#INDEX_SEGMENT_BRANCHING_FACTOR
*/
private int indexSegmentBranchingFactor;
/**
* @see Options#INDEX_SEGMENT_BUFFER_NODES
*/
private boolean indexSegmentBufferNodes;
/**
* The unique identifier for the (scale-out) index whose data is stored in
* this B+Tree data structure.
*
* Note: When using a scale-out index the same indexUUID MUST be
* assigned to each mutable and immutable B+Tree having data for any
* partition of that scale-out index. This makes it possible to work
* backwards from the B+Tree data structures and identify the index to which
* they belong.
*/
public final UUID getIndexUUID() {return indexUUID;}
/**
* The type of the associated persistence capable data structure.
*/
public final IndexTypeEnum getIndexType() {
return indexType;
}
/**
* The name associated with the index -or- null
iff the index
* is not named (internal indices are generally not named while application
* indices are always named).
*
* Note: When the index is a scale-out index, this is the name of the
* scale-out index NOT the name under which an index partition is
* registered.
*
* Note: When the index is a metadata index, then this is the name of the
* metadata index itself NOT the name of the managed scale-out index.
*/
public final String getName() {return name;}
/**
* The branching factor for a mutable {@link BTree}. The branching factor
* is the #of children in a node or values in a leaf and must be an integer
* greater than or equal to three (3). Larger branching factors result in
* trees with fewer levels. However there is a point of diminishing returns
* at which the amount of copying performed to move the data around in the
* nodes and leaves exceeds the performance gain from having fewer levels.
* The branching factor for the read-only {@link IndexSegment}s is
* generally much larger in order to reduce the number of disk seeks.
*/
public final int getBranchingFactor() {
return branchingFactor;
}
/**
* The branching factor used when building an {@link IndexSegment} (default
* is 4096). Index segments are read-only B+Tree resources. The are built
* using a bulk index build procedure and typically have a much higher
* branching factor than the corresponding mutable index on the journal.
* There are two reasons why it makes sense to use a larger branching factor
* for an index segment. First, the WORM Journal is used to buffer writes in
* scale-out and IO on an index on the WORM Journal is driven by node and
* leaf revisions so the index often uses a smaller branching factor on the
* WORM. Second, the index segment is laid out in total key order in the
* file and each node and leaf is a contiguous sequences of bytes on the
* disk (like the WORM, but unlike the RWStore). Since most of the latency
* of the disk is the seek, reading larger leaves from an index segment is
* efficient.
*
* Note: the value of this property will determine the branching factor of
* the {@link IndexSegment}. When the {@link IndexSegment} is built, it will
* be given a {@link #clone()} of this {@link IndexMetadata} and the actual
* branching factor for the {@link IndexSegment} be set on the
* {@link #getBranchingFactor()} at that time.
*
* Note: a branching factor of 256 for an index segment and split limits of
* (1M,5M) imply an average B+Tree height of 1.5 to 1.8. With a 10ms seek
* time and NO CACHE that is between 15 and 18ms average seek time.
*
* Note: a branching factor of 512 for an index segment and split limits of
* (1M,5M) imply an average B+Tree height of 1.2 to 1.5. With a 10ms seek
* time and NO CACHE that is between 12 and 15ms average seek time.
*
* Note: the actual size of the index segment of course depends heavily on
* (a) whether or now block references are being stored since the referenced
* blocks are also stored in the index segment; (b) the size of the keys and
* values stored in the index; and (c) the key, value, and record
* compression options in use.
*/
public final int getIndexSegmentBranchingFactor() {
return indexSegmentBranchingFactor;
}
/**
* Return true
iff the nodes region for the
* {@link IndexSegment} should be fully buffered by the
* {@link IndexSegmentStore}.
*
* @see Options#INDEX_DEFAULT_SEGMENT_BUFFER_NODES
*/
public final boolean getIndexSegmentBufferNodes() {
return indexSegmentBufferNodes;
}
public final void setIndexSegmentBufferNodes(boolean newValue) {
this.indexSegmentBufferNodes = newValue;
}
/**
* Return the record-level compression provider for a {@link BTree} (may be
* null, which implies no compression).
*/
public IRecordCompressorFactory getBtreeRecordCompressorFactory() {
return btreeRecordCompressorFactory;
}
public void setBtreeRecordCompressorFactory(
final IRecordCompressorFactory btreeRecordCompressorFactory) {
this.btreeRecordCompressorFactory = btreeRecordCompressorFactory;
}
/**
* Return the record-level compression provider for an {@link IndexSegment}
* (may be null, which implies no compression).
*/
public IRecordCompressorFactory getIndexSegmentRecordCompressorFactory() {
return indexSegmentRecordCompressorFactory;
}
public void setIndexSegmentRecordCompressorFactory(
final IRecordCompressorFactory segmentRecordCompressorFactory) {
this.indexSegmentRecordCompressorFactory = segmentRecordCompressorFactory;
}
/**
* @see Options#WRITE_RETENTION_QUEUE_CAPACITY
*/
public final int getWriteRetentionQueueCapacity() {
return writeRetentionQueueCapacity;
}
public final void setWriteRetentionQueueCapacity(int v) {
this.writeRetentionQueueCapacity = v;
}
/**
* @see Options#WRITE_RETENTION_QUEUE_SCAN
*/
public final int getWriteRetentionQueueScan() {
return writeRetentionQueueScan;
}
public final void setWriteRetentionQueueScan(int v) {
this.writeRetentionQueueScan = v;
}
// /**
// * @see Options#BTREE_READ_RETENTION_QUEUE_CAPACITY
// */
// public final int getBTreeReadRetentionQueueCapacity() {
//
// return btreeReadRetentionQueueCapacity;
//
// }
//
// public final void setBTreeReadRetentionQueueCapacity(int v) {
//
// this.btreeReadRetentionQueueCapacity = v;
//
// }
//
// /**
// * @see Options#BTREE_READ_RETENTION_QUEUE_SCAN
// */
// public final int getBTreeReadRetentionQueueScan() {
//
// return btreeReadRetentionQueueScan;
//
// }
//
// public final void setBTreeReadRetentionQueueScan(int v) {
//
// this.btreeReadRetentionQueueScan = v;
//
// }
/**
* When non-null
, this is the description of the view of
* this index partition. This will be null
iff the
* {@link BTree} is not part of a scale-out index. This is updated when the
* view composition for the index partition is changed.
*/
public final LocalPartitionMetadata getPartitionMetadata() {
return pmd;
}
/**
* The name of a class derived from {@link BTree} that will be used to
* re-load the index. Note that index partitions are in general views (of
* one or more resources). Therefore only unpartitioned indices can be
* meaningfully specialized solely in terms of the {@link BTree} base class.
*
* @see Options#BTREE_CLASS_NAME
*/
public final String getBTreeClassName() {
return btreeClassName;
}
/**
* The name of the {@link Checkpoint} class used by the index. This may be
* overridden to store additional state with each {@link Checkpoint} record.
*/
public final String getCheckpointClassName() {
return checkpointClassName;
}
public final void setCheckpointClassName(final String className) {
if (className == null)
throw new IllegalArgumentException();
this.checkpointClassName = className;
}
/**
* Object used to code (compress) the keys in a node.
*
* Note: The keys for nodes are separator keys for the leaves. Since they
* are chosen to be the minimum length separator keys dynamically when a
* leaf is split or joined the keys in the node typically DO NOT conform to
* application expectations and MAY be assigned a different
* {@link IRabaCoder} for that reason.
*
* @see #getTupleSerializer()
*/
public final IRabaCoder getNodeKeySerializer() {return nodeKeysCoder;}
/**
* The object used to form unsigned byte[] keys from Java objects, to
* (de-)serialize Java object stored in the index, and to (de-)compress the
* keys and values when stored in a leaf or {@link ResultSet}.
*
* Note: If you change this value in a manner that is not backward
* compatible once entries have been written on the index then you may be
* unable to any read data already written.
*/
public final ITupleSerializer getTupleSerializer() {return tupleSer;}
/**
* The optional object for handling write-write conflicts.
*
* The concurrency control strategy detects write-write conflict resolution
* during backward validation. If a write-write conflict is detected and a
* conflict resolver is defined, then the conflict resolver is expected to
* make a best attempt using data type specific rules to reconcile the state
* for two versions of the same persistent identifier. If the conflict can
* not be resolved, then validation will fail. State-based conflict
* resolution when combined with validation (aka optimistic locking) is
* capable of validating the greatest number of interleavings of
* transactions (aka serialization orders).
*
* @return The conflict resolver to be applied during validation or
* null
iff no conflict resolution will be performed.
*/
public final IConflictResolver getConflictResolver() {return conflictResolver;}
/**
* When true
the index will write a delete marker when an
* attempt is made to delete the entry under a key. Delete markers will be
* retained until a compacting merge of an index partition. When
* false
the index entry will be removed from the index
* immediately.
*
* Delete markers MUST be enabled to use scale-out indices. Index partition
* views depend on an ordered array of {@link AbstractBTree}s. The presence
* of a delete marker provides an indication of a deleted index entry and is
* used to prevent reading of index entries for the same key which might
* exist in an earlier {@link AbstractBTree} which is part of the same index
* partition view.
*
* Delete markers MUST be enabled for transaction support where they play a
* similar role recording within the write set of the transaction the fact
* that an index entry has been deleted.
*/
public final boolean getDeleteMarkers() {return deleteMarkers;}
public final void setDeleteMarkers(final boolean deleteMarkers) {
this.deleteMarkers = deleteMarkers;
}
/**
* When true
the index will maintain a per-index entry revision
* timestamp. The primary use of this is in support of transactional
* isolation. Delete markers MUST be enabled when using revision timestamps.
*
* @see #getVersionTimestampFilters()
*/
public final boolean getVersionTimestamps() {
return versionTimestamps;
}
/**
* When true
the index will maintain the min/max of the per
* tuple-revision timestamp on each {@link Node} of the B+Tree. This
* information can be used to perform efficient filtering of iterators such
* that they visit only nodes and leaves having data for a specified tuple
* revision timestamp range. This filtering is efficient because it skips
* any node (and all spanned nodes or leaves) which does not have data for
* the desired revision timestamp range. In order to find all updates after
* a given timestamp revision, you specify (fromRevision,Long.MAX_VALUE). In
* order to visit the delta between two revisions, you specify
* (fromRevision, toRevision+1).
*
* Tuple revision filtering can be very efficient for some purposes. For
* example, it can be used to synchronize disconnected clients or compute
* the write set of a committed transaction. However, it requires more space
* in the {@link INodeData} records since we must store the minimum and
* maximum timestamp revision for each child of a given node.
*
* Per-tuple timestamp revisions MAY be used without support for per-tuple
* revision filtering.
*
* @see #getVersionTimestamps()
*/
public final boolean getVersionTimestampFilters() {
return versionTimestampFilters;
}
/**
* Sets {@link #versionTimestampFilters}. You MUST also use
* {@link #setVersionTimestamps(boolean)} to true
for version
* timestamp filtering to be supported.
*
* @param versionTimestampFilters
* true
iff version timestamp filtering should be
* supported.
*/
public final void setVersionTimestampFilters(
final boolean versionTimestampFilters) {
this.versionTimestampFilters = versionTimestampFilters;
}
public final void setVersionTimestamps(final boolean versionTimestamps) {
this.versionTimestamps = versionTimestamps;
}
/**
* True iff the index supports transactional isolation (both delete markers
* and version timestamps are required).
*/
public final boolean isIsolatable() {
return deleteMarkers && versionTimestamps;
}
/**
* Convenience method sets both {@link #setDeleteMarkers(boolean)} and
* {@link #setVersionTimestamps(boolean)} at the same time.
*
* @param isolatable
* true
if delete markers and version timestamps
* will be enabled -or- false
if they will be
* disabled.
*/
public void setIsolatable(final boolean isolatable) {
setDeleteMarkers(isolatable);
setVersionTimestamps(isolatable);
}
/**
* When true
the index transparently promote large
* byte[]
values associated with a key to raw records on the
* backing store. This feature is disabled by default. Indices which do use
* large records should enable this option in order to reduce their IO churn
* and disk footprint.
*
* @see #getMaxRecLen()
*/
public final boolean getRawRecords() {return rawRecords;}
/**
* (Dis|En)able automatic promotion of index byte[]
values
* larger than a configured byte length out of the index leaf and into raw
* records on the backing persistence store. This option can significicantly
* reduce the IO churn for indices which do make use of large values.
* However, the leaves will occupy slightly more space (~ 1 bit per tuple)
* if this option is enabled and none of the values stored in the index
* exceed the configured maximum value length. {@link IRabaCoder}s which
* rely on a uniform value length generally already use small values and
* should typically turn this feature off in order to make the leaf as
* compact as possible.
*
* @param rawRecords
* true
if the feature is to be enabled.
*
* @see #setMaxRecLen(int)
*/
public final void setRawRecords(final boolean rawRecords) {
this.rawRecords = rawRecords;
}
/**
* When {@link #getRawRecords()} returns true
, this method
* returns the maximum byte length of a byte[]
value will be be
* stored in a B+Tree leaf (default {@link Options#MAX_REC_LEN}) while
* values larger than this will be automatically converted into raw record
* references. Note that this method returns the configured value regardless
* of the value of {@link #getRawRecords()} - the caller must check
* {@link #getRawRecords()} in order to correctly interpret the value
* returned by this method.
*
* @see Options#MAX_REC_LEN
* @see Options#DEFAULT_MAX_REC_LEN
*/
public final int getMaxRecLen() {return maxRecLen;}
/**
* Set the maximum length of a byte[]
value in a leaf of the
* index.
*
* @param maxRecLen
* The maximum length of a byte[]
value in a leaf of
* the index. A value of ZERO (0) may be used to force all values
* into raw records.
*
* @throws IllegalArgumentException
* if the argument is negative or greater than
* {@link Short#MAX_VALUE}
*
* @see #setRawRecords(boolean)
*/
public final void setMaxRecLen(final int maxRecLen) {
if (maxRecLen < 0 || maxRecLen > Short.MAX_VALUE)
throw new IllegalArgumentException();
this.maxRecLen = (short) maxRecLen;
}
public void setPartitionMetadata(final LocalPartitionMetadata pmd) {
this.pmd = pmd;
}
public void setNodeKeySerializer(final IRabaCoder nodeKeysCoder) {
if (nodeKeysCoder == null)
throw new IllegalArgumentException();
this.nodeKeysCoder = nodeKeysCoder;
}
public void setTupleSerializer(final ITupleSerializer tupleSer) {
if (tupleSer == null)
throw new IllegalArgumentException();
this.tupleSer = tupleSer;
}
/**
* The branching factor MAY NOT be changed once an {@link AbstractBTree}
* object has been created.
*
* @param branchingFactor
*/
public void setBranchingFactor(final int branchingFactor) {
if(branchingFactor < Options.MIN_BRANCHING_FACTOR) {
throw new IllegalArgumentException();
}
this.branchingFactor = branchingFactor;
}
public void setIndexSegmentBranchingFactor(final int branchingFactor) {
if(branchingFactor < Options.MIN_BRANCHING_FACTOR) {
throw new IllegalArgumentException();
}
this.indexSegmentBranchingFactor = branchingFactor;
}
public void setBTreeClassName(final String className) {
if (className == null)
throw new IllegalArgumentException();
this.btreeClassName = className;
}
public void setConflictResolver(final IConflictResolver conflictResolver) {
this.conflictResolver = conflictResolver;
}
/**
* Return the bloom filter factory.
*
* Bloom filters provide fast rejection for point tests in a space efficient
* manner with a configurable probability of a false positive. Since the
* bloom filter does not give positive results with 100% certainity, the
* index is tested iff the bloom filter states that the key exists.
*
* Note: Bloom filters are NOT enabled by default since point tests are not
* a bottleneck (or even used) for some indices. Also, when multiple indices
* represent different access paths for the same information, you only need
* a bloom filter on one of those indices.
*
* @return Return the object that will be used to configure an optional
* bloom filter for a {@link BTree} or {@link IndexSegment}. When
* null
the index WILL NOT use a bloom filter.
*
* @see BloomFilterFactory
* @see BloomFilterFactory#DEFAULT
*/
public BloomFilterFactory getBloomFilterFactory() {
return bloomFilterFactory;
}
/**
* Set the bloom filter factory.
*
* Bloom filters provide fast rejection for point tests in a space efficient
* manner with a configurable probability of a false positive. Since the
* bloom filter does not give positive results with 100% certainity, the
* index is tested iff the bloom filter states that the key exists.
*
* @param bloomFilterFactory
* The new value (may be null).
*
* @see BloomFilterFactory#DEFAULT
*/
public void setBloomFilterFactory(final BloomFilterFactory bloomFilterFactory) {
this.bloomFilterFactory = bloomFilterFactory;
}
/**
* An optional object that may be used to inspect, and possibly operate on,
* each index entry as it is copied into an {@link IndexSegment}.
*/
public IOverflowHandler getOverflowHandler() {
return overflowHandler;
}
public void setOverflowHandler(final IOverflowHandler overflowHandler) {
this.overflowHandler = overflowHandler;
}
/**
* Object which decides whether and where to split an index partition into 2
* or more index partitions. The default is a null
reference.
* The default behavior when no split handler is specified will work for
* nearly all use cases and will result in index partitions whose size on
* the disk is bounded by the parameter specified using
* {@link OverflowManager.Options#NOMINAL_SHARD_SIZE}. Indices which require
* certain guarantees for atomicity, such as the {@link SparseRowStore},
* must override this default.
*
* @return The {@link ISimpleSplitHandler} -or- null
if the
* application has not imposed any additional constraints on the
* separator keys when splitting index partitions.
*/
public ISimpleSplitHandler getSplitHandler() {
return splitHandler2;
}
public void setSplitHandler(final ISimpleSplitHandler splitHandler) {
this.splitHandler2 = splitHandler;
}
/**
* The asynchronous index write API configuration for this index.
*/
public AsynchronousIndexWriteConfiguration getAsynchronousIndexWriteConfiguration() {
return asynchronousIndexWriteConfiguration;
}
/**
* Set the asynchronous index write API configuration for this index.
*/
public void getAsynchronousIndexWriteConfiguration(
final AsynchronousIndexWriteConfiguration newVal) {
if (newVal == null)
throw new IllegalArgumentException();
this.asynchronousIndexWriteConfiguration = newVal;
}
/**
* The scatter split configuration for a scale-out index.
*/
public ScatterSplitConfiguration getScatterSplitConfiguration() {
return scatterSplitConfiguration;
}
public void setScatterSplitConfiguration(
final ScatterSplitConfiguration newVal) {
if (newVal == null)
throw new IllegalArgumentException();
this.scatterSplitConfiguration = newVal;
}
/**
* Create an instance of a class known to implement the specified interface
* from a class name.
*
* @param className
* The class name.
*
* @return An instance of that class -or- null
iff the class
* name is null
.
*
* @throws RuntimeException
* if the class does not implement that interface or for any
* other reason.
*/
@SuppressWarnings("unchecked")
static private T newInstance(final String className,
final Class iface) {
if (iface == null)
throw new IllegalArgumentException();
if (className == null) {
return null;
}
try {
final Class cls = Class.forName(className);
if (!iface.isAssignableFrom(cls)) {
throw new IllegalArgumentException("Does not implement " + cls
+ " : " + className);
}
return (T) cls.getConstructor(new Class[] {}).newInstance(
new Object[] {});
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* De-serialization constructor only - DO NOT use this ctor
* for creating a new instance! It will result in a thrown exception,
* typically from {@link #firstCheckpoint()}.
*/
public IndexMetadata() {
}
/**
* Constructor used to configure a new unnamed B+Tree. The index
* UUID is set to the given value and all other fields are defaulted as
* explained at {@link #IndexMetadata(Properties, String, UUID)}. Those
* defaults may be overridden using the various setter methods, but some
* values can not be safely overridden after the index is in use.
*
* @param indexUUID
* The indexUUID.
*
* @throws IllegalArgumentException
* if the indexUUID is null
.
*/
public IndexMetadata(final UUID indexUUID) {
this(null/* name */, indexUUID);
}
/**
* Constructor used to configure a new named {@link BTree}. The
* index UUID is set to the given value and all other fields are defaulted
* as explained at {@link #IndexMetadata(Properties, String, UUID)}. Those
* defaults may be overridden using the various setter methods, but some
* values can not be safely overridden after the index is in use.
*
* @param name
* The index name. When this is a scale-out index, the same
* name is specified for each index resource. However they
* will be registered on the journal under different names
* depending on the index partition to which they belong.
*
* @param indexUUID
* The indexUUID. The same index UUID MUST be used for all
* component indices in a scale-out index.
*
* @throws IllegalArgumentException
* if the indexUUID is null
.
*/
public IndexMetadata(final String name, final UUID indexUUID) {
this(null/* name */, System.getProperties(), name, indexUUID,
IndexTypeEnum.BTree);
}
/**
* Constructor used to configure a new named B+Tree. The index UUID
* is set to the given value and all other fields are defaulted as explained
* at {@link #getProperty(Properties, String, String, String)}. Those
* defaults may be overridden using the various setter methods.
*
* @param indexManager
* Optional. When given and when the {@link IIndexManager} is a
* scale-out {@link IBigdataFederation}, this object will be used
* to interpret the {@link Options#INITIAL_DATA_SERVICE}
* property.
* @param properties
* Properties object used to overridden the default values for
* this {@link IndexMetadata} instance.
* @param namespace
* The index name. When this is a scale-out index, the same
* name is specified for each index resource. However they
* will be registered on the journal under different names
* depending on the index partition to which they belong.
* @param indexUUID
* The indexUUID. The same index UUID MUST be used for all
* component indices in a scale-out index.
* @param indexType
* Type-safe enumeration specifying the type of the persistence
* class data structure (historically, this was always a B+Tree).
*
* @throws IllegalArgumentException
* if properties is null
.
* @throws IllegalArgumentException
* if indexUUID is null
.
*/
public IndexMetadata(final IIndexManager indexManager,
final Properties properties, final String namespace,
final UUID indexUUID, final IndexTypeEnum indexType) {
if (indexUUID == null)
throw new IllegalArgumentException();
if (indexType == null)
throw new IllegalArgumentException();
this.name = namespace;
this.indexType = indexType;
this.indexUUID = indexUUID;
{
final String val = getProperty(indexManager, properties, namespace,
Options.INITIAL_DATA_SERVICE, null/* default */);
if (val != null) {
/*
* Attempt to interpret the value as either a UUID or the name of
* a data service joined with the federation.
*/
UUID uuid = null;
try {
uuid = UUID.fromString(val);
} catch (Throwable t) {
// Not a UUID.
if (log.isInfoEnabled())
log.info("Not a UUID: " + val);
// Ignore & fall through.
}
if (uuid == null && indexManager != null
&& indexManager instanceof IBigdataFederation) {
final IBigdataFederation fed = (IBigdataFederation) indexManager;
final IDataService dataService = fed
.getDataServiceByName(val);
if (dataService != null) {
try {
uuid = dataService.getServiceUUID();
} catch (IOException ex) {
log.warn("Could not get serviceUUID", ex);
// ignore and fall through.
}
}
}
this.initialDataServiceUUID = uuid;
}
}
this.branchingFactor = getProperty(indexManager, properties, namespace,
Options.BTREE_BRANCHING_FACTOR,
Options.DEFAULT_BTREE_BRANCHING_FACTOR,
new IntegerRangeValidator(Options.MIN_BRANCHING_FACTOR,
Options.MAX_BTREE_BRANCHING_FACTOR));
{
/*
* The default capacity is set dynamically based on the maximum java
* heap as specified by -Xmx on the command line. This provides
* better ergonomics, but the larger write retention queue capacity
* will only benefit applications which perform sustained writes on
* the index.
*
* Note: For now I am turning off the write retention queue capacity
* "ergonomics". I am exploring whether or not this is too
* aggressive. The advantage of the ergonomics is that it
* automatically tunes the indices for a store used for a single
* purpose, such as a KB. However, if you have a lot of open
* indices, then this is not a good idea as each open index will
* allocate a ring buffer of that capacity.
*/
final String defaultCapacity;
// final long maxMemory = Runtime.getRuntime().maxMemory();
// if (maxMemory >= Bytes.gigabyte * 10) {
// defaultCapacity = "20000";
// } else if (maxMemory >= Bytes.gigabyte * 1) {
// defaultCapacity = "8000";
// } else {
defaultCapacity = Options.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY;
// }
this.writeRetentionQueueCapacity = getProperty(indexManager,
properties, namespace,
Options.WRITE_RETENTION_QUEUE_CAPACITY,
defaultCapacity,
// Options.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY,
new IntegerRangeValidator(
Options.MIN_WRITE_RETENTION_QUEUE_CAPACITY,
Options.MAX_WRITE_RETENTION_QUEUE_CAPACITY));
}
this.writeRetentionQueueScan = getProperty(indexManager,
properties, namespace, Options.WRITE_RETENTION_QUEUE_SCAN,
Options.DEFAULT_WRITE_RETENTION_QUEUE_SCAN,
IntegerValidator.GTE_ZERO);
// this.btreeReadRetentionQueueCapacity = getProperty(indexManager,
// properties, namespace, Options.BTREE_READ_RETENTION_QUEUE_CAPACITY,
// Options.DEFAULT_BTREE_READ_RETENTION_QUEUE_CAPACITY,
// IntegerValidator.GTE_ZERO);
//
// this.btreeReadRetentionQueueScan = getProperty(indexManager,
// properties, namespace, Options.BTREE_READ_RETENTION_QUEUE_SCAN,
// Options.DEFAULT_BTREE_READ_RETENTION_QUEUE_SCAN,
// IntegerValidator.GTE_ZERO);
this.btreeRecordCompressorFactory = newInstance(getProperty(
indexManager, properties, namespace,
Options.BTREE_RECORD_COMPRESSOR_FACTORY,
Options.DEFAULT_BTREE_RECORD_COMPRESSOR_FACTORY/* default */),
IRecordCompressorFactory.class);
this.indexSegmentBranchingFactor = getProperty(indexManager,
properties, namespace, Options.INDEX_SEGMENT_BRANCHING_FACTOR,
Options.DEFAULT_INDEX_SEGMENT_BRANCHING_FACTOR,
new IntegerRangeValidator(Options.MIN_BRANCHING_FACTOR,
Options.MAX_INDEX_SEGMENT_BRANCHING_FACTOR));
this.indexSegmentBufferNodes = Boolean.parseBoolean(getProperty(
indexManager, properties, namespace,
Options.INDEX_SEGMENT_BUFFER_NODES,
Options.DEFAULT_INDEX_SEGMENT_BUFFER_NODES));
// this.indexSegmentLeafCacheCapacity = getProperty(indexManager,
// properties, namespace,
// Options.INDEX_SEGMENT_LEAF_CACHE_CAPACITY,
// Options.DEFAULT_INDEX_SEGMENT_LEAF_CACHE_CAPACITY,
// IntegerValidator.GT_ZERO);
//
// this.indexSegmentLeafCacheTimeout = getProperty(indexManager,
// properties, namespace,
// Options.INDEX_SEGMENT_LEAF_CACHE_TIMEOUT,
// Options.DEFAULT_INDEX_SEGMENT_LEAF_CACHE_TIMEOUT,
// LongValidator.GT_ZERO);
this.indexSegmentRecordCompressorFactory = newInstance(
getProperty(indexManager, properties, namespace,
Options.INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY,
Options.DEFAULT_INDEX_SEGMENT_RECORD_COMPRESSOR_FACTORY/* default */),
IRecordCompressorFactory.class);
// Note: default assumes NOT an index partition.
this.pmd = null;
/* Intern'd to reduce duplication on the heap. Will be com.bigdata.btree.BTree or
* com.bigdata.btree.IndexSegment and occasionally a class derived from BTree.
*/
this.btreeClassName = getProperty(indexManager, properties, namespace,
Options.BTREE_CLASS_NAME, BTree.class.getName()).intern();
// Intern'd to reduce duplication on the heap.
this.checkpointClassName = Checkpoint.class.getName().intern();
// this.addrSer = AddressSerializer.INSTANCE;
// this.nodeKeySer = PrefixSerializer.INSTANCE;
final Class keyRabaCoder;
if (this instanceof HTreeIndexMetadata) {
keyRabaCoder = FrontCodedRabaCoderDupKeys.class;
} else {
keyRabaCoder = DefaultFrontCodedRabaCoder.class;
}
this.nodeKeysCoder = newInstance(getProperty(indexManager, properties,
namespace, Options.NODE_KEYS_CODER,
keyRabaCoder.getName()), IRabaCoder.class);
// this.tupleSer = DefaultTupleSerializer.newInstance();
{
/*
* FIXME allow override of the keyBuilderFactory.
*
* FIXME there are a bunch of subclasses of DefaultTupleSerializer.
* In order to be able to override the specific key/value
* serialization providers we need to make the tupleSer instance
* itself a configuration parameter.
*/
final IKeyBuilderFactory keyBuilderFactory = DefaultTupleSerializer
.getDefaultKeyBuilderFactory();
final IRabaCoder leafKeysCoder = newInstance(getProperty(
indexManager, properties, namespace,
Options.LEAF_KEYS_CODER, keyRabaCoder
.getName()), IRabaCoder.class);
final IRabaCoder valuesCoder = newInstance(getProperty(
indexManager, properties, namespace,
Options.LEAF_VALUES_CODER, CanonicalHuffmanRabaCoder.class
.getName()), IRabaCoder.class);
this.tupleSer = new DefaultTupleSerializer(keyBuilderFactory,
leafKeysCoder, valuesCoder);
}
this.conflictResolver = null;
// this.childLocks = Boolean.parseBoolean(getProperty(
// indexManager, properties, namespace, Options.CHILD_LOCKS,
// Options.DEFAULT_CHILD_LOCKS));
this.deleteMarkers = false;
this.versionTimestamps = false;
this.versionTimestampFilters = false;
/*
* Default to false for new indices. This follows the same principle of
* requiring people to opt in for special features. Many indices tend to
* always use small records and this option represents overhead for such
* indices. Indices which do use large records should enable this option
* in order to reduce their IO churn and disk footprint.
*/
this.rawRecords = false;
this.maxRecLen = Short.parseShort(getProperty(indexManager,
properties, namespace, Options.MAX_REC_LEN,
Options.DEFAULT_MAX_REC_LEN));
// Note: May be used to force testing with raw records.
// this.rawRecords = true;
// this.maxRecLen = 1;
// optional bloom filter setup.
final boolean bloomFilter = Boolean.parseBoolean(getProperty(
indexManager, properties, namespace, Options.BLOOM_FILTER,
Options.DEFAULT_BLOOM_FILTER));
this.bloomFilterFactory = bloomFilter ? BloomFilterFactory.DEFAULT
: null;
// Note: by default there is no overflow handler.
this.overflowHandler = null;
// split handler setup (used iff scale-out).
{
// final int minimumEntryCount = Integer.parseInt(getProperty(
// indexManager, properties, namespace,
// Options.SPLIT_HANDLER_MIN_ENTRY_COUNT,
// Options.DEFAULT_SPLIT_HANDLER_MIN_ENTRY_COUNT));
//
// final int entryCountPerSplit = Integer.parseInt(getProperty(
// indexManager, properties, namespace,
// Options.SPLIT_HANDLER_ENTRY_COUNT_PER_SPLIT,
// Options.DEFAULT_SPLIT_HANDLER_ENTRY_COUNT_PER_SPLIT));
//
// final double overCapacityMultiplier = Double.parseDouble(getProperty(
// indexManager, properties, namespace,
// Options.SPLIT_HANDLER_OVER_CAPACITY_MULTIPLIER,
// Options.DEFAULT_SPLIT_HANDLER_OVER_CAPACITY_MULTIPLIER));
//
// final double underCapacityMultiplier = Double.parseDouble(getProperty(
// indexManager, properties, namespace,
// Options.SPLIT_HANDLER_UNDER_CAPACITY_MULTIPLIER,
// Options.DEFAULT_SPLIT_HANDLER_UNDER_CAPACITY_MULTIPLIER));
//
// final int sampleRate = Integer.parseInt(getProperty(
// indexManager, properties, namespace,
// Options.SPLIT_HANDLER_SAMPLE_RATE,
// Options.DEFAULT_SPLIT_HANDLER_SAMPLE_RATE));
//
// this.splitHandler = new DefaultSplitHandler(//
// minimumEntryCount, //
// entryCountPerSplit, //
// overCapacityMultiplier, //
// underCapacityMultiplier, //
// sampleRate //
// );
/*
* Note: The default behavior when no split handler is specified
* will work for nearly all use cases and will result in index
* partitions whose size on the disk is bounded by the parameter
* specified to the OverflowManager class. Indices which require
* certain guarantees for atomicity, such as the SparseRowStore or
* the SPO index, must override this default.
*/
this.splitHandler2 = null;
}
/*
* Asynchronous index write API configuration.
*/
{
final int masterQueueCapacity = Integer.parseInt(getProperty(
indexManager, properties, namespace,
Options.MASTER_QUEUE_CAPACITY,
Options.DEFAULT_MASTER_QUEUE_CAPACITY));
final int masterChunkSize = Integer.parseInt(getProperty(
indexManager, properties, namespace,
Options.MASTER_CHUNK_SIZE,
Options.DEFAULT_MASTER_CHUNK_SIZE));
final long masterChunkTimeoutNanos = Long.parseLong(getProperty(
indexManager, properties, namespace,
Options.MASTER_CHUNK_TIMEOUT_NANOS,
Options.DEFAULT_MASTER_CHUNK_TIMEOUT_NANOS));
final long sinkIdleTimeoutNanos = Long.parseLong(getProperty(
indexManager, properties, namespace,
Options.SINK_IDLE_TIMEOUT_NANOS,
Options.DEFAULT_SINK_IDLE_TIMEOUT_NANOS));
final long sinkPollTimeoutNanos = Long.parseLong(getProperty(
indexManager, properties, namespace,
Options.SINK_POLL_TIMEOUT_NANOS,
Options.DEFAULT_SINK_POLL_TIMEOUT_NANOS));
final int sinkQueueCapacity = Integer.parseInt(getProperty(
indexManager, properties, namespace,
Options.SINK_QUEUE_CAPACITY,
Options.DEFAULT_SINK_QUEUE_CAPACITY));
final int sinkChunkSize = Integer.parseInt(getProperty(
indexManager, properties, namespace,
Options.SINK_CHUNK_SIZE,
Options.DEFAULT_SINK_CHUNK_SIZE));
final long sinkChunkTimeoutNanos = Long.parseLong(getProperty(
indexManager, properties, namespace,
Options.SINK_CHUNK_TIMEOUT_NANOS,
Options.DEFAULT_SINK_CHUNK_TIMEOUT_NANOS));
this.asynchronousIndexWriteConfiguration = new AsynchronousIndexWriteConfiguration(
masterQueueCapacity,//
masterChunkSize,//
masterChunkTimeoutNanos,//
sinkIdleTimeoutNanos,//
sinkPollTimeoutNanos,//
sinkQueueCapacity,//
sinkChunkSize,//
sinkChunkTimeoutNanos//
);
}
// Scatter-split configuration
{
final boolean scatterSplitEnabled = Boolean
.parseBoolean(getProperty(indexManager, properties,
namespace, Options.SCATTER_SPLIT_ENABLED,
Options.DEFAULT_SCATTER_SPLIT_ENABLED));
final double scatterSplitPercentOfSplitThreshold = Double
.parseDouble(getProperty(
indexManager,
properties,
namespace,
Options.SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD,
Options.DEFAULT_SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD));
final int scatterSplitDataServicesCount = Integer
.parseInt(getProperty(indexManager, properties, namespace,
Options.SCATTER_SPLIT_DATA_SERVICE_COUNT,
Options.DEFAULT_SCATTER_SPLIT_DATA_SERVICE_COUNT));
final int scatterSplitIndexPartitionsCount = Integer
.parseInt(getProperty(indexManager, properties, namespace,
Options.SCATTER_SPLIT_INDEX_PARTITION_COUNT,
Options.DEFAULT_SCATTER_SPLIT_INDEX_PARTITION_COUNT));
this.scatterSplitConfiguration = new ScatterSplitConfiguration(
scatterSplitEnabled, scatterSplitPercentOfSplitThreshold,
scatterSplitDataServicesCount,
scatterSplitIndexPartitionsCount);
}
if (log.isInfoEnabled())
log.info(toString());
}
/**
* Write out the metadata record for the btree on the store and return the
* address.
*
* @param store
* The store on which the metadata record is being written.
*
* @return The address of the metadata record is set on this object as a
* side effect.
*
* @throws IllegalStateException
* if the record has already been written on the store.
* @throws IllegalStateException
* if the {@link #indexUUID} field is null
- this
* generally indicates that you used the de-serialization
* constructor rather than one of the constructor variants that
* accept the required UUID parameter.
*/
public void write(final IRawStore store) {
if (addrMetadata != 0L) {
throw new IllegalStateException("Already written.");
}
if (indexUUID == null) {
throw new IllegalStateException("No indexUUID : wrong constructor?");
}
// write on the store, setting address as side-effect.
this.addrMetadata = store.write(ByteBuffer.wrap(SerializerUtil
.serialize(this)));
}
/**
* Read the metadata record from the store.
*
* @param store
* the store.
* @param addr
* the address of the metadata record.
*
* @return the metadata record. The address from which it was loaded is set
* on the metadata record as a side-effect.
*/
public static IndexMetadata read(final IRawStore store, final long addr) {
final IndexMetadata metadata = (IndexMetadata) SerializerUtil
.deserialize(store.read(addr));
// save the address from which the metadata record was loaded.
metadata.addrMetadata = addr;
return metadata;
}
/**
* A human readable representation of the metadata record.
*/
public String toString() {
final StringBuilder sb = new StringBuilder();
// transient
sb.append("addrMetadata=" + addrMetadata);
// persistent
sb.append(", name=" + (name == null ? "N/A" : name));
sb.append(", indexType=" + indexType);
sb.append(", indexUUID=" + indexUUID);
if (initialDataServiceUUID != null) {
sb.append(", initialDataServiceUUID=" + initialDataServiceUUID);
}
sb.append(", branchingFactor=" + branchingFactor);
sb.append(", pmd=" + pmd);
sb.append(", btreeClassName=" + btreeClassName);
sb.append(", checkpointClass=" + checkpointClassName);
// sb.append(", childAddrSerializer=" + addrSer.getClass().getName());
sb.append(", nodeKeysCoder=" + nodeKeysCoder);//.getClass().getName());
sb.append(", btreeRecordCompressorFactory="
+ (btreeRecordCompressorFactory == null ? "N/A"
: btreeRecordCompressorFactory));
sb.append(", tupleSerializer=" + tupleSer);//.getClass().getName());
sb.append(", conflictResolver="
+ (conflictResolver == null ? "N/A" : conflictResolver
.getClass().getName()));
sb.append(", deleteMarkers=" + deleteMarkers);
sb.append(", versionTimestamps=" + versionTimestamps);
sb.append(", versionTimestampFilters=" + versionTimestampFilters);
sb.append(", isolatable=" + isIsolatable());
sb.append(", rawRecords=" + rawRecords);
sb.append(", maxRecLen=" + maxRecLen);
sb.append(", bloomFilterFactory=" + (bloomFilterFactory == null ? "N/A"
: bloomFilterFactory.toString()));
sb.append(", overflowHandler="
+ (overflowHandler == null ? "N/A" : overflowHandler.getClass()
.getName()));
sb.append(", splitHandler="
+ (splitHandler2 == null ? "N/A" : splitHandler2.toString()));
sb.append(", indexSegmentBranchingFactor=" + indexSegmentBranchingFactor);
sb.append(", indexSegmentBufferNodes=" + indexSegmentBufferNodes);
// sb.append(", indexSegmentLeafCacheCapacity=" + indexSegmentLeafCacheCapacity);
// sb.append(", indexSegmentLeafCacheTimeout=" + indexSegmentLeafCacheTimeout);
sb.append(", indexSegmentRecordCompressorFactory="
+ (indexSegmentRecordCompressorFactory == null ? "N/A"
: indexSegmentRecordCompressorFactory));
sb.append(", asynchronousIndexWriteConfiguration=" + asynchronousIndexWriteConfiguration);
sb.append(", scatterSplitConfiguration=" + scatterSplitConfiguration);
toString(sb); // extension hook
return sb.toString();
}
/**
* Extension hook for {@link #toString()}.
*
* @param sb
* Where to write additional metadata.
*/
protected void toString(final StringBuilder sb) {
// NOP
}
/**
* The initial version.
*/
private static transient final int VERSION0 = 0x0;
/**
* This version adds support for {@link ILeafData#getRawRecord(int)} and
* {@link IndexMetadata#getRawRecords()} will report false
for
* earlier versions and {@link IndexMetadata#getMaxRecLen()} will report
* {@link Options#DEFAULT_MAX_REC_LEN}.
*/
private static transient final int VERSION1 = 0x1;
/**
* This version adds support for {@link HTree}. This includes
* {@link #addressBits} and {@link #htreeClassName}.
*/
private static transient final int VERSION2 = 0x2;
/**
* This version adds support for a fixed length key option for the
* {@link HTree} using {@link #keyLen}.
*/
private static transient final int VERSION3 = 0x3;
/**
* This version moves the {@link HTree} specific metadata into a derived
* class. Prior to this version, the {@link HTree} was not used in a durable
* context. Thus, there is no need to recover HTree specific index metadata
* records before {@link #VERSION4}. This version also introduces the
* {@link #indexType} field. This field defaults to
* {@link IndexTypeEnum#BTree} for all prior versions.
*/
private static transient final int VERSION4 = 0x4;
/**
* The version that will be serialized by this class.
*/
private static transient final int CURRENT_VERSION = VERSION4;
/**
* The actual version as set by {@link #readExternal(ObjectInput)} and
* {@link #writeExternal(ObjectOutput)}.
*/
private transient int version;
/**
* @todo review generated record for compactness.
*/
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
final int version = this.version = (int) LongPacker.unpackLong(in);
switch (version) {
case VERSION0:
case VERSION1:
case VERSION2:
case VERSION3:
case VERSION4:
// case VERSION5:
// case VERSION6:
// case VERSION7:
// case VERSION8:
// case VERSION9:
// case VERSION10:
break;
default:
throw new IOException("Unknown version: version=" + version);
}
final boolean hasName = in.readBoolean();
if (hasName) {
name = in.readUTF();
}
if (version >= VERSION4) {
indexType = IndexTypeEnum.valueOf(in.readShort());
} else {
indexType = IndexTypeEnum.BTree;
}
indexUUID = new UUID(in.readLong()/* MSB */, in.readLong()/* LSB */);
branchingFactor = (int) LongPacker.unpackLong(in);
writeRetentionQueueCapacity = (int) LongPacker.unpackLong(in);
writeRetentionQueueScan = (int) LongPacker.unpackLong(in);
// if (version < VERSION7) {
//
// /* btreeReadRetentionQueueCapacity = (int) */LongPacker
// .unpackLong(in);
//
// /* btreeReadRetentionQueueScan = (int) */LongPacker.unpackLong(in);
//
// }
pmd = (LocalPartitionMetadata) in.readObject();
btreeClassName = in.readUTF();
checkpointClassName = in.readUTF();
nodeKeysCoder = (IRabaCoder) in.readObject();
tupleSer = (ITupleSerializer) in.readObject();
btreeRecordCompressorFactory = (IRecordCompressorFactory) in
.readObject();
conflictResolver = (IConflictResolver) in.readObject();
deleteMarkers = in.readBoolean();
if (version >= VERSION1) {
rawRecords = in.readBoolean();
maxRecLen = in.readShort();
} else {
rawRecords = false;
maxRecLen = Short.parseShort(Options.DEFAULT_MAX_REC_LEN);
}
versionTimestamps = in.readBoolean();
versionTimestampFilters = in.readBoolean();
bloomFilterFactory = (BloomFilterFactory) in.readObject();
overflowHandler = (IOverflowHandler) in.readObject();
splitHandler2 = (ISimpleSplitHandler) in.readObject();
/*
* IndexSegment.
*/
indexSegmentBranchingFactor = (int) LongPacker.unpackLong(in);
indexSegmentBufferNodes = in.readBoolean();
indexSegmentRecordCompressorFactory = (IRecordCompressorFactory) in
.readObject();
asynchronousIndexWriteConfiguration = (AsynchronousIndexWriteConfiguration) in
.readObject();
scatterSplitConfiguration = (ScatterSplitConfiguration) in.readObject();
if (version >= VERSION2 && version < VERSION4) {
/*
* These data were moved into the HTreeIndexMetadata subclass
* in VERSION4. The HTree was only used against the memory
* manager before VERSION4. Therefore, we never have durable
* data for an HTree before VERSION4.
*/
if (version >= VERSION3) {
// keyLen
LongPacker.unpackInt(in);
}
// addressBits
LongPacker.unpackInt(in);
// htreeClassName
in.readUTF();
}
}
public void writeExternal(final ObjectOutput out) throws IOException {
final int version = CURRENT_VERSION;
LongPacker.packLong(out, version);
// hasName?
out.writeBoolean(name != null ? true : false);
// the name
if (name != null) {
out.writeUTF(name);
}
if (version >= VERSION4) {
out.writeShort(indexType.getCode());
}
out.writeLong(indexUUID.getMostSignificantBits());
out.writeLong(indexUUID.getLeastSignificantBits());
LongPacker.packLong(out, branchingFactor);
LongPacker.packLong(out, writeRetentionQueueCapacity);
LongPacker.packLong(out, writeRetentionQueueScan);
// Note: gone with version7.
// LongPacker.packLong(out, btreeReadRetentionQueueCapacity);
// LongPacker.packLong(out, btreeReadRetentionQueueScan);
out.writeObject(pmd);
out.writeUTF(btreeClassName);
out.writeUTF(checkpointClassName);
out.writeObject(nodeKeysCoder);
out.writeObject(tupleSer);
out.writeObject(btreeRecordCompressorFactory);
out.writeObject(conflictResolver);
out.writeBoolean(deleteMarkers);
if (version >= VERSION1) {
out.writeBoolean(rawRecords);
out.writeShort(maxRecLen);
}
out.writeBoolean(versionTimestamps);
out.writeBoolean(versionTimestampFilters);
out.writeObject(bloomFilterFactory);
out.writeObject(overflowHandler);
out.writeObject(splitHandler2);
/*
* IndexSegment.
*/
LongPacker.packLong(out, indexSegmentBranchingFactor);
out.writeBoolean(indexSegmentBufferNodes);
out.writeObject(btreeRecordCompressorFactory);
// introduced in VERSION1
out.writeObject(asynchronousIndexWriteConfiguration);
// introduced in VERSION2
out.writeObject(scatterSplitConfiguration);
// if (version >= VERSION2) {
//
// if (version >= VERSION3) {
//
// LongPacker.packLong(out, keyLen);
//
// }
//
// LongPacker.packLong(out, addressBits);
//
// out.writeUTF(htreeClassName);
//
// }
}
/**
* Makes a copy of the persistent data, clearing the address of the
* {@link IndexMetadata} record on the cloned copy.
*
* @return The cloned copy.
*/
public IndexMetadata clone() {
try {
final IndexMetadata copy = (IndexMetadata) super.clone();
copy.addrMetadata = 0L;
return copy;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
/**
* Create an initial {@link Checkpoint} for a new persistence capable data
* structure described by this metadata record.
*
* The caller is responsible for writing the {@link Checkpoint} record onto
* the store.
*
* The class identified by {@link #getCheckpointClassName()} MUST declare a
* public constructor with the following method signature
*
*
* ...( IndexMetadata metadata )
*
*
* @return The {@link Checkpoint}.
*/
@SuppressWarnings("unchecked")
final public Checkpoint firstCheckpoint() {
final String checkpointClassName = getCheckpointClassName();
if (checkpointClassName == null) {
/*
* This exception can be thrown if you originally created the
* IndexMetadata object using the zero argument constructor. That
* form of the constructor is only for deserialization and as such
* it does not set any properties.
*/
throw new RuntimeException(
"checkpointClassName not set: did you use the deserialization constructor by mistake?");
}
try {
final Class cl = Class.forName(checkpointClassName);
/*
* Note: A NoSuchMethodException thrown here means that you did not
* declare the required public constructor.
*/
final Constructor ctor = cl.getConstructor(new Class[] {//
IndexMetadata.class//
});
final Checkpoint checkpoint = (Checkpoint) ctor
.newInstance(new Object[] { this });
return checkpoint;
} catch(Exception ex) {
throw new RuntimeException(ex);
}
}
/**
* Variant used when an index overflows onto a new backing store.
*
* The caller is responsible for writing the {@link Checkpoint} record onto
* the store.
*
* The class identified by {@link #getCheckpointClassName()} MUST declare a
* public constructor with the following method signature
*
*
* ...( IndexMetadata metadata, Checkpoint oldCheckpoint )
*
*
* @param oldCheckpoint
* The last checkpoint for the index of the old backing store.
*
* @return The first {@link Checkpoint} for the index on the new backing
* store.
*
* @throws IllegalArgumentException
* if the oldCheckpoint is null
.
*/
@SuppressWarnings("unchecked")
final public Checkpoint overflowCheckpoint(final Checkpoint oldCheckpoint) {
if (oldCheckpoint == null) {
throw new IllegalArgumentException();
}
try {
final Class cl = Class.forName(getCheckpointClassName());
/*
* Note: A NoSuchMethodException thrown here means that you did not
* declare the required public constructor.
*/
final Constructor ctor = cl.getConstructor(new Class[] {
IndexMetadata.class, //
Checkpoint.class//
});
final Checkpoint checkpoint = (Checkpoint) ctor
.newInstance(new Object[] { //
this, //
oldCheckpoint //
});
// sanity check makes sure the counter is propagated to the new store.
assert checkpoint.getCounter() == oldCheckpoint.getCounter();
return checkpoint;
} catch(Exception ex) {
throw new RuntimeException(ex);
}
}
/**
*
* Factory for thread-safe {@link IKeyBuilder} objects for use by
* {@link ITupleSerializer#serializeKey(Object)} and possibly others.
*
*
* Note: A mutable B+Tree is always single-threaded. However, read-only
* B+Trees allow concurrent readers. Therefore, thread-safety requirement is
* safe for either a single writers -or- for concurrent readers.
*
*
* Note: If you change this value in a manner that is not backward
* compatable once entries have been written on the index then you may be
* unable to any read data already written.
*
*
* Note: This method delegates to {@link ITupleSerializer#getKeyBuilder()}.
* This {@link IKeyBuilder} SHOULD be used to form all keys for this
* index. This is critical for indices that have Unicode data in their
* application keys as the formation of Unicode sort keys from Unicode data
* depends on the {@link IKeyBuilderFactory}. If you use a locally
* configured {@link IKeyBuilder} then your Unicode keys will be encoded
* based on the {@link Locale} configured for the JVM NOT the factory
* specified for this index.
*
*/
@Override
public IKeyBuilder getKeyBuilder() {
return getTupleSerializer().getKeyBuilder();
}
@Override
public IKeyBuilder getPrimaryKeyBuilder() {
return getTupleSerializer().getPrimaryKeyBuilder();
}
/**
* @see Configuration#getProperty(IIndexManager, Properties, String, String,
* String)
*/
protected String getProperty(final IIndexManager indexManager,
final Properties properties, final String namespace,
final String globalName, final String defaultValue) {
return Configuration.getProperty(indexManager, properties, namespace,
globalName, defaultValue);
}
/**
* @see Configuration#getProperty(IIndexManager, Properties, String, String,
* String, IValidator)
*/
protected E getProperty(final IIndexManager indexManager,
final Properties properties, final String namespace,
final String globalName, final String defaultValue,
IValidator validator) {
return Configuration.getProperty(indexManager, properties, namespace,
globalName, defaultValue, validator);
}
}