com.bigdata.btree.IndexSegmentPlan Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.btree;
import org.apache.log4j.Logger;
import com.bigdata.btree.IndexMetadata.Options;
/**
* A plan for building a B+-Tree based on an input branching factor and #of
* entries.
*
* @author Bryan Thompson
* @version $Id$
*/
public class IndexSegmentPlan {
protected static final transient Logger log = Logger
.getLogger(IndexSegmentPlan.class);
/**
* A summary representation of the index build plan. The branching factor
* and the #of entries are the inputs. The outputs include the height of
* the B+Tree that should be generated and the #of nodes and leaves that
* will exist in that B+Tree.
*/
public String toString() {
return getClass() + "{branchingFactor=" + m + ", nentries=" + nentries
+ ", height=" + height + ", nnodes=" + nnodes + ", nleaves="
+ nleaves + "}";
}
/**
* The branching factor of the output tree (input).
*/
final public int m;
/**
* The minimum #of values that may be placed into non-root leaf (and
* also the minimum #of children that may be placed into a non-root
* node). (the minimum capacity).
*/
final public int m2;
/**
* The #of entries in the btree (input).
*/
final public long nentries;
/**
* The #of leaves that will exist in the output tree. When nleaves == 1
* the output tree will consist of a root leaf. In this case we do not
* open a temporary file for the nodes since there will not be any.
*/
final public long nleaves;
/**
* The #of non-leaf nodes in the output tree.
*/
final public long nnodes;
/**
* The height of the output tree (#of levels in the output tree).
*/
final public int height;
/**
* The #of entries to place into each leaf. The array is dimensioned to
* {@link #nleaves}. This is a convenience reference to the last array in
* {@link #numInNode}.
*/
final public int[] numInLeaf;
/**
* The #of nodes at each level of the tree, including the level containing
* the leaves.
*
* @see #nleaves, which is the #of leaves in the output tree.
*/
final public long[] numInLevel;
/**
* The #of children / values to place into each node in each level of the
* output tree. The first index is the level in the tree, starting from
* level zero which is the root and increasing through level [height+1],
* which is the level containing the leaves of the output tree.
*
* @see #numInLeaf numInLeaf, which is a reference to the last element of
* this array.
*/
final public int[][] numInNode;
/**
* Create a plan for building a B+-Tree. The plan has only these two inputs.
* Everything else about the plan is deterministic based on those values.
*
* @param m
* The branching factor of the output tree (#of keys/values for a
* leaf or the #of children for a node).
* @param nentries
* The #of entries in the tree.
*
* @throws IllegalArgumentException
* if the branching factor is less than
* {@value Options#MIN_BRANCHING_FACTOR}.
* @throws IllegalArgumentException
* if the #of index entries is negative (zero is allowed as a
* special case).
*/
public IndexSegmentPlan(final int m, final long nentries) {
if (m < Options.MIN_BRANCHING_FACTOR)
throw new IllegalArgumentException();
if (nentries < 0)
throw new IllegalArgumentException();
// The branching factor of the output tree.
this.m = m;
// The #of entries in the btree.
this.nentries = nentries;
// The minimum capacity of a leaf (or a node).
m2 = (m+1)/2;
if(nentries == 0) {
/*
* Special case for an empty tree.
*/
if (log.isInfoEnabled())
log.info("Empty tree.");
nleaves = 1;
height = 0;
numInLeaf = new int[]{0};
numInNode = new int[][]{new int[]{0}};
numInLevel = new long[]{1};
nnodes = 0;
return;
}
// The #of leaves in the output tree.
nleaves = (int)Math.ceil((double)nentries / (double)m);
// The height of the output tree.
height = getMinimumHeight(m,nleaves);
if (log.isInfoEnabled())
log.info("branchingFactor=" + m + ", nentries=" + nentries
+ ", nleaves=" + nleaves + ", height=" + height);
// #of entries in each leaf.
numInLeaf = distributeKeys(m, m2, nleaves, nentries);
/*
* Figure out how many nodes are in each level of the output tree. We
* start from the leaves and compute the #of nodes required to hold that
* many child references.
*/
numInNode = new int[height+1][];
numInLevel = new long[height+1];
/*
* The first time through this loop the #of children is initialized to
* the #of leaves. Thereafter is is [numThisLevel] for the previous
* level.
*/
long nchildren = nleaves;
int nnodes = 0;
for (int h = height - 1; h >= 0; h--) {
/*
* Compute the minimum #of nodes required to hold the references for
* the children of the level in the tree beneath this one.
*/
final long numThisLevel = (long) Math.ceil((double) nchildren
/ (double) m);
numInLevel[h] = numThisLevel;
/*
* Distribute the children among the nodes allocated for this level.
*/
numInNode[h] = distributeChildren(m, m2, numThisLevel, nchildren);
nchildren = numThisLevel;
nnodes += numThisLevel;
}
numInNode[height] = numInLeaf;
numInLevel[height] = nleaves;
this.nnodes = nnodes;
}
/**
* Chooses the minimum height for a tree having a specified branching factor
* and a specified #of leaves.
*
* @param m
* The branching factor.
* @param nleaves
* The #of leaves that must be addressable by the tree.
*
* @throws UnsupportedOperationException
* if it is not possible to build a B+Tree with that branching
* factor and that many leaves without exceeding maxHeight
* (statically configured to 10
).
*/
public static int getMinimumHeight(final int m, final long nleaves) {
final int maxHeight = 10;
for (int h = 0; h <= maxHeight; h++) {
/*
* The maximum #of leaves addressable by a tree of height h and the
* given branching factor.
*
* Note: Java guarantees that Math.pow(int,int) produces the exact
* result iff that result can be represented as an integer. This
* useful feature lets us avoid having to deal with precision issues
* or write our own integer version of pow (computing m*m h times).
*/
final double d = (double) Math.pow(m, h);
if (d >= nleaves) {
/*
* h is the smallest height tree of the specified branching
* factor m capable of addressing the specified #of leaves.
*/
return h;
}
}
throw new UnsupportedOperationException(
"Can not build tree with height less than " + maxHeight
+ " given branchingFactor=" + m + ", nleaves="
+ nleaves);
}
/**
* Distributes the keys among the leaves.
*
* We want to fill up every leaf, but we have to make sure that the last
* leaf is not under capacity. To that end, we calculate the #of entries
* that would remain if we filled up n-1 leaves completely. If the #of
* remaining entries is less than or equal to the minimum capacity of a
* leaf, then we have to adjust the allocation of entries such that the last
* leaf is at its minimum capacity. This is done by computing the shortage
* and then distributing that shortage among the leaves. Once we have
* deferred enough entries we are guaranteed that the final leaf will not be
* under capacity.
*
* @param m
* The branching factor in the output tree.
* @param m2
* The minimum capacity for a leaf in the output tree, which is
* computed as (m+1)/2.
* @param nleaves
* The #of leaves in the output tree.
* @param nentries
* The #of entries to be inserted into the output tree.
*
* @return An array indicating how many entries should be inserted into each
* leaf of the output tree. The array index is the leaf order
* (origin zero). The value is the capacity to which that leaf
* should be filled.
* @throws IllegalArgumentException
* if there is a problem with the arguments.
*
* @see TestIndexSegmentPlan
* @see TestIndexSegmentBuilderWithSmallTree#test_problem3_buildOrder3()
*/
public static int[] distributeKeys(final int m, final int m2,
final long nleaves, final long nentries) {
if (m < Options.MIN_BRANCHING_FACTOR)
throw new IllegalArgumentException();
if (m > Options.MAX_INDEX_SEGMENT_BRANCHING_FACTOR)
throw new IllegalArgumentException();
if (m2 < (m + 1) / 2)
throw new IllegalArgumentException();
if (m2 > m)
throw new IllegalArgumentException();
if (nleaves <= 0)
throw new IllegalArgumentException();
if (nleaves > Integer.MAX_VALUE) {
/*
* Note: We can not build a plan with more than MAX_INT leaves since
* that would require an array with an int64 index.
*/
throw new IllegalArgumentException();
}
if (nentries <= 0)
throw new IllegalArgumentException();
if (nleaves == 1) {
/*
* If there is just a root leaf then any number (up to the leafs
* capacity) will fit into that root leaf.
*/
if (nentries > m)
throw new RuntimeException();
return new int[] { (int) nentries }; // Note: nentries<=m<=MAX_INT.
}
final int[] n = new int[(int) nleaves]; // Note: nleaves<=MAX_INT.
/*
* Default each leaf to m entries.
*/
for (int i = 0; i < nleaves; i++) {
n[i] = m;
}
/*
* The #of entries that would be allocated to the last leaf if we filled
* each proceeding leaf to its capacity of [m] tuples.
*/
final long remaining = nentries - ((nleaves - 1) * m);
/*
* If the #of entries remaining would put the leaf under capacity then
* we compute the shortage. We need to defer this many entries from the
* previous leaves in order to have the last leaf reach its minimum
* capacity.
*
* Note: This will be a small integer in [0:m2).
*/
int shortage = (int) (remaining < m2 ? m2 - remaining : 0);
if( remaining < m2 ) {
// The last leaf will be at minimum capacity.
n[(int) (nleaves - 1)] = m2;
} else {
/*
* The remainder will go into the last leaf without underflow.
*
* Note: remaining will be a small integer (LT m).
*/
n[(int) (nleaves - 1)] = (int) remaining;
}
/*
* If the shortage is greater than the #of previous leaves, then we need
* to short some leaves by more than one entry. This scenario can be
* observed when building a tree with m := 9 and 10 entries. In that
* case there are only two leaves and we wind up shorting the previous
* leaf by 4 bringing both leaves down to their minimum capacity of 5.
*/
if (shortage > 0) {
while (shortage > 0) {
for (int i = (int) (nleaves - 2); i >= 0 && shortage > 0; i--) {
n[i]--;
shortage--;
}
}
}
return n;
}
/**
* Distributes the children among the nodes of a given level.
*
* Note: This is just an alias for
* {@link #distributeKeys(int, int, long, long)}. The only difference when
* distributing children among nodes is that the result returned to the
* caller must be interpreted as the #of children to assigned to each node
* NOT the #of keys (for leaves the #of values and the #of keys is always
* the same).
*
* @param m
* The branching factor in the output tree.
* @param m2
* The minimum capacity, which should be computed as (m+1)/2.
* @param nnodes
* The #of nodes in the output tree for some given level of the
* output tree.
* @param nchildren
* The #of children to be distributed among those nodes.
*
* @return An array indicating how many children should be inserted into
* each node of the output tree at the given level. The array index
* is the node order (origin zero). The value is the #of children
* which must be assigned to that leaf.
*
* @see TestIndexSegmentPlan
* @see TestIndexSegmentBuilderWithSmallTree#test_problem3_buildOrder3()
*/
public static int[] distributeChildren(int m, int m2, long nnodes,
long nchildren) {
return distributeKeys(m, m2, nnodes, nchildren);
}
}