hex.tree.gbm.GBM Maven / Gradle / Ivy
package hex.tree.gbm;
import hex.genmodel.utils.DistributionFamily;
import hex.Distribution;
import hex.ModelCategory;
import hex.quantile.Quantile;
import hex.quantile.QuantileModel;
import hex.tree.*;
import hex.tree.DTree.DecidedNode;
import hex.tree.DTree.LeafNode;
import hex.tree.DTree.UndecidedNode;
import water.*;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.*;
import water.util.*;
import java.util.Arrays;
import java.util.Random;
/** Gradient Boosted Trees
*
* Based on "Elements of Statistical Learning, Second Edition, page 387"
*/
public class GBM extends SharedTree {
@Override public ModelCategory[] can_build() {
return new ModelCategory[]{
ModelCategory.Regression,
ModelCategory.Binomial,
ModelCategory.Multinomial,
};
}
// Called from an http request
public GBM( GBMModel.GBMParameters parms ) { super(parms ); init(false); }
public GBM( GBMModel.GBMParameters parms, Key key) { super(parms, key); init(false); }
public GBM(boolean startup_once) { super(new GBMModel.GBMParameters(),startup_once); }
@Override protected int nModelsInParallel() {
if (!_parms._parallelize_cross_validation || _parms._max_runtime_secs != 0) return 1; //user demands serial building (or we need to honor the time constraints for all CV models equally)
if (_train.byteSize() < 1e6) return _parms._nfolds; //for small data, parallelize over CV models
return 2; //GBM always has some serial work, so it's fine to build two models at once
}
/** Start the GBM training Job on an F/J thread. */
@Override protected GBMDriver trainModelImpl() {
return new GBMDriver();
}
/** Initialize the ModelBuilder, validating all arguments and preparing the
* training frame. This call is expected to be overridden in the subclasses
* and each subclass will start with "super.init();". This call is made
* by the front-end whenever the GUI is clicked, and needs to be fast;
* heavy-weight prep needs to wait for the trainModel() call.
*
* Validate the learning rate and distribution family. */
@Override public void init(boolean expensive) {
super.init(expensive);
// Initialize response based on given distribution family.
// Regression: initially predict the response mean
// Binomial: just class 0 (class 1 in the exact inverse prediction)
// Multinomial: Class distribution which is not a single value.
// However there is this weird tension on the initial value for
// classification: If you guess 0's (no class is favored over another),
// then with your first GBM tree you'll typically move towards the correct
// answer a little bit (assuming you have decent predictors) - and
// immediately the Confusion Matrix shows good results which gradually
// improve... BUT the Means Squared Error will suck for unbalanced sets,
// even as the CM is good. That's because we want the predictions for the
// common class to be large and positive, and the rare class to be negative
// and instead they start around 0. Guessing initial zero's means the MSE
// is so bad, that the R^2 metric is typically negative (usually it's
// between 0 and 1).
// If instead you guess the mean (reversed through the loss function), then
// the zero-tree GBM model reports an MSE equal to the response variance -
// and an initial R^2 of zero. More trees gradually improves the R^2 as
// expected. However, all the minority classes have large guesses in the
// wrong direction, and it takes a long time (lotsa trees) to correct that
// - so your CM sucks for a long time.
if (expensive) {
if (error_count() > 0)
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(GBM.this);
if (_parms._distribution == DistributionFamily.AUTO) {
if (_nclass == 1) _parms._distribution = DistributionFamily.gaussian;
if (_nclass == 2) _parms._distribution = DistributionFamily.bernoulli;
if (_nclass >= 3) _parms._distribution = DistributionFamily.multinomial;
}
checkDistributions();
if (hasOffsetCol() && isClassifier() && _parms._distribution == DistributionFamily.multinomial) {
error("_offset_column", "Offset is not supported for multinomial distribution.");
}
}
switch( _parms._distribution) {
case bernoulli:
if( _nclass != 2 /*&& !couldBeBool(_response)*/)
error("_distribution", H2O.technote(2, "Binomial requires the response to be a 2-class categorical"));
break;
case quasibinomial:
if ( !_response.isNumeric() )
error("_distribution", H2O.technote(2, "Quasibinomial requires the response to be numeric."));
if ( _nclass != 2)
error("_distribution", H2O.technote(2, "Quasibinomial requires the response to be binary."));
break;
case modified_huber:
if( _nclass != 2 /*&& !couldBeBool(_response)*/)
error("_distribution", H2O.technote(2, "Modified Huber requires the response to be a 2-class categorical."));
break;
case multinomial:
if (!isClassifier()) error("_distribution", H2O.technote(2, "Multinomial requires an categorical response."));
break;
case huber:
if (isClassifier()) error("_distribution", H2O.technote(2, "Huber requires the response to be numeric."));
break;
case poisson:
if (isClassifier()) error("_distribution", H2O.technote(2, "Poisson requires the response to be numeric."));
break;
case gamma:
if (isClassifier()) error("_distribution", H2O.technote(2, "Gamma requires the response to be numeric."));
break;
case tweedie:
if (isClassifier()) error("_distribution", H2O.technote(2, "Tweedie requires the response to be numeric."));
break;
case gaussian:
if (isClassifier()) error("_distribution", H2O.technote(2, "Gaussian requires the response to be numeric."));
break;
case laplace:
if (isClassifier()) error("_distribution", H2O.technote(2, "Laplace requires the response to be numeric."));
break;
case quantile:
if (isClassifier()) error("_distribution", H2O.technote(2, "Quantile requires the response to be numeric."));
break;
case AUTO:
break;
default:
error("_distribution","Invalid distribution: " + _parms._distribution);
}
if( !(0. < _parms._learn_rate && _parms._learn_rate <= 1.0) )
error("_learn_rate", "learn_rate must be between 0 and 1");
if( !(0. < _parms._learn_rate_annealing && _parms._learn_rate_annealing <= 1.0) )
error("_learn_rate_annealing", "learn_rate_annealing must be between 0 and 1");
if( !(0. < _parms._col_sample_rate && _parms._col_sample_rate <= 1.0) )
error("_col_sample_rate", "col_sample_rate must be between 0 and 1");
if (_parms._max_abs_leafnode_pred <= 0)
error("_max_abs_leafnode_pred", "max_abs_leafnode_pred must be larger than 0.");
if (_parms._pred_noise_bandwidth < 0)
error("_pred_noise_bandwidth", "pred_noise_bandwidth must be >= 0.");
}
// ----------------------
private class GBMDriver extends Driver {
private transient FrameMap frameMap;
@Override
protected Frame makeValidWorkspace() {
// FIXME: this is not efficient, we need a sparse volatile chunks
Vec[] tmp = _valid.anyVec().makeVolatileDoubles(numClassTrees());
String[] tmpNames = new String[tmp.length];
for (int i = 0; i < tmpNames.length; i++)
tmpNames[i] = "__P_" + i;
return new Frame(tmpNames, tmp);
}
@Override protected boolean doOOBScoring() { return false; }
@Override protected void initializeModelSpecifics() {
frameMap = new FrameMap(GBM.this);
_mtry_per_tree = Math.max(1, (int)(_parms._col_sample_rate_per_tree * _ncols)); //per-tree
if (!(1 <= _mtry_per_tree && _mtry_per_tree <= _ncols)) throw new IllegalArgumentException("Computed mtry_per_tree should be in interval <1,"+_ncols+"> but it is " + _mtry_per_tree);
_mtry = Math.max(1, (int)(_parms._col_sample_rate * _parms._col_sample_rate_per_tree * _ncols)); //per-split
if (!(1 <= _mtry && _mtry <= _ncols)) throw new IllegalArgumentException("Computed mtry should be in interval <1,"+_ncols+"> but it is " + _mtry);
// for Bernoulli, we compute the initial value with Newton-Raphson iteration, otherwise it might be NaN here
DistributionFamily distr = _parms._distribution;
_initialPrediction = _nclass > 2 || distr == DistributionFamily.laplace || distr == DistributionFamily.huber || distr == DistributionFamily.quantile ? 0 : getInitialValue();
if (distr == DistributionFamily.bernoulli || distr == DistributionFamily.quasibinomial) {
if (hasOffsetCol())
_initialPrediction = getInitialValueBernoulliOffset(_train);
} else if (distr == DistributionFamily.laplace || distr == DistributionFamily.huber) {
_initialPrediction = getInitialValueQuantile(0.5);
} else if (distr == DistributionFamily.quantile) {
_initialPrediction = getInitialValueQuantile(_parms._quantile_alpha);
}
_model._output._init_f = _initialPrediction; //always write the initial value here (not just for Bernoulli)
// Set the initial prediction into the tree column 0
if (_initialPrediction != 0.0) {
new FillVecWithConstant(_initialPrediction)
.doAll(vec_tree(_train, 0), _parms._build_tree_one_node); // Only setting tree-column 0
}
}
/**
* Helper to compute the initial value for Laplace/Huber/Quantile (incl. optional offset and obs weights)
* @return weighted median of response - offset
*/
private double getInitialValueQuantile(double quantile) {
// obtain y - o
Vec y = hasOffsetCol()
? new ResponseLessOffsetTask(frameMap).doAll(1, Vec.T_NUM, _train).outputFrame().anyVec()
: response();
// Now compute (weighted) quantile of y - o
double res;
QuantileModel qm = null;
Frame tempFrame = null;
try {
tempFrame = new Frame(Key.make(H2O.SELF), new String[]{"y"}, new Vec[]{y});
if (hasWeightCol()) tempFrame.add("w", _weights);
DKV.put(tempFrame);
QuantileModel.QuantileParameters parms = new QuantileModel.QuantileParameters();
parms._train = tempFrame._key;
parms._probs = new double[]{quantile};
parms._weights_column = hasWeightCol() ? "w" : null;
Job job1 = new Quantile(parms).trainModel();
qm = job1.get();
res = qm._output._quantiles[0][0];
} finally {
if (qm!=null) qm.remove();
if (tempFrame!=null) DKV.remove(tempFrame._key);
}
return res;
}
/**
* Helper to compute the initial value for Bernoulli for offset != 0
*/
private double getInitialValueBernoulliOffset(Frame train) {
Log.info("Running Newton-Raphson iteration to find the initial value since offsets are specified.");
double delta;
int count = 0;
double tol = 1e-4;
//From R GBM vignette:
//For speed, gbm() does only one step of the Newton-Raphson algorithm
//rather than iterating to convergence. No appreciable loss of accuracy
//since the next boosting iteration will simply correct for the prior iterations
//inadequacy.
int N = 1; //one step is enough - same as R
double init = 0; //start with initial value of 0 for convergence
do {
double newInit = new NewtonRaphson(frameMap, new Distribution(_parms), init).doAll(train).value();
delta = Math.abs(init - newInit);
init = newInit;
Log.info("Iteration " + (++count) + ": initial value: " + init);
} while (count < N && delta >= tol);
if (delta > tol) Log.warn("Not fully converged.");
Log.info("Newton-Raphson iteration ran for " + count + " iteration(s). Final residual: " + delta);
return init;
}
private static final double MIN_LOG_TRUNC = -19;
private static final double MAX_LOG_TRUNC = 19;
private void truncatePreds(final DTree tree, int firstLeafIndex, DistributionFamily dist) {
if (firstLeafIndex==tree._len) return;
ComputeMinMax minMax = new ComputeMinMax(frameMap, firstLeafIndex, tree._len).doAll(_train);
if (DEV_DEBUG) {
Log.info("Number of leaf nodes: " + minMax._mins.length);
Log.info("Min: " + java.util.Arrays.toString(minMax._mins));
Log.info("Max: " + java.util.Arrays.toString(minMax._maxs));
}
//loop over leaf nodes only: starting at leaf index
for (int i = 0; i < tree._len - firstLeafIndex; i++) {
final LeafNode node = ((LeafNode) tree.node(firstLeafIndex + i));
int nidx = node.nid();
float nodeMin = minMax._mins[nidx- firstLeafIndex];
float nodeMax = minMax._maxs[nidx- firstLeafIndex];
// Log.info("Node: " + nidx + " min/max: " + nodeMin + "/" + nodeMax);
// https://github.com/cran/gbm/blob/master/src/poisson.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/poisson.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/poisson.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/gamma.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/gamma.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/tweedie.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/tweedie.cpp
double val = node._pred;
if (dist == DistributionFamily.gamma || dist == DistributionFamily.tweedie) //only for gamma/tweedie
val += nodeMax;
if (val > MAX_LOG_TRUNC) {
// Log.warn("Truncating large positive leaf prediction (log): " + node._pred + " to " + (MAX_LOG_TRUNC - nodeMax));
node._pred = (float) (MAX_LOG_TRUNC - nodeMax);
}
val = node._pred;
if (dist == DistributionFamily.gamma || dist == DistributionFamily.tweedie) //only for gamma/tweedie
val += nodeMin;
if (val < MIN_LOG_TRUNC) {
// Log.warn("Truncating large negative leaf prediction (log): " + node._pred + " to " + (MIN_LOG_TRUNC - nodeMin));
node._pred = (float) (MIN_LOG_TRUNC - nodeMin);
}
if (node._pred < MIN_LOG_TRUNC && node._pred > MAX_LOG_TRUNC) {
Log.warn("Terminal node prediction outside of allowed interval in log-space: "
+ node._pred + " (should be in " + MIN_LOG_TRUNC + "..." + MAX_LOG_TRUNC + ").");
}
}
}
// --------------------------------------------------------------------------
// Build the next k-trees, which is trying to correct the residual error from
// the prior trees.
@Override protected boolean buildNextKTrees() {
// We're going to build K (nclass) trees - each focused on correcting
// errors for a single class.
final DTree[] ktrees = new DTree[_nclass];
// Define a "working set" of leaf splits, from here to tree._len
int[] leaves = new int[_nclass];
// Compute predictions and resulting residuals
// ESL2, page 387, Steps 2a, 2b
// fills "Work" columns for all rows (incl. OOB) with the residuals
double huberDelta = Double.NaN;
if (_parms._distribution == DistributionFamily.huber) {
// Jerome Friedman 1999: Greedy Function Approximation: A Gradient Boosting Machine
// https://statweb.stanford.edu/~jhf/ftp/trebst.pdf
// compute absolute diff |y-(f+o)| for all rows
Vec diff = new ComputeAbsDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
Distribution dist = new Distribution(_parms);
// compute weighted alpha-quantile of the absolute residual -> this is the delta for the huber loss
huberDelta = MathUtils.computeWeightedQuantile(_weights, diff, _parms._huber_alpha);
dist.setHuberDelta(huberDelta);
// now compute residuals using the gradient of the huber loss (with a globally adjusted delta)
new StoreResiduals(frameMap, dist).doAll(_train, _parms._build_tree_one_node);
} else {
// compute predictions and residuals in one shot
new ComputePredAndRes(frameMap, _nclass, _model._output._distribution, new Distribution(_parms))
.doAll(_train, _parms._build_tree_one_node);
}
for (int k = 0; k < _nclass; k++) {
if (DEV_DEBUG && ktrees[k]!=null) {
System.out.println("Updated predictions in WORK col for class " + k + ":\n" + new Frame(new String[]{"WORK"},new Vec[]{vec_work(_train, k)}).toTwoDimTable());
}
}
// ----
// ESL2, page 387. Step 2b ii.
// One Big Loop till the ktrees are of proper depth.
// Adds a layer to the trees each pass.
growTrees(ktrees, leaves, _rand);
for (int k = 0; k < _nclass; k++) {
if (DEV_DEBUG && ktrees[k]!=null) {
System.out.println("Grew trees. Updated NIDs for class " + k + ":\n" + new Frame(new String[]{"NIDS"},new Vec[]{vec_nids(_train, k)}).toTwoDimTable());
}
}
// ----
// ESL2, page 387. Step 2b iii. Compute the gammas (leaf node predictions === fit best constant), and store them back
// into the tree leaves. Includes learn_rate.
GammaPass gp = new GammaPass(frameMap, ktrees, leaves, new Distribution(_parms), _nclass);
gp.doAll(_train);
if (_parms._distribution == DistributionFamily.laplace) {
fitBestConstantsQuantile(ktrees, leaves[0], 0.5); //special case for Laplace: compute the median for each leaf node and store that as prediction
} else if (_parms._distribution == DistributionFamily.quantile) {
fitBestConstantsQuantile(ktrees, leaves[0], _parms._quantile_alpha); //compute the alpha-quantile for each leaf node and store that as prediction
} else if (_parms._distribution == DistributionFamily.huber) {
fitBestConstantsHuber(ktrees, leaves[0], huberDelta); //compute the alpha-quantile for each leaf node and store that as prediction
} else {
fitBestConstants(ktrees, leaves, gp);
}
// Apply a correction for strong mispredictions (otherwise deviance can explode)
if (_parms._distribution == DistributionFamily.gamma ||
_parms._distribution == DistributionFamily.poisson ||
_parms._distribution == DistributionFamily.tweedie) {
assert(_nclass == 1);
truncatePreds(ktrees[0], leaves[0], _parms._distribution);
}
// ----
// ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the
// new tree, in the 'tree' columns. Also, zap the NIDs for next pass.
// Tree <== f(Tree)
// Nids <== 0
new AddTreeContributions(
frameMap, ktrees, _parms._pred_noise_bandwidth, _parms._seed, _parms._ntrees, _model._output._ntrees
).doAll(_train);
// sanity check
for (int k = 0; k < _nclass; k++) {
if (ktrees[k]!=null) assert(vec_nids(_train,k).mean()==0);
}
// Grow the model by K-trees
_model._output.addKTrees(ktrees);
boolean converged = effective_learning_rate() < 1e-6;
if (converged) {
Log.warn("Effective learning rate dropped below 1e-6 (" + _parms._learn_rate + " * " + _parms._learn_rate_annealing + "^" + (_model._output._ntrees-1) + ") - stopping the model now.");
}
return converged;
}
/**
* How may trees are actually calculated for the number of classes the model uses.
* @return number of trees
*/
private int numClassTrees() {
return _nclass == 2 ? 1 : _nclass; // Boolean Optimization (only one tree needed for 2-class problems)
}
/**
* Grow k regression trees (k=1 for regression and binomial, k=N for classification with N classes)
* @param ktrees k trees to grow (must be properly initialized)
* @param leaves workspace to store the leaf node starting index (k-dimensional - one per tree)
* @param rand PRNG for reproducibility
*/
private void growTrees(DTree[] ktrees, int[] leaves, Random rand) {
// Initial set of histograms. All trees; one leaf per tree (the root
// leaf); all columns
DHistogram hcs[][][] = new DHistogram[_nclass][1/*just root leaf*/][_ncols];
// Adjust real bins for the top-levels
int adj_nbins = Math.max(_parms._nbins_top_level,_parms._nbins);
long rseed = rand.nextLong();
// initialize trees
for (int k = 0; k < numClassTrees(); k++) {
// Initially setup as-if an empty-split had just happened
if (_model._output._distribution[k] != 0) {
ktrees[k] = new DTree(_train, _ncols, (char)_nclass, _mtry, _mtry_per_tree, rseed, _parms);
DHistogram[] hist = DHistogram.initialHist(_train, _ncols, adj_nbins, hcs[k][0], rseed, _parms, getGlobalQuantilesKeys());
new UndecidedNode(ktrees[k], DTree.NO_PARENT, hist); // The "root" node
}
}
// Sample - mark the lines by putting 'OUT_OF_BAG' into nid() vector
if (_parms._sample_rate < 1 || _parms._sample_rate_per_class != null) {
Sample ss[] = new Sample[_nclass];
for (int k = 0; k < _nclass; k++)
if (ktrees[k] != null)
ss[k] = new Sample(ktrees[k], _parms._sample_rate, _parms._sample_rate_per_class).dfork(null, new Frame(vec_nids(_train, k), _response), _parms._build_tree_one_node);
for (int k = 0; k < _nclass; k++) {
if (ss[k] != null) {
ss[k].getResult();
if (DEV_DEBUG && ktrees[k]!=null) {
System.out.println("Sampled OOB rows. NIDS:\n" + new Frame(vec_nids(_train, k)).toTwoDimTable());
}
}
}
}
// ----
// ESL2, page 387. Step 2b ii.
// One Big Loop till the ktrees are of proper depth.
// Adds a layer to the trees each pass.
int depth = 0;
for (; depth < _parms._max_depth; depth++) {
hcs = buildLayer(_train, _parms._nbins, _parms._nbins_cats, ktrees, leaves, hcs, _parms._build_tree_one_node);
// If we did not make any new splits, then the tree is split-to-death
if (hcs == null) break;
}
// Each tree bottomed-out in a DecidedNode; go 1 more level and insert
// LeafNodes to hold predictions.
for (int k = 0; k < _nclass; k++) {
DTree tree = ktrees[k];
if (tree == null) continue;
int leaf = tree.len();
leaves[k] = leaf; //record the size of the tree before splitting the bottom nodes as the starting index for the leaf node indices
for (int nid = 0; nid < leaf; nid++) {
if (tree.node(nid) instanceof DecidedNode) {
DecidedNode dn = tree.decided(nid);
if (dn._split == null) { // No decision here, no row should have this NID now
if (nid == 0) // Handle the trivial non-splitting tree
new LeafNode(tree, DTree.NO_PARENT, 0);
continue;
}
for (int i = 0; i < dn._nids.length; i++) { //L/R children
int cnid = dn._nids[i];
if (cnid == ScoreBuildHistogram.UNDECIDED_CHILD_NODE_ID || // Bottomed out (predictors or responses known constant)
tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth
(tree.node(cnid) instanceof DecidedNode && // Or not possible to split
((DecidedNode) tree.node(cnid))._split == null)) {
dn._nids[i] = new LeafNode(tree, nid).nid(); // Mark a leaf here
}
}
}
}
} // -- k-trees are done
}
private void fitBestConstantsQuantile(DTree[] ktrees, int firstLeafIndex, double quantile) {
if (firstLeafIndex == ktrees[0]._len) return; // no splits happened - nothing to do
assert(_nclass==1);
Vec diff = new ComputeDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
Vec weights = hasWeightCol() ? _train.vecs()[idx_weight()] : null;
Vec strata = vec_nids(_train,0);
// compute quantile for all leaf nodes
Quantile.StratifiedQuantilesTask sqt = new Quantile.StratifiedQuantilesTask(null, quantile, diff, weights, strata, QuantileModel.CombineMethod.INTERPOLATE);
H2O.submitTask(sqt);
sqt.join();
final DTree tree = ktrees[0];
for (int i = 0; i < sqt._quantiles.length; i++) {
if (Double.isNaN(sqt._quantiles[i])) continue; //no active rows for this NID
double val = effective_learning_rate() * sqt._quantiles[i];
assert !Double.isNaN(val) && !Double.isInfinite(val);
if (val > _parms._max_abs_leafnode_pred) val = _parms._max_abs_leafnode_pred;
if (val < -_parms._max_abs_leafnode_pred) val = -_parms._max_abs_leafnode_pred;
((LeafNode) tree.node(sqt._nids[i]))._pred = (float) val;
if (DEV_DEBUG) { Log.info("Leaf " + sqt._nids[i] + " has quantile: " + sqt._quantiles[i]); }
}
}
// Jerome Friedman 1999: Greedy Function Approximation: A Gradient Boosting Machine
// https://statweb.stanford.edu/~jhf/ftp/trebst.pdf
private void fitBestConstantsHuber(DTree[] ktrees, int firstLeafIndex, double huberDelta) {
if (firstLeafIndex == ktrees[0]._len) return; // no splits happened - nothing to do
assert(_nclass==1);
// get diff y-(f+o) and weights and strata (node idx)
Vec diff = new ComputeDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
Vec weights = hasWeightCol() ? _train.vecs()[idx_weight()] : null;
Vec strata = vec_nids(_train,0);
// compute median diff for each leaf node
Quantile.StratifiedQuantilesTask sqt = new Quantile.StratifiedQuantilesTask(null, 0.5 /*median of weighted residuals*/, diff, weights, strata, QuantileModel.CombineMethod.INTERPOLATE);
H2O.submitTask(sqt);
sqt.join();
// subtract median(diff) from residuals for all observations of each leaf
DiffMinusMedianDiff hp = new DiffMinusMedianDiff(strata, sqt._quantiles /*median residuals per leaf*/);
Frame tmpFrame1 = new Frame(new String[]{"strata","diff"}, new Vec[]{strata,diff});
hp.doAll(tmpFrame1);
Vec diffMinusMedianDiff = diff;
// for each leaf, compute the mean of Math.signum(resMinusMedianRes) * Math.min(Math.abs(resMinusMedianRes), huberDelta),
// where huberDelta is the alpha-percentile of the residual across all observations
Frame tmpFrame2 = new Frame(_train.vecs());
tmpFrame2.add("resMinusMedianRes", diffMinusMedianDiff);
double[] huberGamma = new HuberLeafMath(frameMap, huberDelta,strata).doAll(tmpFrame2)._huberGamma;
// now assign the median per leaf + the above _huberCorrection[i] to each leaf
final DTree tree = ktrees[0];
for (int i = 0; i < sqt._quantiles.length; i++) {
double huber = (sqt._quantiles[i] /*median*/ + huberGamma[i]);
if (Double.isNaN(sqt._quantiles[i])) continue; //no active rows for this NID
double val = effective_learning_rate() * huber;
assert !Double.isNaN(val) && !Double.isInfinite(val);
if (val > _parms._max_abs_leafnode_pred) val = _parms._max_abs_leafnode_pred;
if (val < -_parms._max_abs_leafnode_pred) val = -_parms._max_abs_leafnode_pred;
((LeafNode) tree.node(sqt._nids[i]))._pred = (float) val;
if (DEV_DEBUG) { Log.info("Leaf " + sqt._nids[i] + " has huber value: " + huber); }
}
diffMinusMedianDiff.remove();
}
private double effective_learning_rate() {
return _parms._learn_rate * Math.pow(_parms._learn_rate_annealing, (_model._output._ntrees-1));
}
private void fitBestConstants(DTree[] ktrees, int[] leafs, GammaPass gp) {
double m1class = _nclass > 1 && _parms._distribution == DistributionFamily.multinomial ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K for multinomial
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null) continue;
if (DEV_DEBUG) for (int i=0;i 1e4) gf = 1e4f; // Cap prediction, will already overflow during Math.exp(gf)
else if (gf < -1e4) gf = -1e4f;
}
if (Double.isNaN(gf)) gf=0;
else if (Double.isInfinite(gf)) gf=Math.signum(gf)*1e4f;
if (gf > _parms._max_abs_leafnode_pred) gf = _parms._max_abs_leafnode_pred;
if (gf < -_parms._max_abs_leafnode_pred) gf = -_parms._max_abs_leafnode_pred;
((LeafNode) tree.node(leafs[k] + i))._pred = (float) gf;
}
}
}
@Override protected GBMModel makeModel(Key modelKey, GBMModel.GBMParameters parms) {
return new GBMModel(modelKey, parms, new GBMModel.GBMOutput(GBM.this));
}
}
//--------------------------------------------------------------------------------------------------------------------
private static class FillVecWithConstant extends MRTask {
private double init;
public FillVecWithConstant(double d) {
init = d;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk tree) {
if (tree instanceof C8DVolatileChunk) {
Arrays.fill(((C8DVolatileChunk) tree).getValues(), init);
} else {
for (int i = 0; i < tree._len; i++)
tree.set(i, init);
}
}
}
private static class ResponseLessOffsetTask extends MRTask {
private FrameMap fm;
public ResponseLessOffsetTask(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk resp = chks[fm.responseIndex];
final Chunk offset = chks[fm.offsetIndex];
for (int i = 0; i < chks[0]._len; ++i)
nc[0].addNum(resp.atd(i) - offset.atd(i)); // y - o
}
}
/**
* Newton-Raphson fixpoint iteration to find a self-consistent initial value
*/
private static class NewtonRaphson extends MRTask {
private FrameMap fm;
private Distribution dist;
private double _init;
private double _numerator;
private double _denominator;
public NewtonRaphson(FrameMap frameMap, Distribution distribution, double initialValue) {
assert frameMap != null && distribution != null;
fm = frameMap;
dist = distribution;
_init = initialValue;
_numerator = 0;
_denominator = 0;
}
public double value() {
return _init + _numerator / _denominator;
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = chks[fm.offsetIndex];
Chunk weight = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < ys._len; row++) {
double w = weight.atd(row);
if (w == 0) continue;
if (ys.isNA(row)) continue;
double y = ys.atd(row);
double o = offset.atd(row);
double p = dist.linkInv(o + _init);
_numerator += w * (y - p);
_denominator += w * p * (1. - p);
}
}
@Override
public void reduce(NewtonRaphson mrt) {
_numerator += mrt._numerator;
_denominator += mrt._denominator;
}
}
/**
* Compute Residuals
* Do this for all rows, whether OOB or not
*/
private static class ComputePredAndRes extends MRTask {
private FrameMap fm;
private int nclass;
private boolean[] out;
private Distribution dist;
public ComputePredAndRes(FrameMap frameMap, int nClasses, double[] outputDistribution, Distribution distribution) {
fm = frameMap;
nclass = nClasses;
dist = distribution;
out = new boolean[outputDistribution.length];
for (int i = 0; i < out.length; i++) out[i] = (outputDistribution[i] != 0);
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
C8DVolatileChunk wk = (C8DVolatileChunk) chks[fm.work0Index]; // Place to store residuals
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
double[] fs = nclass > 1 ? new double[nclass + 1] : null;
for (int row = 0; row < wk._len; row++) {
double weight = weights.atd(row);
if (weight == 0) continue;
if (ys.isNA(row)) continue;
double f = preds.atd(row) + offset.atd(row);
double y = ys.atd(row);
// Log.info(f + " vs " + y); //expect that the model predicts very negative values for 0 and very positive values for 1
if (dist.distribution == DistributionFamily.multinomial && fs != null) {
double sum = score1static(chks, fm.tree0Index, 0.0 /*not used for multiclass*/, fs, row, dist, nclass);
if (Double.isInfinite(sum)) { // Overflow (happens for constant responses)
for (int k = 0; k < nclass; k++) {
wk = (C8DVolatileChunk) chks[fm.work0Index + k];
wk.getValues()[row] = ((int) y == k ? 1f : 0f) - (Double.isInfinite(fs[k + 1]) ? 1.0f : 0.0f);
}
} else {
for (int k = 0; k < nclass; k++) { // Save as a probability distribution
if (out[k]) {
wk = (C8DVolatileChunk) chks[fm.work0Index + k];
wk.getValues()[row] = (((int) y == k ? 1f : 0f) - (float) (fs[k + 1] / sum));
}
}
}
} else {
wk.getValues()[row] = ((float) dist.negHalfGradient(y, f));
}
}
}
}
private static class ComputeMinMax extends MRTask {
private FrameMap fm;
int firstLeafIdx;
int _totalNumNodes;
float[] _mins;
float[] _maxs;
public ComputeMinMax(FrameMap frameMap, int firstLeafIndex, int totalNumNodes) {
fm = frameMap;
firstLeafIdx = firstLeafIndex;
_totalNumNodes = totalNumNodes;
}
@Override
public void map(Chunk[] chks) {
int len = _totalNumNodes - firstLeafIdx; // number of leaves
_mins = new float[len];
_maxs = new float[len];
Arrays.fill(_mins, Float.MAX_VALUE);
Arrays.fill(_maxs, -Float.MAX_VALUE);
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
Chunk nids = chks[fm.nids0Index];
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < preds._len; row++) {
if (ys.isNA(row)) continue;
if (weights.atd(row) == 0) continue;
int nid = (int) nids.at8(row);
assert (nid != ScoreBuildHistogram.UNDECIDED_CHILD_NODE_ID);
if (nid < 0) continue; //skip OOB and otherwise skipped rows
float f = (float) (preds.atd(row) + offset.atd(row));
int idx = nid - firstLeafIdx;
_mins[idx] = Math.min(_mins[idx], f);
_maxs[idx] = Math.max(_maxs[idx], f);
}
}
@Override
public void reduce(ComputeMinMax mrt) {
ArrayUtils.reduceMin(_mins, mrt._mins);
ArrayUtils.reduceMax(_maxs, mrt._maxs);
}
}
private static class ComputeDiff extends MRTask {
private FrameMap fm;
public ComputeDiff(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk y = chks[fm.responseIndex];
final Chunk o = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk f = chks[fm.tree0Index];
for (int i = 0; i < chks[0].len(); ++i)
nc[0].addNum(y.atd(i) - (f.atd(i) + o.atd(i)));
}
}
private static class ComputeAbsDiff extends MRTask {
private FrameMap fm;
public ComputeAbsDiff(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk y = chks[fm.responseIndex];
final Chunk o = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk f = chks[fm.tree0Index];
for (int i = 0; i < chks[0].len(); ++i)
nc[0].addNum(Math.abs(y.atd(i) - (f.atd(i) + o.atd(i))));
}
}
public static class DiffMinusMedianDiff extends MRTask {
private final int _strataMin;
private double[] _terminalMedians;
public DiffMinusMedianDiff(Vec strata, double[] terminalMedians) {
_strataMin = (int) strata.min();
_terminalMedians = terminalMedians;
}
@Override
public void map(Chunk[] chks) {
final Chunk strata = chks[0];
final Chunk diff = chks[1];
for (int i = 0; i < chks[0].len(); ++i) {
int nid = (int) strata.atd(i);
diff.set(i, diff.atd(i) - _terminalMedians[nid - _strataMin]);
}
}
}
private static final class HuberLeafMath extends MRTask {
// INPUT
final FrameMap fm;
final double _huberDelta;
final Vec _strata;
final int _strataMin;
final int _strataMax;
// OUTPUT
double[/*leaves*/] _huberGamma, _wcounts;
public HuberLeafMath(FrameMap frameMap, double huberDelta, Vec strata) {
fm = frameMap;
_huberDelta = huberDelta;
_strata = strata;
_strataMin = (int) _strata.min();
_strataMax = (int) _strata.max();
}
@Override
public void map(Chunk[] cs) {
if (_strataMin < 0 || _strataMax < 0) {
Log.warn("No Huber math can be done since there's no strata.");
return;
}
final int nstrata = _strataMax - _strataMin + 1;
Log.info("Computing Huber math for (up to) " + nstrata + " different strata.");
_huberGamma = new double[nstrata];
_wcounts = new double[nstrata];
Chunk weights = fm.weightIndex >= 0 ? cs[fm.weightIndex] : new C0DChunk(1, cs[0]._len);
Chunk stratum = cs[fm.nids0Index];
Chunk diffMinusMedianDiff = cs[cs.length - 1];
for (int row = 0; row < cs[0]._len; ++row) {
int nidx = (int) stratum.at8(row) - _strataMin; //get terminal node for this row
_huberGamma[nidx] += weights.atd(row) * Math.signum(diffMinusMedianDiff.atd(row)) * Math.min(Math.abs(diffMinusMedianDiff.atd(row)), _huberDelta);
_wcounts[nidx] += weights.atd(row);
}
}
@Override
public void reduce(HuberLeafMath mrt) {
ArrayUtils.add(_huberGamma, mrt._huberGamma);
ArrayUtils.add(_wcounts, mrt._wcounts);
}
@Override
protected void postGlobal() {
for (int i = 0; i < _huberGamma.length; ++i)
_huberGamma[i] /= _wcounts[i];
}
}
private static class StoreResiduals extends MRTask {
private FrameMap fm;
private Distribution dist;
public StoreResiduals(FrameMap frameMap, Distribution distribution) {
fm = frameMap;
dist = distribution;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
C8DVolatileChunk wk = (C8DVolatileChunk) chks[fm.work0Index]; // Place to store residuals
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < wk._len; row++) {
double weight = weights.atd(row);
if (weight == 0) continue;
if (ys.isNA(row)) continue;
double f = preds.atd(row) + offset.atd(row);
double y = ys.atd(row);
wk.getValues()[row] = ((float) dist.negHalfGradient(y, f));
}
}
}
/**
* Set terminal node estimates (gamma)
* ESL2, page 387. Step 2b iii.
* Nids <== f(Nids)
* For classification (bernoulli):
* {@code gamma_i = sum (w_i * res_i) / sum (w_i*p_i*(1 - p_i)) where p_i = y_i - res_i}
* For classification (multinomial):
* {@code gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))}
* For regression (gaussian):
* {@code gamma_i = sum res_i / count(res_i)}
*/
private static class GammaPass extends MRTask {
private final FrameMap fm;
private final DTree[] _trees; // Read-only, shared (except at the histograms in the Nodes)
private final int[] _leafs; // Starting index of leaves (per class-tree)
private final Distribution _dist;
private final int _nclass;
private double[/*tree/klass*/][/*tree-relative node-id*/] _num;
private double[/*tree/klass*/][/*tree-relative node-id*/] _denom;
public GammaPass(FrameMap frameMap, DTree[] trees, int[] leafs, Distribution distribution, int nClasses) {
fm = frameMap;
_leafs = leafs;
_trees = trees;
_dist = distribution;
_nclass = nClasses;
}
double gamma(int tree, int nid) {
if (_denom[tree][nid] == 0) return 0;
double g = _num[tree][nid] / _denom[tree][nid];
assert (!Double.isInfinite(g) && !Double.isNaN(g));
if (_dist.distribution == DistributionFamily.poisson ||
_dist.distribution == DistributionFamily.gamma ||
_dist.distribution == DistributionFamily.tweedie) {
return _dist.link(g);
} else {
return g;
}
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
_denom = new double[_nclass][];
_num = new double[_nclass][];
final Chunk resp = chks[fm.responseIndex]; // Response for this frame
// For all tree/klasses
for (int k = 0; k < _nclass; k++) {
final DTree tree = _trees[k];
final int leaf = _leafs[k];
if (tree == null) continue; // Empty class is ignored
assert (tree._len - leaf >= 0);
// A leaf-biased array of all active Tree leaves.
final double[] denom = _denom[k] = new double[tree._len - leaf];
final double[] num = _num[k] = new double[tree._len - leaf];
final C4VolatileChunk nids = (C4VolatileChunk) chks[fm.nids0Index + k]; // Node-ids for this tree/class
int[] nids_vals = nids.getValues();
final Chunk ress = chks[fm.work0Index + k]; // Residuals for this tree/class
final Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk preds = chks[fm.tree0Index + k];
final Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
// If we have all constant responses, then we do not split even the
// root and the residuals should be zero.
if (tree.root() instanceof LeafNode)
continue;
for (int row = 0; row < nids._len; row++) { // For all rows
double w = weights.atd(row);
if (w == 0) continue;
double y = resp.atd(row); //response
if (Double.isNaN(y)) continue;
// Compute numerator and denominator of terminal node estimate (gamma)
int nid = (int) nids.at8(row); // Get Node to decide from
final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow(nid); //same for all k
if (wasOOBRow) nid = ScoreBuildHistogram.oob2Nid(nid);
if (nid < 0) continue;
DecidedNode dn = tree.decided(nid); // Must have a decision point
if (dn._split == null) // Unable to decide?
dn = tree.decided(dn.pid()); // Then take parent's decision
int leafnid = dn.getChildNodeID(chks, row); // Decide down to a leafnode
assert leaf <= leafnid && leafnid < tree._len :
"leaf: " + leaf + " leafnid: " + leafnid + " tree._len: " + tree._len + "\ndn: " + dn;
assert tree.node(leafnid) instanceof LeafNode;
// Note: I can tell which leaf/region I end up in, but I do not care for
// the prediction presented by the tree. For GBM, we compute the
// sum-of-residuals (and sum/abs/mult residuals) for all rows in the
// leaf, and get our prediction from that.
nids_vals[row] = leafnid;
assert !ress.isNA(row);
// OOB rows get placed properly (above), but they don't affect the computed Gamma (below)
// For Laplace/Quantile distribution, we need to compute the median of (y-offset-preds == y-f), will be done outside of here
if (wasOOBRow
|| _dist.distribution == DistributionFamily.laplace
|| _dist.distribution == DistributionFamily.huber
|| _dist.distribution == DistributionFamily.quantile) continue;
double z = ress.atd(row); // residual
double f = preds.atd(row) + offset.atd(row);
int idx = leafnid - leaf;
num[idx] += _dist.gammaNum(w, y, z, f);
denom[idx] += _dist.gammaDenom(w, y, z, f);
}
}
}
@Override
public void reduce(GammaPass gp) {
ArrayUtils.add(_denom, gp._denom);
ArrayUtils.add(_num, gp._num);
}
}
private static class AddTreeContributions extends MRTask {
private FrameMap fm;
private DTree[] _ktrees;
private int _nclass;
private double _pred_noise_bandwidth;
private long _seed;
private int _ntrees1;
private int _ntrees2;
public AddTreeContributions(
FrameMap frameMap, DTree[] ktrees, double predictionNoiseBandwidth, long seed, int nTreesInp, int nTreesOut
) {
fm = frameMap;
_ktrees = ktrees;
_nclass = ktrees.length;
_pred_noise_bandwidth = predictionNoiseBandwidth;
_seed = seed;
_ntrees1 = nTreesInp;
_ntrees2 = nTreesOut;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
Random rand = RandomUtils.getRNG(_seed);
// For all tree/klasses
for (int k = 0; k < _nclass; k++) {
final DTree tree = _ktrees[k];
if (tree == null) continue;
final C4VolatileChunk nids = (C4VolatileChunk) chks[fm.nids0Index + k];
final int[] nids_vals = nids.getValues();
final C8DVolatileChunk ct = (C8DVolatileChunk) chks[fm.tree0Index + k];
double[] ct_vals = ct.getValues();
final Chunk y = chks[fm.responseIndex];
final Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
long baseseed = (0xDECAF + _seed) * (0xFAAAAAAB + k * _ntrees1 + _ntrees2);
for (int row = 0; row < nids._len; row++) {
int nid = nids_vals[row];
nids_vals[row] = ScoreBuildHistogram.FRESH;
if (nid < 0) continue;
if (y.isNA(row)) continue;
if (weights.atd(row) == 0) continue;
double factor = 1;
if (_pred_noise_bandwidth != 0) {
rand.setSeed(baseseed + nid); //bandwidth is a function of tree number, class and node id (but same for all rows in that node)
factor += rand.nextGaussian() * _pred_noise_bandwidth;
}
// Prediction stored in Leaf is cut to float to be deterministic in reconstructing
// fields from tree prediction
ct_vals[row] = ((float) (ct.atd(row) + factor * ((LeafNode) tree.node(nid))._pred));
}
}
}
}
//--------------------------------------------------------------------------------------------------------------------
// Read the 'tree' columns, do model-specific math and put the results in the
// fs[] array, and return the sum. Dividing any fs[] element by the sum
// turns the results into a probability distribution.
@Override protected double score1(Chunk[] chks, double weight, double offset, double[/*nclass*/] fs, int row) {
return score1static(chks, idx_tree(0), offset, fs, row, new Distribution(_parms), _nclass);
}
// Read the 'tree' columns, do model-specific math and put the results in the
// fs[] array, and return the sum. Dividing any fs[] element by the sum
// turns the results into a probability distribution.
private static double score1static(Chunk[] chks, int treeIdx, double offset, double[] fs, int row, Distribution dist, int nClasses) {
double f = chks[treeIdx].atd(row) + offset;
double p = dist.linkInv(f);
if (dist.distribution == DistributionFamily.modified_huber || dist.distribution == DistributionFamily.bernoulli || dist.distribution == DistributionFamily.quasibinomial) {
fs[2] = p;
fs[1] = 1.0 - p;
return 1; // f2 = 1.0 - f1; so f1+f2 = 1.0
} else if (dist.distribution == DistributionFamily.multinomial) {
if (nClasses == 2) {
// This optimization assumes the 2nd tree of a 2-class system is the
// inverse of the first. Fill in the missing tree
fs[1] = p;
fs[2] = 1 / p;
return fs[1] + fs[2];
}
// Multinomial loss function; sum(exp(data)). Load tree data
assert (offset == 0);
fs[1] = f;
for (int k = 1; k < nClasses; k++)
fs[k + 1] = chks[treeIdx + k].atd(row);
// Rescale to avoid Infinities; return sum(exp(data))
return hex.genmodel.GenModel.log_rescale(fs);
} else {
return fs[0] = p;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy