hex.tree.gbm.GBM Maven / Gradle / Ivy
package hex.tree.gbm;
import hex.*;
import hex.genmodel.MojoModel;
import hex.genmodel.algos.gbm.GbmMojoModel;
import hex.genmodel.utils.DistributionFamily;
import hex.quantile.Quantile;
import hex.quantile.QuantileModel;
import hex.tree.*;
import hex.tree.DTree.DecidedNode;
import hex.tree.DTree.LeafNode;
import hex.tree.DTree.UndecidedNode;
import org.apache.log4j.Logger;
import water.*;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.*;
import water.util.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import static hex.tree.ScoreBuildHistogram.DECIDED_ROW;
import static hex.util.LinearAlgebraUtils.toEigenArray;
/** Gradient Boosted Trees
*
* Based on "Elements of Statistical Learning, Second Edition, page 387"
*/
public class GBM extends SharedTree {
private static final Logger LOG = Logger.getLogger(GBM.class);
@Override public ModelCategory[] can_build() {
return new ModelCategory[]{
ModelCategory.Regression,
ModelCategory.Binomial,
ModelCategory.Multinomial,
};
}
// Called from an http request
public GBM( GBMModel.GBMParameters parms ) { super(parms ); init(false); }
public GBM( GBMModel.GBMParameters parms, Key key) { super(parms, key); init(false); }
public GBM(boolean startup_once) { super(new GBMModel.GBMParameters(),startup_once); }
@Override protected int nModelsInParallel(int folds) {
int defaultParallelization = nclasses() <= 2 ?
2 // for binomial and regression we can squeeze bit more performance by running 2 models in parallel
:
1; // for multinomial we need to build a tree per class - this is already massively parallel;
// adding another level of parallelism on top of that increases memory requirements and doesn't improve performance
return nModelsInParallel(folds, defaultParallelization);
}
/** Start the GBM training Job on an F/J thread. */
@Override protected GBMDriver trainModelImpl() {
return new GBMDriver();
}
/** Initialize the ModelBuilder, validating all arguments and preparing the
* training frame. This call is expected to be overridden in the subclasses
* and each subclass will start with "super.init();". This call is made
* by the front-end whenever the GUI is clicked, and needs to be fast;
* heavy-weight prep needs to wait for the trainModel() call.
*
* Validate the learning rate and distribution family. */
@Override public void init(boolean expensive) {
super.init(expensive);
// Initialize response based on given distribution family.
// Regression: initially predict the response mean
// Binomial: just class 0 (class 1 in the exact inverse prediction)
// Multinomial: Class distribution which is not a single value.
// However there is this weird tension on the initial value for
// classification: If you guess 0's (no class is favored over another),
// then with your first GBM tree you'll typically move towards the correct
// answer a little bit (assuming you have decent predictors) - and
// immediately the Confusion Matrix shows good results which gradually
// improve... BUT the Means Squared Error will suck for unbalanced sets,
// even as the CM is good. That's because we want the predictions for the
// common class to be large and positive, and the rare class to be negative
// and instead they start around 0. Guessing initial zero's means the MSE
// is so bad, that the R^2 metric is typically negative (usually it's
// between 0 and 1).
// If instead you guess the mean (reversed through the loss function), then
// the zero-tree GBM model reports an MSE equal to the response variance -
// and an initial R^2 of zero. More trees gradually improves the R^2 as
// expected. However, all the minority classes have large guesses in the
// wrong direction, and it takes a long time (lotsa trees) to correct that
// - so your CM sucks for a long time.
if (expensive) {
if (error_count() > 0)
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(GBM.this);
if (_parms._distribution == DistributionFamily.AUTO) {
if (_nclass == 1) _parms._distribution = DistributionFamily.gaussian;
if (_nclass == 2) _parms._distribution = DistributionFamily.bernoulli;
if (_nclass >= 3) _parms._distribution = DistributionFamily.multinomial;
}
checkDistributions();
if (hasOffsetCol() && isClassifier() && (_parms._distribution == DistributionFamily.multinomial || _parms._distribution == DistributionFamily.custom)) {
error("_offset_column", "Offset is not supported for "+_parms._distribution+" distribution.");
}
if (_parms._monotone_constraints != null && _parms._monotone_constraints.length > 0 && !supportMonotoneConstraints(_parms._distribution)) {
error("_monotone_constraints", "Monotone constraints are only supported for Gaussian, Bernoulli, Tweedie and Quantile distributions, your distribution: " + _parms._distribution + ".");
}
if (_origTrain != null && _origTrain != _train) {
List projections = new ArrayList<>();
for (int i = 0; i < _origTrain.numCols(); i++) {
Vec currentCol = _origTrain.vec(i);
if (currentCol.isCategorical()) {
double[] actProjection = toEigenArray(currentCol);
for (double v : actProjection) {
projections.add(v);
}
}
}
double[] primitive_projections = new double[projections.size()];
for (int i = 0; i < projections.size(); i++) {
primitive_projections[i] = projections.get(i);
}
_orig_projection_array = primitive_projections;
}
}
switch( _parms._distribution) {
case bernoulli:
if( _nclass != 2 /*&& !couldBeBool(_response)*/)
error("_distribution", H2O.technote(2, "Binomial requires the response to be a 2-class categorical"));
break;
case quasibinomial:
if ( !_response.isNumeric() )
error("_distribution", H2O.technote(2, "Quasibinomial requires the response to be numeric."));
if ( _nclass != 2)
error("_distribution", H2O.technote(2, "Quasibinomial requires the response to be binary."));
break;
case modified_huber:
if( _nclass != 2 /*&& !couldBeBool(_response)*/)
error("_distribution", H2O.technote(2, "Modified Huber requires the response to be a 2-class categorical."));
break;
case multinomial:
if (!isClassifier()) error("_distribution", H2O.technote(2, "Multinomial requires an categorical response."));
break;
case huber:
if (isClassifier()) error("_distribution", H2O.technote(2, "Huber requires the response to be numeric."));
break;
case poisson:
if (isClassifier()) error("_distribution", H2O.technote(2, "Poisson requires the response to be numeric."));
break;
case gamma:
if (isClassifier()) error("_distribution", H2O.technote(2, "Gamma requires the response to be numeric."));
break;
case tweedie:
if (isClassifier()) error("_distribution", H2O.technote(2, "Tweedie requires the response to be numeric."));
break;
case gaussian:
if (isClassifier()) error("_distribution", H2O.technote(2, "Gaussian requires the response to be numeric."));
break;
case laplace:
if (isClassifier()) error("_distribution", H2O.technote(2, "Laplace requires the response to be numeric."));
break;
case quantile:
if (isClassifier()) error("_distribution", H2O.technote(2, "Quantile requires the response to be numeric."));
break;
case custom:
if(_parms._custom_distribution_func == null) error("_distribution", H2O.technote(2, "Custom requires custom function loaded."));
break;
case AUTO:
break;
default:
error("_distribution","Invalid distribution: " + _parms._distribution);
}
if( !(0. < _parms._learn_rate && _parms._learn_rate <= 1.0) )
error("_learn_rate", "learn_rate must be between 0 and 1");
if( !(0. < _parms._learn_rate_annealing && _parms._learn_rate_annealing <= 1.0) )
error("_learn_rate_annealing", "learn_rate_annealing must be between 0 and 1");
if( !(0. < _parms._col_sample_rate && _parms._col_sample_rate <= 1.0) )
error("_col_sample_rate", "col_sample_rate must be between 0 and 1");
if (_parms._max_abs_leafnode_pred <= 0)
error("_max_abs_leafnode_pred", "max_abs_leafnode_pred must be larger than 0.");
if (_parms._pred_noise_bandwidth < 0)
error("_pred_noise_bandwidth", "pred_noise_bandwidth must be >= 0.");
if ((_train != null) && (_parms._monotone_constraints != null)) {
TreeUtils.checkMonotoneConstraints(this, _train, _parms._monotone_constraints);
}
if ((_train != null) && (_parms._interaction_constraints != null)) {
if(_parms._categorical_encoding != Model.Parameters.CategoricalEncodingScheme.AUTO && _parms._categorical_encoding != Model.Parameters.CategoricalEncodingScheme.OneHotInternal){
error("_categorical_encoding", "Interaction constraints can be used when the categorical encoding is set to ``AUTO`` (``one_hot_internal`` or ``OneHotInternal``) only.");
}
TreeUtils.checkInteractionConstraints(this, _train, _parms._interaction_constraints);
if (error_count() == 0) {
_ics = _parms.interactionConstraints(_train);
}
}
}
private boolean supportMonotoneConstraints(DistributionFamily distributionFamily){
switch (distributionFamily) {
case gaussian:
case bernoulli:
case tweedie:
case quantile:
return true;
default:
return false;
}
}
// ----------------------
private class GBMDriver extends Driver {
private transient FrameMap frameMap;
private transient long _skippedCnt; // #observations that will be skipped because they have 0 weight or NA label in the training frame
@Override
protected Frame makeValidWorkspace() {
final int nClasses = numClassTrees();
final Vec[] tmp;
if (validWorkspaceCanReuseTrainWorkspace()) {
tmp = new Vec[nClasses];
for (int i = 0; i < nClasses; i++) {
tmp[i] = _train.vec(idx_tree(i));
assert tmp[i].isVolatile();
}
} else {
tmp = _valid.anyVec().makeVolatileDoubles(nClasses);
}
String[] tmpNames = new String[tmp.length];
for (int i = 0; i < tmpNames.length; i++)
tmpNames[i] = "__P_" + i;
return new Frame(tmpNames, tmp);
}
private boolean validWorkspaceCanReuseTrainWorkspace() {
// 1. only possible for CV models
if (!_parms._is_cv_model)
return false;
// 2. and only if training frame and validation frame are identical (except for weights)
// training frame can eg. be sub/over-sampled for imbalanced problems\
// shortcut: if responses are identical => frames must be identical as well (for CV models only!!!)
return _vresponse._key.equals(_response._key);
}
@Override protected boolean doOOBScoring() { return false; }
@Override protected void initializeModelSpecifics() {
frameMap = new FrameMap(GBM.this);
_mtry_per_tree = Math.max(1, (int)(_parms._col_sample_rate_per_tree * _ncols)); //per-tree
assert _parms.useColSampling() || _mtry_per_tree == _ncols;
if (!(1 <= _mtry_per_tree && _mtry_per_tree <= _ncols)) throw new IllegalArgumentException("Computed mtry_per_tree should be in interval <1,"+_ncols+"> but it is " + _mtry_per_tree);
_mtry = Math.max(1, (int)(_parms._col_sample_rate * _parms._col_sample_rate_per_tree * _ncols)); //per-split
assert _parms.useColSampling() || _mtry == _ncols;
if (!(1 <= _mtry && _mtry <= _ncols)) throw new IllegalArgumentException("Computed mtry should be in interval <1,"+_ncols+"> but it is " + _mtry);
// for Bernoulli, we compute the initial value with Newton-Raphson iteration, otherwise it might be NaN here
DistributionFamily distr = _parms._distribution;
_initialPrediction = _nclass > 2 || distr == DistributionFamily.laplace || distr == DistributionFamily.huber || distr == DistributionFamily.quantile ? 0 : getInitialValue();
if(distr == DistributionFamily.quasibinomial){
_model._output._quasibinomialDomains = new VecUtils.CollectDoubleDomain(null,2).doAll(_response).stringDomain(_response.isInt());
}
if (distr == DistributionFamily.bernoulli || distr == DistributionFamily.quasibinomial) {
if (hasOffsetCol())
_initialPrediction = getInitialValueBernoulliOffset(_train);
} else if (distr == DistributionFamily.laplace || distr == DistributionFamily.huber) {
_initialPrediction = getInitialValueQuantile(0.5);
} else if (distr == DistributionFamily.quantile) {
_initialPrediction = getInitialValueQuantile(_parms._quantile_alpha);
}
_model._output._init_f = _initialPrediction; //always write the initial value here (not just for Bernoulli)
if (_model.evalAutoParamsEnabled) {
_model.initActualParamValuesAfterOutputSetup(_nclass, isClassifier());
}
// Set the initial prediction into the tree column 0
if (_initialPrediction != 0.0) {
new FillVecWithConstant(_initialPrediction)
.doAll(vec_tree(_train, 0), _parms._build_tree_one_node); // Only setting tree-column 0
}
// Mark all rows that either have zero weights or the label is NA as decided
// This is way we can avoid processing them when building trees
final long zeroWeights = hasWeightCol() ? _weights.length() - _weights.nzCnt() : 0;
if (zeroWeights > 0 || // has some zero weights or
response().naCnt() > 0) { // there are NAs in the response
Vec[] vecs = new Vec[]{_response};
if (hasWeightCol())
vecs = ArrayUtils.append(vecs, _weights);
for (int k = 0; k < _nclass; k++) {
Vec nidsVecK = vec_nids(_train, k);
if (nidsVecK.min() == DECIDED_ROW) {
// classes not present in the training frame are skipped
assert _model._output._distribution[k] == 0;
assert nidsVecK.isConst();
continue;
}
vecs = ArrayUtils.append(vecs, nidsVecK);
}
_skippedCnt = new MarkDecidedRows().doAll(vecs).markedCnt;
assert _skippedCnt >= zeroWeights;
assert _skippedCnt >= response().naCnt();
assert _skippedCnt <= response().naCnt() + zeroWeights;
} else
_skippedCnt = 0;
}
class MarkDecidedRows extends MRTask {
int markedCnt;
@Override
public void map(Chunk[] cs) {
final int colStart;
final Chunk y;
final Chunk weights;
if (hasWeightCol()) {
y = cs[0];
weights = cs[1];
colStart = 2;
} else {
y = cs[0];
weights = new C0DChunk(1, cs[0]._len);
colStart = 1;
}
for (int row = 0; row < y._len; row++) {
if (weights.atd(row) == 0 || y.isNA(row)) {
markedCnt++;
for (int c = colStart; c < cs.length; c++) {
cs[c].set(row, DECIDED_ROW);
}
}
}
}
@Override
public void reduce(MarkDecidedRows mrt) {
markedCnt += mrt.markedCnt;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
}
/**
* Helper to compute the initial value for Laplace/Huber/Quantile (incl. optional offset and obs weights)
* @return weighted median of response - offset
*/
private double getInitialValueQuantile(double quantile) {
// obtain y - o
Vec y = hasOffsetCol()
? new ResponseLessOffsetTask(frameMap).doAll(1, Vec.T_NUM, _train).outputFrame().anyVec()
: response();
// Now compute (weighted) quantile of y - o
double res;
QuantileModel qm = null;
Frame tempFrame = null;
try {
tempFrame = new Frame(Key.make(H2O.SELF), new String[]{"y"}, new Vec[]{y});
if (hasWeightCol()) tempFrame.add("w", _weights);
DKV.put(tempFrame);
QuantileModel.QuantileParameters parms = new QuantileModel.QuantileParameters();
parms._train = tempFrame._key;
parms._probs = new double[]{quantile};
parms._weights_column = hasWeightCol() ? "w" : null;
Job job1 = new Quantile(parms).trainModel();
qm = job1.get();
res = qm._output._quantiles[0][0];
} finally {
if (qm!=null) qm.remove();
if (tempFrame!=null) DKV.remove(tempFrame._key);
}
return res;
}
/**
* Helper to compute the initial value for Bernoulli for offset != 0
*/
private double getInitialValueBernoulliOffset(Frame train) {
LOG.info("Running Newton-Raphson iteration to find the initial value since offsets are specified.");
double delta;
int count = 0;
double tol = 1e-4;
//From R GBM vignette:
//For speed, gbm() does only one step of the Newton-Raphson algorithm
//rather than iterating to convergence. No appreciable loss of accuracy
//since the next boosting iteration will simply correct for the prior iterations
//inadequacy.
int N = 1; //one step is enough - same as R
double init = 0; //start with initial value of 0 for convergence
do {
double newInit = new NewtonRaphson(frameMap, DistributionFactory.getDistribution(_parms), init).doAll(train).value();
delta = Math.abs(init - newInit);
init = newInit;
LOG.info("Iteration " + (++count) + ": initial value: " + init);
} while (count < N && delta >= tol);
if (delta > tol) LOG.warn("Not fully converged.");
LOG.info("Newton-Raphson iteration ran for " + count + " iteration(s). Final residual: " + delta);
return init;
}
private static final double MIN_LOG_TRUNC = -19;
private static final double MAX_LOG_TRUNC = 19;
private void truncatePreds(final DTree tree, int firstLeafIndex, DistributionFamily dist) {
if (firstLeafIndex==tree._len) return;
ComputeMinMax minMax = new ComputeMinMax(frameMap, firstLeafIndex, tree._len).doAll(_train);
if (LOG.isTraceEnabled()) {
LOG.trace("Number of leaf nodes: " + minMax._mins.length);
LOG.trace("Min: " + Arrays.toString(minMax._mins));
LOG.trace("Max: " + Arrays.toString(minMax._maxs));
}
//loop over leaf nodes only: starting at leaf index
for (int i = 0; i < tree._len - firstLeafIndex; i++) {
final LeafNode node = ((LeafNode) tree.node(firstLeafIndex + i));
int nidx = node.nid();
float nodeMin = minMax._mins[nidx- firstLeafIndex];
float nodeMax = minMax._maxs[nidx- firstLeafIndex];
if (LOG.isTraceEnabled()) LOG.trace("Node: " + nidx + " min/max: " + nodeMin + "/" + nodeMax);
// https://github.com/cran/gbm/blob/master/src/poisson.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/poisson.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/poisson.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/gamma.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/gamma.cpp
// https://github.com/harrysouthworth/gbm/blob/master/src/tweedie.cpp
// https://github.com/gbm-developers/gbm/blob/master/src/tweedie.cpp
double val = node._pred;
if (dist == DistributionFamily.gamma || dist == DistributionFamily.tweedie) //only for gamma/tweedie
val += nodeMax;
if (val > MAX_LOG_TRUNC) {
if (LOG.isDebugEnabled()) LOG.debug("Truncating large positive leaf prediction (log): " + node._pred + " to " + (MAX_LOG_TRUNC - nodeMax));
node._pred = (float) (MAX_LOG_TRUNC - nodeMax);
}
val = node._pred;
if (dist == DistributionFamily.gamma || dist == DistributionFamily.tweedie) //only for gamma/tweedie
val += nodeMin;
if (val < MIN_LOG_TRUNC) {
if (LOG.isDebugEnabled()) LOG.debug("Truncating large negative leaf prediction (log): " + node._pred + " to " + (MIN_LOG_TRUNC - nodeMin));
node._pred = (float) (MIN_LOG_TRUNC - nodeMin);
}
if (node._pred < MIN_LOG_TRUNC && node._pred > MAX_LOG_TRUNC) {
LOG.warn("Terminal node prediction outside of allowed interval in log-space: "
+ node._pred + " (should be in " + MIN_LOG_TRUNC + "..." + MAX_LOG_TRUNC + ").");
}
}
}
// --------------------------------------------------------------------------
// Build the next k-trees, which is trying to correct the residual error from
// the prior trees.
@Override protected boolean buildNextKTrees() {
// We're going to build K (nclass) trees - each focused on correcting
// errors for a single class.
final DTree[] ktrees = new DTree[_nclass];
// Define a "working set" of leaf splits, from here to tree._len
int[] leaves = new int[_nclass];
// Get distribution
Distribution distributionImpl = DistributionFactory.getDistribution(_parms);
// Compute predictions and resulting residuals
// ESL2, page 387, Steps 2a, 2b
// fills "Work" columns for all rows (incl. OOB) with the residuals
double huberDelta = Double.NaN;
if (_parms._distribution == DistributionFamily.huber) {
// Jerome Friedman 1999: Greedy Function Approximation: A Gradient Boosting Machine
// https://statweb.stanford.edu/~jhf/ftp/trebst.pdf
// compute absolute diff |y-(f+o)| for all rows
Vec diff = new ComputeAbsDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
// compute weighted alpha-quantile of the absolute residual -> this is the delta for the huber loss
huberDelta = MathUtils.computeWeightedQuantile(_weights, diff, _parms._huber_alpha);
distributionImpl.setHuberDelta(huberDelta);
// now compute residuals using the gradient of the huber loss (with a globally adjusted delta)
new StoreResiduals(frameMap, distributionImpl).doAll(_train, _parms._build_tree_one_node);
} else {
// compute predictions and residuals in one shot
new ComputePredAndRes(frameMap, _nclass, _model._output._distribution, distributionImpl)
.doAll(_train, _parms._build_tree_one_node);
}
for (int k = 0; k < _nclass; k++) {
if (LOG.isTraceEnabled() && ktrees[k]!=null) {
LOG.trace("Updated predictions in WORK col for class " + k + ":\n" + new Frame(new String[]{"WORK"},new Vec[]{vec_work(_train, k)}).toTwoDimTable());
}
}
// ----
// ESL2, page 387. Step 2b ii.
// One Big Loop till the ktrees are of proper depth.
// Adds a layer to the trees each pass.
Constraints cs = _parms.constraints(_train);
// Initialize branch interaction constraints
BranchInteractionConstraints bics = null;
if(_parms._interaction_constraints != null) {
bics = _parms.initialInteractionConstraints(_ics);
}
growTrees(ktrees, leaves, _rand, cs, bics);
for (int k = 0; k < _nclass; k++) {
if (LOG.isTraceEnabled() && ktrees[k]!=null) {
LOG.trace("Grew trees. Updated NIDs for class " + k + ":\n" + new Frame(new String[]{"NIDS"},new Vec[]{vec_nids(_train, k)}).toTwoDimTable());
}
}
// ----
// ESL2, page 387. Step 2b iii. Compute the gammas (leaf node predictions === fit best constant), and store them back
// into the tree leaves. Includes learn_rate.
GammaPass gp = new GammaPass(frameMap, ktrees, leaves, distributionImpl, _nclass);
gp.doAll(_train);
if (_parms._distribution == DistributionFamily.laplace) {
fitBestConstantsQuantile(ktrees, leaves[0], 0.5); //special case for Laplace: compute the median for each leaf node and store that as prediction
} else if (_parms._distribution == DistributionFamily.quantile) {
if(cs == null) {
fitBestConstantsQuantile(ktrees, leaves[0], _parms._quantile_alpha); //compute the alpha-quantile for each leaf node and store that as prediction
} else {
fitBestConstants(ktrees, leaves, gp, cs); // compute quantile monotone constraints using precomputed parent prediction
resetQuantileConstants(ktrees, _parms._quantile_alpha, cs);
}
} else if (_parms._distribution == DistributionFamily.huber) {
fitBestConstantsHuber(ktrees, leaves[0], huberDelta); //compute the alpha-quantile for each leaf node and store that as prediction
} else {
fitBestConstants(ktrees, leaves, gp, cs);
}
// Apply a correction for strong mispredictions (otherwise deviance can explode)
if (_parms._distribution == DistributionFamily.gamma ||
_parms._distribution == DistributionFamily.poisson ||
_parms._distribution == DistributionFamily.tweedie) {
assert(_nclass == 1);
truncatePreds(ktrees[0], leaves[0], _parms._distribution);
}
if ((cs != null) && constraintCheckEnabled()) {
_job.update(0, "Checking monotonicity constraints on the final model");
checkConstraints(ktrees, leaves, cs);
}
// ----
// ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the
// new tree, in the 'tree' columns. Also, zap the NIDs for next pass.
// Tree <== f(Tree)
// Nids <== 0
new AddTreeContributions(
frameMap, ktrees, _parms._pred_noise_bandwidth, _parms._seed, _parms._ntrees, _model._output._ntrees
).doAll(_train);
// sanity check
for (int k = 0; k < _nclass; k++) {
assert ktrees[k] == null || vec_nids(_train, k).nzCnt() == _skippedCnt;
}
// Grow the model by K-trees
_model._output.addKTrees(ktrees);
// If there is no row/col-sampling and trees are just roots with 0 prediction (==no change) we can stop building
if (!_parms.isStochastic()) {
boolean converged = true;
for (DTree tree : ktrees) {
if (tree == null)
continue;
DTree.Node root = tree.root();
converged = root instanceof LeafNode && ((LeafNode) root)._pred == 0.0f;
if (! converged) {
break;
}
}
if (converged) {
LOG.warn("Model cannot be further improved by building more trees, " +
"stopping with ntrees=" + _model._output._ntrees + ".");
return true;
}
}
boolean converged = effective_learning_rate() < 1e-6;
if (converged) {
LOG.warn("Effective learning rate dropped below 1e-6 (" + _parms._learn_rate + " * " + _parms._learn_rate_annealing + "^" + (_model._output._ntrees-1) + ") - stopping the model now.");
}
return converged;
}
/**
* How may trees are actually calculated for the number of classes the model uses.
* @return number of trees
*/
private int numClassTrees() {
return _nclass == 2 ? 1 : _nclass; // Boolean Optimization (only one tree needed for 2-class problems)
}
/**
* Grow k regression trees (k=1 for regression and binomial, k=N for classification with N classes)
* @param ktrees k trees to grow (must be properly initialized)
* @param leaves workspace to store the leaf node starting index (k-dimensional - one per tree)
* @param rand PRNG for reproducibility
*/
private void growTrees(DTree[] ktrees, int[] leaves, Random rand, Constraints cs, BranchInteractionConstraints bics) {
// Initial set of histograms. All trees; one leaf per tree (the root
// leaf); all columns
DHistogram hcs[][][] = new DHistogram[_nclass][1/*just root leaf*/][_ncols];
// Adjust real bins for the top-levels
int adj_nbins = Math.max(_parms._nbins_top_level,_parms._nbins);
long rseed = rand.nextLong();
// initialize trees
for (int k = 0; k < numClassTrees(); k++) {
// Initially setup as-if an empty-split had just happened
if (_model._output._distribution[k] != 0) {
ktrees[k] = new DTree(_train, _ncols, _mtry, _mtry_per_tree, rseed, _parms);
DHistogram[] hist = DHistogram.initialHist(_train, _ncols, adj_nbins, hcs[k][0], rseed, _parms, getGlobalSplitPointsKeys(), cs, false, _ics);
new UndecidedNode(ktrees[k], DTree.NO_PARENT, hist, cs, bics); // The "root" node
}
}
// Sample - mark the lines by putting 'OUT_OF_BAG' into nid() vector
if (_parms.useRowSampling()) {
Sample ss[] = new Sample[_nclass];
for (int k = 0; k < _nclass; k++)
if (ktrees[k] != null)
ss[k] = new Sample(ktrees[k], _parms._sample_rate, _parms._sample_rate_per_class).dfork(null, new Frame(vec_nids(_train, k), _response), _parms._build_tree_one_node);
for (int k = 0; k < _nclass; k++) {
if (ss[k] != null) {
ss[k].getResult();
if (LOG.isTraceEnabled() && ktrees[k]!=null) {
LOG.trace("Sampled OOB rows. NIDS:\n" + new Frame(vec_nids(_train, k)).toTwoDimTable());
}
}
}
}
// ----
// ESL2, page 387. Step 2b ii.
// One Big Loop till the ktrees are of proper depth.
// Adds a layer to the trees each pass.
int depth = 0;
for (; depth < _parms._max_depth; depth++) {
hcs = buildLayer(_train, _parms._nbins, ktrees, leaves, hcs, _parms._build_tree_one_node);
// If we did not make any new splits, then the tree is split-to-death
if (hcs == null) break;
}
// Each tree bottomed-out in a DecidedNode; go 1 more level and insert
// LeafNodes to hold predictions.
for (int k = 0; k < _nclass; k++) {
DTree tree = ktrees[k];
if (tree == null) continue;
int leaf = tree.len();
leaves[k] = leaf; //record the size of the tree before splitting the bottom nodes as the starting index for the leaf node indices
for (int nid = 0; nid < leaf; nid++) {
if (tree.node(nid) instanceof DecidedNode) {
DecidedNode dn = tree.decided(nid);
if (dn._split == null) { // No decision here, no row should have this NID now
if (nid == 0) // Handle the trivial non-splitting tree
new LeafNode(tree, DTree.NO_PARENT, 0);
continue;
}
for (int i = 0; i < dn._nids.length; i++) { //L/R children
int cnid = dn._nids[i];
if (cnid == ScoreBuildHistogram.UNDECIDED_CHILD_NODE_ID || // Bottomed out (predictors or responses known constant)
tree.node(cnid) instanceof UndecidedNode || // Or chopped off for depth
(tree.node(cnid) instanceof DecidedNode && // Or not possible to split
((DecidedNode) tree.node(cnid))._split == null)) {
dn._nids[i] = new LeafNode(tree, nid).nid(); // Mark a leaf here
}
}
}
}
} // -- k-trees are done
}
// Jerome Friedman 1999: Greedy Function Approximation: A Gradient Boosting Machine
// https://statweb.stanford.edu/~jhf/ftp/trebst.pdf
private void fitBestConstantsHuber(DTree[] ktrees, int firstLeafIndex, double huberDelta) {
if (firstLeafIndex == ktrees[0]._len) return; // no splits happened - nothing to do
assert(_nclass==1);
// get diff y-(f+o) and weights and strata (node idx)
Vec diff = new ComputeDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
Vec weights = hasWeightCol() ? _train.vecs()[idx_weight()] : null;
Vec strata = vec_nids(_train,0);
// compute median diff for each leaf node
Quantile.StratifiedQuantilesTask sqt = new Quantile.StratifiedQuantilesTask(null, 0.5 /*median of weighted residuals*/, diff, weights, strata, QuantileModel.CombineMethod.INTERPOLATE);
H2O.submitTask(sqt);
sqt.join();
// subtract median(diff) from residuals for all observations of each leaf
DiffMinusMedianDiff hp = new DiffMinusMedianDiff(strata, sqt._quantiles /*median residuals per leaf*/);
Frame tmpFrame1 = new Frame(new String[]{"strata","diff"}, new Vec[]{strata,diff});
hp.doAll(tmpFrame1);
Vec diffMinusMedianDiff = diff;
// for each leaf, compute the mean of Math.signum(resMinusMedianRes) * Math.min(Math.abs(resMinusMedianRes), huberDelta),
// where huberDelta is the alpha-percentile of the residual across all observations
Frame tmpFrame2 = new Frame(_train.vecs());
tmpFrame2.add("resMinusMedianRes", diffMinusMedianDiff);
double[] huberGamma = new HuberLeafMath(frameMap, huberDelta, strata).doAll(tmpFrame2)._huberGamma;
// now assign the median per leaf + the above _huberCorrection[i] to each leaf
final DTree tree = ktrees[0];
for (int i = 0; i < sqt._quantiles.length; i++) {
double huber = (sqt._quantiles[i] /*median*/ + huberGamma[i]);
if (Double.isNaN(sqt._quantiles[i])) continue; //no active rows for this NID
double val = effective_learning_rate() * huber;
assert !Double.isNaN(val) && !Double.isInfinite(val);
if (val > _parms._max_abs_leafnode_pred) val = _parms._max_abs_leafnode_pred;
if (val < -_parms._max_abs_leafnode_pred) val = -_parms._max_abs_leafnode_pred;
((LeafNode) tree.node(sqt._nids[i]))._pred = (float) val;
if (LOG.isTraceEnabled()) { LOG.trace("Leaf " + sqt._nids[i] + " has huber value: " + huber); }
}
diffMinusMedianDiff.remove();
}
private double effective_learning_rate() {
return _parms._learn_rate * Math.pow(_parms._learn_rate_annealing, (_model._output._ntrees-1));
}
private void fitBestConstantsQuantile(DTree[] ktrees, int firstLeafIndex, double quantile) {
if (firstLeafIndex == ktrees[0]._len) return; // no splits happened - nothing to do
assert(_nclass==1);
Vec diff = new ComputeDiff(frameMap).doAll(1, (byte)3 /*numeric*/, _train).outputFrame().anyVec();
Vec weights = hasWeightCol() ? _train.vecs()[idx_weight()] : null;
Vec strata = vec_nids(_train,0);
// compute quantile for all leaf nodes
QuantileModel.CombineMethod combine_method = QuantileModel.CombineMethod.INTERPOLATE;
Quantile.StratifiedQuantilesTask sqt = new Quantile.StratifiedQuantilesTask(null, quantile, diff, weights, strata, combine_method);
H2O.submitTask(sqt);
sqt.join();
final DTree tree = ktrees[0];
for (int i = 0; i < sqt._quantiles.length; i++) {
double leafQuantile = sqt._quantiles[i];
if (Double.isNaN(leafQuantile)) continue; //no active rows for this NID
double val = effective_learning_rate() * leafQuantile;
if (val > _parms._max_abs_leafnode_pred) val = _parms._max_abs_leafnode_pred;
if (val < -_parms._max_abs_leafnode_pred) val = -_parms._max_abs_leafnode_pred;
((LeafNode) tree.node(sqt._nids[i]))._pred = (float) val;
}
}
/**
* This method recompute result leaf node prediction to get a more precise prediction but respecting
* the column constraints in subtrees
* @param ktrees array of all trained trees
* @param quantile quantile alpha parameter
* @param cs constraint object to get information about constraints for each column
*/
private void resetQuantileConstants(DTree[] ktrees, double quantile, Constraints cs) {
Vec diff = new ComputeDiff(frameMap).doAll(1, (byte)3, _train).outputFrame().anyVec();
Vec weights = hasWeightCol() ? _train.vecs()[idx_weight()] : null;
Vec strata = vec_nids(_train,0);
// compute quantile for all leaf nodes
QuantileModel.CombineMethod combine_method = QuantileModel.CombineMethod.INTERPOLATE;
Quantile.StratifiedQuantilesTask sqt = new Quantile.StratifiedQuantilesTask(null, quantile, diff, weights, strata, combine_method);
H2O.submitTask(sqt);
sqt.join();
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null || tree.len() < 2) continue;
float[] mins = new float[tree._len];
int[] min_ids = new int[tree._len];
float[] maxs = new float[tree._len];
int[] max_ids = new int[tree._len];
int dnSize = tree._len - tree._leaves; // calculate index where leaves starts
rollupMinMaxPreds(tree, tree.root(), mins, min_ids, maxs, max_ids);
for (int i = dnSize; i < tree.len(); i++) {
LeafNode node = (LeafNode)tree.node(i);
int quantileId = i - dnSize;
double leafQuantile = sqt._quantiles[quantileId];
if (Double.isNaN(leafQuantile)) continue; // quantile can be NaN if CV or weights column is enabled
boolean canBeReplaced = true;
DTree.Node tmpNode = tree.node(i);
while(tmpNode.pid() != DTree.NO_PARENT) {
DecidedNode parent = (DecidedNode) tree.node(tmpNode._pid);
int constraint = cs.getColumnConstraint(parent._split._col);
if (parent._split.naSplitDir() == DHistogram.NASplitDir.NAvsREST || constraint == 0) {
tmpNode = parent;
continue;
}
if (constraint > 0) {
if (leafQuantile > mins[parent._nids[0]] || leafQuantile < maxs[parent._nids[1]]) {
canBeReplaced = false;
break;
}
} else if (leafQuantile < maxs[parent._nids[0]] || leafQuantile > mins[parent._nids[1]]) {
canBeReplaced = false;
break;
}
tmpNode = parent;
}
if(canBeReplaced){
node._pred = (float) leafQuantile;
}
}
}
}
private void fitBestConstants(DTree[] ktrees, int[] leafs, GammaPass gp, Constraints cs) {
final boolean useSplitPredictions = cs != null && cs.useBounds();
double m1class = (_nclass > 1 && _parms._distribution == DistributionFamily.multinomial) ||
(_nclass > 2 && _parms._distribution == DistributionFamily.custom) ? (double) (_nclass - 1) / _nclass : 1.0; // K-1/K for multinomial
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null) continue;
if (LOG.isTraceEnabled()) {
for (int i=0;i 2)) {
if (gf > 1e4) gf = 1e4f; // Cap prediction, will already overflow during Math.exp(gf)
else if (gf < -1e4) gf = -1e4f;
}
if (Double.isNaN(gf)) gf=0;
else if (Double.isInfinite(gf)) gf=Math.signum(gf)*1e4f;
if (gf > _parms._max_abs_leafnode_pred) gf = _parms._max_abs_leafnode_pred;
if (gf < -_parms._max_abs_leafnode_pred) gf = -_parms._max_abs_leafnode_pred;
leafNode._pred = (float) gf;
}
}
}
private boolean constraintCheckEnabled() {
return Boolean.parseBoolean(getSysProperty("gbm.monotonicity.checkEnabled", "true"));
}
private void checkConstraints(DTree[] ktrees, int[] leafs, Constraints cs) {
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null) continue;
float[] mins = new float[tree._len];
int[] min_ids = new int[tree._len];
float[] maxs = new float[tree._len];
int[] max_ids = new int[tree._len];
rollupMinMaxPreds(tree, tree.root(), mins, min_ids, maxs, max_ids);
for (int i = 0; i < tree._len - leafs.length; i++) {
DTree.Node node = tree.node(i);
if (! (node instanceof DecidedNode))
continue;
DecidedNode dn = ((DecidedNode) node);
if (dn._split == null)
continue;
int constraint = cs.getColumnConstraint(dn._split._col);
if (dn._split.naSplitDir() == DHistogram.NASplitDir.NAvsREST) {
// NAs are not subject to constraints, we don't have to check the monotonicity on "NA vs REST" type of splits
continue;
}
if (constraint > 0) {
if (maxs[dn._nids[0]] > mins[dn._nids[1]]) {
String message = "Monotonicity constraint " + constraint + " violated on column '" + _train.name(dn._split._col) + "' (max(left) > min(right)): " +
maxs[dn._nids[0]] + " > " + mins[dn._nids[1]] +
"\nNode: " + node +
"\nLeft Node (max): " + tree.node(max_ids[dn._nids[0]]) +
"\nRight Node (min): " + tree.node(min_ids[dn._nids[1]]);
throw new IllegalStateException(message);
}
} else if (constraint < 0) {
if (mins[dn._nids[0]] < maxs[dn._nids[1]]) {
String message = "Monotonicity constraint " + constraint + " violated on column '" + _train.name(dn._split._col) + "' (min(left) < max(right)): " +
mins[dn._nids[0]] + " < " + maxs[dn._nids[1]] +
"\nNode: " + node +
"\nLeft Node (min): " + tree.node(min_ids[dn._nids[0]]) +
"\nRight Node (max): " + tree.node(max_ids[dn._nids[1]]);
throw new IllegalStateException(message);
}
}
}
}
}
private void rollupMinMaxPreds(DTree tree, DTree.Node node, float[] mins, int min_ids[], float[] maxs, int[] max_ids) {
if (node instanceof LeafNode) {
mins[node.nid()] = ((LeafNode) node)._pred;
min_ids[node.nid()] = node.nid();
maxs[node.nid()] = ((LeafNode) node)._pred;
max_ids[node.nid()] = node.nid();
return;
}
DecidedNode dn = (DecidedNode) node;
rollupMinMaxPreds(tree, tree.node(dn._nids[0]), mins, min_ids, maxs, max_ids);
rollupMinMaxPreds(tree, tree.node(dn._nids[1]), mins, min_ids, maxs, max_ids);
final int min_id = mins[dn._nids[0]] < mins[dn._nids[1]] ? dn._nids[0] : dn._nids[1];
mins[node.nid()] = mins[min_id];
min_ids[node.nid()] = min_ids[min_id];
final int max_id = maxs[dn._nids[0]] > maxs[dn._nids[1]] ? dn._nids[0] : dn._nids[1];
maxs[node.nid()] = maxs[max_id];
max_ids[node.nid()] = max_ids[max_id];
}
@Override protected GBMModel makeModel(Key modelKey, GBMModel.GBMParameters parms) {
return new GBMModel(modelKey, parms, new GBMModel.GBMOutput(GBM.this));
}
@Override
protected void doInTrainingCheckpoint() {
try {
String modelFile = _parms._in_training_checkpoints_dir + "/" + _model._key.toString() + ".ntrees_" + _model._output._ntrees;
GBMModel modelClone = _model.clone();
modelClone.setInputParms(_parms);
modelClone._key = Key.make(_model._key + "." + _model._output._ntrees);
modelClone._output = (GBMModel.GBMOutput) _model._output.clone();
modelClone._output.changeModelMetricsKey(modelClone._key);
modelClone.exportBinaryModel(modelFile, true);
} catch (IOException e) {
throw new RuntimeException("Failed to write GBM checkpoint" + _model._key.toString(), e);
}
}
}
//--------------------------------------------------------------------------------------------------------------------
private static class FillVecWithConstant extends MRTask {
private double init;
public FillVecWithConstant(double d) {
init = d;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk tree) {
if (tree instanceof C8DVolatileChunk) {
Arrays.fill(((C8DVolatileChunk) tree).getValues(), init);
} else {
for (int i = 0; i < tree._len; i++)
tree.set(i, init);
}
}
}
private static class ResponseLessOffsetTask extends MRTask {
private FrameMap fm;
public ResponseLessOffsetTask(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk resp = chks[fm.responseIndex];
final Chunk offset = chks[fm.offsetIndex];
for (int i = 0; i < chks[0]._len; ++i)
nc[0].addNum(resp.atd(i) - offset.atd(i)); // y - o
}
}
/**
* Newton-Raphson fixpoint iteration to find a self-consistent initial value
*/
private static class NewtonRaphson extends MRTask {
private FrameMap fm;
private Distribution dist;
private double _init;
private double _numerator;
private double _denominator;
public NewtonRaphson(FrameMap frameMap, Distribution distribution, double initialValue) {
assert frameMap != null && distribution != null;
fm = frameMap;
dist = distribution;
_init = initialValue;
_numerator = 0;
_denominator = 0;
}
public double value() {
return _init + _numerator / _denominator;
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = chks[fm.offsetIndex];
Chunk weight = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < ys._len; row++) {
double w = weight.atd(row);
if (w == 0) continue;
if (ys.isNA(row)) continue;
double y = ys.atd(row);
double o = offset.atd(row);
double p = dist.linkInv(o + _init);
_numerator += w * (y - p);
_denominator += w * p * (1. - p);
}
}
@Override
public void reduce(NewtonRaphson mrt) {
_numerator += mrt._numerator;
_denominator += mrt._denominator;
}
}
/**
* Compute Residuals
* Do this for all rows, whether OOB or not
*/
private static class ComputePredAndRes extends MRTask {
private FrameMap fm;
private int nclass;
private boolean[] out;
private Distribution dist;
public ComputePredAndRes(FrameMap frameMap, int nClasses, double[] outputDistribution, Distribution distribution) {
fm = frameMap;
nclass = nClasses;
dist = distribution;
out = new boolean[outputDistribution.length];
for (int i = 0; i < out.length; i++) out[i] = (outputDistribution[i] != 0);
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
C8DVolatileChunk wk = (C8DVolatileChunk) chks[fm.work0Index]; // Place to store residuals
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
double[] fs = nclass > 1 ? new double[nclass + 1] : null;
for (int row = 0; row < wk._len; row++) {
double weight = weights.atd(row);
if (weight == 0) continue;
if (ys.isNA(row)) continue;
double f = preds.atd(row) + offset.atd(row);
double y = ys.atd(row);
if (LOG.isTraceEnabled()) LOG.trace(f + " vs " + y); //expect that the model predicts very negative values for 0 and very positive values for 1
if ((dist._family == DistributionFamily.multinomial && fs != null) || (dist._family == DistributionFamily.custom && nclass > 2)) {
double sum = score1static(chks, fm.tree0Index, 0.0 /*not used for multiclass*/, fs, row, dist, nclass);
if (Double.isInfinite(sum)) { // Overflow (happens for constant responses)
for (int k = 0; k < nclass; k++) {
wk = (C8DVolatileChunk) chks[fm.work0Index + k];
wk.getValues()[row] = ((float) dist.negHalfGradient(y, (Double.isInfinite(fs[k + 1]) ? 1.0f : 0.0f), k));
}
} else {
for (int k = 0; k < nclass; k++) { // Save as a probability distribution
if (out[k]) {
wk = (C8DVolatileChunk) chks[fm.work0Index + k];
wk.getValues()[row] = ((float) dist.negHalfGradient(y, (float) (fs[k + 1] / sum), k));
}
}
}
} else {
wk.getValues()[row] = ((float) dist.negHalfGradient(y, f));
}
}
}
}
private static class ComputeMinMax extends MRTask {
private FrameMap fm;
int firstLeafIdx;
int _totalNumNodes;
float[] _mins;
float[] _maxs;
public ComputeMinMax(FrameMap frameMap, int firstLeafIndex, int totalNumNodes) {
fm = frameMap;
firstLeafIdx = firstLeafIndex;
_totalNumNodes = totalNumNodes;
}
@Override
public void map(Chunk[] chks) {
int len = _totalNumNodes - firstLeafIdx; // number of leaves
_mins = new float[len];
_maxs = new float[len];
Arrays.fill(_mins, Float.MAX_VALUE);
Arrays.fill(_maxs, -Float.MAX_VALUE);
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
Chunk nids = chks[fm.nids0Index];
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < preds._len; row++) {
if (ys.isNA(row)) continue;
if (weights.atd(row) == 0) continue;
int nid = (int) nids.at8(row);
assert (nid != ScoreBuildHistogram.UNDECIDED_CHILD_NODE_ID);
if (nid < 0) continue; //skip OOB and otherwise skipped rows
float f = (float) (preds.atd(row) + offset.atd(row));
int idx = nid - firstLeafIdx;
_mins[idx] = Math.min(_mins[idx], f);
_maxs[idx] = Math.max(_maxs[idx], f);
}
}
@Override
public void reduce(ComputeMinMax mrt) {
ArrayUtils.reduceMin(_mins, mrt._mins);
ArrayUtils.reduceMax(_maxs, mrt._maxs);
}
}
private static class ComputeDiff extends MRTask {
private FrameMap fm;
public ComputeDiff(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk y = chks[fm.responseIndex];
final Chunk o = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk f = chks[fm.tree0Index];
for (int i = 0; i < chks[0].len(); ++i)
nc[0].addNum(y.atd(i) - (f.atd(i) + o.atd(i)));
}
}
private static class ComputeAbsDiff extends MRTask {
private FrameMap fm;
public ComputeAbsDiff(FrameMap frameMap) {
fm = frameMap;
}
@Override
public void map(Chunk[] chks, NewChunk[] nc) {
final Chunk y = chks[fm.responseIndex];
final Chunk o = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk f = chks[fm.tree0Index];
for (int i = 0; i < chks[0].len(); ++i)
nc[0].addNum(Math.abs(y.atd(i) - (f.atd(i) + o.atd(i))));
}
}
public static class DiffMinusMedianDiff extends MRTask {
private final int _strataMin;
private double[] _terminalMedians;
public DiffMinusMedianDiff(Vec strata, double[] terminalMedians) {
_strataMin = (int) strata.min();
_terminalMedians = terminalMedians;
}
@Override
public void map(Chunk[] chks) {
final Chunk strata = chks[0];
final Chunk diff = chks[1];
for (int i = 0; i < chks[0].len(); ++i) {
int nid = (int) strata.atd(i);
diff.set(i, diff.atd(i) - _terminalMedians[nid - _strataMin]);
}
}
}
private static final class HuberLeafMath extends MRTask {
// INPUT
final FrameMap fm;
final double _huberDelta;
final Vec _strata;
final int _strataMin;
final int _strataMax;
// OUTPUT
double[/*leaves*/] _huberGamma, _wcounts;
public HuberLeafMath(FrameMap frameMap, double huberDelta, Vec strata) {
fm = frameMap;
_huberDelta = huberDelta;
_strata = strata;
_strataMin = (int) _strata.min();
_strataMax = (int) _strata.max();
}
@Override
public void map(Chunk[] cs) {
if (_strata.length() == 0) {
LOG.warn("No Huber math can be done since there's no strata.");
_huberGamma = new double[0];
return;
}
final int nstrata = _strataMax - _strataMin + 1;
LOG.info("Computing Huber math for (up to) " + nstrata + " different strata.");
_huberGamma = new double[nstrata];
_wcounts = new double[nstrata];
Chunk weights = fm.weightIndex >= 0 ? cs[fm.weightIndex] : new C0DChunk(1, cs[0]._len);
Chunk stratum = cs[fm.nids0Index];
Chunk diffMinusMedianDiff = cs[cs.length - 1];
for (int row = 0; row < cs[0]._len; ++row) {
int nidx = (int) stratum.at8(row) - _strataMin; //get terminal node for this row
_huberGamma[nidx] += weights.atd(row) * Math.signum(diffMinusMedianDiff.atd(row)) * Math.min(Math.abs(diffMinusMedianDiff.atd(row)), _huberDelta);
_wcounts[nidx] += weights.atd(row);
}
}
@Override
public void reduce(HuberLeafMath mrt) {
ArrayUtils.add(_huberGamma, mrt._huberGamma);
ArrayUtils.add(_wcounts, mrt._wcounts);
}
@Override
protected void postGlobal() {
for (int i = 0; i < _huberGamma.length; ++i)
_huberGamma[i] /= _wcounts[i];
}
}
private static class StoreResiduals extends MRTask {
private FrameMap fm;
private Distribution dist;
public StoreResiduals(FrameMap frameMap, Distribution distribution) {
fm = frameMap;
dist = distribution;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
Chunk ys = chks[fm.responseIndex];
Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
Chunk preds = chks[fm.tree0Index]; // Prior tree sums
C8DVolatileChunk wk = (C8DVolatileChunk) chks[fm.work0Index]; // Place to store residuals
Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
for (int row = 0; row < wk._len; row++) {
double weight = weights.atd(row);
if (weight == 0) continue;
if (ys.isNA(row)) continue;
double f = preds.atd(row) + offset.atd(row);
double y = ys.atd(row);
wk.getValues()[row] = ((float) dist.negHalfGradient(y, f));
}
}
}
/**
* Set terminal node estimates (gamma)
* ESL2, page 387. Step 2b iii.
* Nids <== f(Nids)
* For classification (bernoulli):
* {@code gamma_i = sum (w_i * res_i) / sum (w_i*p_i*(1 - p_i)) where p_i = y_i - res_i}
* For classification (multinomial):
* {@code gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))}
* For regression (gaussian):
* {@code gamma_i = sum res_i / count(res_i)}
*/
private static class GammaPass extends MRTask {
private final FrameMap fm;
private final DTree[] _trees; // Read-only, shared (except at the histograms in the Nodes)
private final int[] _leafs; // Starting index of leaves (per class-tree)
private final Distribution _dist;
private final int _nclass;
private double[/*tree/klass*/][/*tree-relative node-id*/] _num;
private double[/*tree/klass*/][/*tree-relative node-id*/] _denom;
public GammaPass(FrameMap frameMap, DTree[] trees, int[] leafs, Distribution distribution, int nClasses) {
fm = frameMap;
_leafs = leafs;
_trees = trees;
_dist = distribution;
_nclass = nClasses;
}
double gamma(int tree, int nid) {
return gamma(tree, nid, _num[tree][nid]);
}
double gamma(int tree, int nid, double num) {
if (_denom[tree][nid] == 0)
return 0;
double g = num / _denom[tree][nid];
assert !Double.isInfinite(g) && !Double.isNaN(g);
return gamma(g);
}
double gamma(double g) {
if (_dist._family == DistributionFamily.poisson ||
_dist._family == DistributionFamily.gamma ||
_dist._family == DistributionFamily.tweedie) {
return _dist.link(g);
} else {
return g;
}
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
_denom = new double[_nclass][];
_num = new double[_nclass][];
final Chunk resp = chks[fm.responseIndex]; // Response for this frame
// For all tree/klasses
for (int k = 0; k < _nclass; k++) {
final DTree tree = _trees[k];
final int leaf = _leafs[k];
if (tree == null) continue; // Empty class is ignored
assert (tree._len - leaf >= 0);
// A leaf-biased array of all active Tree leaves.
final double[] denom = _denom[k] = new double[tree._len - leaf];
final double[] num = _num[k] = new double[tree._len - leaf];
final C4VolatileChunk nids = (C4VolatileChunk) chks[fm.nids0Index + k]; // Node-ids for this tree/class
int[] nids_vals = nids.getValues();
final Chunk ress = chks[fm.work0Index + k]; // Residuals for this tree/class
final Chunk offset = fm.offsetIndex >= 0 ? chks[fm.offsetIndex] : new C0DChunk(0, chks[0]._len);
final Chunk preds = chks[fm.tree0Index + k];
final Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
// If we have all constant responses, then we do not split even the
// root and the residuals should be zero.
if (tree.root() instanceof LeafNode)
continue;
for (int row = 0; row < nids._len; row++) { // For all rows
double w = weights.atd(row);
if (w == 0)
continue;
double y = resp.atd(row); //response
if (Double.isNaN(y))
continue;
// Compute numerator and denominator of terminal node estimate (gamma)
int nid = (int) nids.at8(row); // Get Node to decide from
final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow(nid); //same for all k
if (wasOOBRow) nid = ScoreBuildHistogram.oob2Nid(nid);
if (nid < 0)
continue;
DecidedNode dn = tree.decided(nid); // Must have a decision point
if (dn._split == null) // Unable to decide?
dn = tree.decided(dn.pid()); // Then take parent's decision
int leafnid = dn.getChildNodeID(chks, row); // Decide down to a leafnode
assert leaf <= leafnid && leafnid < tree._len :
"leaf: " + leaf + " leafnid: " + leafnid + " tree._len: " + tree._len + "\ndn: " + dn;
assert tree.node(leafnid) instanceof LeafNode;
// Note: I can tell which leaf/region I end up in, but I do not care for
// the prediction presented by the tree. For GBM, we compute the
// sum-of-residuals (and sum/abs/mult residuals) for all rows in the
// leaf, and get our prediction from that.
nids_vals[row] = leafnid;
assert !ress.isNA(row);
// OOB rows get placed properly (above), but they don't affect the computed Gamma (below)
// For Laplace/Quantile distribution, we need to compute the median of (y-offset-preds == y-f), will be done outside of here
if (wasOOBRow
|| _dist._family == DistributionFamily.laplace
|| _dist._family == DistributionFamily.huber
|| _dist._family == DistributionFamily.quantile) continue;
double z = ress.atd(row); // residual
double f = preds.atd(row) + offset.atd(row);
int idx = leafnid - leaf;
num[idx] += _dist.gammaNum(w, y, z, f);
denom[idx] += _dist.gammaDenom(w, y, z, f);
}
}
}
@Override
public void reduce(GammaPass gp) {
ArrayUtils.add(_denom, gp._denom);
ArrayUtils.add(_num, gp._num);
}
}
private static class AddTreeContributions extends MRTask {
private FrameMap fm;
private DTree[] _ktrees;
private int _nclass;
private double _pred_noise_bandwidth;
private long _seed;
private int _ntrees1;
private int _ntrees2;
public AddTreeContributions(
FrameMap frameMap, DTree[] ktrees, double predictionNoiseBandwidth, long seed, int nTreesInp, int nTreesOut
) {
fm = frameMap;
_ktrees = ktrees;
_nclass = ktrees.length;
_pred_noise_bandwidth = predictionNoiseBandwidth;
_seed = seed;
_ntrees1 = nTreesInp;
_ntrees2 = nTreesOut;
}
@Override
protected boolean modifiesVolatileVecs() {
return true;
}
@Override
public void map(Chunk[] chks) {
Random rand = RandomUtils.getRNG(_seed);
// For all tree/klasses
for (int k = 0; k < _nclass; k++) {
final DTree tree = _ktrees[k];
if (tree == null) continue;
final C4VolatileChunk nids = (C4VolatileChunk) chks[fm.nids0Index + k];
final int[] nids_vals = nids.getValues();
final C8DVolatileChunk ct = (C8DVolatileChunk) chks[fm.tree0Index + k];
double[] ct_vals = ct.getValues();
final Chunk y = chks[fm.responseIndex];
final Chunk weights = fm.weightIndex >= 0 ? chks[fm.weightIndex] : new C0DChunk(1, chks[0]._len);
long baseseed = (0xDECAF + _seed) * (0xFAAAAAAB + k * _ntrees1 + _ntrees2);
for (int row = 0; row < nids._len; row++) {
int nid = nids_vals[row];
nids_vals[row] = ScoreBuildHistogram.FRESH;
if (nid < 0) {
if (weights.atd(row) == 0 || y.isNA(row))
nids_vals[row] = ScoreBuildHistogram.DECIDED_ROW;
continue;
}
double factor = 1;
if (_pred_noise_bandwidth != 0) {
rand.setSeed(baseseed + nid); //bandwidth is a function of tree number, class and node id (but same for all rows in that node)
factor += rand.nextGaussian() * _pred_noise_bandwidth;
}
// Prediction stored in Leaf is cut to float to be deterministic in reconstructing
// fields from tree prediction
ct_vals[row] = ((float) (ct.atd(row) + factor * ((LeafNode) tree.node(nid))._pred));
}
}
}
}
//--------------------------------------------------------------------------------------------------------------------
// Read the 'tree' columns, do model-specific math and put the results in the
// fs[] array, and return the sum. Dividing any fs[] element by the sum
// turns the results into a probability distribution.
@Override protected double score1(Chunk[] chks, double weight, double offset, double[/*nclass*/] fs, int row) {
return score1static(chks, idx_tree(0), offset, fs, row, DistributionFactory.getDistribution(_parms), _nclass);
}
// Read the 'tree' columns, do model-specific math and put the results in the
// fs[] array, and return the sum. Dividing any fs[] element by the sum
// turns the results into a probability distribution.
private static double score1static(Chunk[] chks, int treeIdx, double offset, double[] fs, int row, Distribution dist, int nClasses) {
double f = chks[treeIdx].atd(row) + offset;
double p = dist.linkInv(f);
if (dist._family == DistributionFamily.modified_huber || dist._family == DistributionFamily.bernoulli || dist._family == DistributionFamily.quasibinomial ||
(dist._family == DistributionFamily.custom && nClasses == 2)) {
fs[2] = p;
fs[1] = 1.0 - p;
return 1; // f2 = 1.0 - f1; so f1+f2 = 1.0
} else if (dist._family == DistributionFamily.multinomial ||
(dist._family == DistributionFamily.custom && nClasses > 2)) {
if (nClasses == 2) {
// This optimization assumes the 2nd tree of a 2-class system is the
// inverse of the first. Fill in the missing tree
fs[1] = p;
fs[2] = 1 / p;
return fs[1] + fs[2];
}
// Multinomial loss function; sum(exp(data)). Load tree data
assert (offset == 0);
fs[1] = f;
for (int k = 1; k < nClasses; k++)
fs[k + 1] = chks[treeIdx + k].atd(row);
// Rescale to avoid Infinities; return sum(exp(data))
return hex.genmodel.GenModel.log_rescale(fs);
} else {
return fs[0] = p;
}
}
@Override
public PojoWriter makePojoWriter(Model, ?, ?> genericModel, MojoModel mojoModel) {
GbmMojoModel gbmMojoModel = (GbmMojoModel) mojoModel;
CompressedTree[][] trees = MojoUtils.extractCompressedTrees(gbmMojoModel);
boolean binomialOpt = MojoUtils.isUsingBinomialOpt(gbmMojoModel, trees);
return new GbmPojoWriter(genericModel, gbmMojoModel.getCategoricalEncoding(), binomialOpt, trees,
gbmMojoModel._init_f, gbmMojoModel._balanceClasses, gbmMojoModel._family,
LinkFunctionFactory.getLinkFunction(gbmMojoModel._link_function));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy