hex.tree.SharedTree Maven / Gradle / Ivy
package hex.tree;
import hex.*;
import hex.genmodel.GenModel;
import hex.tree.gbm.GBMModel;
import jsr166y.CountedCompleter;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.*;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public abstract class SharedTree, P extends SharedTreeModel.SharedTreeParameters, O extends SharedTreeModel.SharedTreeOutput> extends ModelBuilder {
protected int _mtry;
protected int _mtry_per_tree;
public static final int MAX_NTREES = 100000;
public SharedTree(P parms ) { super(parms ); /*only call init in leaf classes*/ }
public SharedTree(P parms, Key key) { super(parms,key); /*only call init in leaf classes*/ }
public SharedTree(P parms, Job job ) { super(parms,job); /*only call init in leaf classes*/ }
public SharedTree(P parms, boolean startup_once) { super(parms,startup_once); /*only call init in leaf classes*/ }
// Number of trees requested, including prior trees from a checkpoint
protected int _ntrees;
// The in-progress model being built
protected M _model;
// Number of columns in training set, not counting the response column
protected int _ncols;
// Initially predicted value (for zero trees)
protected double _initialPrediction;
// Sum of variable empirical improvement in squared-error. The value is not scaled.
private transient float[/*nfeatures*/] _improvPerVar;
protected Random _rand;
public boolean isSupervised(){return true;}
@Override protected boolean computePriorClassDistribution(){ return true;}
/** Initialize the ModelBuilder, validating all arguments and preparing the
* training frame. This call is expected to be overridden in the subclasses
* and each subclass will start with "super.init();". This call is made
* by the front-end whenever the GUI is clicked, and needs to be fast;
* heavy-weight prep needs to wait for the trainModel() call.
*
* Validate the requested ntrees; precompute actual ntrees. Validate
* the number of classes to predict on; validate a checkpoint. */
@Override public void init(boolean expensive) {
super.init(expensive);
if( expensive && _parms._seed==-1 ) _parms._seed = RandomUtils.getRNG(System.nanoTime()).nextLong();
if (H2O.ARGS.client && _parms._build_tree_one_node)
error("_build_tree_one_node", "Cannot run on a single node in client mode.");
if( _parms._min_rows < 0 )
error("_min_rows", "Requested min_rows must be greater than 0");
if( _parms._ntrees < 0 || _parms._ntrees > MAX_NTREES)
error("_ntrees", "Requested ntrees must be between 1 and " + MAX_NTREES);
_ntrees = _parms._ntrees; // Total trees in final model
if( _parms.hasCheckpoint() ) { // Asking to continue from checkpoint?
Value cv = DKV.get(_parms._checkpoint);
if( cv != null ) { // Look for prior model
M checkpointModel = cv.get();
try {
_parms.validateWithCheckpoint(checkpointModel._parms);
if( isClassifier() != checkpointModel._output.isClassifier() )
throw new IllegalArgumentException("Response type must be the same as for the checkpointed model.");
if (!Arrays.equals(_train.names(), checkpointModel._output._names)) {
throw new IllegalArgumentException("The columns of the training data must be the same as for the checkpointed model");
}
if (!Arrays.deepEquals(_train.domains(), checkpointModel._output._domains)) {
throw new IllegalArgumentException("Categorical factor levels of the training data must be the same as for the checkpointed model");
}
} catch (H2OIllegalArgumentException e) {
error(e.values.get("argument").toString(), e.values.get("value").toString());
}
if( _parms._ntrees < checkpointModel._output._ntrees+1 )
error("_ntrees", "If checkpoint is specified then requested ntrees must be higher than " + (checkpointModel._output._ntrees+1));
// Compute number of trees to build for this checkpoint
_ntrees = _parms._ntrees - checkpointModel._output._ntrees; // Needed trees
}
}
if (_parms._nbins <= 1) error ("_nbins", "nbins must be > 1.");
if (_parms._nbins >= 1<<16) error ("_nbins", "nbins must be < " + (1<<16));
if (_parms._nbins_cats <= 1) error ("_nbins_cats", "nbins_cats must be > 1.");
if (_parms._nbins_cats >= 1<<16) error ("_nbins_cats", "nbins_cats must be < " + (1<<16));
if (_parms._nbins_top_level < _parms._nbins) error ("_nbins_top_level", "nbins_top_level must be >= nbins (" + _parms._nbins + ").");
if (_parms._nbins_top_level >= 1<<16) error ("_nbins_top_level", "nbins_top_level must be < " + (1<<16));
if (_parms._max_depth <= 0) error ("_max_depth", "_max_depth must be > 0.");
if (_parms._min_rows <=0) error ("_min_rows", "_min_rows must be > 0.");
if (_parms._score_tree_interval < 0 || _parms._score_tree_interval > _parms._ntrees) error ("_score_tree_interval", "_score_tree_interval must be >= 0 and <= _ntrees.");
if (_parms._sample_rate_per_class != null) {
warn("_sample_rate", "_sample_rate is ignored if _sample_rate_per_class is specified.");
if (_parms._sample_rate_per_class.length != nclasses()) error("_sample_rate_per_class", "_sample_rate_per_class must have " + nclasses() + " values (one per class).");
for (int i=0;i<_parms._sample_rate_per_class.length;++i) {
if (!(0.0 < _parms._sample_rate_per_class[i] && _parms._sample_rate_per_class[i] <= 1.0))
error("_sample_rate_per_class", "sample_rate_per_class for class " + response().domain()[i] + " should be in interval ]0,1] but it is " + _parms._sample_rate_per_class[i] + ".");
}
}
if (!(0.0 < _parms._sample_rate && _parms._sample_rate <= 1.0))
error("_sample_rate", "sample_rate should be in interval ]0,1] but it is " + _parms._sample_rate + ".");
if (!(0.0 < _parms._col_sample_rate_per_tree && _parms._col_sample_rate_per_tree <= 1.0))
error("_col_sample_rate_per_tree", "col_sample_rate_per_tree should be in interval ]0,1] but it is " + _parms._col_sample_rate_per_tree + ".");
if (_train != null) {
double sumWeights = _train.numRows() * (hasWeightCol() ? _train.vec(_parms._weights_column).mean() : 1);
if (sumWeights < 2*_parms._min_rows ) // Need at least 2*min_rows weighted rows to split even once
error("_min_rows", "The dataset size is too small to split for min_rows=" + _parms._min_rows
+ ": must have at least " + 2*_parms._min_rows + " (weighted) rows, but have only " + sumWeights + ".");
}
if( _train != null )
_ncols = _train.numCols()-1-numSpecialCols();
}
// --------------------------------------------------------------------------
// Top-level tree-algo driver
abstract protected class Driver extends ModelBuilder.Driver {
@Override public void compute2() {
_model = null; // Resulting model!
try {
Scope.enter(); // Cleanup temp keys
init(true); // Do any expensive tests & conversions now
// Do lock even before checking the errors, since this block is finalized by unlock
// (not the best solution, but the code is more readable)
_parms.read_lock_frames(_job); // Fetch & read-lock input frames
if( error_count() > 0 )
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this);
// Create a New Model or continuing from a checkpoint
if (_parms.hasCheckpoint()) {
// Get the model to continue
_model = DKV.get(_parms._checkpoint).get().deepClone(_result);
// Override original parameters by new parameters
_model._parms = _parms;
// We create a new model
_model.delete_and_lock(_job);
} else { // New Model
// Compute the zero-tree error - guessing only the class distribution.
// MSE is stddev squared when guessing for regression.
// For classification, guess the largest class.
_model = makeModel(dest(), _parms);
_model.delete_and_lock(_job); // and clear & write-lock it (smashing any prior)
_model._output._init_f = _initialPrediction;
}
// Compute the response domain; makes for nicer printouts
String[] domain = _response.domain();
assert (_nclass > 1 && domain != null) || (_nclass==1 && domain==null);
if( _nclass==1 ) domain = new String[] {"r"}; // For regression, give a name to class 0
// Compute class distribution, used to for initial guesses and to
// upsample minority classes (if asked for).
if( _nclass>1 ) { // Classification?
// Handle imbalanced classes by stratified over/under-sampling.
// initWorkFrame sets the modeled class distribution, and
// model.score() corrects the probabilities back using the
// distribution ratios
if(_model._output.isClassifier() && _parms._balance_classes ) {
float[] trainSamplingFactors = new float[_train.lastVec().domain().length]; //leave initialized to 0 -> will be filled up below
if (_parms._class_sampling_factors != null) {
if (_parms._class_sampling_factors.length != _train.lastVec().domain().length)
throw new IllegalArgumentException("class_sampling_factors must have " + _train.lastVec().domain().length + " elements");
trainSamplingFactors = _parms._class_sampling_factors.clone(); //clone: don't modify the original
}
Frame stratified = water.util.MRUtils.sampleFrameStratified(_train, _train.lastVec(), _train.vec(_model._output.weightsName()), trainSamplingFactors, (long)(_parms._max_after_balance_size*_train.numRows()), _parms._seed, true, false);
if (stratified != _train) {
_train = stratified;
_response = stratified.vec(_parms._response_column);
_weights = stratified.vec(_parms._weights_column);
// Recompute distribution since the input frame was modified
MRUtils.ClassDist cdmt2 = _weights != null ?
new MRUtils.ClassDist(_nclass).doAll(_response, _weights) : new MRUtils.ClassDist(_nclass).doAll(_response);
_model._output._distribution = cdmt2.dist();
_model._output._modelClassDist = cdmt2.rel_dist();
}
}
Log.info("Prior class distribution: " + Arrays.toString(_model._output._priorClassDist));
Log.info("Model class distribution: " + Arrays.toString(_model._output._modelClassDist));
if (_parms._sample_rate_per_class != null) {
Log.info("Sample rates per tree (this affects the distribution of probabilities):");
for (int i = 0; i < nclasses(); ++i)
Log.info(" sample rate for class '" + response().domain()[i] + "' : " + _parms._sample_rate_per_class[i]);
}
}
// Also add to the basic working Frame these sets:
// nclass Vecs of current forest results (sum across all trees)
// nclass Vecs of working/temp data
// nclass Vecs of NIDs, allowing 1 tree per class
// Current forest values: results of summing the prior M trees
for( int i=0; i<_nclass; i++ )
_train.add("Tree_"+domain[i], _response.makeZero());
// Initial work columns. Set-before-use in the algos.
for( int i=0; i<_nclass; i++ )
_train.add("Work_"+domain[i], _response.makeZero());
// One Tree per class, each tree needs a NIDs. For empty classes use a -1
// NID signifying an empty regression tree.
for( int i=0; i<_nclass; i++ )
_train.add("NIDs_"+domain[i], _response.makeCon(_model._output._distribution==null ? 0 : (_model._output._distribution[i]==0?-1:0)));
// Append number of trees participating in on-the-fly scoring
_train.add("OUT_BAG_TREES", _response.makeZero());
// Tag out rows missing the response column
new ExcludeNAResponse().doAll(_train);
// Variable importance: squared-error-improvement-per-variable-per-split
_improvPerVar = new float[_ncols];
_rand = RandomUtils.getRNG(_parms._seed);
initializeModelSpecifics();
resumeFromCheckpoint();
scoreAndBuildTrees(doOOBScoring());
} finally {
_parms.read_unlock_frames(_job);
if( _model!=null ) _model.unlock(_job);
Scope.exit( _model==null ? null : new Key[]{_model._key, ModelMetrics.buildKey(_model,_parms.train()), ModelMetrics.buildKey(_model,_parms.valid())});
}
tryComplete();
}
// Abstract classes implemented by the tree builders
abstract protected M makeModel( Key modelKey, P parms);
abstract protected boolean doOOBScoring();
abstract protected void buildNextKTrees();
abstract protected void initializeModelSpecifics();
// Common methods for all tree builders
/**
* Restore the workspace from a previous model (checkpoint)
*/
protected final void resumeFromCheckpoint() {
if( !_parms.hasCheckpoint() ) return;
// Reconstruct the working tree state from the checkpoint
Timer t = new Timer();
int ntreesFromCheckpoint = ((SharedTreeModel.SharedTreeParameters) _parms._checkpoint.get()._parms)._ntrees;
new ReconstructTreeState(_ncols, _nclass, numSpecialCols(), _parms._sample_rate,_model._output._treeKeys, doOOBScoring()).doAll(_train, _parms._build_tree_one_node);
for (int i = 0; i < ntreesFromCheckpoint; i++) _rand.nextLong(); //for determinism
Log.info("Reconstructing OOB stats from checkpoint took " + t);
}
/**
* Build more trees, as specified by the model parameters
* @param oob Whether or not Out-Of-Bag scoring should be performed
*/
protected final void scoreAndBuildTrees(boolean oob) {
for( int tid=0; tid< _ntrees; tid++) {
// During first iteration model contains 0 trees, then 1-tree, ...
// No need to score a checkpoint with no extra trees added
if( tid!=0 || !_parms.hasCheckpoint() ) { // do not make initial scoring if model already exist
double training_r2 = doScoringAndSaveModel(false, oob, _parms._build_tree_one_node);
if( training_r2 >= _parms._r2_stopping ) {
doScoringAndSaveModel(true, oob, _parms._build_tree_one_node);
_job.update(_ntrees-_model._output._ntrees); //finish
return; // Stop when approaching round-off error
}
if (!Double.isNaN(training_r2) //HACK to detect whether we scored at all
&& ScoreKeeper.stopEarly(_model._output.scoreKeepers(), _parms._stopping_rounds, _nclass > 1, _parms._stopping_metric, _parms._stopping_tolerance, "model's last")) {
doScoringAndSaveModel(true, oob, _parms._build_tree_one_node);
_job.update(_ntrees-_model._output._ntrees); //finish
return;
}
}
Timer kb_timer = new Timer();
buildNextKTrees();
Log.info((tid + 1) + ". tree was built in " + kb_timer.toString());
_job.update(1);
if (_model._output._treeStats._max_depth==0) {
Log.warn("Nothing to split on: Check that response and distribution are meaningful (e.g., you are not using laplace/quantile regression with a binary response).");
}
if (timeout()) break; // If timed out, do the final scoring
if (stop_requested()) throw new Job.JobCancelledException();
}
// Final scoring (skip if job was cancelled)
doScoringAndSaveModel(true, oob, _parms._build_tree_one_node);
}
}
// --------------------------------------------------------------------------
// Build an entire layer of all K trees
protected DHistogram[][][] buildLayer(final Frame fr, final int nbins, int nbins_cats, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) {
// Build K trees, one per class.
// Build up the next-generation tree splits from the current histograms.
// Nearly all leaves will split one more level. This loop nest is
// O( #active_splits * #bins * #ncols )
// but is NOT over all the data.
ScoreBuildOneTree sb1ts[] = new ScoreBuildOneTree[_nclass];
Vec vecs[] = fr.vecs();
for( int k=0; k<_nclass; k++ ) {
final DTree tree = ktrees[k]; // Tree for class K
if( tree == null ) continue;
// Build a frame with just a single tree (& work & nid) columns, so the
// nested MRTask ScoreBuildHistogram in ScoreBuildOneTree does not try
// to close other tree's Vecs when run in parallel.
Frame fr2 = new Frame(Arrays.copyOf(fr._names,_ncols+1), Arrays.copyOf(vecs,_ncols+1)); //predictors and actual response
fr2.add(fr._names[idx_tree(k)],vecs[idx_tree(k)]); //tree predictions
fr2.add(fr._names[idx_work(k)],vecs[idx_work(k)]); //target value to fit (copy of actual response for DRF, residual for GBM)
fr2.add(fr._names[idx_nids(k)],vecs[idx_nids(k)]); //node indices
if (idx_weight() >= 0)
fr2.add(fr._names[idx_weight()],vecs[idx_weight()]); //observation weights
// Start building one of the K trees in parallel
H2O.submitTask(sb1ts[k] = new ScoreBuildOneTree(this,k,nbins, nbins_cats, tree, leafs, hcs, fr2, subset, build_tree_one_node, _improvPerVar, _model._parms._distribution));
}
// Block for all K trees to complete.
boolean did_split=false;
for( int k=0; k<_nclass; k++ ) {
final DTree tree = ktrees[k]; // Tree for class K
if( tree == null ) continue;
sb1ts[k].join();
if( sb1ts[k]._did_split ) did_split=true;
}
// The layer is done.
return did_split ? hcs : null;
}
private static class ScoreBuildOneTree extends H2OCountedCompleter {
final SharedTree _st;
final int _k; // The tree
final int _nbins; // Numerical columns: Number of histogram bins
final int _nbins_cats; // Categorical columns: Number of histogram bins
final DTree _tree;
final int _leafs[/*nclass*/];
final DHistogram _hcs[/*nclass*/][][];
final Frame _fr2;
final boolean _subset; // True if working a subset of cols
final boolean _build_tree_one_node;
float[] _improvPerVar; // Squared Error improvement per variable per split
Distribution.Family _family;
boolean _did_split;
ScoreBuildOneTree(SharedTree st, int k, int nbins, int nbins_cats, DTree tree, int leafs[], DHistogram hcs[][][], Frame fr2, boolean subset, boolean build_tree_one_node, float[] improvPerVar, Distribution.Family family) {
_st = st;
_k = k;
_nbins= nbins;
_nbins_cats= nbins_cats;
_tree = tree;
_leafs= leafs;
_hcs = hcs;
_fr2 = fr2;
_subset = subset;
_build_tree_one_node = build_tree_one_node;
_improvPerVar = improvPerVar;
_family = family;
}
@Override public void compute2() {
// Fuse 2 conceptual passes into one:
// Pass 1: Score a prior DHistogram, and make new Node assignments
// to every row. This involves pulling out the current assigned Node,
// "scoring" the row against that Node's decision criteria, and assigning
// the row to a new child Node (and giving it an improved prediction).
// Pass 2: Build new summary DHistograms on the new child Nodes every row
// got assigned into. Collect counts, mean, variance, min, max per bin,
// per column.
new ScoreBuildHistogram(this,_k, _st._ncols, _nbins, _nbins_cats, _tree, _leafs[_k], _hcs[_k], _family).dfork(null,_fr2,_build_tree_one_node);
}
@Override public void onCompletion(CountedCompleter caller) {
ScoreBuildHistogram sbh = (ScoreBuildHistogram)caller;
//System.out.println(sbh.profString());
final int leafk = _leafs[_k];
int tmax = _tree.len(); // Number of total splits in tree K
for( int leaf=leafk; leaf Split: " + dn._split + " L/R:" + dn._split._n0+" + "+dn._split._n1);
if( dn._split._col == -1 ) udn.do_not_split();
else {
_did_split = true;
DTree.Split s = dn._split; // Accumulate squared error improvements per variable
AtomicUtils.FloatArray.add(_improvPerVar,s.col(),(float)(s.pre_split_se()-s.se()));
}
}
_leafs[_k]=tmax; // Setup leafs for next tree level
int new_leafs = _tree.len()-tmax;
_hcs[_k] = new DHistogram[new_leafs][/*ncol*/];
for( int nl = tmax; nl<_tree.len(); nl ++ )
_hcs[_k][nl-tmax] = _tree.undecided(nl)._hs;
if (_did_split) _tree._depth++;
}
}
// --------------------------------------------------------------------------
// Convenience accessor for a complex chunk layout.
// Wish I could name the array elements nicer...
protected int idx_weight( ) { return _model._output.weightsIdx(); }
protected int idx_offset( ) { return _model._output.offsetIdx(); }
protected int idx_resp( ) { return _model._output.responseIdx(); }
protected int idx_tree(int c) { return _ncols+1+c+numSpecialCols(); }
protected int idx_work(int c) { return idx_tree(c) + _nclass; }
protected int idx_nids(int c) { return idx_work(c) + _nclass; }
protected int idx_oobt() { return idx_nids(0) + _nclass; }
protected Chunk chk_weight( Chunk chks[] ) { return chks[idx_weight()]; }
protected Chunk chk_offset( Chunk chks[] ) { return chks[idx_offset()]; }
protected Chunk chk_resp( Chunk chks[] ) { return chks[idx_resp()]; }
protected Chunk chk_tree( Chunk chks[], int c ) { return chks[idx_tree(c)]; }
protected Chunk chk_work( Chunk chks[], int c ) { return chks[idx_work(c)]; }
protected Chunk chk_nids( Chunk chks[], int c ) { return chks[idx_nids(c)]; }
// Out-of-bag trees counter - only one since it is shared via k-trees
protected Chunk chk_oobt(Chunk chks[]) { return chks[idx_oobt()]; }
protected final Vec vec_nids( Frame fr, int c) { return fr.vecs()[idx_nids(c)]; }
protected final Vec vec_resp( Frame fr ) { return fr.vecs()[idx_resp() ]; }
protected final Vec vec_tree( Frame fr, int c) { return fr.vecs()[idx_tree(c)]; }
protected double[] data_row( Chunk chks[], int row, double[] data) {
assert data.length == _ncols;
for(int f=0; f<_ncols; f++) data[f] = chks[f].atd(row);
return data;
}
// Builder-specific decision node
protected DTree.DecidedNode makeDecided( DTree.UndecidedNode udn, DHistogram hs[] ) {
return new DTree.DecidedNode(udn, hs);
}
// Read the 'tree' columns, do model-specific math and put the results in the
// fs[] array, and return the sum. Dividing any fs[] element by the sum
// turns the results into a probability distribution.
abstract protected double score1( Chunk chks[], double offset, double weight, double fs[/*nclass*/], int row );
// Call builder specific score code and then correct probabilities
// if it is necessary.
void score2(Chunk chks[], double weight, double offset, double fs[/*nclass*/], int row ) {
double sum = score1(chks, weight, offset, fs, row);
if( isClassifier()) {
if( !Double.isInfinite(sum) && sum>0f && sum!=1f) ArrayUtils.div(fs, sum);
if (_parms._balance_classes)
GenModel.correctProbabilities(fs, _model._output._priorClassDist, _model._output._modelClassDist);
}
}
// --------------------------------------------------------------------------
// Tag out rows missing the response column
class ExcludeNAResponse extends MRTask {
@Override public void map( Chunk chks[] ) {
Chunk ys = chk_resp(chks);
for( int row=0; row _parms._score_interval && // Limit scoring updates to every 4sec
(double)(_timeLastScoreEnd-_timeLastScoreStart)/sinceLastScore < 0.1); //10% duty cycle
boolean manualInterval = _parms._score_tree_interval > 0 && _model._output._ntrees % _parms._score_tree_interval == 0;
// Now model already contains tid-trees in serialized form
if( _parms._score_each_iteration || finalScoring || // always score under these circumstances
(timeToScore && _parms._score_tree_interval == 0) || // use time-based duty-cycle heuristic only if the user didn't specify _score_tree_interval
manualInterval) {
checkMemoryFootPrint();
if (error_count() > 0)
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(SharedTree.this);
// If validation is specified we use a model for scoring, so we need to
// update it! First we save model with trees (i.e., make them available
// for scoring) and then update it with resulting error
_model.update(_job); updated = true;
Log.info("============================================================== ");
SharedTreeModel.SharedTreeOutput out = _model._output;
_timeLastScoreStart = now;
final boolean printout = (_parms._score_each_iteration || finalScoring || sinceLastScore > _parms._score_interval);
// final boolean computeGainsLift = printout; //only compute Gains/Lift during final scoring
final boolean computeGainsLift = true;
// Score on training data
_job.update(0,"Scoring the model.");
Score sc = new Score(this,_model._output._ntrees>0/*score 0-tree model from scratch*/,oob,response()._key,_model._output.getModelCategory(),computeGainsLift).doAll(train(), build_tree_one_node);
ModelMetrics mm = sc.makeModelMetrics(_model, _parms.train());
out._training_metrics = mm;
training_r2 = ((ModelMetricsSupervised)mm).r2();
if (oob) out._training_metrics._description = "Metrics reported on Out-Of-Bag training samples";
out._scored_train[out._ntrees].fillFrom(mm);
// Score again on validation data
if( _parms._valid != null ) {
Score scv = new Score(this,false,false,vresponse()._key,_model._output.getModelCategory(),computeGainsLift).doAll(valid(), build_tree_one_node);
ModelMetrics mmv = scv.makeModelMetrics(_model,_parms.valid());
out._validation_metrics = mmv;
out._scored_valid[out._ntrees].fillFrom(mmv);
}
out._model_summary = createModelSummaryTable(out);
out._scoring_history = createScoringHistoryTable(out);
if( out._ntrees > 0 ) { // Compute variable importances
out._varimp = new hex.VarImp(_improvPerVar, out._names);
out._variable_importances = hex.ModelMetrics.calcVarImp(out._varimp);
}
if (printout) {
Log.info(_model.toString());
}
_timeLastScoreEnd = System.currentTimeMillis();
}
// Double update - after either scoring or variable importance
if( updated ) _model.update(_job);
return training_r2;
}
static int counter = 0;
// helper for debugging
@SuppressWarnings("unused")
static protected void printGenerateTrees(DTree[] trees) {
for( DTree dtree : trees )
if( dtree != null ) {
try {
PrintWriter writer = new PrintWriter("/tmp/h2o-3.tree" + ++counter + ".txt", "UTF-8");
writer.println(dtree.root().toString2(new StringBuilder(), 0));
writer.close();
} catch (FileNotFoundException|UnsupportedEncodingException e) {
e.printStackTrace();
}
System.out.println(dtree.root().toString2(new StringBuilder(), 0));
}
}
private TwoDimTable createScoringHistoryTable(SharedTreeModel.SharedTreeOutput _output) {
List colHeaders = new ArrayList<>();
List colTypes = new ArrayList<>();
List colFormat = new ArrayList<>();
colHeaders.add("Timestamp"); colTypes.add("string"); colFormat.add("%s");
colHeaders.add("Duration"); colTypes.add("string"); colFormat.add("%s");
colHeaders.add("Number of Trees"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Training MSE"); colTypes.add("double"); colFormat.add("%.5f");
if (_output.getModelCategory() == ModelCategory.Regression) {
colHeaders.add("Training Deviance"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.isClassifier()) {
colHeaders.add("Training LogLoss"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.getModelCategory() == ModelCategory.Binomial) {
colHeaders.add("Training AUC"); colTypes.add("double"); colFormat.add("%.5f");
colHeaders.add("Training Lift"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.getModelCategory() == ModelCategory.Binomial || _output.getModelCategory() == ModelCategory.Multinomial) {
colHeaders.add("Training Classification Error"); colTypes.add("double"); colFormat.add("%.5f");
}
if (valid() != null) {
colHeaders.add("Validation MSE"); colTypes.add("double"); colFormat.add("%.5f");
if (_output.getModelCategory() == ModelCategory.Regression) {
colHeaders.add("Validation Deviance"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.isClassifier()) {
colHeaders.add("Validation LogLoss"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.getModelCategory() == ModelCategory.Binomial) {
colHeaders.add("Validation AUC"); colTypes.add("double"); colFormat.add("%.5f");
colHeaders.add("Validation Lift"); colTypes.add("double"); colFormat.add("%.5f");
}
if (_output.isClassifier()) {
colHeaders.add("Validation Classification Error"); colTypes.add("double"); colFormat.add("%.5f");
}
}
int rows = 0;
for( int i = 0; i<_output._scored_train.length; i++ ) {
if (i != 0 && Double.isNaN(_output._scored_train[i]._mse)) continue;
rows++;
}
TwoDimTable table = new TwoDimTable(
"Scoring History", null,
new String[rows],
colHeaders.toArray(new String[0]),
colTypes.toArray(new String[0]),
colFormat.toArray(new String[0]),
"");
int row = 0;
for( int i = 0; i<_output._scored_train.length; i++ ) {
if (i != 0 && Double.isNaN(_output._scored_train[i]._mse)) continue;
int col = 0;
DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss");
table.set(row, col++, fmt.print(_output._training_time_ms[i]));
table.set(row, col++, PrettyPrint.msecs(_output._training_time_ms[i] - _job.start_time(), true));
table.set(row, col++, i);
ScoreKeeper st = _output._scored_train[i];
table.set(row, col++, st._mse);
if (_output.getModelCategory() == ModelCategory.Regression) table.set(row, col++, st._mean_residual_deviance);
if (_output.isClassifier()) table.set(row, col++, st._logloss);
if (_output.getModelCategory() == ModelCategory.Binomial) {
table.set(row, col++, st._AUC);
table.set(row, col++, st._lift);
}
if (_output.isClassifier()) table.set(row, col++, st._classError);
if (_valid != null) {
st = _output._scored_valid[i];
table.set(row, col++, st._mse);
if (_output.getModelCategory() == ModelCategory.Regression) table.set(row, col++, st._mean_residual_deviance);
if (_output.isClassifier()) table.set(row, col++, st._logloss);
if (_output.getModelCategory() == ModelCategory.Binomial) {
table.set(row, col++, st._AUC);
table.set(row, col++, st._lift);
}
if (_output.isClassifier()) table.set(row, col++, st._classError);
}
row++;
}
return table;
}
private TwoDimTable createModelSummaryTable(SharedTreeModel.SharedTreeOutput _output) {
List colHeaders = new ArrayList<>();
List colTypes = new ArrayList<>();
List colFormat = new ArrayList<>();
colHeaders.add("Number of Trees"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Model Size in Bytes"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Min. Depth"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Max. Depth"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Mean Depth"); colTypes.add("double"); colFormat.add("%.5f");
colHeaders.add("Min. Leaves"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Max. Leaves"); colTypes.add("long"); colFormat.add("%d");
colHeaders.add("Mean Leaves"); colTypes.add("double"); colFormat.add("%.5f");
final int rows = 1;
TwoDimTable table = new TwoDimTable(
"Model Summary", null,
new String[rows],
colHeaders.toArray(new String[0]),
colTypes.toArray(new String[0]),
colFormat.toArray(new String[0]),
"");
int row = 0;
int col = 0;
table.set(row, col++, _output._treeStats._num_trees);
table.set(row, col++, _output._treeStats._byte_size);
table.set(row, col++, _output._treeStats._min_depth);
table.set(row, col++, _output._treeStats._max_depth);
table.set(row, col++, _output._treeStats._mean_depth);
table.set(row, col++, _output._treeStats._min_leaves);
table.set(row, col++, _output._treeStats._max_leaves);
table.set(row, col++, _output._treeStats._mean_leaves);
return table;
}
/**
* Compute the *actual* byte size of a tree model in the KV store
*/
private static class ComputeModelSize extends MRTask {
long _model_mem_size; //OUTPUT
final int trees_so_far; //INPUT
final public Key[/*_ntrees*/][/*_nclass*/] _treeKeys; //INPUT
public ComputeModelSize(int trees_so_far, Key[][] _treeKeys) {
this.trees_so_far = trees_so_far;
this._treeKeys = _treeKeys;
}
@Override protected void setupLocal() {
_model_mem_size = 0;
for (int i=0; i< trees_so_far; ++i) {
Key[] per_class = _treeKeys[i];
for (int j=0; j max_mem) {
String msg = "The tree model will not fit in the driver node's memory ("
+ PrettyPrint.bytes((long)avg_tree_mem_size)
+ " per tree x " + _parms._ntrees + " > "
+ PrettyPrint.bytes(max_mem)
+ ") - try decreasing ntrees and/or max_depth or increasing min_rows!";
error("_ntrees", msg);
}
}
/**
* Compute the inital value for a given distribution
* @return initial value
*/
protected double getInitialValue() {
return new InitialValue(_parms).doAll(
_response,
hasWeightCol() ? _weights : _response.makeCon(1),
hasOffsetCol() ? _offset : _response.makeCon(0)
).initialValue();
}
// Helper MRTask to compute the initial value
private static class InitialValue extends MRTask {
public InitialValue(Model.Parameters parms) { _dist = new Distribution(parms); }
final private Distribution _dist;
private double _num;
private double _denom;
public double initialValue() {
if (_dist.distribution == Distribution.Family.multinomial)
return -0.5*new Distribution(Distribution.Family.bernoulli).link(_num/_denom);
else return _dist.link(_num / _denom);
}
@Override public void map(Chunk response, Chunk weight, Chunk offset) {
for (int i=0;i[] cvModelBuilders) {
if( _parms._stopping_rounds == 0 && _parms._max_runtime_secs == 0) return; // No exciting changes to stopping conditions
// Extract stopping conditions from each CV model, and compute the best stopping answer
_parms._stopping_rounds = 0;
_parms._max_runtime_secs = 0;
int sum = 0;
for( int i=0; igetGet(cvModelBuilders[i].dest())._output)._ntrees;
_parms._ntrees = (int)((double)sum/cvModelBuilders.length);
warn("_ntrees", "Setting optimal _ntrees to " + _parms._ntrees + " for cross-validation main model based on early stopping of cross-validation models.");
warn("_stopping_rounds", "Disabling convergence-based early stopping for cross-validation main model.");
warn("_max_runtime_secs", "Disabling maximum allowed runtime for cross-validation main model.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy