Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
hex.tree.gbm.GBM_oldQ Maven / Gradle / Ivy
package hex.gbm;
import static water.util.ModelUtils.getPrediction;
import static water.util.Utils.div;
import hex.*;
import hex.VarImp.VarImpRI;
import hex.ConfusionMatrix;
import hex.gbm.DTree.DecidedNode;
import hex.gbm.DTree.LeafNode;
import hex.gbm.DTree.Split;
import hex.gbm.DTree.TreeModel.TreeStats;
import hex.gbm.DTree.UndecidedNode;
import java.util.Arrays;
import water.*;
import water.api.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.util.*;
import water.util.Log.Tag.Sys;
// Gradient Boosted Trees
//
// Based on "Elements of Statistical Learning, Second Edition, page 387"
public class GBM extends SharedTreeModelBuilder {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
@API(help = "Distribution for computing loss function. AUTO selects gaussian for continuous and multinomial for categorical response", filter = Default.class, json=true, importance=ParamImportance.CRITICAL)
public Family family = Family.AUTO;
@API(help = "Learning rate, from 0. to 1.0", filter = Default.class, dmin=0, dmax=1, json=true, importance=ParamImportance.SECONDARY)
public double learn_rate = 0.1;
@API(help = "Grid search parallelism", filter = Default.class, lmax = 4, gridable=false, importance=ParamImportance.SECONDARY)
public int grid_parallelism = 1;
@API(help = "Seed for the random number generator - only for balancing classes (autogenerated)", filter = Default.class)
long seed = -1; // To follow R-semantics, each call of GBM with imbalance should provide different seed. -1 means seed autogeneration
/** Distribution functions */
// Note: AUTO will select gaussian for continuous, and multinomial for categorical response
// TODO: Replace with drop-down that displays different distributions depending on cont/cat response
public enum Family {
AUTO, bernoulli
}
/** Sum of variable empirical improvement in squared-error. The value is not scaled! */
private transient float[/*nfeatures*/] _improvPerVar;
public static class GBMModel extends DTree.TreeModel {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
@API(help = "Model parameters", json = true)
final private GBM parameters;
@Override public final GBM get_params() { return parameters; }
@Override public final Request2 job() { return get_params(); }
@API(help = "Learning rate, from 0. to 1.0") final double learn_rate;
@API(help = "Distribution for computing loss function. AUTO selects gaussian for continuous and multinomial for categorical response")
final Family family;
@API(help = "Initially predicted value (for zero trees)")
double initialPrediction;
public GBMModel(GBM job, Key key, Key dataKey, Key testKey, String names[], String domains[][], String[] cmDomain, int ntrees, int max_depth, int min_rows, int nbins, double learn_rate, Family family, int num_folds, float[] priorClassDist, float[] classDist) {
super(key,dataKey,testKey,names,domains,cmDomain,ntrees,max_depth,min_rows,nbins,num_folds,priorClassDist,classDist);
this.parameters = Job.hygiene((GBM) job.clone());
this.learn_rate = learn_rate;
this.family = family;
}
private GBMModel(GBMModel prior, DTree[] trees, double err, ConfusionMatrix cm, TreeStats tstats) {
super(prior, trees, err, cm, tstats);
this.parameters = prior.parameters;
this.learn_rate = prior.learn_rate;
this.family = prior.family;
this.initialPrediction = prior.initialPrediction;
}
private GBMModel(GBMModel prior, DTree[] trees, TreeStats tstats) {
super(prior, trees, tstats);
this.parameters = prior.parameters;
this.learn_rate = prior.learn_rate;
this.family = prior.family;
this.initialPrediction = prior.initialPrediction;
}
private GBMModel(GBMModel prior, double err, ConfusionMatrix cm, VarImp varimp, AUCData validAUC) {
super(prior, err, cm, varimp, validAUC);
this.parameters = prior.parameters;
this.learn_rate = prior.learn_rate;
this.family = prior.family;
this.initialPrediction = prior.initialPrediction;
}
private GBMModel(GBMModel prior, Key[][] treeKeys, double[] errs, ConfusionMatrix[] cms, TreeStats tstats, VarImp varimp, AUCData validAUC) {
super(prior, treeKeys, errs, cms, tstats, varimp, validAUC);
this.parameters = prior.parameters;
this.learn_rate = prior.learn_rate;
this.family = prior.family;
}
@Override protected TreeModelType getTreeModelType() { return TreeModelType.GBM; }
@Override protected float[] score0(double[] data, float[] preds) {
float[] p = super.score0(data, preds); // These are f_k(x) in Algorithm 10.4
if(family == Family.bernoulli) {
double fx = p[1] + initialPrediction;
p[2] = 1.0f/(float)(1f+Math.exp(-fx));
p[1] = 1f-p[2];
p[0] = getPrediction(p, data);
return p;
}
if (nclasses()>1) { // classification
// Because we call Math.exp, we have to be numerically stable or else
// we get Infinities, and then shortly NaN's. Rescale the data so the
// largest value is +/-1 and the other values are smaller.
// See notes here: http://www.hongliangjie.com/2011/01/07/logsum/
float maxval=Float.NEGATIVE_INFINITY;
float dsum=0;
if (nclasses()==2) p[2] = - p[1];
// Find a max
for( int k=1; k%content");
rs.replace("key", k.toString());
rs.replace("content", content);
return rs.toString();
}
@Override protected void execImpl() {
try {
logStart();
buildModel(seed);
if (n_folds > 0) CrossValUtils.crossValidate(this);
} finally {
remove(); // Remove Job
state = UKV.get(self()).state;
new TAtomic() {
@Override
public GBMModel atomic(GBMModel m) {
if (m != null) m.get_params().state = state;
return m;
}
}.invoke(dest());
}
}
@Override public int gridParallelism() {
return grid_parallelism;
}
@Override protected Response redirect() {
return GBMProgressPage.redirect(this, self(), dest());
}
@Override protected void initAlgo( GBMModel initialModel) {
// Initialize gbm-specific data structures
if (importance) _improvPerVar = new float[initialModel.nfeatures()];
// assert (family != Family.bernoulli) || (_nclass == 2) : "Bernoulli requires the response to be a 2-class categorical";
if(family == Family.bernoulli && _nclass != 2)
throw new IllegalArgumentException("Bernoulli requires the response to be a 2-class categorical");
}
@Override protected void initWorkFrame(GBMModel initialModel, Frame fr) {
// Tag out rows missing the response column
new ExcludeNAResponse().doAll(fr);
// Initial value is mean(y)
final double mean = (float) fr.vec(initialModel.responseName()).mean();
// Initialize working response based on given loss function
if (_nclass == 1) { /* regression */
initialModel.initialPrediction = mean; // Regression initially predicts the response mean
new MRTask2() {
@Override public void map(Chunk[] chks) {
Chunk tr = chk_tree(chks, 0); // there is only one tree for regression
for (int i=0; i {
@Override public void map( Chunk chks[] ) {
Chunk ys = chk_resp(chks);
for( int row=0; row {
@Override public void map( Chunk chks[] ) {
Chunk ys = chk_resp(chks);
if( family == Family.bernoulli ) {
Chunk tr = chk_tree(chks,0);
Chunk wk = chk_work(chks,0);
for( int row = 0; row < ys._len; row++)
// wk.set0(row, 1.0f/(1f+Math.exp(-tr.at0(row))) ); // Prob_1
wk.set0(row, 1.0f/(1f+Math.exp(tr.at0(row))) ); // Prob_0
} else if( _nclass > 1 ) { // Classification
float fs[] = new float[_nclass+1];
for( int row=0; row {
@Override public void map( Chunk chks[] ) {
Chunk ys = chk_resp(chks);
if(family == Family.bernoulli) {
for(int row = 0; row < ys._len; row++) {
if( ys.isNA0(row) ) continue;
int y = (int)ys.at80(row); // zero-based response variable
Chunk wk = chk_work(chks,0);
// wk.set0(row, y-(float)wk.at0(row)); // wk.at0(row) is Prob_1
wk.set0(row, y-1f+(float)wk.at0(row)); // wk.at0(row) is Prob_0
}
} else if( _nclass > 1 ) { // Classification
for( int row=0; rowResiduals
for( int row=0; row 1 && family != Family.bernoulli ? (double)(_nclass-1)/_nclass : 1.0; // K-1/K for multinomial
for( int k=0; k<_nclass; k++ ) {
final DTree tree = ktrees[k];
if( tree == null ) continue;
for( int i=0; i fields from tree prediction
ct.set0(row, (float)(ct.at0(row) + (float) ((LeafNode)tree.node(nid))._pred));
nids.set0(row,0);
}
}
}
}.doAll(fr);
// Collect leaves stats
for (int i=0; i {
final DTree _trees[]; // Read-only, shared (except at the histograms in the Nodes)
final int _leafs[]; // Number of active leaves (per tree)
// Per leaf: sum(res);
double _rss[/*tree/klass*/][/*tree-relative node-id*/];
// Per leaf: multinomial: sum(|res|*1-|res|), gaussian: sum(1), bernoulli: sum((y-res)*(1-y+res))
double _gss[/*tree/klass*/][/*tree-relative node-id*/];
GammaPass(DTree trees[], int leafs[]) { _leafs=leafs; _trees=trees; }
@Override public void map( Chunk[] chks ) {
_gss = new double[_nclass][];
_rss = new double[_nclass][];
final Chunk resp = chk_resp(chks); // Response for this frame
// For all tree/klasses
for( int k=0; k<_nclass; k++ ) {
final DTree tree = _trees[k];
final int leaf = _leafs[k];
if( tree == null ) continue; // Empty class is ignored
// A leaf-biased array of all active Tree leaves.
final double gs[] = _gss[k] = new double[tree._len-leaf];
final double rs[] = _rss[k] = new double[tree._len-leaf];
final Chunk nids = chk_nids(chks,k); // Node-ids for this tree/class
final Chunk ress = chk_work(chks,k); // Residuals for this tree/class
// If we have all constant responses, then we do not split even the
// root and the residuals should be zero.
if( tree.root() instanceof LeafNode ) continue;
for( int row=0; row 1 ? ares*(1-ares) : 1;
rs[leafnid-leaf] += res;
}
}
}
@Override public void reduce( GammaPass gp ) {
Utils.add(_gss,gp._gss);
Utils.add(_rss,gp._rss);
}
}
@Override protected DecidedNode makeDecided( UndecidedNode udn, DHistogram hs[] ) {
return new GBMDecidedNode(udn,hs);
}
// ---
// GBM DTree decision node: same as the normal DecidedNode, but
// specifies a decision algorithm given complete histograms on all
// columns. GBM algo: find the lowest error amongst *all* columns.
static class GBMDecidedNode extends DecidedNode {
GBMDecidedNode( UndecidedNode n, DHistogram[] hs ) { super(n,hs); }
@Override public UndecidedNode makeUndecidedNode(DHistogram[] hs ) {
return new GBMUndecidedNode(_tree,_nid,hs);
}
// Find the column with the best split (lowest score). Unlike RF, GBM
// scores on all columns and selects splits on all columns.
@Override public DTree.Split bestCol( UndecidedNode u, DHistogram[] hs ) {
DTree.Split best = new DTree.Split(-1,-1,null,(byte)0,Double.MAX_VALUE,Double.MAX_VALUE,0L,0L,0,0);
if( hs == null ) return best;
for( int i=0; i© 2015 - 2025 Weber Informatics LLC | Privacy Policy