All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.tree.gbm.GBM_oldQ Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.gbm;

import static water.util.ModelUtils.getPrediction;
import static water.util.Utils.div;
import hex.*;
import hex.VarImp.VarImpRI;
import hex.ConfusionMatrix;
import hex.gbm.DTree.DecidedNode;
import hex.gbm.DTree.LeafNode;
import hex.gbm.DTree.Split;
import hex.gbm.DTree.TreeModel.TreeStats;
import hex.gbm.DTree.UndecidedNode;

import java.util.Arrays;

import water.*;
import water.api.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.util.*;
import water.util.Log.Tag.Sys;

// Gradient Boosted Trees
//
// Based on "Elements of Statistical Learning, Second Edition, page 387"
public class GBM extends SharedTreeModelBuilder {
  static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
  static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.

  @API(help = "Distribution for computing loss function. AUTO selects gaussian for continuous and multinomial for categorical response", filter = Default.class, json=true, importance=ParamImportance.CRITICAL)
  public Family family = Family.AUTO;

  @API(help = "Learning rate, from 0. to 1.0", filter = Default.class, dmin=0, dmax=1, json=true, importance=ParamImportance.SECONDARY)
  public double learn_rate = 0.1;

  @API(help = "Grid search parallelism", filter = Default.class, lmax = 4, gridable=false, importance=ParamImportance.SECONDARY)
  public int grid_parallelism = 1;

  @API(help = "Seed for the random number generator - only for balancing classes (autogenerated)", filter = Default.class)
  long seed = -1; // To follow R-semantics, each call of GBM with imbalance should provide different seed. -1 means seed autogeneration

  /** Distribution functions */
  // Note: AUTO will select gaussian for continuous, and multinomial for categorical response
  // TODO: Replace with drop-down that displays different distributions depending on cont/cat response
  public enum Family {
    AUTO, bernoulli
  }

  /** Sum of variable empirical improvement in squared-error. The value is not scaled! */
  private transient float[/*nfeatures*/] _improvPerVar;

  public static class GBMModel extends DTree.TreeModel {
    static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
    static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.

    @API(help = "Model parameters", json = true)
    final private GBM parameters;
    @Override public final GBM get_params() { return parameters; }
    @Override public final Request2 job() { return get_params(); }

    @API(help = "Learning rate, from 0. to 1.0") final double learn_rate;
    @API(help = "Distribution for computing loss function. AUTO selects gaussian for continuous and multinomial for categorical response")
    final Family family;

    @API(help = "Initially predicted value (for zero trees)")
    double initialPrediction;

    public GBMModel(GBM job, Key key, Key dataKey, Key testKey, String names[], String domains[][], String[] cmDomain, int ntrees, int max_depth, int min_rows, int nbins, double learn_rate, Family family, int num_folds, float[] priorClassDist, float[] classDist) {
      super(key,dataKey,testKey,names,domains,cmDomain,ntrees,max_depth,min_rows,nbins,num_folds,priorClassDist,classDist);
      this.parameters = Job.hygiene((GBM) job.clone());
      this.learn_rate = learn_rate;
      this.family = family;
    }
    private GBMModel(GBMModel prior, DTree[] trees, double err, ConfusionMatrix cm, TreeStats tstats) {
      super(prior, trees, err, cm, tstats);
      this.parameters = prior.parameters;
      this.learn_rate = prior.learn_rate;
      this.family = prior.family;
      this.initialPrediction = prior.initialPrediction;
    }
    private GBMModel(GBMModel prior, DTree[] trees, TreeStats tstats) {
      super(prior, trees, tstats);
      this.parameters = prior.parameters;
      this.learn_rate = prior.learn_rate;
      this.family = prior.family;
      this.initialPrediction = prior.initialPrediction;
    }
    private GBMModel(GBMModel prior, double err, ConfusionMatrix cm, VarImp varimp, AUCData validAUC) {
      super(prior, err, cm, varimp, validAUC);
      this.parameters = prior.parameters;
      this.learn_rate = prior.learn_rate;
      this.family = prior.family;
      this.initialPrediction = prior.initialPrediction;
    }
    private GBMModel(GBMModel prior, Key[][] treeKeys, double[] errs, ConfusionMatrix[] cms, TreeStats tstats, VarImp varimp, AUCData validAUC) {
      super(prior, treeKeys, errs, cms, tstats, varimp, validAUC);
      this.parameters = prior.parameters;
      this.learn_rate = prior.learn_rate;
      this.family = prior.family;
    }

    @Override protected TreeModelType getTreeModelType() { return TreeModelType.GBM; }

    @Override protected float[] score0(double[] data, float[] preds) {
      float[] p = super.score0(data, preds);    // These are f_k(x) in Algorithm 10.4
      if(family == Family.bernoulli) {
        double fx = p[1] + initialPrediction;
        p[2] = 1.0f/(float)(1f+Math.exp(-fx));
        p[1] = 1f-p[2];
        p[0] = getPrediction(p, data);
        return p;
      }
      if (nclasses()>1) { // classification
        // Because we call Math.exp, we have to be numerically stable or else
        // we get Infinities, and then shortly NaN's.  Rescale the data so the
        // largest value is +/-1 and the other values are smaller.
        // See notes here:  http://www.hongliangjie.com/2011/01/07/logsum/
        float maxval=Float.NEGATIVE_INFINITY;
        float dsum=0;
        if (nclasses()==2)  p[2] = - p[1];
        // Find a max
        for( int k=1; k%content");
    rs.replace("key", k.toString());
    rs.replace("content", content);
    return rs.toString();
  }

  @Override protected void execImpl() {
    try {
      logStart();
      buildModel(seed);
      if (n_folds > 0) CrossValUtils.crossValidate(this);
    } finally {
      remove();                   // Remove Job
      state = UKV.get(self()).state;
      new TAtomic() {
        @Override
        public GBMModel atomic(GBMModel m) {
          if (m != null) m.get_params().state = state;
          return m;
        }
      }.invoke(dest());
    }
  }

  @Override public int gridParallelism() {
    return grid_parallelism;
  }

  @Override protected Response redirect() {
    return GBMProgressPage.redirect(this, self(), dest());
  }

  @Override protected void initAlgo( GBMModel initialModel) {
    // Initialize gbm-specific data structures
    if (importance) _improvPerVar = new float[initialModel.nfeatures()];
    // assert (family != Family.bernoulli) || (_nclass == 2) : "Bernoulli requires the response to be a 2-class categorical";
    if(family == Family.bernoulli && _nclass != 2)
      throw new IllegalArgumentException("Bernoulli requires the response to be a 2-class categorical");
  }
  @Override protected void initWorkFrame(GBMModel initialModel, Frame fr) {
    // Tag out rows missing the response column
    new ExcludeNAResponse().doAll(fr);
    // Initial value is mean(y)
    final double mean = (float) fr.vec(initialModel.responseName()).mean();
    // Initialize working response based on given loss function
    if (_nclass == 1) { /* regression */
      initialModel.initialPrediction = mean; // Regression initially predicts the response mean
      new MRTask2() {
        @Override public void map(Chunk[] chks) {
          Chunk tr = chk_tree(chks, 0); // there is only one tree for regression
          for (int i=0; i {
    @Override public void map( Chunk chks[] ) {
      Chunk ys = chk_resp(chks);
      for( int row=0; row {
    @Override public void map( Chunk chks[] ) {
      Chunk ys = chk_resp(chks);
      if( family == Family.bernoulli ) {
        Chunk tr = chk_tree(chks,0);
        Chunk wk = chk_work(chks,0);
        for( int row = 0; row < ys._len; row++)
          // wk.set0(row, 1.0f/(1f+Math.exp(-tr.at0(row))) ); // Prob_1
          wk.set0(row, 1.0f/(1f+Math.exp(tr.at0(row))) );     // Prob_0
      } else if( _nclass > 1 ) {       // Classification
        float fs[] = new float[_nclass+1];
        for( int row=0; row {
    @Override public void map( Chunk chks[] ) {
      Chunk ys = chk_resp(chks);
      if(family == Family.bernoulli) {
        for(int row = 0; row < ys._len; row++) {
          if( ys.isNA0(row) ) continue;
          int y = (int)ys.at80(row); // zero-based response variable
          Chunk wk = chk_work(chks,0);
          // wk.set0(row, y-(float)wk.at0(row));  // wk.at0(row) is Prob_1
          wk.set0(row, y-1f+(float)wk.at0(row));  // wk.at0(row) is Prob_0
        }
      } else if( _nclass > 1 ) {       // Classification

        for( int row=0; rowResiduals
        for( int row=0; row 1 && family != Family.bernoulli ? (double)(_nclass-1)/_nclass : 1.0; // K-1/K for multinomial
    for( int k=0; k<_nclass; k++ ) {
      final DTree tree = ktrees[k];
      if( tree == null ) continue;
      for( int i=0; i fields from tree prediction
            ct.set0(row, (float)(ct.at0(row) + (float) ((LeafNode)tree.node(nid))._pred));
            nids.set0(row,0);
          }
        }
      }
    }.doAll(fr);

    // Collect leaves stats
    for (int i=0; i {
    final DTree _trees[]; // Read-only, shared (except at the histograms in the Nodes)
    final int   _leafs[]; // Number of active leaves (per tree)
    // Per leaf: sum(res);
    double _rss[/*tree/klass*/][/*tree-relative node-id*/];
    // Per leaf: multinomial: sum(|res|*1-|res|), gaussian: sum(1), bernoulli: sum((y-res)*(1-y+res))
    double _gss[/*tree/klass*/][/*tree-relative node-id*/];
    GammaPass(DTree trees[], int leafs[]) { _leafs=leafs; _trees=trees; }
    @Override public void map( Chunk[] chks ) {
      _gss = new double[_nclass][];
      _rss = new double[_nclass][];
      final Chunk resp = chk_resp(chks); // Response for this frame

      // For all tree/klasses
      for( int k=0; k<_nclass; k++ ) {
        final DTree tree = _trees[k];
        final int   leaf = _leafs[k];
        if( tree == null ) continue; // Empty class is ignored

        // A leaf-biased array of all active Tree leaves.
        final double gs[] = _gss[k] = new double[tree._len-leaf];
        final double rs[] = _rss[k] = new double[tree._len-leaf];
        final Chunk nids = chk_nids(chks,k); // Node-ids  for this tree/class
        final Chunk ress = chk_work(chks,k); // Residuals for this tree/class

        // If we have all constant responses, then we do not split even the
        // root and the residuals should be zero.
        if( tree.root() instanceof LeafNode ) continue;
        for( int row=0; row 1 ? ares*(1-ares) : 1;
          rs[leafnid-leaf] += res;
        }
      }
    }
    @Override public void reduce( GammaPass gp ) {
      Utils.add(_gss,gp._gss);
      Utils.add(_rss,gp._rss);
    }
  }

  @Override protected DecidedNode makeDecided( UndecidedNode udn, DHistogram hs[] ) {
    return new GBMDecidedNode(udn,hs);
  }

  // ---
  // GBM DTree decision node: same as the normal DecidedNode, but
  // specifies a decision algorithm given complete histograms on all
  // columns.  GBM algo: find the lowest error amongst *all* columns.
  static class GBMDecidedNode extends DecidedNode {
    GBMDecidedNode( UndecidedNode n, DHistogram[] hs ) { super(n,hs); }
    @Override public UndecidedNode makeUndecidedNode(DHistogram[] hs ) {
      return new GBMUndecidedNode(_tree,_nid,hs);
    }

    // Find the column with the best split (lowest score).  Unlike RF, GBM
    // scores on all columns and selects splits on all columns.
    @Override public DTree.Split bestCol( UndecidedNode u, DHistogram[] hs ) {
      DTree.Split best = new DTree.Split(-1,-1,null,(byte)0,Double.MAX_VALUE,Double.MAX_VALUE,0L,0L,0,0);
      if( hs == null ) return best;
      for( int i=0; i© 2015 - 2025 Weber Informatics LLC | Privacy Policy