hex.tree.ScoreBuildHistogram Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.tree;

import water.MRTask;
import water.H2O.H2OCountedCompleter;
import water.fvec.Chunk;
import water.util.AtomicUtils;

/**  Score and Build Histogram
 * 
 * Fuse 2 conceptual passes into one:
 *
 * 

 *
 * Pass 1:
Score a prior partially-built tree model, and make new Node assignments to
 * every row.  This involves pulling out the current assigned DecidedNode,
 * "scoring" the row against that Node's decision criteria, and assigning the
 * row to a new child UndecidedNode (and giving it an improved prediction).
 *
 * Pass 2:
Build new summary DHistograms on the new child UndecidedNodes
 * every row got assigned into.  Collect counts, mean, variance, min,
 * max per bin, per column.
 * 
 *
 * The result is a set of DHistogram arrays; one DHistogram array for each
 * unique 'leaf' in the tree being histogramed in parallel.  These have node
 * ID's (nids) from 'leaf' to 'tree._len'.  Each DHistogram array is for all
 * the columns in that 'leaf'.
 *
 * 
The other result is a prediction "score" for the whole dataset, based on
 * the previous passes' DHistograms.
 */
public class ScoreBuildHistogram extends MRTask {
  final int   _k;    // Which tree
  final int   _ncols;// Active feature columns
  final int   _nbins;// Numerical columns: Number of bins in each histogram
  final int   _nbins_cats;// Categorical columns: Number of bins in each histogram
  final DTree _tree; // Read-only, shared (except at the histograms in the Nodes)
  final int   _leaf; // Number of active leaves (per tree)
  // Histograms for every tree, split & active column
  final DHistogram _hcs[/*tree-relative node-id*/][/*column*/];
  final boolean _subset;      // True if working a subset of cols

  public ScoreBuildHistogram(H2OCountedCompleter cc, int k, int ncols, int nbins, int nbins_cats, DTree tree, int leaf, DHistogram hcs[][], boolean subset) {
    super(cc);
    _k    = k;
    _ncols= ncols;
    _nbins= nbins;
    _nbins_cats= nbins_cats;
    _tree = tree;
    _leaf = leaf;
    _hcs  = hcs;
    _subset = subset;
    _modifiesInputs = true;
  }

  /** Marker for already decided row. */
  static public final int DECIDED_ROW = -1;
  /** Marker for sampled out rows */
  static public final int OUT_OF_BAG = -2;

  static public boolean isOOBRow(int nid)     { return nid <= OUT_OF_BAG; }
  static public boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; }
  static public int     oob2Nid(int oobNid)   { return -oobNid + OUT_OF_BAG; }
  static public int     nid2Oob(int nid)      { return -nid + OUT_OF_BAG; }

  // Once-per-node shared init
  @Override public void setupLocal( ) {
    // Init all the internal tree fields after shipping over the wire
    _tree.init_tree();
    // Allocate local shared memory histograms
    for( int l=_leaf; l<_tree._len; l++ ) {
      DTree.UndecidedNode udn = _tree.undecided(l);
      DHistogram hs[] = _hcs[l-_leaf];
      int sCols[] = udn._scoreCols;
      if( sCols != null ) { // Sub-selecting just some columns?
        for( int col : sCols ) // For tracked cols
          hs[col].init();
      } else {                 // Else all columns
        for( int j=0; j<_ncols; j++) // For all columns
          if( hs[j] != null )        // Tracking this column?
            hs[j].init();
      }
    }
  }

  @Override public void map( Chunk[] chks ) {
    assert chks.length==_ncols+4;
    final Chunk wrks = chks[_ncols+2];
    final Chunk nids = chks[_ncols+3];

    // Pass 1: Score a prior partially-built tree model, and make new Node
    // assignments to every row.  This involves pulling out the current
    // assigned DecidedNode, "scoring" the row against that Node's decision
    // criteria, and assigning the row to a new child UndecidedNode (and
    // giving it an improved prediction).
    int nnids[] = new int[nids._len];
    if( _leaf > 0)            // Prior pass exists?
      score_decide(chks,nids,nnids);
    else                      // Just flag all the NA rows
      for( int row=0; row= 0 ) {        // row already predicts perfectly or OOB
        assert !Double.isNaN(wrks.atd(row)); // Already marked as sampled-away
        DHistogram nhs[] = _hcs[nid];
        int sCols[] = _tree.undecided(nid+_leaf)._scoreCols; // Columns to score (null, or a list of selected cols)
        for( int col : sCols ) // For tracked cols
          //FIXME/TODO: sum into local variables, do atomic increment once at the end, similar to accum_all
          nhs[col].incr((float)chks[col].atd(row),wrks.atd(row)); // Histogram row/col
      }
    }
  }

  // All rows, all cols, accumulate histograms.  This is the hot hot inner
  // loop of GBM, so we do some non-standard optimizations.  The rows in this
  // chunk are spread out amongst a modest set of NodeIDs/splits.  Normally
  // we would visit the rows in row-order, but this visits the NIDs in random
  // order.  The hot-part of this code updates the histograms racily (via
  // atomic updates) - once-per-row.  This optimized version updates the
  // histograms once-per-NID, but requires pre-sorting the rows by NID.
  private void accum_all(Chunk chks[], Chunk wrks, int nnids[]) {
    // Sort the rows by NID, so we visit all the same NIDs in a row
    // Find the count of unique NIDs in this chunk
    int nh[] = new int[_hcs.length+1];
    for( int i : nnids ) if( i >= 0 ) nh[i+1]++;
    // Rollup the histogram of rows-per-NID in this chunk
    for( int i=0; i<_hcs.length; i++ ) nh[i+1] += nh[i];
    // Splat the rows into NID-groups
    int rows[] = new int[nnids.length];
    for( int row=0; row= 0 )
        rows[nh[nnids[row]]++] = row;
    // rows[] has Chunk-local ROW-numbers now, in-order, grouped by NID.
    // nh[] lists the start of each new NID, and is indexed by NID+1.
    accum_all2(chks,wrks,nh,rows);
  }

  // For all columns, for all NIDs, for all ROWS...
  private void accum_all2(Chunk chks[], Chunk wrks, int nh[], int[] rows) {
    final DHistogram hcs[][] = _hcs;
    if( hcs.length==0 ) return; // Unlikely fast cutout
    // Local temp arrays, no atomic updates.
    int    bins[] = new int   [Math.max(_nbins, _nbins_cats)];
    double sums[] = new double[Math.max(_nbins, _nbins_cats)];
    double ssqs[] = new double[Math.max(_nbins, _nbins_cats)];
    // For All Columns
    for( int c=0; c<_ncols; c++) { // for all columns
      Chunk chk = chks[c];
      // For All NIDs
      for( int n=0; n= bins.length ) { // Grow bins if needed
          bins = new int   [rh._bins.length];
          sums = new double[rh._bins.length];
          ssqs = new double[rh._bins.length];
        }

        // Gather all the data for this set of rows, for 1 column and 1 split/NID
        // Gather min/max, sums and sum-squares.
        for( int xrow=lo; xrow max ) max = col_data;
          int b = rh.bin(col_data); // Compute bin# via linear interpolation
          bins[b]++;                // Bump count in bin
          double resp = wrks.atd(row);
          sums[b] += resp;
          ssqs[b] += resp*resp;
        }

        // Add all the data into the Histogram (atomically add)
        rh.setMin(min);       // Track actual lower/upper bound per-bin
        rh.setMax(max);
        for( int b=0; b