hex.tree.ScoreBuildHistogram Maven / Gradle / Ivy
package hex.tree;
import water.MRTask;
import water.H2O.H2OCountedCompleter;
import water.fvec.Chunk;
import water.util.AtomicUtils;
/** Score and Build Histogram
*
* Fuse 2 conceptual passes into one:
*
*
*
* - Pass 1:
- Score a prior partially-built tree model, and make new Node assignments to
* every row. This involves pulling out the current assigned DecidedNode,
* "scoring" the row against that Node's decision criteria, and assigning the
* row to a new child UndecidedNode (and giving it an improved prediction).
*
* - Pass 2:
- Build new summary DHistograms on the new child UndecidedNodes
* every row got assigned into. Collect counts, mean, variance, min,
* max per bin, per column.
*
*
* The result is a set of DHistogram arrays; one DHistogram array for each
* unique 'leaf' in the tree being histogramed in parallel. These have node
* ID's (nids) from 'leaf' to 'tree._len'. Each DHistogram array is for all
* the columns in that 'leaf'.
*
*
The other result is a prediction "score" for the whole dataset, based on
* the previous passes' DHistograms.
*/
public class ScoreBuildHistogram extends MRTask {
final int _k; // Which tree
final int _ncols;// Active feature columns
final int _nbins;// Numerical columns: Number of bins in each histogram
final int _nbins_cats;// Categorical columns: Number of bins in each histogram
final DTree _tree; // Read-only, shared (except at the histograms in the Nodes)
final int _leaf; // Number of active leaves (per tree)
// Histograms for every tree, split & active column
final DHistogram _hcs[/*tree-relative node-id*/][/*column*/];
final boolean _subset; // True if working a subset of cols
public ScoreBuildHistogram(H2OCountedCompleter cc, int k, int ncols, int nbins, int nbins_cats, DTree tree, int leaf, DHistogram hcs[][], boolean subset) {
super(cc);
_k = k;
_ncols= ncols;
_nbins= nbins;
_nbins_cats= nbins_cats;
_tree = tree;
_leaf = leaf;
_hcs = hcs;
_subset = subset;
_modifiesInputs = true;
}
/** Marker for already decided row. */
static public final int DECIDED_ROW = -1;
/** Marker for sampled out rows */
static public final int OUT_OF_BAG = -2;
static public boolean isOOBRow(int nid) { return nid <= OUT_OF_BAG; }
static public boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; }
static public int oob2Nid(int oobNid) { return -oobNid + OUT_OF_BAG; }
static public int nid2Oob(int nid) { return -nid + OUT_OF_BAG; }
// Once-per-node shared init
@Override public void setupLocal( ) {
// Init all the internal tree fields after shipping over the wire
_tree.init_tree();
// Allocate local shared memory histograms
for( int l=_leaf; l<_tree._len; l++ ) {
DTree.UndecidedNode udn = _tree.undecided(l);
DHistogram hs[] = _hcs[l-_leaf];
int sCols[] = udn._scoreCols;
if( sCols != null ) { // Sub-selecting just some columns?
for( int col : sCols ) // For tracked cols
hs[col].init();
} else { // Else all columns
for( int j=0; j<_ncols; j++) // For all columns
if( hs[j] != null ) // Tracking this column?
hs[j].init();
}
}
}
@Override public void map( Chunk[] chks ) {
assert chks.length==_ncols+4;
final Chunk wrks = chks[_ncols+2];
final Chunk nids = chks[_ncols+3];
// Pass 1: Score a prior partially-built tree model, and make new Node
// assignments to every row. This involves pulling out the current
// assigned DecidedNode, "scoring" the row against that Node's decision
// criteria, and assigning the row to a new child UndecidedNode (and
// giving it an improved prediction).
int nnids[] = new int[nids._len];
if( _leaf > 0) // Prior pass exists?
score_decide(chks,nids,nnids);
else // Just flag all the NA rows
for( int row=0; row= 0 ) { // row already predicts perfectly or OOB
assert !Double.isNaN(wrks.atd(row)); // Already marked as sampled-away
DHistogram nhs[] = _hcs[nid];
int sCols[] = _tree.undecided(nid+_leaf)._scoreCols; // Columns to score (null, or a list of selected cols)
for( int col : sCols ) // For tracked cols
//FIXME/TODO: sum into local variables, do atomic increment once at the end, similar to accum_all
nhs[col].incr((float)chks[col].atd(row),wrks.atd(row)); // Histogram row/col
}
}
}
// All rows, all cols, accumulate histograms. This is the hot hot inner
// loop of GBM, so we do some non-standard optimizations. The rows in this
// chunk are spread out amongst a modest set of NodeIDs/splits. Normally
// we would visit the rows in row-order, but this visits the NIDs in random
// order. The hot-part of this code updates the histograms racily (via
// atomic updates) - once-per-row. This optimized version updates the
// histograms once-per-NID, but requires pre-sorting the rows by NID.
private void accum_all(Chunk chks[], Chunk wrks, int nnids[]) {
// Sort the rows by NID, so we visit all the same NIDs in a row
// Find the count of unique NIDs in this chunk
int nh[] = new int[_hcs.length+1];
for( int i : nnids ) if( i >= 0 ) nh[i+1]++;
// Rollup the histogram of rows-per-NID in this chunk
for( int i=0; i<_hcs.length; i++ ) nh[i+1] += nh[i];
// Splat the rows into NID-groups
int rows[] = new int[nnids.length];
for( int row=0; row= 0 )
rows[nh[nnids[row]]++] = row;
// rows[] has Chunk-local ROW-numbers now, in-order, grouped by NID.
// nh[] lists the start of each new NID, and is indexed by NID+1.
accum_all2(chks,wrks,nh,rows);
}
// For all columns, for all NIDs, for all ROWS...
private void accum_all2(Chunk chks[], Chunk wrks, int nh[], int[] rows) {
final DHistogram hcs[][] = _hcs;
if( hcs.length==0 ) return; // Unlikely fast cutout
// Local temp arrays, no atomic updates.
int bins[] = new int [Math.max(_nbins, _nbins_cats)];
double sums[] = new double[Math.max(_nbins, _nbins_cats)];
double ssqs[] = new double[Math.max(_nbins, _nbins_cats)];
// For All Columns
for( int c=0; c<_ncols; c++) { // for all columns
Chunk chk = chks[c];
// For All NIDs
for( int n=0; n= bins.length ) { // Grow bins if needed
bins = new int [rh._bins.length];
sums = new double[rh._bins.length];
ssqs = new double[rh._bins.length];
}
// Gather all the data for this set of rows, for 1 column and 1 split/NID
// Gather min/max, sums and sum-squares.
for( int xrow=lo; xrow max ) max = col_data;
int b = rh.bin(col_data); // Compute bin# via linear interpolation
bins[b]++; // Bump count in bin
double resp = wrks.atd(row);
sums[b] += resp;
ssqs[b] += resp*resp;
}
// Add all the data into the Histogram (atomically add)
rh.setMin(min); // Track actual lower/upper bound per-bin
rh.setMax(max);
for( int b=0; b