hex.tree.ScoreBuildHistogram Maven / Gradle / Ivy
package hex.tree;
import hex.genmodel.utils.DistributionFamily;
import water.H2O.H2OCountedCompleter;
import water.MRTask;
/** Score and Build Histogram
*
* Fuse 2 conceptual passes into one:
*
*
*
* - Pass 1:
- Score a prior partially-built tree model, and make new Node assignments to
* every row. This involves pulling out the current assigned DecidedNode,
* "scoring" the row against that Node's decision criteria, and assigning the
* row to a new child UndecidedNode (and giving it an improved prediction).
*
* - Pass 2:
- Build new summary DHistograms on the new child UndecidedNodes
* every row got assigned into. Collect counts, mean, variance, min,
* max per bin, per column.
*
*
* The result is a set of DHistogram arrays; one DHistogram array for each
* unique 'leaf' in the tree being histogramed in parallel. These have node
* ID's (nids) from 'leaf' to 'tree._len'. Each DHistogram array is for all
* the columns in that 'leaf'.
*
*
The other result is a prediction "score" for the whole dataset, based on
* the previous passes' DHistograms.
*/
public class ScoreBuildHistogram extends MRTask {
final int _k; // Which tree
final int _ncols;// Active feature columns
final int _nbins;// Numerical columns: Number of bins in each histogram
final DTree _tree; // Read-only, shared (except at the histograms in the Nodes)
final int _leaf; // Number of active leaves (per tree)
// Histograms for every tree, split & active column
DHistogram[/*tree-relative node-id*/][/*column*/] _hcs;
final DistributionFamily _family;
final int _weightIdx;
final int _workIdx;
final int _nidIdx;
final int _treatmentIdx;
public ScoreBuildHistogram(H2OCountedCompleter cc, int k, int ncols, int nbins, DTree tree, int leaf, DHistogram[][] hcs, DistributionFamily family, int weightIdx, int workIdx, int nidIdx, int treatmentIdx) {
super(cc);
_k = k;
_ncols= ncols;
_nbins= nbins;
_tree = tree;
_leaf = leaf;
_hcs = hcs;
_family = family;
_weightIdx = weightIdx;
_workIdx = workIdx;
_nidIdx = nidIdx;
_treatmentIdx = treatmentIdx;
}
/** Marker for already decided row. */
static public final int DECIDED_ROW = -1;
/** Marker for sampled out rows */
static public final int OUT_OF_BAG = -2;
/** Marker for a fresh tree */
static public final int UNDECIDED_CHILD_NODE_ID = -1;
static public final int FRESH = 0;
static public boolean isOOBRow(int nid) { return nid <= OUT_OF_BAG; }
static public boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; }
static public int oob2Nid(int oobNid) { return -oobNid + OUT_OF_BAG; }
static public int nid2Oob(int nid) { return -nid + OUT_OF_BAG; }
@Override public void reduce( ScoreBuildHistogram sbh ) {
// Merge histograms
if( sbh._hcs == _hcs )
return; // Local histograms all shared; free to merge
// Distributed histograms need a little work
for( int i=0; i<_hcs.length; i++ ) {
DHistogram[] hs1 = _hcs[i], hs2 = sbh._hcs[i];
if( hs1 == null ) _hcs[i] = hs2;
else if( hs2 != null )
for( int j=0; j