All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.tree.ScoreBuildHistogram Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.tree;

import hex.genmodel.utils.DistributionFamily;
import water.H2O.H2OCountedCompleter;
import water.MRTask;

/**  Score and Build Histogram
 *
 * 

Fuse 2 conceptual passes into one: * *

* *
Pass 1:
Score a prior partially-built tree model, and make new Node assignments to * every row. This involves pulling out the current assigned DecidedNode, * "scoring" the row against that Node's decision criteria, and assigning the * row to a new child UndecidedNode (and giving it an improved prediction).
* *
Pass 2:
Build new summary DHistograms on the new child UndecidedNodes * every row got assigned into. Collect counts, mean, variance, min, * max per bin, per column.
*
* *

The result is a set of DHistogram arrays; one DHistogram array for each * unique 'leaf' in the tree being histogramed in parallel. These have node * ID's (nids) from 'leaf' to 'tree._len'. Each DHistogram array is for all * the columns in that 'leaf'. * *

The other result is a prediction "score" for the whole dataset, based on * the previous passes' DHistograms. */ public class ScoreBuildHistogram extends MRTask { final int _k; // Which tree final int _ncols;// Active feature columns final int _nbins;// Numerical columns: Number of bins in each histogram final DTree _tree; // Read-only, shared (except at the histograms in the Nodes) final int _leaf; // Number of active leaves (per tree) // Histograms for every tree, split & active column DHistogram[/*tree-relative node-id*/][/*column*/] _hcs; final DistributionFamily _family; final int _weightIdx; final int _workIdx; final int _nidIdx; final int _treatmentIdx; public ScoreBuildHistogram(H2OCountedCompleter cc, int k, int ncols, int nbins, DTree tree, int leaf, DHistogram[][] hcs, DistributionFamily family, int weightIdx, int workIdx, int nidIdx, int treatmentIdx) { super(cc); _k = k; _ncols= ncols; _nbins= nbins; _tree = tree; _leaf = leaf; _hcs = hcs; _family = family; _weightIdx = weightIdx; _workIdx = workIdx; _nidIdx = nidIdx; _treatmentIdx = treatmentIdx; } /** Marker for already decided row. */ static public final int DECIDED_ROW = -1; /** Marker for sampled out rows */ static public final int OUT_OF_BAG = -2; /** Marker for a fresh tree */ static public final int UNDECIDED_CHILD_NODE_ID = -1; static public final int FRESH = 0; static public boolean isOOBRow(int nid) { return nid <= OUT_OF_BAG; } static public boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; } static public int oob2Nid(int oobNid) { return -oobNid + OUT_OF_BAG; } static public int nid2Oob(int nid) { return -nid + OUT_OF_BAG; } @Override public void reduce( ScoreBuildHistogram sbh ) { // Merge histograms if( sbh._hcs == _hcs ) return; // Local histograms all shared; free to merge // Distributed histograms need a little work for( int i=0; i<_hcs.length; i++ ) { DHistogram[] hs1 = _hcs[i], hs2 = sbh._hcs[i]; if( hs1 == null ) _hcs[i] = hs2; else if( hs2 != null ) for( int j=0; j





© 2015 - 2025 Weber Informatics LLC | Privacy Policy