All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.tree.ScoreBuildHistogram Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.tree;

import water.MRTask;
import water.H2O.H2OCountedCompleter;
import water.fvec.Chunk;
import water.util.AtomicUtils;

/**  Score and Build Histogram
 * 
 * 

Fuse 2 conceptual passes into one: * *

* *
Pass 1:
Score a prior partially-built tree model, and make new Node assignments to * every row. This involves pulling out the current assigned DecidedNode, * "scoring" the row against that Node's decision criteria, and assigning the * row to a new child UndecidedNode (and giving it an improved prediction).
* *
Pass 2:
Build new summary DHistograms on the new child UndecidedNodes * every row got assigned into. Collect counts, mean, variance, min, * max per bin, per column.
*
* *

The result is a set of DHistogram arrays; one DHistogram array for each * unique 'leaf' in the tree being histogramed in parallel. These have node * ID's (nids) from 'leaf' to 'tree._len'. Each DHistogram array is for all * the columns in that 'leaf'. * *

The other result is a prediction "score" for the whole dataset, based on * the previous passes' DHistograms. */ public class ScoreBuildHistogram extends MRTask { final int _k; // Which tree final int _ncols;// Active feature columns final int _nbins;// Numerical columns: Number of bins in each histogram final int _nbins_cats;// Categorical columns: Number of bins in each histogram final DTree _tree; // Read-only, shared (except at the histograms in the Nodes) final int _leaf; // Number of active leaves (per tree) // Histograms for every tree, split & active column final DHistogram _hcs[/*tree-relative node-id*/][/*column*/]; final boolean _subset; // True if working a subset of cols public ScoreBuildHistogram(H2OCountedCompleter cc, int k, int ncols, int nbins, int nbins_cats, DTree tree, int leaf, DHistogram hcs[][], boolean subset) { super(cc); _k = k; _ncols= ncols; _nbins= nbins; _nbins_cats= nbins_cats; _tree = tree; _leaf = leaf; _hcs = hcs; _subset = subset; _modifiesInputs = true; } /** Marker for already decided row. */ static public final int DECIDED_ROW = -1; /** Marker for sampled out rows */ static public final int OUT_OF_BAG = -2; static public boolean isOOBRow(int nid) { return nid <= OUT_OF_BAG; } static public boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; } static public int oob2Nid(int oobNid) { return -oobNid + OUT_OF_BAG; } static public int nid2Oob(int nid) { return -nid + OUT_OF_BAG; } // Once-per-node shared init @Override public void setupLocal( ) { // Init all the internal tree fields after shipping over the wire _tree.init_tree(); // Allocate local shared memory histograms for( int l=_leaf; l<_tree._len; l++ ) { DTree.UndecidedNode udn = _tree.undecided(l); DHistogram hs[] = _hcs[l-_leaf]; int sCols[] = udn._scoreCols; if( sCols != null ) { // Sub-selecting just some columns? for( int col : sCols ) // For tracked cols hs[col].init(); } else { // Else all columns for( int j=0; j<_ncols; j++) // For all columns if( hs[j] != null ) // Tracking this column? hs[j].init(); } } } @Override public void map( Chunk[] chks ) { assert chks.length==_ncols+4; final Chunk wrks = chks[_ncols+2]; final Chunk nids = chks[_ncols+3]; // Pass 1: Score a prior partially-built tree model, and make new Node // assignments to every row. This involves pulling out the current // assigned DecidedNode, "scoring" the row against that Node's decision // criteria, and assigning the row to a new child UndecidedNode (and // giving it an improved prediction). int nnids[] = new int[nids._len]; if( _leaf > 0) // Prior pass exists? score_decide(chks,nids,nnids); else // Just flag all the NA rows for( int row=0; row= 0 ) { // row already predicts perfectly or OOB assert !Double.isNaN(wrks.atd(row)); // Already marked as sampled-away DHistogram nhs[] = _hcs[nid]; int sCols[] = _tree.undecided(nid+_leaf)._scoreCols; // Columns to score (null, or a list of selected cols) for( int col : sCols ) // For tracked cols //FIXME/TODO: sum into local variables, do atomic increment once at the end, similar to accum_all nhs[col].incr((float)chks[col].atd(row),wrks.atd(row)); // Histogram row/col } } } // All rows, all cols, accumulate histograms. This is the hot hot inner // loop of GBM, so we do some non-standard optimizations. The rows in this // chunk are spread out amongst a modest set of NodeIDs/splits. Normally // we would visit the rows in row-order, but this visits the NIDs in random // order. The hot-part of this code updates the histograms racily (via // atomic updates) - once-per-row. This optimized version updates the // histograms once-per-NID, but requires pre-sorting the rows by NID. private void accum_all(Chunk chks[], Chunk wrks, int nnids[]) { // Sort the rows by NID, so we visit all the same NIDs in a row // Find the count of unique NIDs in this chunk int nh[] = new int[_hcs.length+1]; for( int i : nnids ) if( i >= 0 ) nh[i+1]++; // Rollup the histogram of rows-per-NID in this chunk for( int i=0; i<_hcs.length; i++ ) nh[i+1] += nh[i]; // Splat the rows into NID-groups int rows[] = new int[nnids.length]; for( int row=0; row= 0 ) rows[nh[nnids[row]]++] = row; // rows[] has Chunk-local ROW-numbers now, in-order, grouped by NID. // nh[] lists the start of each new NID, and is indexed by NID+1. accum_all2(chks,wrks,nh,rows); } // For all columns, for all NIDs, for all ROWS... private void accum_all2(Chunk chks[], Chunk wrks, int nh[], int[] rows) { final DHistogram hcs[][] = _hcs; if( hcs.length==0 ) return; // Unlikely fast cutout // Local temp arrays, no atomic updates. int bins[] = new int [Math.max(_nbins, _nbins_cats)]; double sums[] = new double[Math.max(_nbins, _nbins_cats)]; double ssqs[] = new double[Math.max(_nbins, _nbins_cats)]; // For All Columns for( int c=0; c<_ncols; c++) { // for all columns Chunk chk = chks[c]; // For All NIDs for( int n=0; n= bins.length ) { // Grow bins if needed bins = new int [rh._bins.length]; sums = new double[rh._bins.length]; ssqs = new double[rh._bins.length]; } // Gather all the data for this set of rows, for 1 column and 1 split/NID // Gather min/max, sums and sum-squares. for( int xrow=lo; xrow max ) max = col_data; int b = rh.bin(col_data); // Compute bin# via linear interpolation bins[b]++; // Bump count in bin double resp = wrks.atd(row); sums[b] += resp; ssqs[b] += resp*resp; } // Add all the data into the Histogram (atomically add) rh.setMin(min); // Track actual lower/upper bound per-bin rh.setMax(max); for( int b=0; b





© 2015 - 2025 Weber Informatics LLC | Privacy Policy