hex.tree.ScoreBuildHistogram2 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.tree;

import hex.genmodel.utils.DistributionFamily;
import jsr166y.CountedCompleter;
import water.*;
import water.fvec.*;
import water.util.ArrayUtils;
import water.util.IcedBitSet;
import water.util.Log;
import water.util.VecUtils;
import static hex.tree.SharedTree.ScoreBuildOneTree;

import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;

/**
 * Created by tomas on 10/28/16.
 *
 * Score and Build Histogram.
 *
 * This is an updated version ditching histogram sharing (still optional) to improve perfomance on multi-cpu systems (witnessed speedup of up to 4x).
 *
 * NOTE: unlike standard MRTask, launch via dfork2 instead of doAll/dfork. Has custom 2-phase local mapreduce task.
 *
 * Fuse 2 conceptual passes into one (MRTask):
 *
 * 

 *
 * Pass 1:
Score a prior partially-built tree model, and make new Node assignments to
 * every row.  This involves pulling out the current assigned DecidedNode,
 * "scoring" the row against that Node's decision criteria, and assigning the
 * row to a new child UndecidedNode (and giving it an improved prediction).
 *
 * Pass 2:
Build new summary DHistograms on the new child UndecidedNodes
 * every row got assigned into.  Collect counts, mean, variance, min,
 * max per bin, per column.
 * 
 *
 * The 2 passes are executed (locally) in sequence.
 *
 * The result is a set of DHistogram arrays; one DHistogram array for each
 * unique 'leaf' in the tree being histogramed in parallel.  These have node
 * ID's (nids) from 'leaf' to 'tree._len'.  Each DHistogram array is for all
 * the columns in that 'leaf'.
 *
 * The other result is a prediction "score" for the whole dataset, based on
 * the previous passes' DHistograms.
 *
 *
 * No CAS update:
 *
 * Sharing the histograms proved to be a performance problem on larger multi-cpu machines with many running threads, CAS was the bottleneck.
 *
 * To remove the CAS while minimizing the memory overhead (private copies of histograms), phase 2 is paralellized both over columns (primary) and rows (secondary).
 * Parallelization over different columns precedes paralellization within each column to reduce number of extra histogram copies made.
 *
 * Expected number of per-column tasks running in parallel (and hence histogram copies) is given by
 *
 *    exp(nthreads-pre-column) = max(1,H2O.NUMCPUS - num_cols)
 *
 */
public class ScoreBuildHistogram2 extends ScoreBuildHistogram {
  transient int []   _cids;
  transient Chunk[][] _chks;
  transient double [][] _ys;
  transient double [][] _ws;
  transient int [][] _nhs;
  transient int [][] _rss;
  Frame _fr2;
  final int _numLeafs;
  final IcedBitSet _activeCols;
  final int _respIdx;
  final int _predsIdx;
  final boolean _reproducibleHistos;
  // only for debugging purposes
  final boolean _reduceHistoPrecision; // if enabled allows to test that histograms are 100% reproducible when reproducibleHistos are enabled
  transient Consumer _hcsMonitor;
  final int _treatmentIdx;

  public ScoreBuildHistogram2(ScoreBuildOneTree sb, int treeNum, int k, int ncols, int nbins, DTree tree, int leaf,
                              DHistogram[][] hcs, DistributionFamily family,
                              int respIdx, int weightIdx, int predsIdx, int workIdx, int nidIdxs, int treatmentIdx) {
    super(sb, k, ncols, nbins, tree, leaf, hcs, family, weightIdx, workIdx, nidIdxs, treatmentIdx);
    
    _numLeafs = _hcs.length;
    _respIdx = respIdx;
    _predsIdx = predsIdx;
    _treatmentIdx = treatmentIdx;

    int hcslen = _hcs.length;
    IcedBitSet activeCols = new IcedBitSet(ncols);
    for (int n = 0; n < hcslen; n++) {
      int [] acs = _tree.undecided(n + _leaf)._scoreCols;
      if(acs != null) {
        for (int c : acs) // Columns to score (null, or a list of selected cols)
          activeCols.set(c);
      } else {
        activeCols = null;
        break;
      }
    }
    _activeCols = activeCols;
    _hcs = ArrayUtils.transpose(_hcs);
    // override defaults using debugging parameters where applicable
    SharedTree.SharedTreeDebugParams dp = sb._st.getDebugParams();
    _reproducibleHistos = tree._parms.forceStrictlyReproducibleHistograms() || dp._reproducible_histos;
    _reduceHistoPrecision = !dp._keep_orig_histo_precision;
    if (_reproducibleHistos && treeNum == 0 && k == 0 && leaf == 0) {
      Log.info("Using a deterministic way of building histograms");
    }
    _hcsMonitor = dp.makeDHistogramMonitor(treeNum, k, leaf);
  }

  void dfork2(Frame fr) {
    _fr2 = fr;
    asyncExecOnAllNodes();
  }

  @Override public void map(Chunk [] chks){
    // Even though this is an MRTask over a Frame, map(Chunk [] chks) should not be called for this task.
    //  Instead, we do a custom 2-stage local pass (launched from setupLocal) using LocalMR.
    //
    // There are 2 reasons for that:
    //    a) We have 2 local passes. 1st pass scores the trees and sorts rows, 2nd pass starts after the 1st pass is done and computes the histogram.
    //       Conceptually two tasks but since we do not need global result we want to do the two passes inside of 1 task - no need to insert extra communication overhead here.
    //    b) To reduce the memory overhead in pass 2(in case we're making private DHistogram copies).
    //       There is a private copy made for each task. MRTask forks one task per one line of chunks and we do not want to make too many copies.
    //       By reusing the same DHisto for multiple chunks we save memory and calls to reduce.
    //
    throw H2O.unimpl();
  }


  // Pass 1: Score a prior partially-built tree model, and make new Node
  // assignments to every row.  This involves pulling out the current
  // assigned DecidedNode, "scoring" the row against that Node's decision
  // criteria, and assigning the row to a new child UndecidedNode (and
  // giving it an improved prediction).
  protected int[] score_decide(Chunk chks[], int nnids[]) {
    int [] res = nnids.clone();
    for( int row=0; row largestChunkSz) largestChunkSz = sz;
    }
    final int fLargestChunkSz = largestChunkSz;
    final AtomicInteger cidx = new AtomicInteger(0);
    // First do the phase 1 on all local data
    new LocalMR(new MrFun(){
      // more or less copied from ScoreBuildHistogram
      private void map(int id, Chunk [] chks) {
        final C4VolatileChunk nids = (C4VolatileChunk) chks[_nidIdx];
        // Pass 1: Score a prior partially-built tree model, and make new Node
        // assignments to every row.  This involves pulling out the current
        // assigned DecidedNode, "scoring" the row against that Node's decision
        // criteria, and assigning the row to a new child UndecidedNode (and
        // giving it an improved prediction).
        int [] nnids;
        if( _leaf > 0)            // Prior pass exists?
          nnids = score_decide(chks,nids.getValues());
        else {                     // Just flag all the NA rows
          nnids = new int[nids._len];
          int [] is = nids.getValues();
          for (int row = 0; row < nids._len; row++) {
            if (isDecidedRow(is[row]))
              nnids[row] = DECIDED_ROW;
          }
        }
        // Pass 2: accumulate all rows, cols into histograms
        // Sort the rows by NID, so we visit all the same NIDs in a row
        // Find the count of unique NIDs in this chunk
        int nh[] = (_nhs[id] = new int[_numLeafs + 1]);
        for (int i : nnids)
          if (i >= 0)
            nh[i + 1]++;
        // Rollup the histogram of rows-per-NID in this chunk
        for (int i = 0; i <_numLeafs; i++) nh[i + 1] += nh[i];
        // Splat the rows into NID-groups
        int rows[] = (_rss[id] = new int[nnids.length]);
        for (int row = 0; row < nnids.length; row++)
          if (nnids[row] >= 0)
            rows[nh[nnids[row]]++] = row;

      }
      @Override
      protected void map(int id) {
        Vec[] vecs = _fr2.vecs();
        for(id = cidx.getAndIncrement(); id < _cids.length; id = cidx.getAndIncrement()) {
          int cidx = _cids[id];
          Chunk [] chks = _chks[id];
          for (int i = 0; i < chks.length; ++i)
            chks[i] = vecs[i].chunkForChunkIdx(cidx);
          map(id,chks);
          chks[_nidIdx].close(cidx,_fs);
          Chunk resChk = chks[_workIdx];
          int len = resChk.len();
          final double[] y;
          if(resChk instanceof C8DVolatileChunk){
            y = ((C8DVolatileChunk)resChk).getValues();
          } else 
            y = resChk.getDoubles(MemoryManager.malloc8d(len), 0, len);
          int[] nh = _nhs[id];
          _ys[id] = MemoryManager.malloc8d(len);
          // Important optimization that helps to avoid cache misses when working on larger datasets
          // `y` has original order corresponding to row order
          // In binning we are accessing data semi-randomly - we only touch values/rows that are in the given
          // node. These are not necessarily next to each other in memory. This is done on a per-feature basis.
          // To optimize for sequential access we reorder the target so that values corresponding to the same node
          // are co-located. Observed speed-up is up to 50% for larger datasets.
          // See DHistogram#updateHisto for reference.
          for (int n = 0; n < nh.length; n++) {
            final int lo = (n == 0 ? 0 : nh[n - 1]);
            final int hi = nh[n];
            if (hi == lo)
              continue;
            for (int i = lo; i < hi; i++) {
              _ys[id][i] = y[_rss[id][i]];
            }
          }
          // Only allocate weights if weight columns is actually used. It is faster to handle null case
          // in binning that to represent the weights using a constant array (it still needs to be in memory
          // and is accessed frequently - waste of CPU cache). 
          if (_weightIdx != -1) {
            _ws[id] = chks[_weightIdx].getDoubles(MemoryManager.malloc8d(len), 0, len);
          }
        }
      }
    },new H2O.H2OCountedCompleter(this){
      public void onCompletion(CountedCompleter cc){
        final int ncols = _ncols;
        final int [] active_cols = _activeCols == null?null:new int[Math.max(1,_activeCols.cardinality())];
        final int nactive_cols = active_cols == null?ncols:active_cols.length;
        ScoreBuildHistogram2.this.addToPendingCount(1+nactive_cols);
        if(active_cols != null) {
          int j = 0;
          for (int i = 0; i < ncols; ++i)
            if (_activeCols.contains(i))
              active_cols[j++] = i;
        }
        // MRTask (over columns) launching MrTasks (over number of workers) for each column.
        // We want FJ to start processing all the columns before parallelizing within column to reduce memory overhead.
        // (running single column in n threads means n-copies of the histogram)
        // This is how it works:
        //    1) Outer MRTask walks down it's tree, forking tasks with exponentially decreasing number of columns until reaching its left most leaf for columns 0.
        //       At this point, the local fjq for this thread has a task for processing half of columns at the bottom, followed by task for 1/4 of columns and so on.
        //       Other threads start stealing work from the bottom.
        //    2) forks the leaf task and (because its polling from the top) executes the LocalMr for the column 0.
        // This way we should have columns as equally distributed as possible without resorting to shared priority queue
        final int numWrks = _hcs.length * nactive_cols < 16 * 1024 ? H2O.NUMCPUS : Math.min(H2O.NUMCPUS, Math.max(4 * H2O.NUMCPUS / nactive_cols, 1));
        final int rem = H2O.NUMCPUS - numWrks * ncols;
        new LocalMR(new MrFun() {
          @Override
          protected void map(int c) {
            c = active_cols == null ? c : active_cols[c];
            final int nthreads = numWrks + (c < rem ? 1 : 0);
            WorkAllocator workAllocator = _reproducibleHistos ? new RangeWorkAllocator(_cids.length, nthreads) : new SharedPoolWorkAllocator(_cids.length); 
            ComputeHistoThread computeHistoThread = new ComputeHistoThread(_hcs.length == 0?new DHistogram[0]:_hcs[c],c,fLargestChunkSz,workAllocator);
            LocalMR mr = new LocalMR(computeHistoThread, nthreads, ScoreBuildHistogram2.this);
            if (_reproducibleHistos) {
              mr = mr.withNoPrevTaskReuse();
              assert mr.isReproducible();
            }
            mr.fork();
          }
        },nactive_cols,ScoreBuildHistogram2.this).fork();
      }
    }).fork();
  }

  private static void mergeHistos(DHistogram [] hcs, DHistogram [] hcs2){
    // Distributed histograms need a little work
    for( int i=0; i< hcs.length; i++ ) {
      DHistogram hs1 = hcs[i], hs2 = hcs2[i];
      if( hs1 == null ) hcs[i] = hs2;
      else if( hs2 != null )
        hs1.add(hs2);
    }
  }

  interface WorkAllocator {
    int getMaxId(int subsetId);
    int allocateWork(int subsetId);
  }

  static class SharedPoolWorkAllocator implements WorkAllocator {
    final int _workAmount;
    final AtomicInteger _id;

    SharedPoolWorkAllocator(int workAmount) {
      _workAmount = workAmount;
      _id = new AtomicInteger();
    }

    @Override
    public int getMaxId(int subsetId) {
      return _workAmount;
    }

    @Override
    public int allocateWork(int subsetId) {
      return _id.getAndIncrement();
    }
  }

  static class RangeWorkAllocator implements WorkAllocator {
    final int _workAmount;
    final int[] _rangePositions;
    final int _rangeLength;

    RangeWorkAllocator(int workAmount, int nWorkers) {
      _workAmount = workAmount;
      _rangePositions = new int[nWorkers];
      _rangeLength = (int) Math.ceil(workAmount / (double) nWorkers);
      int p = 0;
      for (int i = 0; i < _rangePositions.length; i++) {
        _rangePositions[i] = p;
        p += _rangeLength;
      }
    }

    @Override
    public int getMaxId(int subsetId) {
      return Math.min((subsetId + 1) * _rangeLength, _workAmount);
    }

    @Override
    public int allocateWork(int subsetId) {
      return _rangePositions[subsetId]++;
    }
  }

  private class ComputeHistoThread extends MrFun {
    final int _maxChunkSz;
    final int _col;
    final DHistogram [] _lh;

    WorkAllocator _allocator;

    ComputeHistoThread(DHistogram [] hcs, int col, int maxChunkSz, WorkAllocator allocator){
      _lh = hcs; _col = col; _maxChunkSz = maxChunkSz;
      _allocator = allocator;
    }

    @Override
    public ComputeHistoThread makeCopy() {
      return new ComputeHistoThread(ArrayUtils.deepClone(_lh),_col,_maxChunkSz,_allocator);
    }

    @Override
    protected void map(int id){
      Object cs = null;
      double[] resp = null;
      double[] preds = null;
      double[] treatment = null;
      final int maxWorkId = _allocator.getMaxId(id);
      for(int i = _allocator.allocateWork(id); i < maxWorkId; i = _allocator.allocateWork(id)) {
        if (cs == null) { // chunk data cache doesn't exist yet
          if (_respIdx >= 0)
            resp = MemoryManager.malloc8d(_maxChunkSz);
          if (_predsIdx >= 0)
            preds = MemoryManager.malloc8d(_maxChunkSz);
          if (_treatmentIdx >= 0)
            treatment = MemoryManager.malloc8d(_maxChunkSz);
        }
        cs = computeChunk(i, cs, _ws[i], resp, preds, treatment);
      }
    }

    private Object computeChunk(int id, Object cs, double[] ws, double[] resp, double[] preds, double[] treatment){
      int [] nh = _nhs[id];
      int [] rs = _rss[id];
      Chunk resChk = _chks[id][_workIdx];
      int len = resChk._len;
      double [] ys = ScoreBuildHistogram2.this._ys[id];
      final int hcslen = _lh.length;
      boolean extracted = false;
      for (int n = 0; n < hcslen; n++) {
        int sCols[] = _tree.undecided(n + _leaf)._scoreCols; // Columns to score (null, or a list of selected cols)
        if (sCols == null || ArrayUtils.find(sCols, _col) >= 0) {
          DHistogram h = _lh[n];
          int hi = nh[n];
          int lo = (n == 0 ? 0 : nh[n - 1]);
          if (hi == lo || h == null) continue; // Ignore untracked columns in this split
          if (h._vals == null) h.init();
          if (! extracted) {
            cs = h.extractData(_chks[id][_col], cs, len, _maxChunkSz);
            if (h._vals_dim >= 6) {
              _chks[id][_respIdx].getDoubles(resp, 0, len);
              if (h._vals_dim == 7) {
                _chks[id][_predsIdx].getDoubles(preds, 0, len);
              }
            }
            if(h._useUplift){
              _chks[id][_respIdx].getDoubles(resp, 0, len);
              _chks[id][_treatmentIdx].getDoubles(treatment, 0, len);
            }
            extracted = true;
          }
          h.updateHisto(ws, resp, cs, ys, preds, rs, hi, lo, treatment);
        }
      }
      return cs;
    }

    @Override
    protected void reduce(ComputeHistoThread cc) {
      assert _lh != cc._lh;
      mergeHistos(_lh, cc._lh);
    }
  }

  @Override public void postGlobal(){
    _hcs = ArrayUtils.transpose(_hcs);
    for(DHistogram [] ary:_hcs)
      for(DHistogram dh:ary) {
        if (dh == null)
          continue;
        if (_reduceHistoPrecision) 
          dh.reducePrecision();
      }
    if (_hcsMonitor != null)
      _hcsMonitor.accept(_hcs);
  }
}