hex.tree.DRealHistogram Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.tree;

import water.MemoryManager;
import water.util.ArrayUtils;
import water.util.AtomicUtils;
import water.util.IcedBitSet;
import water.util.MathUtils;

/** A Histogram, computed in parallel over a Vec.
 *
 *  Sums and sums-of-squares of floats
 *
 *  @author Cliff Click
 */
public class DRealHistogram extends DHistogram {
  private double _sums[], _ssqs[]; // Sums & square-sums, shared, atomically incremented

  public DRealHistogram(String name, final int nbins, int nbins_cats, byte isInt, float min, float maxEx, long nelems) {
    super(name,nbins, nbins_cats, isInt, min, maxEx, nelems);
  }
  @Override boolean isBinom() { return false; }

  @Override public double mean(int b) {
    int n = _bins[b];
    return n>0 ? _sums[b]/n : 0;
  }
  @Override public double var (int b) {
    int n = _bins[b];
    if( n<=1 ) return 0;
    return (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1);
  }

  // Big allocation of arrays
  @Override void init0() {
    _sums = MemoryManager.malloc8d(_nbin);
    _ssqs = MemoryManager.malloc8d(_nbin);
  }

  // Add one row to a bin found via simple linear interpolation.
  // Compute response mean & variance.
  // Done racily instead F/J map calls, so atomic
  @Override void incr0( int b, double y ) {
    AtomicUtils.DoubleArray.add(_sums,b,y);
    AtomicUtils.DoubleArray.add(_ssqs,b,y*y);
  }
  // Same, except square done by caller
  void incr1( int b, double y, double yy ) {
    AtomicUtils.DoubleArray.add(_sums,b,y);
    AtomicUtils.DoubleArray.add(_ssqs,b,yy);
  }

  // Merge two equal histograms together.
  // Done in a F/J reduce, so no synchronization needed.
  @Override void add0( DRealHistogram dsh ) {
    ArrayUtils.add(_sums,dsh._sums);
    ArrayUtils.add(_ssqs,dsh._ssqs);
  }

  // Compute a "score" for a column; lower score "wins" (is a better split).
  // Score is the sum of the MSEs when the data is split at a single point.
  // mses[1] == MSE for splitting between bins  0  and 1.
  // mses[n] == MSE for splitting between bins n-1 and n.
  @Override public DTree.Split scoreMSE( int col, int min_rows ) {
    final int nbins = nbins();
    assert nbins > 1;

    // Histogram arrays used for splitting, these are either the original bins
    // (for an ordered predictor), or sorted by the mean response (for an
    // unordered predictor, i.e. categorical predictor).
    double[] sums = _sums;
    double[] ssqs = _ssqs;
    int  [] bins = _bins;
    int idxs[] = null;          // and a reverse index mapping

    // For categorical (unordered) predictors, sort the bins by average
    // prediction then look for an optimal split.  Currently limited to enums
    // where we're one-per-bin.  No point for 3 or fewer bins as all possible
    // combinations (just 3) are tested without needing to sort.
    if( _isInt == 2 && _step == 1.0f && nbins >= 4 ) {
      // Sort the index by average response
      idxs = MemoryManager.malloc4(nbins+1); // Reverse index
      for( int i=0; i=0; b-- ) {
      double m0 = sums1[b+1],  m1 = sums[b];
      double s0 = ssqs1[b+1],  s1 = ssqs[b];
      long   k0 = ns1  [b+1],  k1 = bins[b];
      if( k0==0 && k1==0 ) continue;
      sums1[b] = m0+m1;
      ssqs1[b] = s0+s1;
      ns1  [b] = k0+k1;
      assert ns0[b]+ns1[b]==tot;
    }

    // Now roll the split-point across the bins.  There are 2 ways to do this:
    // split left/right based on being less than some value, or being equal/
    // not-equal to some value.  Equal/not-equal makes sense for categoricals
    // but both splits could work for any integral datatype.  Do the less-than
    // splits first.
    int best=0;                         // The no-split
    double best_se0=Double.MAX_VALUE;   // Best squared error
    double best_se1=Double.MAX_VALUE;   // Best squared error
    byte equal=0;                       // Ranged check
    for( int b=1; b<=nbins-1; b++ ) {
      if( bins[b] == 0 ) continue; // Ignore empty splits
      if( ns0[b] < min_rows ) continue;
      if( ns1[b] < min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always
      // We're making an unbiased estimator, so that MSE==Var.
      // Then Squared Error = MSE*N = Var*N
      //                    = (ssqs/N - mean^2)*N
      //                    = ssqs - N*mean^2
      //                    = ssqs - N*(sum/N)(sum/N)
      //                    = ssqs - sum^2/N
      double se0 = ssqs0[b] - sums0[b]*sums0[b]/ns0[b];
      double se1 = ssqs1[b] - sums1[b]*sums1[b]/ns1[b];
      if( se0 < 0 ) se0 = 0;    // Roundoff error; sometimes goes negative
      if( se1 < 0 ) se1 = 0;    // Roundoff error; sometimes goes negative
      if( (se0+se1 < best_se0+best_se1) || // Strictly less error?
          // Or tied MSE, then pick split towards middle bins
          (se0+se1 == best_se0+best_se1 &&
           Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) {
        best_se0 = se0;   best_se1 = se1;
        best = b;
      }
    }

    // If the bin covers a single value, we can also try an equality-based split
    if( _isInt > 0 && _step == 1.0f &&    // For any integral (not float) column
        _maxEx-_min > 2 && idxs==null ) { // Also need more than 2 (boolean) choices to actually try a new split pattern
      for( int b=1; b<=nbins-1; b++ ) {
        if( bins[b] < min_rows ) continue; // Ignore too small splits
        long N =         ns0[b  ] + ns1[b+1];
        if( N < min_rows ) continue; // Ignore too small splits
        double sums2 = sums0[b  ]+sums1[b+1];
        double ssqs2 = ssqs0[b  ]+ssqs1[b+1];
        double si =    ssqs2     -sums2  *sums2  /   N   ; // Left+right, excluding 'b'
        double sx =    ssqs [b]  -sums[b]*sums[b]/bins[b]; // Just 'b'
        if( si < 0 ) si = 0;    // Roundoff error; sometimes goes negative
        if( sx < 0 ) sx = 0;    // Roundoff error; sometimes goes negative
        if( si+sx < best_se0+best_se1 ) { // Strictly less error?
          best_se0 = si;   best_se1 = sx;
          best = b;        equal = 1; // Equality check
        }
      }
    }

    // For categorical (unordered) predictors, we sorted the bins by average
    // prediction then found the optimal split on sorted bins
    IcedBitSet bs = null;       // In case we need an arbitrary bitset
    if( idxs != null ) {        // We sorted bins; need to build a bitset
      int min=Integer.MAX_VALUE;// Compute lower bound and span for bitset
      int max=Integer.MIN_VALUE;
      for( int i=best; i