All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.tree.DBinomHistogram Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.tree;

import water.MemoryManager;
import water.util.ArrayUtils;
import water.util.IcedBitSet;

/**
   A Histogram, computed in parallel over a Vec.
   

Sums (and sums-of-squares) of binomials - 0 or 1. Sums-of-squares==sums in this case.

@author Cliff Click */ public class DBinomHistogram extends DHistogram { public int _sums[]; // Sums (& square-sums since only 0 & 1 allowed), shared, atomically incremented public DBinomHistogram(String name, final int nbins, int nbins_cats, byte isInt, float min, float maxEx, long nelems) { super(name,nbins, nbins_cats, isInt, min, maxEx, nelems); } @Override boolean isBinom() { return true; } @Override public double mean(int b) { int n = _bins[b]; return n>0 ? (double)_sums[b]/n : 0; } @Override public double var (int b) { int n = _bins[b]; if( n<=1 ) return 0; return (_sums[b] - (double)_sums[b]*_sums[b]/n)/(n-1); } // Big allocation of arrays @Override void init0() { _sums = MemoryManager.malloc4(_nbin); } // Add one row to a bin found via simple linear interpolation. // Compute bin min/max. // Compute response mean & variance. @Override void incr0( int b, double y ) { water.util.AtomicUtils.IntArray.incr(_sums,b); } // Merge two equal histograms together. Done in a F/J reduce, so no // synchronization needed. @Override void add0( DBinomHistogram dsh ) { water.util.ArrayUtils.add(_sums,dsh._sums); } // Compute a "score" for a column; lower score "wins" (is a better split). // Score is the sum of the MSEs when the data is split at a single point. // mses[1] == MSE for splitting between bins 0 and 1. // mses[n] == MSE for splitting between bins n-1 and n. @Override public DTree.Split scoreMSE( int col, int min_rows ) { final int nbins = nbins(); assert nbins > 1; // Histogram arrays used for splitting, these are either the original bins // (for an ordered predictor), or sorted by the mean response (for an // unordered predictor, i.e. categorical predictor). int[] sums = _sums; int [] bins = _bins; int idxs[] = null; // and a reverse index mapping // For categorical (unordered) predictors, sort the bins by average // prediction then look for an optimal split. Currently limited to enums // where we're one-per-bin. No point for 3 or fewer bins as all possible // combinations (just 3) are tested without needing to sort. if( _isInt == 2 && _step == 1.0f && nbins >= 4 ) { // Sort the index by average response idxs = MemoryManager.malloc4(nbins+1); // Reverse index for( int i=0; i=0; b-- ) { double m0 = sums1[b+1], m1 = sums[b]; long k0 = ns1 [b+1], k1 = bins[b]; if( k0==0 && k1==0 ) continue; sums1[b] = m0+m1; ns1 [b] = k0+k1; assert ns0[b]+ns1[b]==tot; } // Now roll the split-point across the bins. There are 2 ways to do this: // split left/right based on being less than some value, or being equal/ // not-equal to some value. Equal/not-equal makes sense for categoricals // but both splits could work for any integral datatype. Do the less-than // splits first. int best=0; // The no-split double best_se0=Double.MAX_VALUE; // Best squared error double best_se1=Double.MAX_VALUE; // Best squared error byte equal=0; // Ranged check for( int b=1; b<=nbins-1; b++ ) { if( bins[b] == 0 ) continue; // Ignore empty splits if( ns0[b] < min_rows ) continue; if( ns1[b] < min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always // We're making an unbiased estimator, so that MSE==Var. // Then Squared Error = MSE*N = Var*N // = (ssqs/N - mean^2)*N // = ssqs - N*mean^2 // = ssqs - N*(sum/N)(sum/N) // = ssqs - sum^2/N double se0 = sums0[b]*(1. - sums0[b]/ns0[b]); double se1 = sums1[b]*(1. - sums1[b]/ns1[b]); if( (se0+se1 < best_se0+best_se1) || // Strictly less error? // Or tied MSE, then pick split towards middle bins (se0+se1 == best_se0+best_se1 && Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) { best_se0 = se0; best_se1 = se1; best = b; } } // If the bin covers a single value, we can also try an equality-based split if( _isInt > 0 && _step == 1.0f && // For any integral (not float) column _maxEx-_min > 2 && idxs==null ) { // Also need more than 2 (boolean) choices to actually try a new split pattern for( int b=1; b<=nbins-1; b++ ) { if( bins[b] < min_rows ) continue; // Ignore too small splits long N = ns0[b ] + ns1[b+1]; if( N < min_rows ) continue; // Ignore too small splits double sums2 = sums0[b ]+sums1[b+1]; double si = sums2*(1.- sums2/N) ; // Left+right, excluding 'b' double sx = sums [b] -sums[b]*sums[b]/bins[b]; // Just 'b' if( si+sx < best_se0+best_se1 ) { // Strictly less error? best_se0 = si; best_se1 = sx; best = b; equal = 1; // Equality check } } } // For categorical (unordered) predictors, we sorted the bins by average // prediction then found the optimal split on sorted bins IcedBitSet bs = null; // In case we need an arbitrary bitset if( idxs != null ) { // We sorted bins; need to build a bitset int min=Integer.MAX_VALUE;// Compute lower bound and span for bitset int max=Integer.MIN_VALUE; for( int i=best; i





© 2015 - 2025 Weber Informatics LLC | Privacy Policy