
hex.tree.DRealHistogram Maven / Gradle / Ivy
package hex.tree;
import water.MemoryManager;
import water.util.ArrayUtils;
import water.util.AtomicUtils;
import water.util.IcedBitSet;
import water.util.MathUtils;
/** A Histogram, computed in parallel over a Vec.
*
* Sums and sums-of-squares of floats
*
* @author Cliff Click
*/
public class DRealHistogram extends DHistogram {
private double _sums[], _ssqs[]; // Sums & square-sums, shared, atomically incremented
public DRealHistogram(String name, final int nbins, int nbins_cats, byte isInt, float min, float maxEx, long nelems) {
super(name,nbins, nbins_cats, isInt, min, maxEx, nelems);
}
@Override boolean isBinom() { return false; }
@Override public double mean(int b) {
int n = _bins[b];
return n>0 ? _sums[b]/n : 0;
}
@Override public double var (int b) {
int n = _bins[b];
if( n<=1 ) return 0;
return (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1);
}
// Big allocation of arrays
@Override void init0() {
_sums = MemoryManager.malloc8d(_nbin);
_ssqs = MemoryManager.malloc8d(_nbin);
}
// Add one row to a bin found via simple linear interpolation.
// Compute response mean & variance.
// Done racily instead F/J map calls, so atomic
@Override void incr0( int b, double y ) {
AtomicUtils.DoubleArray.add(_sums,b,y);
AtomicUtils.DoubleArray.add(_ssqs,b,y*y);
}
// Same, except square done by caller
void incr1( int b, double y, double yy ) {
AtomicUtils.DoubleArray.add(_sums,b,y);
AtomicUtils.DoubleArray.add(_ssqs,b,yy);
}
// Merge two equal histograms together.
// Done in a F/J reduce, so no synchronization needed.
@Override void add0( DRealHistogram dsh ) {
ArrayUtils.add(_sums,dsh._sums);
ArrayUtils.add(_ssqs,dsh._ssqs);
}
// Compute a "score" for a column; lower score "wins" (is a better split).
// Score is the sum of the MSEs when the data is split at a single point.
// mses[1] == MSE for splitting between bins 0 and 1.
// mses[n] == MSE for splitting between bins n-1 and n.
@Override public DTree.Split scoreMSE( int col, int min_rows ) {
final int nbins = nbins();
assert nbins > 1;
// Histogram arrays used for splitting, these are either the original bins
// (for an ordered predictor), or sorted by the mean response (for an
// unordered predictor, i.e. categorical predictor).
double[] sums = _sums;
double[] ssqs = _ssqs;
int [] bins = _bins;
int idxs[] = null; // and a reverse index mapping
// For categorical (unordered) predictors, sort the bins by average
// prediction then look for an optimal split. Currently limited to enums
// where we're one-per-bin. No point for 3 or fewer bins as all possible
// combinations (just 3) are tested without needing to sort.
if( _isInt == 2 && _step == 1.0f && nbins >= 4 ) {
// Sort the index by average response
idxs = MemoryManager.malloc4(nbins+1); // Reverse index
for( int i=0; i=0; b-- ) {
double m0 = sums1[b+1], m1 = sums[b];
double s0 = ssqs1[b+1], s1 = ssqs[b];
long k0 = ns1 [b+1], k1 = bins[b];
if( k0==0 && k1==0 ) continue;
sums1[b] = m0+m1;
ssqs1[b] = s0+s1;
ns1 [b] = k0+k1;
assert ns0[b]+ns1[b]==tot;
}
// Now roll the split-point across the bins. There are 2 ways to do this:
// split left/right based on being less than some value, or being equal/
// not-equal to some value. Equal/not-equal makes sense for categoricals
// but both splits could work for any integral datatype. Do the less-than
// splits first.
int best=0; // The no-split
double best_se0=Double.MAX_VALUE; // Best squared error
double best_se1=Double.MAX_VALUE; // Best squared error
byte equal=0; // Ranged check
for( int b=1; b<=nbins-1; b++ ) {
if( bins[b] == 0 ) continue; // Ignore empty splits
if( ns0[b] < min_rows ) continue;
if( ns1[b] < min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always
// We're making an unbiased estimator, so that MSE==Var.
// Then Squared Error = MSE*N = Var*N
// = (ssqs/N - mean^2)*N
// = ssqs - N*mean^2
// = ssqs - N*(sum/N)(sum/N)
// = ssqs - sum^2/N
double se0 = ssqs0[b] - sums0[b]*sums0[b]/ns0[b];
double se1 = ssqs1[b] - sums1[b]*sums1[b]/ns1[b];
if( se0 < 0 ) se0 = 0; // Roundoff error; sometimes goes negative
if( se1 < 0 ) se1 = 0; // Roundoff error; sometimes goes negative
if( (se0+se1 < best_se0+best_se1) || // Strictly less error?
// Or tied MSE, then pick split towards middle bins
(se0+se1 == best_se0+best_se1 &&
Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) {
best_se0 = se0; best_se1 = se1;
best = b;
}
}
// If the bin covers a single value, we can also try an equality-based split
if( _isInt > 0 && _step == 1.0f && // For any integral (not float) column
_maxEx-_min > 2 && idxs==null ) { // Also need more than 2 (boolean) choices to actually try a new split pattern
for( int b=1; b<=nbins-1; b++ ) {
if( bins[b] < min_rows ) continue; // Ignore too small splits
long N = ns0[b ] + ns1[b+1];
if( N < min_rows ) continue; // Ignore too small splits
double sums2 = sums0[b ]+sums1[b+1];
double ssqs2 = ssqs0[b ]+ssqs1[b+1];
double si = ssqs2 -sums2 *sums2 / N ; // Left+right, excluding 'b'
double sx = ssqs [b] -sums[b]*sums[b]/bins[b]; // Just 'b'
if( si < 0 ) si = 0; // Roundoff error; sometimes goes negative
if( sx < 0 ) sx = 0; // Roundoff error; sometimes goes negative
if( si+sx < best_se0+best_se1 ) { // Strictly less error?
best_se0 = si; best_se1 = sx;
best = b; equal = 1; // Equality check
}
}
}
// For categorical (unordered) predictors, we sorted the bins by average
// prediction then found the optimal split on sorted bins
IcedBitSet bs = null; // In case we need an arbitrary bitset
if( idxs != null ) { // We sorted bins; need to build a bitset
int min=Integer.MAX_VALUE;// Compute lower bound and span for bitset
int max=Integer.MIN_VALUE;
for( int i=best; i