
hex.tree.DHistogram Maven / Gradle / Ivy
package hex.tree;
import sun.misc.Unsafe;
import water.*;
import water.fvec.Frame;
import water.fvec.Vec;
import water.nbhm.UtilUnsafe;
import water.util.*;
/** A Histogram, computed in parallel over a Vec.
*
* A {@code DHistogram} bins every value added to it, and computes a the
* vec min and max (for use in the next split), and response mean and variance
* for each bin. {@code DHistogram}s are initialized with a min, max and
* number-of- elements to be added (all of which are generally available from
* a Vec). Bins run from min to max in uniform sizes. If the {@code
* DHistogram} can determine that fewer bins are needed (e.g. boolean columns
* run from 0 to 1, but only ever take on 2 values, so only 2 bins are
* needed), then fewer bins are used.
*
*
{@code DHistogram} are shared per-node, and atomically updated. There's
* an {@code add} call to help cross-node reductions. The data is stored in
* primitive arrays, so it can be sent over the wire.
*
*
If we are successively splitting rows (e.g. in a decision tree), then a
* fresh {@code DHistogram} for each split will dynamically re-bin the data.
* Each successive split will logarithmically divide the data. At the first
* split, outliers will end up in their own bins - but perhaps some central
* bins may be very full. At the next split(s), the full bins will get split,
* and again until (with a log number of splits) each bin holds roughly the
* same amount of data. This dynamic binning resolves a lot of problems with
* picking the proper bin count or limits - generally a few more tree levels
* will equal any fancy but fixed-size binning strategy.
*
* @author Cliff Click
*/
public final class DHistogram extends Iced {
public final transient String _name; // Column name (for debugging)
public final byte _isInt; // 0: float col, 1: int col, 2: categorical & int col
public final char _nbin; // Bin count
public final float _step; // Linear interpolation step per bin
public final float _min, _maxEx; // Conservative Min/Max over whole collection. _maxEx is Exclusive.
public double _bins[]; // Bins, shared, atomically incremented
private double _sums[], _ssqs[]; // Sums & square-sums, shared, atomically incremented
// Atomically updated float min/max
protected float _min2, _maxIn; // Min/Max, shared, atomically updated. _maxIn is Inclusive.
private static final Unsafe _unsafe = UtilUnsafe.getUnsafe();
static private final long _min2Offset;
static private final long _max2Offset;
static {
try {
_min2Offset = _unsafe.objectFieldOffset(DHistogram.class.getDeclaredField("_min2"));
_max2Offset = _unsafe.objectFieldOffset(DHistogram.class.getDeclaredField("_maxIn"));
} catch( Exception e ) {
throw H2O.fail();
}
}
void setMin( float min ) {
int imin = Float.floatToRawIntBits(min);
float old = _min2;
while( min < old && !_unsafe.compareAndSwapInt(this, _min2Offset, Float.floatToRawIntBits(old), imin ) )
old = _min2;
}
// Find Inclusive _max2
void setMax( float max ) {
int imax = Float.floatToRawIntBits(max);
float old = _maxIn;
while( max > old && !_unsafe.compareAndSwapInt(this, _max2Offset, Float.floatToRawIntBits(old), imax ) )
old = _maxIn;
}
public DHistogram(String name, final int nbins, int nbins_cats, byte isInt, float min, float maxEx) {
assert nbins > 1;
assert nbins_cats > 1;
assert maxEx > min : "Caller ensures "+maxEx+">"+min+", since if max==min== the column "+name+" is all constants";
_isInt = isInt;
_name = name;
_min=min;
_maxEx=maxEx; // Set Exclusive max
_min2 = Float.MAX_VALUE; // Set min/max to outer bounds
_maxIn= -Float.MAX_VALUE;
// See if we can show there are fewer unique elements than nbins.
// Common for e.g. boolean columns, or near leaves.
int xbins = isInt == 2 ? nbins_cats : nbins;
if( isInt>0 && maxEx-min <= xbins ) {
assert ((long)min)==min; // No overflow
xbins = (char)((long)maxEx-(long)min); // Shrink bins
_step = 1.0f; // Fixed stepsize
} else {
_step = xbins/(maxEx-min); // Step size for linear interpolation, using mul instead of div
assert _step > 0 && !Float.isInfinite(_step);
}
_nbin = (char)xbins;
// Do not allocate the big arrays here; wait for scoreCols to pick which cols will be used.
}
// Interpolate d to find bin#
int bin( float col_data ) {
if( Float.isNaN(col_data) ) return 0; // Always NAs to bin 0
if (Float.isInfinite(col_data)) // Put infinity to most left/right bin
if (col_data<0) return 0;
else return _bins.length-1;
// When the model is exposed to new test data, we could have data that is
// out of range of any bin - however this binning call only happens during
// model-building.
assert _min <= col_data && col_data < _maxEx : "Coldata "+col_data+" out of range "+this;
int idx1 = (int)((col_data-_min)*_step);
assert 0 <= idx1 && idx1 <= _bins.length : idx1 + " " + _bins.length;
if( idx1 == _bins.length) idx1--; // Roundoff error allows idx1 to hit upper bound, so truncate
return idx1;
}
float binAt( int b ) { return _min+b/_step; }
public int nbins() { return _nbin; }
public double bins(int b) { return _bins[b]; }
// Big allocation of arrays
void init() {
assert _bins == null;
_bins = MemoryManager.malloc8d(_nbin);
init0();
}
// Add one row to a bin found via simple linear interpolation.
// Compute bin min/max.
// Compute response mean & variance.
void incr( float col_data, double y, double w ) {
assert Float.isNaN(col_data) || Float.isInfinite(col_data) || (_min <= col_data && col_data < _maxEx) : "col_data "+col_data+" out of range "+this;
int b = bin(col_data); // Compute bin# via linear interpolation
water.util.AtomicUtils.DoubleArray.add(_bins,b,w); // Bump count in bin
// Track actual lower/upper bound per-bin
if (!Float.isInfinite(col_data)) {
setMin(col_data);
setMax(col_data);
}
if( y != 0 && w != 0) incr0(b,y,w);
}
// Merge two equal histograms together. Done in a F/J reduce, so no
// synchronization needed.
void add( DHistogram dsh ) {
assert _isInt == dsh._isInt && _nbin == dsh._nbin && _step == dsh._step &&
_min == dsh._min && _maxEx == dsh._maxEx;
assert (_bins == null && dsh._bins == null) || (_bins != null && dsh._bins != null);
if( _bins == null ) return;
ArrayUtils.add(_bins,dsh._bins);
if( _min2 > dsh._min2 ) _min2 = dsh._min2 ;
if( _maxIn < dsh._maxIn ) _maxIn = dsh._maxIn;
add0(dsh);
}
// Inclusive min & max
float find_min () { return _min2 ; }
float find_maxIn() { return _maxIn; }
// Exclusive max
float find_maxEx() { return find_maxEx(_maxIn,_isInt); }
static private float find_maxEx(float maxIn, int isInt ) {
float ulp = Math.ulp(maxIn);
if( isInt > 0 && 1 > ulp ) ulp = 1;
float res = maxIn+ulp;
return Float.isInfinite(res) ? maxIn : res;
}
// The initial histogram bins are setup from the Vec rollups.
static public DHistogram[] initialHist(Frame fr, int ncols, int nbins, int nbins_cats, DHistogram hs[]) {
Vec vecs[] = fr.vecs();
for( int c=0; c 0);
}
return hs;
}
static public DHistogram make(String name, final int nbins, int nbins_cats, byte isInt, float min, float maxEx) {
return new DHistogram(name,nbins, nbins_cats, isInt, min, maxEx);
}
// Check for a constant response variable
private boolean isConstantResponse() {
double m = Double.NaN;
for( int b=0; b<_bins.length; b++ ) {
if( _bins[b] == 0 ) continue;
if( var(b) > 1e-6) {
Log.warn("Response should be constant, but variance of bin " + b + " (out of " + _bins.length + ") is " + var(b));
return false;
}
double mean = mean(b);
if( mean != m )
if( Double.isNaN(m) ) m=mean; // Capture mean of first non-empty bin
else if( !MathUtils.compare(m,mean,1e-5,1e-5) ) {
Log.warn("Response should be constant, but mean of first non-empty bin is " + m + ", but another bin (" + b + ") has mean(b) = " + mean);
return false;
}
}
return true;
}
// Pretty-print a histogram
@Override public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(_name).append(":").append(_min).append("-").append(_maxEx).append(" step=" + (1 / _step) + " nbins=" + nbins() + " isInt=" + _isInt);
if( _bins != null ) {
for( int b=0; b<_bins.length; b++ ) {
sb.append(String.format("\ncnt=%d, [%f - %f], mean/var=", _bins[b],_min+b/_step,_min+(b+1)/_step));
sb.append(String.format("%6.2f/%6.2f,", mean(b), var(b)));
}
sb.append('\n');
}
return sb.toString();
}
double mean(int b) {
double n = _bins[b];
return n>0 ? _sums[b]/n : 0;
}
/**
* compute the sample variance within a given bin
* @param b bin id
* @return sample variance (>= 0)
*/
double var (int b) {
double n = _bins[b];
if( n<=1 ) return 0;
return Math.max(0, (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1)); //not strictly consistent with what is done elsewhere (use n instead of n-1 to get there)
}
// Big allocation of arrays
void init0() {
_sums = MemoryManager.malloc8d(_nbin);
_ssqs = MemoryManager.malloc8d(_nbin);
}
// Add one row to a bin found via simple linear interpolation.
// Compute response mean & variance.
// Done racily instead F/J map calls, so atomic
void incr0( int b, double y, double w ) {
AtomicUtils.DoubleArray.add(_sums,b,(float)(w*y)); //See 'HistogramTest' JUnit for float-casting rationalization
AtomicUtils.DoubleArray.add(_ssqs,b,(float)(w*y*y));
}
// Same, except square done by caller
void incr1( int b, double y, double yy) {
AtomicUtils.DoubleArray.add(_sums,b,(float)y); //See 'HistogramTest' JUnit for float-casting rationalization
AtomicUtils.DoubleArray.add(_ssqs,b,(float)yy);
}
// Merge two equal histograms together.
// Done in a F/J reduce, so no synchronization needed.
void add0( DHistogram dsh ) {
ArrayUtils.add(_sums, dsh._sums);
ArrayUtils.add(_ssqs,dsh._ssqs);
}
// Compute a "score" for a column; lower score "wins" (is a better split).
// Score is the sum of the MSEs when the data is split at a single point.
// mses[1] == MSE for splitting between bins 0 and 1.
// mses[n] == MSE for splitting between bins n-1 and n.
public DTree.Split scoreMSE( int col, double min_rows ) {
final int nbins = nbins();
assert nbins > 1;
// Histogram arrays used for splitting, these are either the original bins
// (for an ordered predictor), or sorted by the mean response (for an
// unordered predictor, i.e. categorical predictor).
double[] sums = _sums;
double[] ssqs = _ssqs;
double[] bins = _bins;
int idxs[] = null; // and a reverse index mapping
// For categorical (unordered) predictors, sort the bins by average
// prediction then look for an optimal split. Currently limited to categoricals
// where we're one-per-bin. No point for 3 or fewer bins as all possible
// combinations (just 3) are tested without needing to sort.
if( _isInt == 2 && _step == 1.0f && nbins >= 4 ) {
// Sort the index by average response
idxs = MemoryManager.malloc4(nbins+1); // Reverse index
for( int i=0; i=0; b-- ) {
double m0 = sums1[b+1], m1 = sums[b];
double s0 = ssqs1[b+1], s1 = ssqs[b];
double k0 = ns1 [b+1], k1 = bins[b];
if( k0==0 && k1==0 )
continue;
sums1[b] = m0+m1;
ssqs1[b] = s0+s1;
ns1 [b] = k0+k1;
assert MathUtils.compare(ns0[b]+ns1[b],tot,1e-5,1e-5);
}
// Now roll the split-point across the bins. There are 2 ways to do this:
// split left/right based on being less than some value, or being equal/
// not-equal to some value. Equal/not-equal makes sense for categoricals
// but both splits could work for any integral datatype. Do the less-than
// splits first.
int best=0; // The no-split
double best_se0=Double.MAX_VALUE; // Best squared error
double best_se1=Double.MAX_VALUE; // Best squared error
byte equal=0; // Ranged check
for( int b=1; b<=nbins-1; b++ ) {
if( bins[b] == 0 ) continue; // Ignore empty splits
if( ns0[b] < min_rows ) continue;
if( ns1[b] < min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always
// We're making an unbiased estimator, so that MSE==Var.
// Then Squared Error = MSE*N = Var*N
// = (ssqs/N - mean^2)*N
// = ssqs - N*mean^2
// = ssqs - N*(sum/N)(sum/N)
// = ssqs - sum^2/N
double se0 = ssqs0[b] - sums0[b]*sums0[b]/ns0[b];
double se1 = ssqs1[b] - sums1[b]*sums1[b]/ns1[b];
if( se0 < 0 ) se0 = 0; // Roundoff error; sometimes goes negative
if( se1 < 0 ) se1 = 0; // Roundoff error; sometimes goes negative
if( (se0+se1 < best_se0+best_se1) || // Strictly less error?
// Or tied MSE, then pick split towards middle bins
(se0+se1 == best_se0+best_se1 &&
Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) {
best_se0 = se0; best_se1 = se1;
best = b;
}
}
// If the bin covers a single value, we can also try an equality-based split
if( _isInt > 0 && _step == 1.0f && // For any integral (not float) column
_maxEx-_min > 2 && idxs==null ) { // Also need more than 2 (boolean) choices to actually try a new split pattern
for( int b=1; b<=nbins-1; b++ ) {
if( bins[b] < min_rows ) continue; // Ignore too small splits
double N = ns0[b] + ns1[b+1];
if( N < min_rows )
continue; // Ignore too small splits
double sums2 = sums0[b ]+sums1[b+1];
double ssqs2 = ssqs0[b ]+ssqs1[b+1];
double si = ssqs2 -sums2 *sums2 / N ; // Left+right, excluding 'b'
double sx = ssqs [b] -sums[b]*sums[b]/bins[b]; // Just 'b'
if( si < 0 ) si = 0; // Roundoff error; sometimes goes negative
if( sx < 0 ) sx = 0; // Roundoff error; sometimes goes negative
if( si+sx < best_se0+best_se1 ) { // Strictly less error?
best_se0 = si; best_se1 = sx;
best = b; equal = 1; // Equality check
}
}
}
// For categorical (unordered) predictors, we sorted the bins by average
// prediction then found the optimal split on sorted bins
IcedBitSet bs = null; // In case we need an arbitrary bitset
if( idxs != null ) { // We sorted bins; need to build a bitset
int min=Integer.MAX_VALUE;// Compute lower bound and span for bitset
int max=Integer.MIN_VALUE;
for( int i=best; i