
water.fvec.RollupStats Maven / Gradle / Ivy
package water.fvec;
import water.Futures;
import jsr166y.CountedCompleter;
import jsr166y.ForkJoinTask;
import water.*;
import water.H2O.H2OCallback;
import water.H2O.H2OCountedCompleter;
import water.parser.Categorical;
import water.parser.ValueString;
import water.util.ArrayUtils;
import java.util.Arrays;
/** A class to compute the rollup stats. These are computed lazily, thrown
* away if the Vec is written into, and then recomputed lazily. Error to ask
* for them if the Vec is actively being written into. It is common for all
* cores to ask for the same Vec rollup at once, so it is crucial that it be
* computed once across the cluster.
*
* Rollups are kept in the K/V store, which also controls who manages the
* rollup work and final results. Winner of a DKV CAS/PutIfMatch race gets to
* manage the M/R job computing the rollups. Losers block for the same
* rollup. Remote requests *always* forward to the Rollup Key's master.
*/
class RollupStats extends Iced {
/** The count of missing elements.... or -2 if we have active writers and no
* rollup info can be computed (because the vector is being rapidly
* modified!), or -1 if rollups have not been computed since the last
* modification. */
volatile transient ForkJoinTask _tsk;
volatile long _naCnt;
// Computed in 1st pass
double _mean, _sigma;
long _checksum;
long _rows, _nzCnt, _size, _pinfs, _ninfs;
boolean _isInt=true;
double[] _mins, _maxs;
// Expensive histogram & percentiles
// Computed in a 2nd pass, on-demand, by calling computeHisto
private static final int MAX_SIZE = 1000; // Standard bin count; enums can have more bins
// the choice of MAX_SIZE being a power of 10 (rather than 1024) just aligns-to-the-grid of the common input of fixed decimal
// precision numbers. It is still an estimate and makes no difference mathematically. It just gives tidier output in some
// simple cases without penalty.
volatile long[] _bins;
// Approximate data value closest to the Xth percentile
double[] _pctiles;
public boolean hasHisto(){return _bins != null;}
// Check for: Vector is mutating and rollups cannot be asked for
boolean isMutating() { return _naCnt==-2; }
// Check for: Rollups currently being computed
private boolean isComputing() { return _naCnt==-1; }
// Check for: Rollups available
private boolean isReady() { return _naCnt>=0; }
private RollupStats(int mode) {
_mins = new double[5]; Arrays.fill(_mins, Double.NaN);
_maxs = new double[5]; Arrays.fill(_maxs, Double.NaN);
_pctiles = new double[Vec.PERCENTILES.length]; Arrays.fill(_pctiles, Double.NaN);
_mean = _sigma = Double.NaN;
_size = 0;
_naCnt = mode;
}
private static RollupStats makeComputing() { return new RollupStats(-1); }
static RollupStats makeMutating () { return new RollupStats(-2); }
private RollupStats map( Chunk c ) {
_size = c.byteSize();
Arrays.fill(_mins, Double.MAX_VALUE);
Arrays.fill(_maxs,-Double.MAX_VALUE);
boolean isUUID = c._vec.isUUID();
boolean isString = c._vec.isString();
ValueString vs = new ValueString();
if (isString) _isInt = false;
// Checksum support
long checksum = 0;
long start = c._start;
long l = 81985529216486895L;
// Check for popular easy cases: All Constant
double min=c.min(), max=c.max();
if( min==max ) { // All constant, and not NaN
double d = min; // It's the min, it's the max, it's the alpha and omega
_checksum = (c.hasFloat()?Double.doubleToRawLongBits(d):(long)d)*c._len;
Arrays.fill(_mins,d);
Arrays.fill(_maxs,d);
if( d == Double.POSITIVE_INFINITY) _pinfs++;
else if( d == Double.NEGATIVE_INFINITY) _ninfs++;
else {
if( d != 0 ) _nzCnt=c._len;
_mean = d;
_rows=c._len;
}
_isInt = ((long)d) == d;
_sigma = 0; // No variance for constants
return this;
}
// Check for popular easy cases: Boolean, possibly sparse, possibly NaN
if( min==0 && max==1 ) {
int zs = c._len-c.sparseLen(); // Easy zeros
int nans = 0;
// Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) )
if( c.isNA(i) ) nans++;
else if( c.at8(i)==0 ) zs++;
int os = c._len-zs-nans; // Ones
_nzCnt = os;
_naCnt = nans;
for( int i=0; i 0 ) {
final double mean = _mean = _mean / _rows;
// Handle all zero rows
int zeros = c._len - c.sparseLen();
double sigma = mean*mean*zeros;
// Handle all non-zero rows
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) ) {
double d = c.atd(i);
if( !Double.isNaN(d) ) {
d -= mean;
sigma += d*d;
}
}
_sigma = sigma;
}
return this;
}
private void reduce( RollupStats rs ) {
for( double d : rs._mins ) min(d);
for( double d : rs._maxs ) max(d);
_naCnt += rs._naCnt;
_nzCnt += rs._nzCnt;
_pinfs += rs._pinfs;
_ninfs += rs._ninfs;
double delta = _mean - rs._mean;
if (_rows == 0) { _mean = rs._mean; _sigma = rs._sigma; }
else if(rs._rows != 0){
_mean = (_mean * _rows + rs._mean * rs._rows) / (_rows + rs._rows);
_sigma = _sigma + rs._sigma + delta*delta * _rows*rs._rows / (_rows+rs._rows);
}
_rows += rs._rows;
_size += rs._size;
_isInt &= rs._isInt;
_checksum ^= rs._checksum;
}
private void min( double d ) {
if( d >= _mins[_mins.length-1] ) return;
for( int i=0; i<_mins.length; i++ )
if( d < _mins[i] )
{ double tmp = _mins[i]; _mins[i] = d; d = tmp; }
}
private void max( double d ) {
if( d <= _maxs[_maxs.length-1] ) return;
for( int i=0; i<_maxs.length; i++ )
if( d > _maxs[i] )
{ double tmp = _maxs[i]; _maxs[i] = d; d = tmp; }
}
private static class Roll extends MRTask {
final Key _rskey;
RollupStats _rs;
Roll( H2OCountedCompleter cmp, Key rskey ) { super(cmp); _rskey=rskey; }
@Override public void map( Chunk c ) { _rs = new RollupStats(0).map(c); }
@Override public void reduce( Roll roll ) { _rs.reduce(roll._rs); }
@Override public void postGlobal() {
if( _rs == null )
_rs = new RollupStats(0);
else {
_rs._sigma = Math.sqrt(_rs._sigma/(_rs._rows-1));
if (_rs._rows < 5) for (int i=0; i<5-_rs._rows; i++) { // Fix PUBDEV-150 for files under 5 rows
_rs._maxs[4-i] = Double.NaN;
_rs._mins[4-i] = Double.NaN;
}
}
}
// Just toooo common to report always. Drowning in multi-megabyte log file writes.
@Override public boolean logVerbose() { return false; }
}
static void start(final Vec vec, Futures fs, boolean computeHisto) {
final Key rskey = vec.rollupStatsKey();
RollupStats rs = getOrNull(vec);
if(rs == null || computeHisto && !rs.hasHisto())
fs.add(new RPC(rskey.home_node(),new ComputeRollupsTask(vec,computeHisto)).addCompleter(new H2OCallback() {
@Override
public void callback(H2OCountedCompleter h2OCountedCompleter) {
DKV.get(rskey); // fetch new results via DKV to enable caching of the results.
}
}).call());
}
static RollupStats get(Vec vec, boolean computeHisto) {
if( DKV.get(vec._key)== null ) throw new RuntimeException("Rollups not possible, because Vec was deleted: "+vec._key);
final Key rskey = vec.rollupStatsKey();
RollupStats rs = DKV.getGet(rskey);
while(rs == null || (!rs.isReady() || (computeHisto && !rs.hasHisto()))){
if(rs != null && rs.isMutating())
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (1)");
// 1. compute
try {
RPC.call(rskey.home_node(),new ComputeRollupsTask(vec, computeHisto)).get();
} catch( Throwable t ) {
System.err.println("Remote rollups failed with an exception, wrapping and rethrowing: "+t);
throw new RuntimeException(t);
}
// 2. fetch - done in two steps to go through standard DKV.get and enable local caching
rs = DKV.getGet(rskey);
}
return rs;
}
// Allow a bunch of rollups to run in parallel. If Futures is passed in, run
// the rollup in the background and do not return.
static RollupStats get(Vec vec) { return get(vec,false);}
// Fetch if present, but do not compute
static RollupStats getOrNull(Vec vec) {
final Key rskey = vec.rollupStatsKey();
Value val = DKV.get(rskey);
if( val == null ) // No rollup stats present?
return vec.length() > 0 ? /*not computed*/null : /*empty vec*/new RollupStats(0);
RollupStats rs = val.get(RollupStats.class);
return rs.isReady() ? rs : null;
}
// Histogram base & stride
double h_base() { return _mins[0]; }
double h_stride() { return h_stride(_bins.length); }
private double h_stride(int nbins) { return (_maxs[0]-_mins[0]+(_isInt?1:0))/nbins; }
// Compute expensive histogram
private static class Histo extends MRTask {
final double _base, _stride; // Inputs
final int _nbins; // Inputs
long[] _bins; // Outputs
Histo( H2OCountedCompleter cmp, RollupStats rs, int nbins ) { super(cmp);_base = rs.h_base(); _stride = rs.h_stride(nbins); _nbins = nbins; }
@Override public void map( Chunk c ) {
_bins = new long[_nbins];
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) ) {
double d = c.atd(i);
if( !Double.isNaN(d) ) _bins[idx(d)]++;
}
// Sparse? We skipped all the zeros; do them now
if( c.isSparse() )
_bins[idx(0.0)] += (c._len - c.sparseLen());
}
private int idx( double d ) { int idx = (int)((d-_base)/_stride); return Math.min(idx,_bins.length-1); }
@Override public void reduce( Histo h ) { ArrayUtils.add(_bins,h._bins); }
// Just toooo common to report always. Drowning in multi-megabyte log file writes.
@Override public boolean logVerbose() { return false; }
}
// Task to compute rollups on its homenode if needed.
// Only computes the rollups, does not fetch them, caller should fetch them via DKV store (to preserve caching).
// Only comutes the rollups if needed (i.e. are null or do not have histo and histo is required)
// If rs computation is already in progress, it will wait for it to finish.
// Throws IAE if the Vec is being modified (or removed) while this task is in progress.
static final class ComputeRollupsTask extends DTask{
final byte _priority;
final Key _vecKey;
final Key _rsKey;
final boolean _computeHisto;
public ComputeRollupsTask(Vec v, boolean computeHisto){
_priority = nextThrPriority();
_vecKey = v._key;
_rsKey = v.rollupStatsKey();
_computeHisto = computeHisto;
}
@Override public byte priority(){return _priority; }
private Value makeComputing(){
RollupStats newRs = RollupStats.makeComputing();
CountedCompleter cc = getCompleter(); // should be null or RPCCall
if(cc != null) assert cc.getCompleter() == null;
newRs._tsk = cc == null?this:cc;
return new Value(_rsKey,newRs);
}
private void installResponse(Value nnn, RollupStats rs) {
Futures fs = new Futures();
Value old = DKV.DputIfMatch(_rsKey,new Value(_rsKey,rs),nnn,fs);
assert rs.isReady();
if(old != nnn)
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (2)");
fs.blockForPending();
}
@Override
protected void compute2() {
assert _rsKey.home();
final Vec vec = DKV.getGet(_vecKey);
while(true) {
Value v = DKV.get(_rsKey);
RollupStats rs = (v == null) ? null : v.get();
// Fetched current rs from the DKV, rs can be:
// a) computed
// a.1) has histo or histo not required => do nothing
// a.2) no histo and histo is required => only compute histo
// b) computing => wait for the task computing it to finish and check again
// c) mutating => throw IAE
// d) null => compute new rollups
if (rs != null) {
if (rs.isReady()) {
if (_computeHisto && !rs.hasHisto()) { // a.2 => compute rollups
CountedCompleter cc = getCompleter(); // should be null or RPCCall
if(cc != null) assert cc.getCompleter() == null;
// note: if cc == null then onExceptionalCompletion tasks waiting on this may be woken up before exception handling iff exception is thrown.
Value nnn = makeComputing();
Futures fs = new Futures();
Value oldv = DKV.DputIfMatch(_rsKey,nnn,v,fs);
fs.blockForPending();
if(oldv == v){ // got the lock
computeHisto(rs, vec, nnn);
break;
} // else someone else is modifying the rollups => try again
} else
break; // a.1 => do nothing
} else if (rs.isComputing()) { // b) => wait for current computation to finish
rs._tsk.join();
} else if(rs.isMutating()) // c) => throw IAE
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (3)");
} else { // d) => compute the rollups
final Value nnn = makeComputing();
Futures fs = new Futures();
Value oldv = DKV.DputIfMatch(_rsKey,nnn,v,fs);
fs.blockForPending();
if(oldv == v){ // got the lock, compute the rollups
addToPendingCount(1);
new Roll(new H2OCallback(this) {
@Override
public void callback(Roll rs) {
// computed the stats, now compute histo if needed and install the response and quit
rs._rs._checksum ^= vec.length();
if(_computeHisto)
computeHisto(rs._rs, vec, nnn);
else installResponse(nnn,rs._rs);
}
},_rsKey).dfork(vec);
break;
} // else someone else is modifying the rollups => try again
}
}
tryComplete();
}
final void computeHisto(final RollupStats rs, Vec vec, final Value nnn){
// All NAs or non-math; histogram has zero bins
if( rs._naCnt == vec.length() || vec.isUUID() ) {
rs._bins = new long[0];
installResponse(nnn,rs);
return;
}
// Constant: use a single bin
double span = rs._maxs[0]-rs._mins[0];
final long rows = vec.length()-rs._naCnt;
assert rows > 0:"rows = " + rows + ", vec.len() = " + vec.length() + ", naCnt = " + rs._naCnt;
if( span==0 ) {
rs._bins = new long[]{rows};
installResponse(nnn,rs);
return;
}
// Number of bins: MAX_SIZE by default. For integers, bins for each unique int
// - unless the count gets too high; allow a very high count for enums.
int nbins=MAX_SIZE;
if( rs._isInt && span < Integer.MAX_VALUE ) {
nbins = (int)span+1; // 1 bin per int
int lim = vec.isEnum() ? Categorical.MAX_CATEGORICAL_COUNT : MAX_SIZE;
nbins = Math.min(lim,nbins); // Cap nbins at sane levels
}
addToPendingCount(1);
new Histo(new H2OCallback(this){
@Override public void callback(Histo histo) {
assert ArrayUtils.sum(histo._bins) == rows;
rs._bins = histo._bins;
// Compute percentiles from histogram
rs._pctiles = new double[Vec.PERCENTILES.length];
int j = 0; // Histogram bin number
int k = 0; // The next non-zero bin after j
long hsum = 0; // Rolling histogram sum
double base = rs.h_base();
double stride = rs.h_stride();
double lastP = -1.0; // any negative value to pass assert below first time
for (int i = 0; i < Vec.PERCENTILES.length; i++) {
final double P = Vec.PERCENTILES[i];
assert P>=0 && P<=1 && P>=lastP; // rely on increasing percentiles here. If P has dup then strange but accept, hence >= not >
lastP = P;
double pdouble = 1.0 + P*(rows-1); // following stats:::quantile.default type 7
long pint = (long) pdouble; // 1-based into bin vector
double h = pdouble - pint; // any fraction h to linearly interpolate between?
assert P!=1 || (h==0.0 && pint==rows); // i.e. max
while (hsum < pint) hsum += rs._bins[j++];
// j overshot by 1 bin; we added _bins[j-1] and this goes from too low to either exactly right or too big
// pint now falls in bin j-1 (the ++ happened even when hsum==pint), so grab that bin value now
rs._pctiles[i] = base + stride * (j - 1);
if (h>0 && pint==hsum) {
// linearly interpolate between adjacent non-zero bins
// i) pint is the last of (j-1)'s bin count (>1 when either duplicates exist in input, or stride makes dups at lower accuracy)
// AND ii) h>0 so we do need to find the next non-zero bin
if (k
© 2015 - 2025 Weber Informatics LLC | Privacy Policy