All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.fvec.RollupStats Maven / Gradle / Ivy

There is a newer version: 3.8.2.9
Show newest version
package water.fvec;

import water.Futures;

import jsr166y.CountedCompleter;
import jsr166y.ForkJoinTask;
import water.*;
import water.H2O.H2OCallback;
import water.H2O.H2OCountedCompleter;
import water.nbhm.NonBlockingHashMap;
import water.parser.Categorical;
import water.parser.BufferedString;
import water.util.ArrayUtils;

import java.util.Arrays;

/** A class to compute the rollup stats.  These are computed lazily, thrown
 *  away if the Vec is written into, and then recomputed lazily.  Error to ask
 *  for them if the Vec is actively being written into.  It is common for all
 *  cores to ask for the same Vec rollup at once, so it is crucial that it be
 *  computed once across the cluster.
 *
 *  Rollups are kept in the K/V store, which also controls who manages the
 *  rollup work and final results.  Winner of a DKV CAS/PutIfMatch race gets to
 *  manage the M/R job computing the rollups.  Losers block for the same
 *  rollup.  Remote requests *always* forward to the Rollup Key's master.
 */
final class RollupStats extends Iced {
  /** The count of missing elements.... or -2 if we have active writers and no
   *  rollup info can be computed (because the vector is being rapidly
   *  modified!), or -1 if rollups have not been computed since the last
   *  modification.   */

  volatile transient ForkJoinTask _tsk;

  // Computed in 1st pass
  volatile long _naCnt; //count(!isNA(X))
  double _mean, _sigma; //sum(X) and sum(X^2) for non-NA values
  long    _rows,        //count(X) for non-NA values
          _nzCnt,       //count(X!=0) for non-NA values
          _size,        //byte size
          _pinfs,       //count(+inf)
          _ninfs;       //count(-inf)
  boolean _isInt=true;
  double[] _mins, _maxs;
  long _checksum;

  // Expensive histogram & percentiles
  // Computed in a 2nd pass, on-demand, by calling computeHisto
  private static final int MAX_SIZE = 1000; // Standard bin count; categoricals can have more bins
  // the choice of MAX_SIZE being a power of 10 (rather than 1024) just aligns-to-the-grid of the common input of fixed decimal
  // precision numbers. It is still an estimate and makes no difference mathematically. It just gives tidier output in some
  // simple cases without penalty.
  volatile long[] _bins;
  // Approximate data value closest to the Xth percentile
  double[] _pctiles;

  public boolean hasHisto(){return _bins != null;}

  // Check for: Vector is mutating and rollups cannot be asked for
  boolean isMutating() { return _naCnt==-2; }
  // Check for: Rollups currently being computed
  private boolean isComputing() { return _naCnt==-1; }
  // Check for: Rollups available
  private boolean isReady() { return _naCnt>=0; }

  private RollupStats(int mode) {
    _mins = new double[5];
    _maxs = new double[5];
    Arrays.fill(_mins, Double.MAX_VALUE);
    Arrays.fill(_maxs,-Double.MAX_VALUE);
    _pctiles = new double[Vec.PERCENTILES.length];  Arrays.fill(_pctiles, Double.NaN);
    _mean = _sigma = 0;
    _size = 0;
    _naCnt = mode;
  }

  private static RollupStats makeComputing() { return new RollupStats(-1); }
  static RollupStats makeMutating () { return new RollupStats(-2); }

  private RollupStats map( Chunk c ) {
    _size = c.byteSize();
    boolean isUUID = c._vec.isUUID();
    boolean isString = c._vec.isString();
    BufferedString tmpStr = new BufferedString();
    if (isString) _isInt = false;
    // Checksum support
    long checksum = 0;
    long start = c._start;
    long l = 81985529216486895L;

    // Check for popular easy cases: All Constant
    double min=c.min(), max=c.max();
    if( min==max  ) {              // All constant or all NaN
      double d = min;             // It's the min, it's the max, it's the alpha and omega
      _checksum = (c.hasFloat()?Double.doubleToRawLongBits(d):(long)d)*c._len;
      Arrays.fill(_mins, d);
      Arrays.fill(_maxs, d);
      if( d == Double.POSITIVE_INFINITY) _pinfs++;
      else if( d == Double.NEGATIVE_INFINITY) _ninfs++;
      else {
        if( Double.isNaN(d)) _naCnt=c._len;
        else if( d != 0 ) _nzCnt=c._len;
        _mean = d;
        _rows=c._len;
      }
      _isInt = ((long)d) == d;
      _sigma = 0;               // No variance for constants
      return this;
    }

    //all const NaNs
    if ((c instanceof C0DChunk && c.isNA_impl(0))) {
      _sigma=0; //count of non-NAs * variance of non-NAs
      _mean = 0; //sum of non-NAs (will get turned into mean)
      _naCnt=c._len;
      _nzCnt=0;
      return this;
    }

    // Check for popular easy cases: Boolean, possibly sparse, possibly NaN
    if( min==0 && max==1 ) {
      int zs = c._len-c.sparseLenZero(); // Easy zeros
      int nans = 0;
      // Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
      for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) )
        if( c.isNA(i) ) nans++;
        else if( c.at8(i)==0 ) zs++;
      int os = c._len-zs-nans;  // Ones
      _nzCnt += os;
      _naCnt += nans;
      for( int i=0; i 0) {
          for( int i=0; i _maxs[i] )
        { double tmp = _maxs[i];  _maxs[i] = d;  d = tmp; }
    return _maxs[_maxs.length-1];
  }

  private static class Roll extends MRTask {
    final Key _rskey;
    RollupStats _rs;
    Roll( H2OCountedCompleter cmp, Key rskey ) { super(cmp); _rskey=rskey; }
    @Override public void map( Chunk c ) { _rs = new RollupStats(0).map(c); }
    @Override public void reduce( Roll roll ) { _rs.reduce(roll._rs); }
    @Override public void postGlobal() {
      if( _rs == null )
        _rs = new RollupStats(0);
      else {
        _rs._sigma = Math.sqrt(_rs._sigma/(_rs._rows-1));
        if (_rs._rows == 1) _rs._sigma = 0;
        if (_rs._rows < 5) for (int i=0; i<5-_rs._rows; i++) {  // Fix PUBDEV-150 for files under 5 rows
          _rs._maxs[4-i] = Double.NaN;
          _rs._mins[4-i] = Double.NaN;
        }
      }
      // mean & sigma not allowed on more than 2 classes; for 2 classes the assumption is that it's true/false
      if( _fr.anyVec().isCategorical() && _fr.anyVec().domain().length > 2 )
        _rs._mean = _rs._sigma = Double.NaN;
    }
    // Just toooo common to report always.  Drowning in multi-megabyte log file writes.
    @Override public boolean logVerbose() { return false; }

    /**
     * Added to avoid deadlocks when running from idea in debug mode (evaluating toSgtring on mr task causes rollups to be computed)
     * @return
     */
    @Override public String toString(){return "Roll(" + _fr.anyVec()._key +")";}
  }

  static void start(final Vec vec, Futures fs, boolean computeHisto) {
    if( vec instanceof InteractionWrappedVec ) return;
    if( DKV.get(vec._key)== null )
      throw new RuntimeException("Rollups not possible, because Vec was deleted: "+vec._key);
    if( vec.isString() ) computeHisto = false; // No histogram for string columns
    final Key rskey = vec.rollupStatsKey();
    RollupStats rs = getOrNull(vec,rskey);
    if(rs == null || (computeHisto && !rs.hasHisto()))
      fs.add(new RPC(rskey.home_node(),new ComputeRollupsTask(vec,computeHisto)).addCompleter(new H2OCallback() {
        @Override public void callback(H2OCountedCompleter h2OCountedCompleter) {
          DKV.get(rskey); // fetch new results via DKV to enable caching of the results.
        }
      }).call());
  }

  private static NonBlockingHashMap _pendingRollups = new NonBlockingHashMap<>();

  static RollupStats get(Vec vec, boolean computeHisto) {
    if( DKV.get(vec._key)== null ) throw new RuntimeException("Rollups not possible, because Vec was deleted: "+vec._key);
    if( vec.isString() ) computeHisto = false; // No histogram for string columns
    final Key rskey = vec.rollupStatsKey();
    RollupStats rs = DKV.getGet(rskey);
    while(rs == null || (!rs.isReady() || (computeHisto && !rs.hasHisto()))){
      if(rs != null && rs.isMutating())
        throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (1)");
      // 1. compute only once
      try {
        RPC rpcNew = new RPC(rskey.home_node(),new ComputeRollupsTask(vec, computeHisto));
        RPC rpcOld = _pendingRollups.putIfAbsent(rskey, rpcNew);
        if(rpcOld == null) {  // no prior pending task, need to send this one
          rpcNew.call().get();
          _pendingRollups.remove(rskey);
        } else // rollups computation is already in progress, wait for it to finish
          rpcOld.get();
      } catch( Throwable t ) {
        System.err.println("Remote rollups failed with an exception, wrapping and rethrowing: "+t);
        throw new RuntimeException(t);
      }
      // 2. fetch - done in two steps to go through standard DKV.get and enable local caching
      rs = DKV.getGet(rskey);
    }
    return rs;
  }
  // Allow a bunch of rollups to run in parallel.  If Futures is passed in, run
  // the rollup in the background and do not return.
  static RollupStats get(Vec vec) { return get(vec,false);}
  // Fetch if present, but do not compute
  static RollupStats getOrNull(Vec vec, final Key rskey ) {
    Value val = DKV.get(rskey);
    if( val == null )           // No rollup stats present?
      return vec.length() > 0 ? /*not computed*/null : /*empty vec*/new RollupStats(0);
    RollupStats rs = val.get(RollupStats.class);
    return rs.isReady() ? rs : null;
  }
  // Histogram base & stride
  double h_base() { return _mins[0]; }
  double h_stride() { return h_stride(_bins.length); }
  private double h_stride(int nbins) { return (_maxs[0]-_mins[0]+(_isInt?1:0))/nbins; }

  // Compute expensive histogram
  private static class Histo extends MRTask {
    final double _base, _stride; // Inputs
    final int _nbins;            // Inputs
    long[] _bins;                // Outputs
    Histo( H2OCountedCompleter cmp, RollupStats rs, int nbins ) { super(cmp);_base = rs.h_base(); _stride = rs.h_stride(nbins); _nbins = nbins; }
    @Override public void map( Chunk c ) {
      _bins = new long[_nbins];
      for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) ) {
        double d = c.atd(i);
        if( !Double.isNaN(d) ) _bins[idx(d)]++;
      }
      // Sparse?  We skipped all the zeros; do them now
      if( c.isSparseZero() )
        _bins[idx(0.0)] += (c._len - c.sparseLenZero());
    }
    private int idx( double d ) { int idx = (int)((d-_base)/_stride); return Math.min(idx,_bins.length-1); }

    @Override public void reduce( Histo h ) { ArrayUtils.add(_bins,h._bins); }
    // Just toooo common to report always.  Drowning in multi-megabyte log file writes.
    @Override public boolean logVerbose() { return false; }
  }


  // Task to compute rollups on its homenode if needed.
  // Only computes the rollups, does not fetch them, caller should fetch them via DKV store (to preserve caching).
  // Only comutes the rollups if needed (i.e. are null or do not have histo and histo is required)
  // If rs computation is already in progress, it will wait for it to finish.
  // Throws IAE if the Vec is being modified (or removed) while this task is in progress.
  static final class ComputeRollupsTask extends DTask{
    final Key _vecKey;
    final Key _rsKey;
    final boolean _computeHisto;

    public ComputeRollupsTask(Vec v, boolean computeHisto){
      super((byte)(Thread.currentThread() instanceof H2O.FJWThr ? currThrPriority()+1 : H2O.MIN_HI_PRIORITY-3));
      _vecKey = v._key;
      _rsKey = v.rollupStatsKey();
      _computeHisto = computeHisto;
    }

    private Value makeComputing(){
      RollupStats newRs = RollupStats.makeComputing();
      CountedCompleter cc = getCompleter(); // should be null or RPCCall
      if(cc != null) assert cc.getCompleter() == null;
      newRs._tsk = cc == null?this:cc;
      return new Value(_rsKey,newRs);
    }
    private void installResponse(Value nnn, RollupStats rs) {
      Futures fs = new Futures();
      Value old = DKV.DputIfMatch(_rsKey, new Value(_rsKey, rs), nnn, fs);
      assert rs.isReady();
      if(old != nnn)
        throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (2)");
      fs.blockForPending();
    }

    @Override
    public void compute2() {
      assert _rsKey.home();
      final Vec vec = DKV.getGet(_vecKey);
      while(true) {
        Value v = DKV.get(_rsKey);
        RollupStats rs = (v == null) ? null : v.get();
        // Fetched current rs from the DKV, rs can be:
        //   a) computed
        //        a.1) has histo or histo not required => do nothing
        //        a.2) no histo and histo is required  => only compute histo
        //   b) computing => wait for the task computing it to finish and check again
        //   c) mutating  => throw IAE
        //   d) null      => compute new rollups
        if (rs != null) {
          if (rs.isReady()) {
            if (_computeHisto && !rs.hasHisto()) { // a.2 => compute rollups
              CountedCompleter cc = getCompleter(); // should be null or RPCCall
              if(cc != null) assert cc.getCompleter() == null;
              // note: if cc == null then onExceptionalCompletion tasks waiting on this may be woken up before exception handling iff exception is thrown.
              Value nnn = makeComputing();
              Futures fs = new Futures();
              Value oldv = DKV.DputIfMatch(_rsKey, nnn, v, fs);
              fs.blockForPending();
              if(oldv == v){ // got the lock
                computeHisto(rs, vec, nnn);
                break;
              } // else someone else is modifying the rollups => try again
            } else
              break; // a.1 => do nothing
          } else if (rs.isComputing()) { // b) => wait for current computation to finish
            rs._tsk.join();
          } else if(rs.isMutating()) // c) => throw IAE
            throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (3)");
        } else { // d) => compute the rollups
          final Value nnn = makeComputing();
          Futures fs = new Futures();
          Value oldv = DKV.DputIfMatch(_rsKey, nnn, v, fs);
          fs.blockForPending();
          if(oldv == v){ // got the lock, compute the rollups
            Roll r = new Roll(null,_rsKey).doAll(vec);
                // computed the stats, now compute histo if needed and install the response and quit
            r._rs._checksum ^= vec.length();
            if(_computeHisto)
              computeHisto(r._rs, vec, nnn);
            else
              installResponse(nnn, r._rs);
            break;
          } // else someone else is modifying the rollups => try again
        }
      }
      tryComplete();
    }

    final void computeHisto(final RollupStats rs, Vec vec, final Value nnn) {
      // All NAs or non-math; histogram has zero bins
      if (rs._naCnt == vec.length() || vec.isUUID()) {
        rs._bins = new long[0];
        installResponse(nnn, rs);
        return;
      }
      // Constant: use a single bin
      double span = rs._maxs[0] - rs._mins[0];
      final long rows = vec.length() - rs._naCnt;
      assert rows > 0 : "rows = " + rows + ", vec.len() = " + vec.length() + ", naCnt = " + rs._naCnt;
      if (span == 0) {
        rs._bins = new long[]{rows};
        installResponse(nnn, rs);
        return;
      }
      // Number of bins: MAX_SIZE by default.  For integers, bins for each unique int
      // - unless the count gets too high; allow a very high count for categoricals.
      int nbins = MAX_SIZE;
      if (rs._isInt && span < Integer.MAX_VALUE) {
        nbins = (int) span + 1;      // 1 bin per int
        int lim = vec.isCategorical() ? Categorical.MAX_CATEGORICAL_COUNT : MAX_SIZE;
        nbins = Math.min(lim, nbins); // Cap nbins at sane levels
      }
      Histo histo = new Histo(null, rs, nbins).doAll(vec);
      assert ArrayUtils.sum(histo._bins) == rows;
      rs._bins = histo._bins;
      // Compute percentiles from histogram
      rs._pctiles = new double[Vec.PERCENTILES.length];
      int j = 0;                 // Histogram bin number
      int k = 0;                 // The next non-zero bin after j
      long hsum = 0;             // Rolling histogram sum
      double base = rs.h_base();
      double stride = rs.h_stride();
      double lastP = -1.0;       // any negative value to pass assert below first time
      for (int i = 0; i < Vec.PERCENTILES.length; i++) {
        final double P = Vec.PERCENTILES[i];
        assert P >= 0 && P <= 1 && P >= lastP;   // rely on increasing percentiles here. If P has dup then strange but accept, hence >= not >
        lastP = P;
        double pdouble = 1.0 + P * (rows - 1);   // following stats:::quantile.default type 7
        long pint = (long) pdouble;          // 1-based into bin vector
        double h = pdouble - pint;           // any fraction h to linearly interpolate between?
        assert P != 1 || (h == 0.0 && pint == rows);  // i.e. max
        while (hsum < pint) hsum += rs._bins[j++];
        // j overshot by 1 bin; we added _bins[j-1] and this goes from too low to either exactly right or too big
        // pint now falls in bin j-1 (the ++ happened even when hsum==pint), so grab that bin value now
        rs._pctiles[i] = base + stride * (j - 1);
        if (h > 0 && pint == hsum) {
          // linearly interpolate between adjacent non-zero bins
          //      i) pint is the last of (j-1)'s bin count (>1 when either duplicates exist in input, or stride makes dups at lower accuracy)
          // AND ii) h>0 so we do need to find the next non-zero bin
          if (k < j) k = j; // if j jumped over the k needed for the last P, catch k up to j
          // Saves potentially winding k forward over the same zero stretch many times
          while (rs._bins[k] == 0) k++;  // find the next non-zero bin
          rs._pctiles[i] += h * stride * (k - j + 1);
        } // otherwise either h==0 and we know which bin, or fraction is between two positions that fall in the same bin
        // this guarantees we are within one bin of the exact answer; i.e. within (max-min)/MAX_SIZE
      }
      installResponse(nnn, rs);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy