All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.quantile.Quantile Maven / Gradle / Ivy

There is a newer version: 3.8.2.9
Show newest version
package hex.quantile;

import hex.ModelBuilder;
import hex.ModelCategory;
import water.*;
import water.fvec.*;
import water.util.ArrayUtils;
import water.util.Log;

import java.util.Arrays;

/**
 *  Quantile model builder... building a simple QuantileModel
 */
public class Quantile extends ModelBuilder {
  private int _ncols;

  @Override protected boolean logMe() { return false; }

  // Called from Nano thread; start the Quantile Job on a F/J thread
  public Quantile( QuantileModel.QuantileParameters parms ) { super(parms); init(false); }
  public Quantile( QuantileModel.QuantileParameters parms, Job job ) { super(parms, job); init(false); }
  @Override public Driver trainModelImpl() { return new QuantileDriver(); }
  @Override public ModelCategory[] can_build() { return new ModelCategory[]{ModelCategory.Unknown}; }
  // any number of chunks is fine - don't rebalance - it's not worth it for a few passes over the data (at most)
  @Override protected int desiredChunks(final Frame original_fr, boolean local) { return 1;  }

  /** Initialize the ModelBuilder, validating all arguments and preparing the
   *  training frame.  This call is expected to be overridden in the subclasses
   *  and each subclass will start with "super.init();".  This call is made
   *  by the front-end whenever the GUI is clicked, and needs to be fast;
   *  heavy-weight prep needs to wait for the trainModel() call.
   *
   *  Validate the probs.
   */
  @Override public void init(boolean expensive) {
    super.init(expensive);
    for( double p : _parms._probs )
      if( p < 0.0 || p > 1.0 )
        error("_probs","Probabilities must be between 0 and 1");
    _ncols = train().numCols()-numSpecialCols(); //offset/weights/nfold - should only ever be weights
    if ( numSpecialCols() == 1 && _weights == null)
      throw new IllegalArgumentException("The only special Vec that is supported for Quantiles is observation weights.");
    if ( numSpecialCols() >1 ) throw new IllegalArgumentException("Cannot handle more than 1 special vec (weights)");
  }

  private static class SumWeights extends MRTask {
    double sum;
    @Override public void map(Chunk c, Chunk w) { for (int i=0;i 0 && wt < 1) throw new H2OIllegalArgumentException("Quantiles only accepts weights that are either 0 or >= 1.");
        sum += wt;
      }
    }
    @Override public void reduce(SumWeights mrt) { sum+=mrt.sum; }
  }

  // ----------------------
  private class QuantileDriver extends Driver {

    @Override public void compute2() {
      QuantileModel model = null;
      try {
        Scope.enter();
        _parms.read_lock_frames(_job); // Fetch & read-lock source frame
        init(true);

        // The model to be built
        model = new QuantileModel(dest(), _parms, new QuantileModel.QuantileOutput(Quantile.this));
        model._output._parameters = _parms;
        model._output._quantiles = new double[_ncols][_parms._probs.length];
        model.delete_and_lock(_job);


        // ---
        // Run the main Quantile Loop
        Vec vecs[] = train().vecs();
        for( int n=0; n<_ncols; n++ ) {
          if( stop_requested() ) return; // Stopped/cancelled
          Vec vec = vecs[n];
          if (vec.isBad()) {
            model._output._quantiles[n] = new double[_parms._probs.length];
            Arrays.fill(model._output._quantiles[n], Double.NaN);
            continue;
          }
          double sumRows=_weights == null ? vec.length()-vec.naCnt() : new SumWeights().doAll(vec, _weights).sum;
          // Compute top-level histogram
          Histo h1 = new Histo(vec.min(),vec.max(),0,sumRows,vec.isInt());
          h1 = _weights==null ? h1.doAll(vec) : h1.doAll(vec, _weights);

          // For each probability, see if we have it exactly - or else run
          // passes until we do.
          for( int p = 0; p < _parms._probs.length; p++ ) {
            double prob = _parms._probs[p];
            Histo h = h1;  // Start from the first global histogram

            model._output._iterations++; // At least one iter per-prob-per-column
            while( Double.isNaN(model._output._quantiles[n][p] = h.findQuantile(prob,_parms._combine_method)) ) {
              h = _weights == null ? h.refinePass(prob).doAll(vec) : h.refinePass(prob).doAll(vec, _weights); // Full pass at higher resolution
              model._output._iterations++; // also count refinement iterations
            }

            // Update the model
            model.update(_job); // Update model in K/V store
            _job.update(0);     // One unit of work
          }
          StringBuilder sb = new StringBuilder();
          sb.append("Quantile: iter: ").append(model._output._iterations).append(" Qs=").append(Arrays.toString(model._output._quantiles[n]));
          Log.debug(sb);
        }
      } finally {
        if( model != null ) model.unlock(_job);
        _parms.read_unlock_frames(_job);
        Scope.exit(model == null ? null : model._key);
      }
      tryComplete();
    }
  }

  public static class StratifiedQuantilesTask extends H2O.H2OCountedCompleter {
    // INPUT
    final double _prob;
    final Vec _response; //vec to compute quantile for
    final Vec _weights; //obs weights
    final Vec _strata; //continuous integer range mapping into the _quantiles[id][]
    final QuantileModel.CombineMethod _combine_method;

    // OUTPUT
    public double[/*strata*/] _quantiles;

    public StratifiedQuantilesTask(H2O.H2OCountedCompleter cc,
                                   double prob,
                                   Vec response, // response
                                   Vec weights,  // obs weights
                                   Vec strata,   // stratification (can be null)
                                   QuantileModel.CombineMethod combine_method) {
      super(cc); _response = response; _prob=prob; _combine_method=combine_method; _weights=weights; _strata=strata;
    }

    @Override public void compute2() {
      final int strataMin = _strata == null ? 0 : (int) (Math.max(0,_strata.min()));
      final int strataMax = _strata == null ? 0 : (int) (Math.max(0,_strata.max()));
      final int nstrata = strataMax - strataMin + 1;
      Log.info("Computing quantiles for " + nstrata + " different strata.");
      _quantiles = new double[nstrata];
      Vec weights = _weights != null ? _weights : _response.makeCon(1);
      for (int i=strataMin;i<=strataMax;++i) { //loop over nodes
        Vec newWeights = weights.makeCopy();
        //only keep weights for this stratum (node), set rest to 0
        if (_strata!=null) new KeepOnlyOneStrata(i).doAll(_strata, newWeights);
        double sumRows = new SumWeights().doAll(_response, newWeights).sum;
        Histo h = new Histo(_response.min(), _response.max(), 0, sumRows, _response.isInt());
        h.doAll(_response, newWeights);
        while (Double.isNaN(_quantiles[i-strataMin] = h.findQuantile(_prob, _combine_method)))
          h = h.refinePass(_prob).doAll(_response, newWeights);
        newWeights.remove();
      }
      if (_weights != weights) weights.remove();
      tryComplete();
    }

    private static class KeepOnlyOneStrata extends MRTask {
      KeepOnlyOneStrata(int stratumToKeep) { this.stratumToKeep = stratumToKeep; }
      int stratumToKeep;
      @Override public void map(Chunk strata, Chunk newW) {
        for (int i=0; i {
    private static final int NBINS = 1024; // Default bin count
    private final int _nbins;            // Actual  bin count
    private final double _lb;            // Lower bound of bin[0]
    private final double _step;          // Step-size per-bin
    private final double _start_row;     // Starting cumulative count of weighted rows for this lower-bound
    private final double _nrows;         // Total datasets (weighted) rows
    private final boolean _isInt;        // Column only holds ints

    // Big Data output result
    double _bins[/*nbins*/];     // Weighted count of rows in each bin
    double _mins[/*nbins*/];     // Smallest element in bin
    double _maxs[/*nbins*/];     // Largest  element in bin

    private Histo(double lb, double ub, double start_row, double nrows, boolean isInt) {
      boolean is_int = (isInt && (ub - lb < NBINS));
      _nbins = is_int ? (int) (ub - lb + 1) : NBINS;
      _lb = lb;
      double ulp = Math.ulp(Math.max(Math.abs(lb), Math.abs(ub)));
      _step = is_int ? 1 : (ub + ulp - lb) / _nbins;
      _start_row = start_row;
      _nrows = nrows;
      _isInt = isInt;
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("range : " + _lb + " ... " + (_lb + _nbins * _step));
      sb.append("\npsum0 : " + _start_row);
      sb.append("\ncounts: " + Arrays.toString(_bins));
      sb.append("\nmaxs  : " + Arrays.toString(_maxs));
      sb.append("\nmins  : " + Arrays.toString(_mins));
      sb.append("\n");
      return sb.toString();
    }

    @Override
    public void map(Chunk chk, Chunk weight) {
      _bins = new double[_nbins];
      _mins = new double[_nbins];
      _maxs = new double[_nbins];
      Arrays.fill(_mins, Double.MAX_VALUE);
      Arrays.fill(_maxs, -Double.MAX_VALUE);
      double d;
      for (int row = 0; row < chk._len; row++) {
        double w = weight.atd(row);
        if (w == 0) continue;
        if (!Double.isNaN(d = chk.atd(row))) {  // na.rm=true
          double idx = (d - _lb) / _step;
          if (!(0.0 <= idx && idx < _bins.length)) continue;
          int i = (int) idx;
          if (_bins[i] == 0) _mins[i] = _maxs[i] = d; // Capture unique value
          else {
            if (d < _mins[i]) _mins[i] = d;
            if (d > _maxs[i]) _maxs[i] = d;
          }
          _bins[i] += w;               // Bump row counts by row weight
        }
      }
    }

    @Override
    public void map(Chunk chk) {
      map(chk, new C0DChunk(1, chk.len()));
    }

    @Override
    public void reduce(Histo h) {
      for (int i = 0; i < _nbins; i++) { // Keep min/max
        if (_mins[i] > h._mins[i]) _mins[i] = h._mins[i];
        if (_maxs[i] < h._maxs[i]) _maxs[i] = h._maxs[i];
      }
      ArrayUtils.add(_bins, h._bins);
    }

    /** @return Quantile for probability prob, or NaN if another pass is needed. */
    double findQuantile( double prob, QuantileModel.CombineMethod method ) {
      double p2 = prob*(_nrows-1); // Desired fractional row number for this probability
      long r2 = (long)p2;
      int loidx = findBin(r2);  // Find bin holding low value
      double lo = (loidx == _nbins) ? binEdge(_nbins) : _maxs[loidx];
      if( loidx<_nbins && r2==p2 && _mins[loidx]==lo ) return lo; // Exact row number, exact bin?  Then quantile is exact

      long r3 = r2+1;
      int hiidx = findBin(r3);  // Find bin holding high value
      double hi = (hiidx == _nbins) ? binEdge(_nbins) : _mins[hiidx];
      if( loidx==hiidx )        // Somewhere in the same bin?
        return (lo==hi) ? lo : Double.NaN; // Only if bin is constant, otherwise must refine the bin
      // Split across bins - the interpolate between the hi of the lo bin, and
      // the lo of the hi bin
      return computeQuantile(lo,hi,r2,_nrows,prob,method);
    }

    private double binEdge( int idx ) { return _lb+_step*idx; }

    // bin for row; can be _nbins if just off the end (normally expect 0 to nbins-1)
    // row == position in (weighted) population
    private int findBin( double row ) {
      long sum = (long)_start_row;
      for( int i=0; i<_nbins; i++ )
        if( (long)row < (sum += _bins[i]) )
          return i;
      return _nbins;
    }

    // Run another pass over the data, with refined endpoints, to home in on
    // the exact elements for this probability.
    Histo refinePass( double prob ) {
      double prow = prob*(_nrows-1); // Desired fractional row number for this probability
      long lorow = (long)prow;       // Lower integral row number
      int loidx = findBin(lorow);    // Find bin holding low value
      // If loidx is the last bin, then high must be also the last bin - and we
      // have an exact quantile (equal to the high bin) and we didn't need
      // another refinement pass
      assert loidx < _nbins;
      double lo = _mins[loidx]; // Lower end of range to explore
      // If probability does not hit an exact row, we need the elements on
      // either side - so the next row up from the low row
      long hirow = lorow==prow ? lorow : lorow+1;
      int hiidx = findBin(hirow);    // Find bin holding high value
      // Upper end of range to explore - except at the very high end cap
      double hi = hiidx==_nbins ? binEdge(_nbins) : _maxs[hiidx];

      long sum = (long)_start_row;
      for( int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy