All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.quantile.Quantile Maven / Gradle / Ivy

package hex.quantile;

import hex.ModelBuilder;
import hex.ModelCategory;
import hex.schemas.ModelBuilderSchema;
import hex.schemas.QuantileV3;
import water.DKV;
import water.H2O.H2OCountedCompleter;
import water.Job;
import water.MRTask;
import water.Scope;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.Log;

import java.util.Arrays;

/**
 *  Quantile model builder... building a simple QuantileModel
 */
public class Quantile extends ModelBuilder {

  // Called from Nano thread; start the Quantile Job on a F/J thread
  public Quantile( QuantileModel.QuantileParameters parms ) { super("Quantile",parms); init(false); }

  public ModelBuilderSchema schema() { return new QuantileV3(); }

  @Override public Quantile trainModelImpl(long work, boolean restartTimer) {
    return (Quantile)start(new QuantileDriver(), work, restartTimer);
  }

  @Override
  public long progressUnits() {
    return train().numCols()*_parms._probs.length;
  }


  @Override public ModelCategory[] can_build() {
    return new ModelCategory[]{ModelCategory.Unknown};
  }

  @Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Stable; };

  /** Initialize the ModelBuilder, validating all arguments and preparing the
   *  training frame.  This call is expected to be overridden in the subclasses
   *  and each subclass will start with "super.init();".  This call is made
   *  by the front-end whenever the GUI is clicked, and needs to be fast;
   *  heavy-weight prep needs to wait for the trainModel() call.
   *
   *  Validate the probs.
   */
  @Override public void init(boolean expensive) {
    super.init(expensive);
    for( double p : _parms._probs )
      if( p < 0.0 || p > 1.0 )
        error("_probs","Probabilities must be between 0 and 1");
  }

  // ----------------------
  private class QuantileDriver extends H2OCountedCompleter {

    @Override protected void compute2() {
      QuantileModel model = null;
      try {
        Scope.enter();
        _parms.read_lock_frames(Quantile.this); // Fetch & read-lock source frame
        init(true);

        // The model to be built
        model = new QuantileModel(dest(), _parms, new QuantileModel.QuantileOutput(Quantile.this));
        model._output._parameters = _parms;
        model._output._quantiles = new double[train().numCols()][_parms._probs.length];
        model.delete_and_lock(_key);

        // ---
        // Run the main Quantile Loop
        Vec vecs[] = train().vecs();
        for( int n=0; n {
    final double _probs[];
    final Frame _train;
    final QuantileModel.CombineMethod _combine_method;

    public double[][] _quantiles;
    public QTask(H2OCountedCompleter cc, double[] probs, Frame train, QuantileModel.CombineMethod combine_method) {
      super(cc); _train=train; _probs=probs; _combine_method=combine_method;
    }
    @Override public void compute2() {
      // Run the main Quantile Loop
      _quantiles = new double[_train.numCols()][_probs.length];
      Vec vecs[] = _train.vecs();
      for( int n=0; n {
    private static final int NBINS=1024; // Default bin count
    private final int _nbins;            // Actual  bin count
    private final double _lb;            // Lower bound of bin[0]
    private final double _step;          // Step-size per-bin
    private final long _start_row;       // Starting row number for this lower-bound
    private final long _nrows;           // Total datasets rows
    private final boolean _isInt;        // Column only holds ints

    // Big Data output result
    long   _bins[/*nbins*/];     // Rows in each bin
    double _mins[/*nbins*/];     // Smallest element in bin
    double _maxs[/*nbins*/];     // Largest  element in bin

    private Histo( double lb, double ub, long start_row, long nrows, boolean isInt  ) {
      boolean is_int = (isInt && (ub-lb < NBINS));
      _nbins = is_int ? (int)(ub-lb+1) : NBINS;
      _lb = lb;
      double ulp = Math.ulp(Math.max(Math.abs(lb),Math.abs(ub)));
      _step = is_int ? 1 : (ub+ulp-lb)/_nbins;
      _start_row = start_row;
      _nrows = nrows;
      _isInt = isInt;
    }

    @Override public void map( Chunk chk ) {
      long   bins[] = _bins = new long  [_nbins];
      double mins[] = _mins = new double[_nbins];
      double maxs[] = _maxs = new double[_nbins];
      Arrays.fill(_mins, Double.MAX_VALUE);
      Arrays.fill(_maxs,-Double.MAX_VALUE);
      double d;
      for( int row=0; row maxs[i]) maxs[i] = d;
          }
          bins[i]++;               // Bump row counts
        }
      }
    }
    @Override public void reduce( Histo h ) {
      for( int i=0; i<_nbins; i++ ) { // Keep min/max
        if( _mins[i] > h._mins[i] ) _mins[i] = h._mins[i];
        if( _maxs[i] < h._maxs[i] ) _maxs[i] = h._maxs[i];
      }
      ArrayUtils.add(_bins,h._bins);
    }

    /** @return Quantile for probability prob, or NaN if another pass is needed. */
    double findQuantile( double prob, QuantileModel.CombineMethod method ) {
      double p2 = prob*(_nrows-1); // Desired fractional row number for this probability
      long r2 = (long)p2;       // Lower integral row number
      int loidx = findBin(r2);  // Find bin holding low value
      double lo = (loidx == _nbins) ? binEdge(_nbins) : _maxs[loidx];
      if( loidx<_nbins && r2==p2 && _mins[loidx]==lo ) return lo; // Exact row number, exact bin?  Then quantile is exact

      long r3 = r2+1;           // Upper integral row number
      int hiidx = findBin(r3);  // Find bin holding high value
      double hi = (hiidx == _nbins) ? binEdge(_nbins) : _mins[hiidx];
      if( loidx==hiidx )        // Somewhere in the same bin?
        return (lo==hi) ? lo : Double.NaN; // Only if bin is constant, otherwise must refine the bin
      // Split across bins - the interpolate between the hi of the lo bin, and
      // the lo of the hi bin
      return computeQuantile(lo,hi,r2,_nrows,prob,method);
    }

    private double binEdge( int idx ) { return _lb+_step*idx; }

    // bin for row; can be _nbins if just off the end (normally expect 0 to nbins-1)
    private int findBin( long row ) {
      long sum = _start_row;
      for( int i=0; i<_nbins; i++ )
        if( row < (sum += _bins[i]) )
          return i;
      return _nbins;
    }

    // Run another pass over the data, with refined endpoints, to home in on
    // the exact elements for this probability.
    Histo refinePass( double prob ) {
      double prow = prob*(_nrows-1); // Desired fractional row number for this probability
      long lorow = (long)prow;       // Lower integral row number
      int loidx = findBin(lorow);    // Find bin holding low value
      // If loidx is the last bin, then high must be also the last bin - and we
      // have an exact quantile (equal to the high bin) and we didn't need
      // another refinement pass
      assert loidx < _nbins;
      double lo = _mins[loidx]; // Lower end of range to explore
      // If probability does not hit an exact row, we need the elements on
      // either side - so the next row up from the low row
      long hirow = lorow==prow ? lorow : lorow+1;
      int hiidx = findBin(hirow);    // Find bin holding high value
      // Upper end of range to explore - except at the very high end cap
      double hi = hiidx==_nbins ? binEdge(_nbins) : _maxs[hiidx];

      long sum = _start_row;
      for( int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy