hex.tree.Score Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.tree;

import hex.*;
import hex.genmodel.GenModel;
import hex.genmodel.utils.DistributionFamily;
import hex.tree.gbm.GBMModel;
import org.apache.log4j.Logger;
import water.Iced;
import water.Key;
import water.fvec.C0DChunk;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.udf.CFuncRef;

/** Score the tree columns, and produce a confusion matrix and AUC
 */
public class Score extends CMetricScoringTask {

  private static final Logger LOG = Logger.getLogger(Score.class);
  
  final SharedTree _bldr;
  final boolean _is_train;      // Scoring on pre-scored training data vs full-score data
  final boolean _oob;           // Computed on OOB
  final Key _kresp;        // Response vector key (might be either train or validation)
  final ModelCategory _mcat;    // Model category (Binomial, Regression, etc)

  final boolean _computeGainsLift;
  final ScoreIncInfo _sii;      // Incremental scoring (on a validation dataset), null indicates full scoring
  final Frame _preds;           // Prediction cache (typically not too many Vecs => it is not too costly embed the object in MRTask)
  final ScoreExtension _ext;    // Optional extension to customize scoring (eg. for Uplift)

  /** Output parameter: Metric builder */
  ModelMetrics.MetricBuilder _mb;

  /** Compute ModelMetrics on the testing dataset.
   *  It expect already adapted validation dataset which is adapted to a model
   *  and contains a response which is adapted to confusion matrix domain.
   */
  public Score(SharedTree bldr, boolean is_train, boolean oob, Vec kresp, ModelCategory mcat, boolean computeGainsLift, Frame preds, CFuncRef customMetricFunc) {
    this(bldr, is_train, null, oob, kresp, mcat, computeGainsLift, preds, customMetricFunc);
  }

  public Score(SharedTree bldr, ScoreIncInfo sii, boolean oob, Vec kresp, ModelCategory mcat, boolean computeGainsLift, Frame preds, CFuncRef customMetricFunc) {
    this(bldr, false, sii, oob, kresp, mcat, computeGainsLift, preds, customMetricFunc);
  }

  private Score(SharedTree bldr, boolean is_train, ScoreIncInfo sii, boolean oob, Vec kresp, ModelCategory mcat, boolean computeGainsLift, Frame preds, CFuncRef customMetricFunc) {
  super(customMetricFunc);
    _bldr = bldr; _is_train = is_train; _sii = sii; _oob = oob; _kresp = kresp != null ? kresp._key : null; _mcat = mcat; _computeGainsLift = computeGainsLift;
    _preds = computeGainsLift ? preds : null; // don't keep the prediction cache if we don't need to compute gainslift
    assert _kresp != null || !_bldr.isSupervised();
    assert (! _is_train) || (_sii == null);
    _ext = _bldr.makeScoreExtension();
  }

  @Override public void map(Chunk allchks[]) {
    final Chunk[] chks = getScoringChunks(allchks);
    Chunk ys; // Response
    if (_bldr.isSupervised()) {
      ys = _bldr.chk_resp(chks);
    } else if (_bldr.isResponseOptional() && _kresp != null) {
      ys = _kresp.get().chunkForChunkIdx(chks[0].cidx());
    } else {
      ys = new C0DChunk(0, chks[0]._len); // Dummy response to simplify code
    }
    SharedTreeModel m = _bldr._model;
    Chunk weightsChunk = m._output.hasWeights() ? chks[m._output.weightsIdx()] : null;
    Chunk offsetChunk = m._output.hasOffset() ? chks[m._output.offsetIdx()] : null;
    // Because of adaption - the validation training set has at least as many
    // classes as the training set (it may have more).  The Confusion Matrix
    // needs to be at least as big as the training set domain.
    final String[] domain;
    if (m._parms._distribution == DistributionFamily.quasibinomial) {
      domain = ((GBMModel) m)._output._quasibinomialDomains;
    } else {
      domain = _kresp != null ? _kresp.get().domain() : null;
    }
    final int nclass = _bldr.nclasses();
    _mb = m.makeMetricBuilder(domain);
    // If this is a score-on-train AND DRF, then oobColIdx makes sense,
    // otherwise this field is unused.
    final int oobColIdx = _bldr.idx_oobt();
    final double[] cdists = _mb._work; // Temp working array for class distributions
    // If working a validation set, need to push thru official model scoring
    // logic which requires a temp array to hold the features.
    final double[] tmp = _is_train && _bldr._ntrees > 0 ? null : new double[_bldr._ncols];

    // Score all Rows
    final int[] responseComplements = _ext == null ? new int[0] : _ext.getResponseComplements(m);
    final float[] val = new float[1 + responseComplements.length];
    for( int row=0; row 2) { // Fill in prediction for multinomial
        cdists[0] = GenModel.getPredictionMultinomial(cdists, m._output._priorClassDist, tmp);
      } else if (nclass == 2) {
        // for binomial the predicted class is not needed
        // and it even cannot be returned because the threshold is calculated based on model metrics that are not known yet
        // (we are just building the metrics)
        cdists[0] = -1;
      }
      val[0] = (float)ys.atd(row);
      if (responseComplements.length > 0) {
        for (int i = 0; i < responseComplements.length; i++) {
          val[1 + i] = (float) chks[responseComplements[i]].atd(row);
        }
      }
      _mb.perRow(cdists, val, weight, offset, m);

      if (_preds != null) {
        _mb.cachePrediction(cdists, allchks, row, chks.length, m);
      }

      // Compute custom metric if necessary
      customMetricPerRow(cdists, val, weight, offset, m);
    }
  }

  // scoring chunks are those chunks that make the input to one of the scoring functions
  private Chunk[] getScoringChunks(Chunk[] allChunks) {
    if (_preds == null)
      return allChunks;
    Chunk[] chks = new Chunk[allChunks.length - _preds.numCols()];
    System.arraycopy(allChunks, 0, chks, 0, chks.length);
    return chks;
  }

  @Override
  protected boolean modifiesVolatileVecs() {
    return _sii != null || _preds != null;
  }

  @Override public void reduce(Score t) {
    super.reduce(t);
    _mb.reduce(t._mb);
  }

  // We need to satsify MB invariant
  @Override protected void postGlobal() {
    super.postGlobal();
    if(_mb != null) {
      _mb.postGlobal(getComputedCustomMetric());
    }
  }

  ModelMetrics scoreAndMakeModelMetrics(SharedTreeModel model, Frame fr, Frame adaptedFr, boolean buildTreeOneNode) {
    Frame input = _preds != null ? new Frame(adaptedFr).add(_preds) : adaptedFr;
    return doAll(input, buildTreeOneNode)
            .makeModelMetrics(model, fr, adaptedFr, _preds);
  }

  // Run after the doAll scoring to convert the MetricsBuilder to a ModelMetrics
  private ModelMetrics makeModelMetrics(SharedTreeModel model, Frame fr, Frame adaptedFr, Frame preds) {
    ModelMetrics mm;
    if ((model._output.nclasses() == 2 && _computeGainsLift) || _ext != null) {
      assert preds != null : "Predictions were pre-created";
      mm = _mb.makeModelMetrics(model, fr, adaptedFr, preds);
    } else {
      boolean calculatePreds = preds == null && model.isDistributionHuber();
      // FIXME: PUBDEV-4992 we should avoid doing full scoring!
      if (calculatePreds) {
        LOG.warn("Going to calculate predictions from scratch. This can be expensive for large models! See PUBDEV-4992");
        preds = model.score(fr);
      }
      mm = _mb.makeModelMetrics(model, fr, null, preds);
      if (calculatePreds && (preds != null))
        preds.remove();
    }
    return mm;
  }

  static Frame makePredictionCache(SharedTreeModel model, Vec templateVec, String[] domain) {
    ModelMetrics.MetricBuilder mb = model.makeMetricBuilder(domain);
    return mb.makePredictionCache(model, templateVec);
  }

  public static class ScoreIncInfo extends Iced {
    public final int _startTree;
    public final int _workspaceColIdx;
    public final int _workspaceColCnt;
    public final int _predsAryOffset;

    public ScoreIncInfo(int startTree, int workspaceColIdx, int workspaceColCnt, int predsAryOffset) {
      _startTree = startTree;
      _workspaceColIdx = workspaceColIdx;
      _workspaceColCnt = workspaceColCnt;
      _predsAryOffset = predsAryOffset;
    }
  }

  public static abstract class ScoreExtension extends Iced {

    /**
     * Get prediction from per class-probabilities or algo-specific data
     *
     * @param cdist prediction array
     * @return prediction
     */
    protected abstract double getPrediction(double[] cdist);

    /**
     * Return indices of columns that need to be extracted from Frame chunks in addition to response
     * @param m instance of SharedTreeModel
     * @return training frame column indices
     */
    protected abstract int[] getResponseComplements(SharedTreeModel m);
  }

}