hex.tree.isofor.IsolationForest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.tree.isofor;

import hex.ModelCategory;
import hex.ModelMetricsBinomial;
import hex.ScoreKeeper;
import hex.genmodel.utils.DistributionFamily;
import hex.quantile.Quantile;
import hex.tree.*;
import hex.tree.DTree.DecidedNode;
import hex.tree.DTree.LeafNode;
import hex.tree.DTree.UndecidedNode;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import water.Iced;
import water.Job;
import water.Key;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.PrettyPrint;
import water.util.TwoDimTable;

import java.util.*;

import static water.util.RandomUtils.getRNG;
import static hex.tree.isofor.IsolationForestModel.IsolationForestParameters;
import static hex.tree.isofor.IsolationForestModel.IsolationForestOutput;

/**
 * Isolation Forest
 */
public class IsolationForest extends SharedTree {

  @Override public ModelCategory[] can_build() {
    return new ModelCategory[]{
      ModelCategory.AnomalyDetection
    };
  }
  @Override public BuilderVisibility builderVisibility() {
    return BuilderVisibility.Stable;
  }

  // Called from an http request
  public IsolationForest(IsolationForestParameters parms                               ) { super(parms     ); init(false); }
  public IsolationForest(IsolationForestParameters parms, Key key) { super(parms, key); init(false); }
  public IsolationForest(IsolationForestParameters parms, Job job                      ) { super(parms, job); init(false); }
  public IsolationForest(boolean startup_once) { super(new IsolationForestParameters(), startup_once); }

  @Override protected Driver trainModelImpl() { return new IsolationForestDriver(); }

  @Override public boolean scoreZeroTrees() { return false; }

  @Override public boolean isSupervised() { return false; }

  @Override public boolean isResponseOptional() { return true; }

  @Override protected ScoreKeeper.ProblemType getProblemType() { return ScoreKeeper.ProblemType.anomaly_detection; }

  private transient VarSplits _var_splits;

  @Override public void init(boolean expensive) {
    super.init(expensive);
    // Initialize local variables
    if( _parms._mtries < 1 && _parms._mtries != -1 && _parms._mtries != -2 )
      error("_mtries", "mtries must be -1 (converted to sqrt(features)) or -2 (All features) or >= 1 but it is " + _parms._mtries);
    if( _train != null ) {
      int ncols = _train.numCols();
      if( _parms._mtries != -1 && _parms._mtries != -2 && !(1 <= _parms._mtries && _parms._mtries <= ncols))
        error("_mtries","Computed mtries should be -1 or -2 or in interval [1," + ncols + "] but it is " + _parms._mtries);
    }
    if (_parms._distribution != DistributionFamily.AUTO && _parms._distribution != DistributionFamily.gaussian) {
      throw new IllegalStateException("Isolation Forest doesn't expect the distribution to be specified by the user");
    }
    _parms._distribution = DistributionFamily.gaussian;
    if (_parms._contamination != -1 && (_parms._contamination <= 0 || _parms._contamination > 0.5)) {
      error("_contamination", "Contamination parameter needs to be in range (0, 0.5] or undefined (-1); but it is " + _parms._contamination);
    }
    if (_parms._valid != null) {
      if (_parms._response_column == null) {
        error("_response_column", "Response column needs to be defined when using a validation frame.");
      } else if (expensive && vresponse() == null) {
        error("_response_column", "Validation frame is missing response column `" + _parms._response_column + "`.");
      }
      if (_parms._contamination > 0) {
        error("_contamination", "Contamination parameter cannot be used together with a validation frame.");
      }
    } else {
      if (_parms._stopping_metric != ScoreKeeper.StoppingMetric.AUTO && _parms._stopping_metric != ScoreKeeper.StoppingMetric.anomaly_score) {
        error("_stopping_metric", "Stopping metric `" + _parms._stopping_metric + 
                "` can only be used when a labeled validation frame is provided.");
      }
    }
    if (expensive) {
      if (vresponse() != null) {
        if (!vresponse().isBinary() || vresponse().domain()==null) {
          error("_response_column", "The response column of the validation frame needs to have a binary categorical domain (not anomaly/anomaly).");
        }
      }
      if (response() != null) {
        error("_training_frame", "Training frame should not have a response column");
      }
    }
  }

  @Override
  protected void validateRowSampleRate() {
    if (_parms._sample_rate == -1) {
      if (_parms._sample_size <= 0) {
        error("_sample_size", "Sample size needs to be a positive integer number but it is " + _parms._sample_size);
      } else if (_train != null && _train.numRows() > 0) {
        _parms._sample_rate = _parms._sample_size /  (double) _train.numRows();
      }
    }
  }

  @Override
  protected boolean validateStoppingMetric() {
    return false; // disable the default stopping metric validation
  }
  
  private void randomResp(final long seed, final int iteration) {
    new MRTask() {
      @Override public void map(Chunk chks[]) {
        Chunk c = chk_work(chks, 0);
        final long chunk_seed = seed + (c.start() * (1 + iteration));
        for (int i = 0; i < c._len; i++) {
          double rnd = getRNG(chunk_seed + i).nextDouble();
          chk_work(chks, 0).set(i, rnd);
        }
      }
    }.doAll(_train);
  }

  @Override
  protected DTree.DecidedNode makeDecided(DTree.UndecidedNode udn, DHistogram hs[], Constraints cs) {
    return new IFDecidedNode(udn, hs, cs);
  }

  private class IFDecidedNode extends DTree.DecidedNode {

    private IFDecidedNode(DTree.UndecidedNode n, DHistogram[] hs, Constraints cs) {
      super(n, hs, cs, null);
    }

    @Override
    public DTree.Split bestCol(DTree.UndecidedNode u, DHistogram hs[], Constraints cs) {
      if( hs == null ) return null;
      final int maxCols = u._scoreCols == null /* all cols */ ? hs.length : u._scoreCols.length;
      List findSplits = new ArrayList<>();
      for (int i=0; i 0) {
      assert vresponse() == null; // contamination is not compatible with using validation frame
      assert _model.outputAnomalyFlag();
      Frame fr = _model.score(_train);
      try {
        Vec score = fr.vec("score");
        assert score != null;
        out._defaultThreshold = Quantile.calcQuantile(score, 1 - _parms._contamination);
      } finally {
        fr.delete();
      }
    } else if (_model._output._validation_metrics instanceof ModelMetricsBinomial) { 
      out._defaultThreshold = ((ModelMetricsBinomial) _model._output._validation_metrics)._auc.defaultThreshold();
    }
  }

  // ----------------------
  private class IsolationForestDriver extends Driver {
    @Override protected boolean doOOBScoring() { return true; }

    @Override protected void initializeModelSpecifics() {
      _mtry_per_tree = Math.max(1, (int)(_parms._col_sample_rate_per_tree * _ncols));
      if (!(1 <= _mtry_per_tree && _mtry_per_tree <= _ncols)) throw new IllegalArgumentException("Computed mtry_per_tree should be in interval <1,"+_ncols+"> but it is " + _mtry_per_tree);
      if(_parms._mtries==-2){ //mtries set to -2 would use all columns in each split regardless of what column has been dropped during train
        _mtry = _ncols;
      }else if(_parms._mtries==-1) {
        _mtry = (isClassifier() ? Math.max((int) Math.sqrt(_ncols), 1) : Math.max(_ncols / 3, 1)); // classification: mtry=sqrt(_ncols), regression: mtry=_ncols/3
      }else{
        _mtry = _parms._mtries;
      }
      if (!(1 <= _mtry && _mtry <= _ncols)) {
        throw new IllegalArgumentException("Computed mtry should be in interval <1," + _ncols + "> but it is " + _mtry);
      }

      _initialPrediction = 0;
      _var_splits = new VarSplits(_ncols);
      
      if ((_parms._contamination > 0) || (vresponse() != null)) {
        _model._output._defaultThreshold = 0.5;
        assert _model.outputAnomalyFlag();
      }
    }

    // --------------------------------------------------------------------------
    // Build the next random k-trees representing tid-th tree
    @Override protected boolean buildNextKTrees() {
      // Create a Random response
      randomResp(_parms._seed, _model._output._ntrees);

      final long rseed = _rand.nextLong();
      final DTree tree = new DTree(_train, _ncols, _mtry, _mtry_per_tree, rseed, _parms);
      final DTree[] ktrees = {tree};

      new Sample(tree, _parms._sample_rate, null)
              .dfork(null, new Frame(vec_nids(_train, 0), vec_work(_train, 0)), _parms._build_tree_one_node)
              .getResult();

      // Assign rows to nodes - fill the "NIDs" column(s)
      growTree(rseed, ktrees);

      // Reset NIDs
      CalculatePaths stats = new CalculatePaths(ktrees[0]).doAll(_train, _parms._build_tree_one_node);

      // Grow the model by K-trees
      _model._output.addKTrees(ktrees);
      _model._output._min_path_length = stats._minPathLength;
      _model._output._max_path_length = stats._maxPathLength;

      return false; // never stop early
    }

    // Assumes that the "Work" column are filled with copy of a random generated response
    private void growTree(long rseed, final DTree[] ktrees) {
      // Initial set of histograms.  All trees; one leaf per tree (the root
      // leaf); all columns
      DHistogram hcs[][][] = new DHistogram[_nclass][1/*just root leaf*/][_ncols];

      // Adjust real bins for the top-levels
      int adj_nbins = Math.max(_parms._nbins_top_level,_parms._nbins);

      // Initially setup as-if an empty-split had just happened
      final DTree tree = ktrees[0];
      new UndecidedNode(tree, -1, DHistogram.initialHist(_train, _ncols, adj_nbins, hcs[0][0], rseed, _parms, getGlobalSplitPointsKeys(), null, false, null), null, null); // The "root" node

      // ----
      // One Big Loop till the ktrees are of proper depth.
      // Adds a layer to the trees each pass.
      final int[] leafs = new int[1];
      for(int depth=0 ; depth<_parms._max_depth; depth++ ) {
        hcs = buildLayer(_train, _parms._nbins, ktrees, leafs, hcs, _parms._build_tree_one_node);
        // If we did not make any new splits, then the tree is split-to-death
        if( hcs == null ) break;
      }

      // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
      // LeafNodes to hold predictions.
      int leaf = tree.len();
      int depths[] = new int[leaf];
      for( int nid=0; nid= 0 ? depths[dn._pid] + 1 : 0;
          for( int i=0; i {
      private final DTree _tree;
      // OUT
      private int _minPathLength = Integer.MAX_VALUE;
      private int _maxPathLength = 0;
      private CalculatePaths(DTree tree) { _tree = tree; }
      @Override public void map(Chunk[] chks) {
        final Chunk tree = chk_tree(chks, 0);
        final Chunk nids = chk_nids(chks, 0); // Node-ids  for this tree/class
        final Chunk oobt = chk_oobt(chks);
        for (int row = 0; row < nids._len; row++) {
          final int rawNid = (int) chk_nids(chks,0).at8(row);
          final boolean wasOOBRow = ScoreBuildHistogram.isOOBRow(rawNid);
          final int nid = wasOOBRow ? ScoreBuildHistogram.oob2Nid(rawNid) : rawNid;
          final int depth = getNodeDepth(chks, row, nid);
          if (wasOOBRow) {
            double oobcnt = oobt.atd(row) + 1;
            oobt.set(row, oobcnt);
          }
          final int total_len = PathTracker.encodeNewPathLength(tree, row, depth, wasOOBRow);
          _maxPathLength = total_len > _maxPathLength ? total_len : _maxPathLength;
          _minPathLength = total_len < _minPathLength ? total_len : _minPathLength;
          // reset NIds
          nids.set(row, 0);
        }
      }
      @Override public void reduce(CalculatePaths mrt) {
        _minPathLength = Math.min(_minPathLength, mrt._minPathLength);
        _maxPathLength = Math.max(_maxPathLength, mrt._maxPathLength);
      }
      int getNodeDepth(Chunk[] chks, int row, int nid) {
        if (_tree.root() instanceof LeafNode) {
          return 0;
        } else {
          if (_tree.node(nid) instanceof UndecidedNode)  // If we bottomed out the tree
            nid = _tree.node(nid).pid();                 // Then take parent's decision
          DecidedNode dn = _tree.decided(nid);           // Must have a decision point
          if (dn._split == null)                         // Unable to decide?
            dn = _tree.decided(_tree.node(nid).pid());   // Then take parent's decision
          int leafnid = dn.getChildNodeID(chks, row);    // Decide down to a leafnode
          double depth = ((LeafNode) _tree.node(leafnid)).pred();
          assert (int) depth == depth;
          return (int) depth;
        }
      }
    }

    @Override protected IsolationForestModel makeModel(Key modelKey, IsolationForestParameters parms) {
      return new IsolationForestModel(modelKey, parms, new IsolationForestOutput(IsolationForest.this));
    }

  }

  
  @Override protected double score1(Chunk chks[], double weight, double offset, double fs[/*2*/], int row) {
    assert weight == 1;
    int len = PathTracker.decodeOOBPathLength(chk_tree(chks, 0), row);
    fs[1] = len / chk_oobt(chks).atd(row); // average tree path length
    fs[0] = _model.normalizePathLength(fs[1] * _model._output._ntrees); // score
    return fs[0];
  }

  protected TwoDimTable createScoringHistoryTable() {
    List colHeaders = new ArrayList<>();
    List colTypes = new ArrayList<>();
    List colFormat = new ArrayList<>();
    colHeaders.add("Timestamp"); colTypes.add("string"); colFormat.add("%s");
    colHeaders.add("Duration"); colTypes.add("string"); colFormat.add("%s");
    colHeaders.add("Number of Trees"); colTypes.add("long"); colFormat.add("%d");
    colHeaders.add("Mean Tree Path Length"); colTypes.add("double"); colFormat.add("%.5f");
    colHeaders.add("Mean Anomaly Score"); colTypes.add("double"); colFormat.add("%.5f");
    if (_parms._custom_metric_func != null) {
      colHeaders.add("Training Custom"); colTypes.add("double"); colFormat.add("%.5f");
    }

    ScoreKeeper[] sks = _model._output._scored_train;

    int rows = 0;
    for (int i = 0; i < sks.length; i++) {
      if (i != 0 && Double.isNaN(sks[i]._anomaly_score)) continue;
      rows++;
    }
    TwoDimTable table = new TwoDimTable(
            "Scoring History", null,
            new String[rows],
            colHeaders.toArray(new String[0]),
            colTypes.toArray(new String[0]),
            colFormat.toArray(new String[0]),
            "");
    int row = 0;
    for( int i = 0; i {
    public final int[] _splitCounts;
    public final float[] _aggSplitRatios;
    public final long[] _splitDepths;

    private VarSplits(int ncols) {
      _splitCounts = new int[ncols];
      _aggSplitRatios = new float[ncols];
      _splitDepths = new long[ncols];
    }

    void update(int col, DTree.Split split, int depth) {
      _aggSplitRatios[col] += Math.abs(split.n0() - split.n1()) / (split.n0() + split.n1());
      _splitCounts[col]++;
      _splitDepths[col] += depth + 1;
    }

    public TwoDimTable toTwoDimTable(String[] coef_names, String table_header) {

      double[][] dblCellValues = new double[_splitCounts.length][];
      for (int i = 0; i < _splitCounts.length; i++) {
        dblCellValues[i] = new double[]{_splitCounts[i], _aggSplitRatios[i], _splitDepths[i]};
      }

      String[] col_headers = {"Count", "Aggregated Split Ratios", "Aggregated Split Depths"};
      String[] col_types = {"int", "double", "long"};
      String[] col_formats = {"%10d", "%5f", "%10d"};

      return new TwoDimTable(table_header, null, coef_names, col_headers, col_types, col_formats, 
              "Variable", new String[_splitCounts.length][], dblCellValues);
    }

  }
  
}