hex.deeplearning.DeepLearningModelInfo Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.deeplearning;

import hex.DataInfo;
import hex.genmodel.utils.DistributionFamily;
import static java.lang.Double.isNaN;

import hex.Model;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import water.*;
import water.fvec.Frame;
import water.util.*;

import java.util.Arrays;
import java.util.Random;


/**
 * This class contains the state of the Deep Learning model
 * This will be shared: one per node
 */
final public class DeepLearningModelInfo extends Iced {

  public TwoDimTable summaryTable;

  public DataInfo data_info;
  public DataInfo data_info() {
    return data_info;
  }

  // model is described by parameters and the following arrays
  private Storage.DenseRowMatrix[] dense_row_weights; //one 2D weight matrix per layer (stored as a 1D array each)
  private Storage.DenseVector[] biases; //one 1D bias array per layer
  private Storage.DenseVector[] avg_activations; //one 1D array per hidden layer

  // helpers for storing previous step deltas
  // Note: These two arrays *could* be made transient and then initialized freshly in makeNeurons() and in DeepLearningTask.initLocal()
  // But then, after each reduction, the weights would be lost and would have to restart afresh -> not *exactly* right, but close...
  private Storage.DenseRowMatrix[] dense_row_weights_momenta;
  private Storage.DenseVector[] biases_momenta;

  // helpers for AdaDelta
  private Storage.DenseRowMatrix[] dense_row_ada_dx_g;
  private Storage.DenseVector[] biases_ada_dx_g;

  private boolean[] _saw_missing_cats;  // whether missing value was encountered for each categorical predictor - needed for varimp

  // compute model size (number of model parameters required for making predictions)
  // momenta are not counted here, but they are needed for model building
  public long size() {
    long siz = 0;
    for (Storage.DenseRowMatrix w : dense_row_weights) if (w != null) siz += w.size();
    for (Storage.Vector b : biases) siz += b.size();
    return siz;
  }

  /**
   * Check whether a missing value was found for every categorical predictor
   * @param cats activation of categorical buckets for a given row
   */
  void checkMissingCats(int[] cats)  {
    if (cats == null) return;
    if (_saw_missing_cats == null) return;
    for (int i=0; i _model_id;
  public final DeepLearningParameters get_params() { return parameters; }
  public final void set_params(DeepLearningParameters p, Key model_id ) {
    parameters = (DeepLearningParameters) p.clone();
    _model_id = model_id;
  }

  private double[] mean_rate;
  private double[] rms_rate;
  private double[] mean_bias;
  private double[] rms_bias;
  private double[] mean_weight;
  public double[] rms_weight;
  public double[] mean_a;

  private volatile boolean unstable = false;
  public boolean isUnstable() { return unstable; }
  public void setUnstable() {
    if (!unstable) computeStats();
    unstable = true;
  }

  private long processed_global;
  public synchronized long get_processed_global() { return processed_global; }
  public synchronized void set_processed_global(long p) { processed_global = p; }
  public synchronized void add_processed_global(long p) { processed_global += p; }
  private long processed_local;
  public synchronized long get_processed_local() { return processed_local; }
  public synchronized void set_processed_local(long p) { processed_local = p; }
  public synchronized void add_processed_local(long p) { processed_local += p; }
  public synchronized long get_processed_total() { return processed_global + processed_local; }

  // package local helpers
  int[] units; //number of neurons per layer, extracted from parameters and from datainfo

  final boolean _classification; // Classification cache (nclasses>1)
  final Frame _train;         // Prepared training frame
  final Frame _valid;         // Prepared validation frame

  /**
   * Dummy constructor, only to be used for deserialization from autobuffer
   */
  private DeepLearningModelInfo() {
    super(); // key is null
    _classification = false;
    _train = _valid = null;
  }

  /**
   * Main constructor
   * @param params Model parameters
   * @param dinfo Data Info
   * @param nClasses number of classes (1 for regression, 0 for autoencoder)
   * @param train User-given training data frame, prepared by AdaptTestTrain
   * @param valid User-specified validation data frame, prepared by AdaptTestTrain
   */
  public DeepLearningModelInfo(final DeepLearningParameters params, Key model_id, final DataInfo dinfo, int nClasses, Frame train, Frame valid) {
    _classification = nClasses > 1;
    _train = train;
    _valid = valid;
    data_info = dinfo;
    parameters = (DeepLearningParameters) params.clone(); //make a copy, don't change model's parameters
    _model_id = model_id;
    DeepLearningParameters.Sanity.modifyParms(parameters, parameters, nClasses); //sanitize the model_info's parameters

    final int num_input = dinfo.fullN();
    final int num_output = get_params()._autoencoder ? num_input :
            (_classification && parameters._distribution != DistributionFamily.modified_huber ? train.vec(parameters._response_column).cardinality() : 1);
    if (!get_params()._autoencoder) assert(num_output == nClasses || parameters._distribution == DistributionFamily.modified_huber );

    _saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null;
    assert (num_input > 0);
    assert (num_output > 0);
    if (has_momenta() && adaDelta())
      throw new IllegalArgumentException("Cannot have non-zero momentum and adaptive rate at the same time.");
    final int layers = get_params()._hidden.length;
    // units (# neurons for each layer)
    units = new int[layers + 2];
    if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums)
      units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input);
    else
      units[0] = num_input;
    System.arraycopy(get_params()._hidden, 0, units, 1, layers);
    units[layers + 1] = num_output;

    boolean printLevels = units[0] > 1000L;
    boolean warn = units[0] > 100000L;
    if (printLevels) {
      final String[][] domains = dinfo._adaptedFrame.domains();
      if (warn) {
        Log.warn("===================================================================================================================================");
        Log.warn(num_input + " input features" + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "") + ". Can be slow and require a lot of memory.");
      }
      FrameUtils.printTopCategoricalLevels(dinfo._adaptedFrame, warn, 10);
      if (warn) {
        Log.warn("Suggestions:");
        Log.warn(" *) Limit the size of the first hidden layer");
        if (dinfo._cats > 0) {
          Log.warn(" *) Limit the total number of one-hot encoded features by setting 'categorical_encoding=\"enum_limited\"'");
          Log.warn(" *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features' (experimental)");
          Log.warn(" *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/deep-learning.html#faq");
        }
        Log.warn("===================================================================================================================================");
      }
    }

    int[] mult = new int[layers + 1];
    for (int i=0;i 0) {
      avg_activations = new Storage.DenseVector[layers];
      mean_a = new double[layers];
      for (int i = 0; i < layers; ++i)
        avg_activations[i] = new Storage.DenseVector(mult[i] * units[i + 1]);
    }
    allocateHelperArrays();
    // for diagnostics
    mean_rate = new double[units.length-1];
    rms_rate = new double[units.length-1];
    mean_bias = new double[units.length-1];
    rms_bias = new double[units.length-1];
    mean_weight = new double[units.length-1];
    rms_weight = new double[units.length-1];
  }

  /**
   * Allocate helper arrays for momentum/learning rate, etc.
   */
  void allocateHelperArrays() {
    int[] mult = new int[units.length-1];
    for (int i=0;i 0) {
        for (int k = 0; k < get_params()._hidden.length; k++) {
          sb.append("Average activation in hidden layer ").append(k).append(" is  ").append(mean_a[k]).append(" \n");
        }
      }
      createSummaryTable();
      sb.append(summaryTable.toString(1));
    }
    return sb.toString();
  }

  /**
   * Debugging printout
   * @return String with useful info
   */
  public String toStringAll() {
    StringBuilder sb = new StringBuilder();
    sb.append(toString());

    for (int i = 0; i < units.length - 1; ++i)
      sb.append("\nweights[").append(i).append("][]=").append(Arrays.toString(get_weights(i).raw()));
    for (int i = 0; i < units.length - 1; ++i) {
      sb.append("\nbiases[").append(i).append("][]=").append(Arrays.toString(get_biases(i).raw()));
    }
    if (has_momenta()) {
      for (int i = 0; i < units.length - 1; ++i)
        sb.append("\nweights_momenta[").append(i).append("][]=").append(Arrays.toString(get_weights_momenta(i).raw()));
    }
    if (biases_momenta != null) {
      for (int i = 0; i < units.length - 1; ++i) {
        sb.append("\nbiases_momenta[").append(i).append("][]=").append(Arrays.toString(biases_momenta[i].raw()));
      }
    }
    sb.append("\nunits[]=").append(Arrays.toString(units));
    sb.append("\nprocessed global: ").append(get_processed_global());
    sb.append("\nprocessed local:  ").append(get_processed_local());
    sb.append("\nprocessed total:  ").append(get_processed_total());
    sb.append("\n");
    return sb.toString();
  }

  /**
   * Initialize weights/biases
   */
  void initializeMembers(Key[] initial_weights, Key[] initial_biases) {
    randomizeWeights();
    //TODO: determine good/optimal/best initialization scheme for biases
    // hidden layers
    for (int i = 0; i < get_params()._hidden.length; ++i) {
      if (get_params()._activation == DeepLearningParameters.Activation.Rectifier
              || get_params()._activation == DeepLearningParameters.Activation.RectifierWithDropout
              || get_params()._activation == DeepLearningParameters.Activation.Maxout
              || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout
              ) {
//          Arrays.fill(biases[i], 1.); //old behavior
        Arrays.fill(biases[i].raw(), i == 0 ? 0.5f : 1f); //new behavior, might be slightly better
      } else if (get_params()._activation == DeepLearningParameters.Activation.Tanh || get_params()._activation == DeepLearningParameters.Activation.TanhWithDropout) {
        Arrays.fill(biases[i].raw(), 0f);
      }
    }
    Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer
    if (initial_weights!=null || initial_biases!=null) {
      Log.info("Initializing initial model state from user-given weights/biases.");
      for (int i = 0; i < get_params()._hidden.length+1; ++i) {
        if (initial_weights[i] == null) {
          Log.info("No user-given weight matrix given for weights #" + (i+1) + ". Initializing those weights randomly.");
          continue;
        }
        if (initial_biases[i] == null) {
          Log.info("No user-given bias vector given for biases #" + (i+1) + ". Initializing those biases randomly.");
          continue;
        }
        Frame w = initial_weights[i].get();
        if (w==null) {
          throw new IllegalArgumentException("User-given weight matrix for weights #" + (i+1) + " '" + initial_weights[i].toString() + "' not found. Initializing those weights randomly.");
        }
        if (w.numRows() != get_weights(i).rows() || w.numCols() != get_weights(i).cols()) {
          throw new IllegalArgumentException("Dimensionality mismatch: initial_weights matrix #" + i +
                  " should have " +  get_weights(i).rows() + " rows and " + get_weights(i).cols()
                  + " columns, but has " + w.numRows() + " rows and " + w.numCols() + " columns.");
        }
        Frame b = initial_biases[i].get();
        if (b==null) {
          throw new IllegalArgumentException("User-given bias vector for biases #" + (i+1) + " '" + initial_biases[i].toString() + "' not found. Initializing those biases randomly.");
        }
        if (b.numRows() != get_biases(i).size() || b.numCols() != 1) {
          throw new IllegalArgumentException("Dimensionality mismatch: initial_biases vector #" + i +
                  " should have " +  get_biases(i).size() + " rows and 1"
                  + " column, but has " + b.numRows() + " rows and " + b.numCols() + " column(s).");
        }
        for (int c=0; ck importances
      for (int i = 0; i < units[0]; i++) vi[i] = ArrayUtils.sum(Qik[i]);
    }

    //normalize importances such that max(vi) = 1
    ArrayUtils.div(vi, ArrayUtils.maxValue(vi));

    // zero out missing categorical variables if they were never seen
    if (_saw_missing_cats != null) {
      for (int i = 0; i < _saw_missing_cats.length; ++i) {
        assert (data_info._catMissing[i]); //have a missing bucket for each categorical
        if (!_saw_missing_cats[i]) vi[data_info._catOffsets[i + 1] - 1] = 0;
      }
    }
    return vi;
  }

  /**
   * Compute statistics about this model on all nodes
   */
  public void computeStats() {
    float[][] rate = get_params()._adaptive_rate ? new float[units.length - 1][] : null;

    if (get_params()._autoencoder && get_params()._sparsity_beta > 0) {
      for (int k = 0; k < get_params()._hidden.length; k++) {
        mean_a[k] = 0;
        for (int j = 0; j < avg_activations[k].size(); j++)
          mean_a[k] += avg_activations[k].get(j);
        mean_a[k] /= avg_activations[k].size();
      }
    }

    for (int y = 0; y < units.length-1; y++) {
      mean_rate[y] = rms_rate[y] = 0;
      mean_bias[y] = rms_bias[y] = 0;
      mean_weight[y] = rms_weight[y] = 0;
      for (int u = 0; u < biases[y].size(); u++) {
        mean_bias[y] += biases[y].get(u);
      }
      if (rate != null) rate[y] = new float[get_weights(y).raw().length];
      for (int u = 0; u < get_weights(y).raw().length; u++) {
        mean_weight[y] += get_weights(y).raw()[u];
        if (rate != null) {
//            final float RMS_dx = (float)Math.sqrt(ada[y][2*u]+(float)get_params().epsilon);
//            final float invRMS_g = (float)(1/Math.sqrt(ada[y][2*u+1]+(float)get_params().epsilon));
          final float RMS_dx = MathUtils.approxSqrt(get_ada_dx_g(y).raw()[2 * u] + (float) get_params()._epsilon);
          final float invRMS_g = MathUtils.approxInvSqrt(get_ada_dx_g(y).raw()[2 * u + 1] + (float) get_params()._epsilon);
          rate[y][u] = RMS_dx * invRMS_g; //not exactly right, RMS_dx should be from the previous time step -> but close enough for diagnostics.
          mean_rate[y] += rate[y][u];
        }
      }
      mean_bias[y] /= biases[y].size();
      mean_weight[y] /= get_weights(y).size();
      if (rate != null) mean_rate[y] /= rate[y].length;
      for (int u = 0; u < biases[y].size(); u++) {
        final double db = biases[y].get(u) - mean_bias[y];
        rms_bias[y] += db * db;
      }
      for (int u = 0; u < get_weights(y).size(); u++) {
        final double dw = get_weights(y).raw()[u] - mean_weight[y];
        rms_weight[y] += dw * dw;
        if (rate != null) {
          final double drate = rate[y][u] - mean_rate[y];
          rms_rate[y] += drate * drate;
        }
      }
      rms_bias[y] = MathUtils.approxSqrt(rms_bias[y] / biases[y].size());
      rms_weight[y] = MathUtils.approxSqrt(rms_weight[y] / get_weights(y).size());
      if (rate != null) rms_rate[y] = MathUtils.approxSqrt(rms_rate[y]/ rate[y].length);
//        rms_bias[y] = (float)Math.sqrt(rms_bias[y]/biases[y].length);
//        rms_weight[y] = (float)Math.sqrt(rms_weight[y]/weights[y].length);
//        if (rate != null) rms_rate[y] = (float)Math.sqrt(rms_rate[y]/rate[y].length);

      // Abort the run if weights or biases are unreasonably large (Note that all input values are normalized upfront)
      // This can happen with Rectifier units when L1/L2/max_w2 are all set to 0, especially when using more than 1 hidden layer.
      final double thresh = 1e10;
      final double bthresh = 1e5;
      unstable |= isNaN(mean_bias[y]) || isNaN(rms_bias[y])
          || isNaN(mean_weight[y]) || isNaN(rms_weight[y])
          // large weights
          || Math.abs(mean_weight[y]) > thresh
          || rms_weight[y] > thresh
          // large biases
          || Math.abs(mean_bias[y]) > bthresh
          || rms_bias[y] > bthresh;
    }
  }

  /**
   * Unique identifier for this model's state, based on raw numbers
   */
  protected long checksum_impl() {
    computeStats();
    Random rng = new Random(0xDECAFBBB);
    double cs = Double.longBitsToDouble(get_params()._seed);
    cs += size() * get_processed_total();
    for (double d : mean_bias) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_bias) cs += (rng.nextDouble() * (d+123.23));
    for (double d : mean_weight) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_weight) cs += (rng.nextDouble() * (d+123.23));
    for (double d : mean_rate) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_rate) cs += (rng.nextDouble() * (d+123.23));
    return Double.doubleToRawLongBits(cs);
  }

  /**
   * TimeAveraging as part of Elastic Averaging Algorithm
   * Cf. equation 6 of arXiv:1412.6651v5
   * @param nodeAverageModel current average of per-node models
   * @return Time-average of node-averages (consensus model, "the" model)
   */
  public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) {
    float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate;
    assert(pa > 0 && pa <= 1);
    DeepLearningModelInfo elasticAverage = DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); //get latest version from DKV
    if (elasticAverage == null || pa == 1) {
      elasticAverage = IcedUtils.deepCopy(nodeAverageModel);
    } else {
      nodeAverageModel.mult(pa);
      elasticAverage.mult(1 - pa);
      elasticAverage.add(nodeAverageModel); //ignore processed local value set here
      elasticAverage.set_processed_global(nodeAverageModel.get_processed_global());
    }
    elasticAverage.set_processed_local(0);
    DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage);

//    nodeAverageModel.computeStats();
//    elasticAverage.computeStats();
//    Log.info("Local Model    :\n" + nodeAverageModel.toString());
//    Log.info("Elastic Average:\n" + elasticAverage.toString());
    return elasticAverage;
  }

  public Key localModelInfoKey(H2ONode node) {
    return Key.make(_model_id + ".node" + node.index(), Key.HIDDEN_USER_KEY, true, node);
  }

  public Key elasticAverageModelInfoKey() {
    return Key.make(_model_id + ".elasticaverage", Key.HIDDEN_USER_KEY, true, H2O.CLOUD._memary[0]);
  }

  static public class GradientCheck {
    GradientCheck(int l, int r, int c) { layer=l; row=r; col=c; gradient=0;}
    int layer;
    int row;
    int col;
    double gradient;
    void apply(int l, int r, int c, double g) {
      if (r==row && c==col && l==layer) {
        gradient += g;
      }
    }
  }
  static public GradientCheck gradientCheck = null;
  static public GradientCheck gradientCheckBias = null;
}