hex.deeplearning.DeepLearningModelInfo Maven / Gradle / Ivy
package hex.deeplearning;
import hex.DataInfo;
import hex.genmodel.utils.DistributionFamily;
import static java.lang.Double.isNaN;
import hex.Model;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import water.*;
import water.fvec.Frame;
import water.util.*;
import java.util.Arrays;
import java.util.Random;
/**
* This class contains the state of the Deep Learning model
* This will be shared: one per node
*/
final public class DeepLearningModelInfo extends Iced {
public TwoDimTable summaryTable;
public DataInfo data_info;
public DataInfo data_info() {
return data_info;
}
// model is described by parameters and the following arrays
private Storage.DenseRowMatrix[] dense_row_weights; //one 2D weight matrix per layer (stored as a 1D array each)
private Storage.DenseVector[] biases; //one 1D bias array per layer
private Storage.DenseVector[] avg_activations; //one 1D array per hidden layer
// helpers for storing previous step deltas
// Note: These two arrays *could* be made transient and then initialized freshly in makeNeurons() and in DeepLearningTask.initLocal()
// But then, after each reduction, the weights would be lost and would have to restart afresh -> not *exactly* right, but close...
private Storage.DenseRowMatrix[] dense_row_weights_momenta;
private Storage.DenseVector[] biases_momenta;
// helpers for AdaDelta
private Storage.DenseRowMatrix[] dense_row_ada_dx_g;
private Storage.DenseVector[] biases_ada_dx_g;
private boolean[] _saw_missing_cats; // whether missing value was encountered for each categorical predictor - needed for varimp
// compute model size (number of model parameters required for making predictions)
// momenta are not counted here, but they are needed for model building
public long size() {
long siz = 0;
for (Storage.DenseRowMatrix w : dense_row_weights) if (w != null) siz += w.size();
for (Storage.Vector b : biases) siz += b.size();
return siz;
}
/**
* Check whether a missing value was found for every categorical predictor
* @param cats activation of categorical buckets for a given row
*/
void checkMissingCats(int[] cats) {
if (cats == null) return;
if (_saw_missing_cats == null) return;
for (int i=0; i _model_id;
public final DeepLearningParameters get_params() { return parameters; }
public final void set_params(DeepLearningParameters p, Key model_id ) {
parameters = (DeepLearningParameters) p.clone();
_model_id = model_id;
}
private double[] mean_rate;
private double[] rms_rate;
private double[] mean_bias;
private double[] rms_bias;
private double[] mean_weight;
public double[] rms_weight;
public double[] mean_a;
private volatile boolean unstable = false;
public boolean isUnstable() { return unstable; }
public void setUnstable() {
if (!unstable) computeStats();
unstable = true;
}
private long processed_global;
public synchronized long get_processed_global() { return processed_global; }
public synchronized void set_processed_global(long p) { processed_global = p; }
public synchronized void add_processed_global(long p) { processed_global += p; }
private long processed_local;
public synchronized long get_processed_local() { return processed_local; }
public synchronized void set_processed_local(long p) { processed_local = p; }
public synchronized void add_processed_local(long p) { processed_local += p; }
public synchronized long get_processed_total() { return processed_global + processed_local; }
// package local helpers
int[] units; //number of neurons per layer, extracted from parameters and from datainfo
final boolean _classification; // Classification cache (nclasses>1)
final Frame _train; // Prepared training frame
final Frame _valid; // Prepared validation frame
/**
* Dummy constructor, only to be used for deserialization from autobuffer
*/
private DeepLearningModelInfo() {
super(); // key is null
_classification = false;
_train = _valid = null;
}
/**
* Main constructor
* @param params Model parameters
* @param dinfo Data Info
* @param nClasses number of classes (1 for regression, 0 for autoencoder)
* @param train User-given training data frame, prepared by AdaptTestTrain
* @param valid User-specified validation data frame, prepared by AdaptTestTrain
*/
public DeepLearningModelInfo(final DeepLearningParameters params, Key model_id, final DataInfo dinfo, int nClasses, Frame train, Frame valid) {
_classification = nClasses > 1;
_train = train;
_valid = valid;
data_info = dinfo;
parameters = (DeepLearningParameters) params.clone(); //make a copy, don't change model's parameters
_model_id = model_id;
DeepLearningParameters.Sanity.modifyParms(parameters, parameters, nClasses); //sanitize the model_info's parameters
final int num_input = dinfo.fullN();
final int num_output = get_params()._autoencoder ? num_input :
(_classification && parameters._distribution != DistributionFamily.modified_huber ? train.vec(parameters._response_column).cardinality() : 1);
if (!get_params()._autoencoder) assert(num_output == nClasses || parameters._distribution == DistributionFamily.modified_huber );
_saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null;
assert (num_input > 0);
assert (num_output > 0);
if (has_momenta() && adaDelta())
throw new IllegalArgumentException("Cannot have non-zero momentum and adaptive rate at the same time.");
final int layers = get_params()._hidden.length;
// units (# neurons for each layer)
units = new int[layers + 2];
if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums)
units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input);
else
units[0] = num_input;
System.arraycopy(get_params()._hidden, 0, units, 1, layers);
units[layers + 1] = num_output;
boolean printLevels = units[0] > 1000L;
boolean warn = units[0] > 100000L;
if (printLevels) {
final String[][] domains = dinfo._adaptedFrame.domains();
if (warn) {
Log.warn("===================================================================================================================================");
Log.warn(num_input + " input features" + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "") + ". Can be slow and require a lot of memory.");
}
FrameUtils.printTopCategoricalLevels(dinfo._adaptedFrame, warn, 10);
if (warn) {
Log.warn("Suggestions:");
Log.warn(" *) Limit the size of the first hidden layer");
if (dinfo._cats > 0) {
Log.warn(" *) Limit the total number of one-hot encoded features by setting 'categorical_encoding=\"enum_limited\"'");
Log.warn(" *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features' (experimental)");
Log.warn(" *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/deep-learning.html#faq");
}
Log.warn("===================================================================================================================================");
}
}
int[] mult = new int[layers + 1];
for (int i=0;i 0) {
avg_activations = new Storage.DenseVector[layers];
mean_a = new double[layers];
for (int i = 0; i < layers; ++i)
avg_activations[i] = new Storage.DenseVector(mult[i] * units[i + 1]);
}
allocateHelperArrays();
// for diagnostics
mean_rate = new double[units.length-1];
rms_rate = new double[units.length-1];
mean_bias = new double[units.length-1];
rms_bias = new double[units.length-1];
mean_weight = new double[units.length-1];
rms_weight = new double[units.length-1];
}
/**
* Allocate helper arrays for momentum/learning rate, etc.
*/
void allocateHelperArrays() {
int[] mult = new int[units.length-1];
for (int i=0;i 0) {
for (int k = 0; k < get_params()._hidden.length; k++) {
sb.append("Average activation in hidden layer ").append(k).append(" is ").append(mean_a[k]).append(" \n");
}
}
createSummaryTable();
sb.append(summaryTable.toString(1));
}
return sb.toString();
}
/**
* Debugging printout
* @return String with useful info
*/
public String toStringAll() {
StringBuilder sb = new StringBuilder();
sb.append(toString());
for (int i = 0; i < units.length - 1; ++i)
sb.append("\nweights[").append(i).append("][]=").append(Arrays.toString(get_weights(i).raw()));
for (int i = 0; i < units.length - 1; ++i) {
sb.append("\nbiases[").append(i).append("][]=").append(Arrays.toString(get_biases(i).raw()));
}
if (has_momenta()) {
for (int i = 0; i < units.length - 1; ++i)
sb.append("\nweights_momenta[").append(i).append("][]=").append(Arrays.toString(get_weights_momenta(i).raw()));
}
if (biases_momenta != null) {
for (int i = 0; i < units.length - 1; ++i) {
sb.append("\nbiases_momenta[").append(i).append("][]=").append(Arrays.toString(biases_momenta[i].raw()));
}
}
sb.append("\nunits[]=").append(Arrays.toString(units));
sb.append("\nprocessed global: ").append(get_processed_global());
sb.append("\nprocessed local: ").append(get_processed_local());
sb.append("\nprocessed total: ").append(get_processed_total());
sb.append("\n");
return sb.toString();
}
/**
* Initialize weights/biases
*/
void initializeMembers(Key[] initial_weights, Key[] initial_biases) {
randomizeWeights();
//TODO: determine good/optimal/best initialization scheme for biases
// hidden layers
for (int i = 0; i < get_params()._hidden.length; ++i) {
if (get_params()._activation == DeepLearningParameters.Activation.Rectifier
|| get_params()._activation == DeepLearningParameters.Activation.RectifierWithDropout
|| get_params()._activation == DeepLearningParameters.Activation.Maxout
|| get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout
) {
// Arrays.fill(biases[i], 1.); //old behavior
Arrays.fill(biases[i].raw(), i == 0 ? 0.5f : 1f); //new behavior, might be slightly better
} else if (get_params()._activation == DeepLearningParameters.Activation.Tanh || get_params()._activation == DeepLearningParameters.Activation.TanhWithDropout) {
Arrays.fill(biases[i].raw(), 0f);
}
}
Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer
if (initial_weights!=null || initial_biases!=null) {
Log.info("Initializing initial model state from user-given weights/biases.");
for (int i = 0; i < get_params()._hidden.length+1; ++i) {
if (initial_weights[i] == null) {
Log.info("No user-given weight matrix given for weights #" + (i+1) + ". Initializing those weights randomly.");
continue;
}
if (initial_biases[i] == null) {
Log.info("No user-given bias vector given for biases #" + (i+1) + ". Initializing those biases randomly.");
continue;
}
Frame w = initial_weights[i].get();
if (w==null) {
throw new IllegalArgumentException("User-given weight matrix for weights #" + (i+1) + " '" + initial_weights[i].toString() + "' not found. Initializing those weights randomly.");
}
if (w.numRows() != get_weights(i).rows() || w.numCols() != get_weights(i).cols()) {
throw new IllegalArgumentException("Dimensionality mismatch: initial_weights matrix #" + i +
" should have " + get_weights(i).rows() + " rows and " + get_weights(i).cols()
+ " columns, but has " + w.numRows() + " rows and " + w.numCols() + " columns.");
}
Frame b = initial_biases[i].get();
if (b==null) {
throw new IllegalArgumentException("User-given bias vector for biases #" + (i+1) + " '" + initial_biases[i].toString() + "' not found. Initializing those biases randomly.");
}
if (b.numRows() != get_biases(i).size() || b.numCols() != 1) {
throw new IllegalArgumentException("Dimensionality mismatch: initial_biases vector #" + i +
" should have " + get_biases(i).size() + " rows and 1"
+ " column, but has " + b.numRows() + " rows and " + b.numCols() + " column(s).");
}
for (int c=0; ck importances
for (int i = 0; i < units[0]; i++) vi[i] = ArrayUtils.sum(Qik[i]);
}
//normalize importances such that max(vi) = 1
ArrayUtils.div(vi, ArrayUtils.maxValue(vi));
// zero out missing categorical variables if they were never seen
if (_saw_missing_cats != null) {
for (int i = 0; i < _saw_missing_cats.length; ++i) {
assert (data_info._catMissing[i]); //have a missing bucket for each categorical
if (!_saw_missing_cats[i]) vi[data_info._catOffsets[i + 1] - 1] = 0;
}
}
return vi;
}
/**
* Compute statistics about this model on all nodes
*/
public void computeStats() {
float[][] rate = get_params()._adaptive_rate ? new float[units.length - 1][] : null;
if (get_params()._autoencoder && get_params()._sparsity_beta > 0) {
for (int k = 0; k < get_params()._hidden.length; k++) {
mean_a[k] = 0;
for (int j = 0; j < avg_activations[k].size(); j++)
mean_a[k] += avg_activations[k].get(j);
mean_a[k] /= avg_activations[k].size();
}
}
for (int y = 0; y < units.length-1; y++) {
mean_rate[y] = rms_rate[y] = 0;
mean_bias[y] = rms_bias[y] = 0;
mean_weight[y] = rms_weight[y] = 0;
for (int u = 0; u < biases[y].size(); u++) {
mean_bias[y] += biases[y].get(u);
}
if (rate != null) rate[y] = new float[get_weights(y).raw().length];
for (int u = 0; u < get_weights(y).raw().length; u++) {
mean_weight[y] += get_weights(y).raw()[u];
if (rate != null) {
// final float RMS_dx = (float)Math.sqrt(ada[y][2*u]+(float)get_params().epsilon);
// final float invRMS_g = (float)(1/Math.sqrt(ada[y][2*u+1]+(float)get_params().epsilon));
final float RMS_dx = MathUtils.approxSqrt(get_ada_dx_g(y).raw()[2 * u] + (float) get_params()._epsilon);
final float invRMS_g = MathUtils.approxInvSqrt(get_ada_dx_g(y).raw()[2 * u + 1] + (float) get_params()._epsilon);
rate[y][u] = RMS_dx * invRMS_g; //not exactly right, RMS_dx should be from the previous time step -> but close enough for diagnostics.
mean_rate[y] += rate[y][u];
}
}
mean_bias[y] /= biases[y].size();
mean_weight[y] /= get_weights(y).size();
if (rate != null) mean_rate[y] /= rate[y].length;
for (int u = 0; u < biases[y].size(); u++) {
final double db = biases[y].get(u) - mean_bias[y];
rms_bias[y] += db * db;
}
for (int u = 0; u < get_weights(y).size(); u++) {
final double dw = get_weights(y).raw()[u] - mean_weight[y];
rms_weight[y] += dw * dw;
if (rate != null) {
final double drate = rate[y][u] - mean_rate[y];
rms_rate[y] += drate * drate;
}
}
rms_bias[y] = MathUtils.approxSqrt(rms_bias[y] / biases[y].size());
rms_weight[y] = MathUtils.approxSqrt(rms_weight[y] / get_weights(y).size());
if (rate != null) rms_rate[y] = MathUtils.approxSqrt(rms_rate[y]/ rate[y].length);
// rms_bias[y] = (float)Math.sqrt(rms_bias[y]/biases[y].length);
// rms_weight[y] = (float)Math.sqrt(rms_weight[y]/weights[y].length);
// if (rate != null) rms_rate[y] = (float)Math.sqrt(rms_rate[y]/rate[y].length);
// Abort the run if weights or biases are unreasonably large (Note that all input values are normalized upfront)
// This can happen with Rectifier units when L1/L2/max_w2 are all set to 0, especially when using more than 1 hidden layer.
final double thresh = 1e10;
final double bthresh = 1e5;
unstable |= isNaN(mean_bias[y]) || isNaN(rms_bias[y])
|| isNaN(mean_weight[y]) || isNaN(rms_weight[y])
// large weights
|| Math.abs(mean_weight[y]) > thresh
|| rms_weight[y] > thresh
// large biases
|| Math.abs(mean_bias[y]) > bthresh
|| rms_bias[y] > bthresh;
}
}
/**
* Unique identifier for this model's state, based on raw numbers
*/
protected long checksum_impl() {
computeStats();
Random rng = new Random(0xDECAFBBB);
double cs = Double.longBitsToDouble(get_params()._seed);
cs += size() * get_processed_total();
for (double d : mean_bias) cs += (rng.nextDouble() * (d+123.23));
for (double d : rms_bias) cs += (rng.nextDouble() * (d+123.23));
for (double d : mean_weight) cs += (rng.nextDouble() * (d+123.23));
for (double d : rms_weight) cs += (rng.nextDouble() * (d+123.23));
for (double d : mean_rate) cs += (rng.nextDouble() * (d+123.23));
for (double d : rms_rate) cs += (rng.nextDouble() * (d+123.23));
return Double.doubleToRawLongBits(cs);
}
/**
* TimeAveraging as part of Elastic Averaging Algorithm
* Cf. equation 6 of arXiv:1412.6651v5
* @param nodeAverageModel current average of per-node models
* @return Time-average of node-averages (consensus model, "the" model)
*/
public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) {
float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate;
assert(pa > 0 && pa <= 1);
DeepLearningModelInfo elasticAverage = DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); //get latest version from DKV
if (elasticAverage == null || pa == 1) {
elasticAverage = IcedUtils.deepCopy(nodeAverageModel);
} else {
nodeAverageModel.mult(pa);
elasticAverage.mult(1 - pa);
elasticAverage.add(nodeAverageModel); //ignore processed local value set here
elasticAverage.set_processed_global(nodeAverageModel.get_processed_global());
}
elasticAverage.set_processed_local(0);
DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage);
// nodeAverageModel.computeStats();
// elasticAverage.computeStats();
// Log.info("Local Model :\n" + nodeAverageModel.toString());
// Log.info("Elastic Average:\n" + elasticAverage.toString());
return elasticAverage;
}
public Key localModelInfoKey(H2ONode node) {
return Key.make(_model_id + ".node" + node.index(), Key.HIDDEN_USER_KEY, true, node);
}
public Key elasticAverageModelInfoKey() {
return Key.make(_model_id + ".elasticaverage", Key.HIDDEN_USER_KEY, true, H2O.CLOUD._memary[0]);
}
static public class GradientCheck {
GradientCheck(int l, int r, int c) { layer=l; row=r; col=c; gradient=0;}
int layer;
int row;
int col;
double gradient;
void apply(int l, int r, int c, double g) {
if (r==row && c==col && l==layer) {
gradient += g;
}
}
}
static public GradientCheck gradientCheck = null;
static public GradientCheck gradientCheckBias = null;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy