hex.deeplearning.DeepLearningModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.deeplearning;

import hex.*;
import hex.genmodel.CategoricalEncoding;
import hex.genmodel.utils.DistributionFamily;
import hex.util.EffectiveParametersUtils;
import hex.util.LinearAlgebraUtils;
import water.*;
import water.codegen.CodeGenerator;
import water.codegen.CodeGeneratorPipeline;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.JCodeSB;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.udf.CFuncRef;
import water.util.*;

import java.lang.reflect.Field;
import java.util.Arrays;

import static hex.ModelMetrics.calcVarImp;
import static hex.deeplearning.DeepLearning.makeDataInfo;
import static hex.deeplearning.DeepLearningModel.DeepLearningParameters.Loss.*;
import static hex.genmodel.utils.DistributionFamily.poisson;
import static water.H2O.technote;

/**
 * The Deep Learning model
 * It contains a DeepLearningModelInfo with the most up-to-date model,
 * a scoring history, as well as some helpers to indicate the progress
 */

public class DeepLearningModel extends Model implements Model.DeepFeatures {
  @Override public ToEigenVec getToEigenVec() {
    return LinearAlgebraUtils.toEigen;
  }

  /**
   * The Deep Learning model output contains a few extra fields in addition to the metrics in Model.Output
   * 1) Scoring history (raw data)
   * 2) weights/biases (raw data)
   * 3) variable importances (TwoDimTable)
   */
  public static class DeepLearningModelOutput extends Model.Output {
    public DeepLearningModelOutput(DeepLearning b) {
      super(b);
      autoencoder = b._parms._autoencoder;
      assert b.isSupervised() == !autoencoder;
    }
    final boolean autoencoder;

    @Override
    public boolean isAutoencoder() { return autoencoder; }

    DeepLearningScoringInfo errors;
    Key[] weights;
    Key[] biases;
    double[] normmul;
    double[] normsub;
    double[] normrespmul;
    double[] normrespsub;
    int[] catoffsets;
    public TwoDimTable _variable_importances;
    @Override
    public TwoDimTable getVariableImportances() {
      return _variable_importances;
    }

    @Override public ModelCategory getModelCategory() {
      return autoencoder ? ModelCategory.AutoEncoder : super.getModelCategory();
    }

    @Override public boolean isSupervised() {
      return !autoencoder;
    }
  } // DeepLearningModelOutput

    @Override
    public void initActualParamValues() {
      super.initActualParamValues();
      EffectiveParametersUtils.initFoldAssignment(_parms);
    }
    
  void set_model_info(DeepLearningModelInfo mi) {
    assert(mi != null);
    model_info = mi;
  }

  final public DeepLearningModelInfo model_info() { return model_info; }
  final public VarImp varImp() { return _output.errors.variable_importances; }

  private volatile DeepLearningModelInfo model_info;

  // timing
  public long total_checkpointed_run_time_ms; //time spent in previous models
  public long total_training_time_ms; //total time spent running (training+scoring, including all previous models)
  public long total_scoring_time_ms; //total time spent scoring (including all previous models)
  public long total_setup_time_ms; //total time spent setting up (including all previous models)
  private long time_of_start_ms; //start time for this model (this cp restart)

  // auto-tuning
  public long actual_train_samples_per_iteration;
  public double time_for_communication_us; //helper for auto-tuning: time in microseconds for collective bcast/reduce of the model

  // helpers for diagnostics
  public double epoch_counter;
  public int iterations;
  public boolean stopped_early;
  public long training_rows;
  public long validation_rows;

  // Keep the best model so far, based on a single criterion (overall class. error or MSE)
  private float _bestLoss = Float.POSITIVE_INFINITY;

  public Key actual_best_model_key;
  public Key model_info_key;

  public DeepLearningScoringInfo last_scored() { return (DeepLearningScoringInfo) super.last_scored(); }


  /**
   * Get the parameters actually used for model building, not the user-given ones (_parms)
   * They might differ since some defaults are filled in, and some invalid combinations are auto-disabled in modifyParams
   * @return actually used parameters
   */
  public final DeepLearningParameters get_params() { return model_info.get_params(); }

  @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) {
    switch(_output.getModelCategory()) {
      case Binomial:    return new ModelMetricsBinomial.MetricBuilderBinomial(domain);
      case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(_output.nclasses(),domain, get_params()._auc_type);
      case Regression:  return new ModelMetricsRegression.MetricBuilderRegression();
      case AutoEncoder: return new ModelMetricsAutoEncoder.MetricBuilderAutoEncoder(_output.nfeatures());
      default: throw H2O.unimpl("Invalid ModelCategory " + _output.getModelCategory());
    }
  }

  /**
   * Helper to allocate keys for output frames for weights and biases
   * @param destKey Base destination key for output frames
   */
  private void makeWeightsBiases(Key destKey) {
    if (!model_info.get_params()._export_weights_and_biases) {
      _output.weights = null;
      _output.biases = null;
      _output.normmul = null;
      _output.normsub = null;
      _output.normrespmul = null;
      _output.normrespsub = null;
      _output.catoffsets = null;
    } else {
      _output.weights = new Key[get_params()._hidden.length + 1];
      for (int i = 0; i < _output.weights.length; ++i) {
        _output.weights[i] = Key.make(destKey + ".weights." + i);
      }
      _output.biases = new Key[get_params()._hidden.length + 1];
      for (int i = 0; i < _output.biases.length; ++i) {
        _output.biases[i] = Key.make(destKey + ".biases." + i);
      }
      _output.normmul = model_info.data_info._normMul;
      _output.normsub = model_info.data_info._normSub;
      _output.normrespmul = model_info.data_info._normRespMul;
      _output.normrespsub = model_info.data_info._normRespSub;
      _output.catoffsets = model_info.data_info._catOffsets;
    }
  }

  /** Constructor to restart from a checkpointed model
   * @param destKey New destination key for the model
   *  @param parms User-given parameters for checkpoint restart
   *  @param cp Checkpoint to restart from
   * @param store_best_model Store only the best model instead of the latest one  */
  public DeepLearningModel(final Key destKey, final DeepLearningParameters parms, final DeepLearningModel cp, final boolean store_best_model, final DataInfo dataInfo) {
    super(destKey, parms == null ? (DeepLearningParameters)cp._parms.clone() : parms, (DeepLearningModelOutput)cp._output.clone());
    assert(_parms != cp._parms); //make sure we have a clone
    model_info = IcedUtils.deepCopy(cp.model_info); //don't want to interfere with model being built, just make a deep copy and store that
    if (store_best_model) {
      model_info.data_info = IcedUtils.deepCopy(dataInfo); //replace previous data_info with updated version that's passed in (contains enum for classification)
    } else {
      model_info.data_info = dataInfo; //shallow clone is ok
      if (parms != null) {
        assert (_parms == parms);
        assert (_parms._checkpoint == parms._checkpoint);
        assert (_parms._checkpoint == cp._key);
      }
    }
    assert(get_params() != cp.model_info().get_params()); //make sure we have a clone
    _dist = DistributionFactory.getDistribution(get_params());
    assert(_dist._family != DistributionFamily.AUTO); // Note: Must use sanitized parameters via get_params() as this._params can still have defaults AUTO, etc.)
    actual_best_model_key = cp.actual_best_model_key;
    if (actual_best_model_key.get() == null) {
      DeepLearningModel best = IcedUtils.deepCopy(cp);
      //best.model_info.data_info = model_info.data_info; // Note: we currently DO NOT use the checkpoint's data info - as data may change during checkpoint restarts
      actual_best_model_key = Key.make(H2O.SELF);
      DKV.put(actual_best_model_key, best);
    }
    time_of_start_ms = cp.time_of_start_ms;
    total_training_time_ms = cp.total_training_time_ms;
    total_checkpointed_run_time_ms = cp.total_training_time_ms;
    total_scoring_time_ms = cp.total_scoring_time_ms;
    total_setup_time_ms = cp.total_setup_time_ms;
    training_rows = cp.training_rows; //copy the value to display the right number on the model page before training has started
    validation_rows = cp.validation_rows; //copy the value to display the right number on the model page before training has started
    _bestLoss = cp._bestLoss;
    epoch_counter = cp.epoch_counter;
    iterations = cp.iterations;

    // deep clone scoring history
    scoringInfo = cp.scoringInfo.clone();
    for (int i=0; i< scoringInfo.length;++i)
      scoringInfo[i] = IcedUtils.deepCopy(cp.scoringInfo[i]);
    _output.errors = last_scored();
    makeWeightsBiases(destKey);
    _output._scoring_history = DeepLearningScoringInfo.createScoringHistoryTable(scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder());
    _output._variable_importances = calcVarImp(last_scored().variable_importances);
    _output.setNames(dataInfo._adaptedFrame.names(), dataInfo._adaptedFrame.typesStr());
    _output._domains = dataInfo._adaptedFrame.domains();
    assert(Arrays.equals(_key._kb, destKey._kb));
  }

  /**
   * Regular constructor (from scratch)
   * @param destKey destination key
   * @param parms DL parameters
   * @param output DL model output
   * @param train Training frame
   * @param valid Validation frame
   * @param nClasses Number of classes (1 for regression or autoencoder)
   */
  public DeepLearningModel(final Key destKey, final DeepLearningParameters parms, final DeepLearningModelOutput output, Frame train, Frame valid, int nClasses) {
    super(destKey, parms, output);
    final DataInfo dinfo = makeDataInfo(train, valid, _parms, nClasses);
    DKV.put(dinfo);
    _output.setNames(dinfo._adaptedFrame.names(), dinfo._adaptedFrame.typesStr());
    _output._domains = dinfo._adaptedFrame.domains();
    Log.info("Building the model on " + dinfo.numNums() + " numeric features and " + dinfo.numCats() + " (one-hot encoded) categorical features.");
    model_info = new DeepLearningModelInfo(parms, destKey, dinfo, nClasses, train, valid);
    model_info_key = Key.make(H2O.SELF);
    _dist = DistributionFactory.getDistribution(get_params());
    assert(_dist._family != DistributionFamily.AUTO); // Note: Must use sanitized parameters via get_params() as this._params can still have defaults AUTO, etc.)
    actual_best_model_key = Key.make(H2O.SELF);
    if (parms._nfolds != 0) actual_best_model_key = null;
    if (!parms._autoencoder) {
      scoringInfo = new DeepLearningScoringInfo[1];
      scoringInfo[0] = new DeepLearningScoringInfo();
      scoringInfo[0].validation = (parms._valid != null);
      scoringInfo[0].time_stamp_ms = System.currentTimeMillis();
      _output.errors = last_scored();
      _output._scoring_history = DeepLearningScoringInfo.createScoringHistoryTable(scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder());
      _output._variable_importances = calcVarImp(last_scored().variable_importances);
    }
    time_of_start_ms = System.currentTimeMillis();
    makeWeightsBiases(destKey);
    assert _key.equals(destKey);
    boolean fail = false;
    long byte_size = 0;
    try {
      byte_size = new AutoBuffer().put(this).buf().length;
    } catch(Throwable t) {
      fail = true;
    }
    if (byte_size > Value.MAX || fail)
      throw new IllegalArgumentException(technote(5, "Model is too large"));
  }

  public long _timeLastIterationEnter;
  public long _timeLastScoreStart; //start actual scoring
  private long _timeLastScoreEnd;  //finished actual scoring
  private long _timeLastPrintStart;

  private void checkTimingConsistency() {
    assert(total_scoring_time_ms <= total_training_time_ms);
    assert(total_setup_time_ms <= total_training_time_ms);
    assert(total_setup_time_ms+total_scoring_time_ms <= total_training_time_ms);
    assert(total_training_time_ms >= total_checkpointed_run_time_ms);
    assert(total_checkpointed_run_time_ms >= 0);
    assert(total_training_time_ms >= 0);
    assert(total_scoring_time_ms >= 0);
  }

  void updateTiming(Key job_key) {
    final long now = System.currentTimeMillis();
    long start_time_current_model = job_key.get().start_time();
    total_training_time_ms = total_checkpointed_run_time_ms + (now - start_time_current_model);
    checkTimingConsistency();
  }

  /**
   * Score this DeepLearning model
   * @param fTrain potentially downsampled training data for scoring
   * @param fValid  potentially downsampled validation data for scoring
   * @param jobKey key of the owning job
   * @param iteration Map/Reduce iteration count
   * @return true if model building is ongoing
   */
  boolean doScoring(Frame fTrain, Frame fValid, Key jobKey, int iteration, boolean finalScoring) {
    final long now = System.currentTimeMillis();
    final double time_since_last_iter = now - _timeLastIterationEnter;
    updateTiming(jobKey);
    _timeLastIterationEnter = now;
    epoch_counter = (double)model_info().get_processed_total()/training_rows;

    boolean keep_running;
    // Auto-tuning
    // if multi-node and auto-tuning and at least 10 ms for communication (to avoid doing thins on multi-JVM on same node),
    // then adjust the auto-tuning parameter 'actual_train_samples_per_iteration' such that the targeted ratio of comm to comp is achieved
    // Note: actual communication time is estimated by the NetworkTest's collective test.
    if (H2O.CLOUD.size() > 1 && get_params()._train_samples_per_iteration == -2 && iteration > 1) {
      Log.debug("Auto-tuning train_samples_per_iteration.");
      if (time_for_communication_us > 1e4) {
        Log.debug("  Time taken for communication: " + PrettyPrint.usecs((long) time_for_communication_us));
        Log.debug("  Time taken for Map/Reduce iteration: " + PrettyPrint.msecs((long) time_since_last_iter, true));
        final double comm_to_work_ratio = (time_for_communication_us * 1e-3) / time_since_last_iter;
        Log.debug("  Ratio of network communication to computation: " + String.format("%.5f", comm_to_work_ratio));
        Log.debug("  target_comm_to_work: " + get_params()._target_ratio_comm_to_comp);
        Log.debug("Old value of train_samples_per_iteration: " + actual_train_samples_per_iteration);
        double correction = get_params()._target_ratio_comm_to_comp / comm_to_work_ratio;
        correction = Math.max(0.5,Math.min(2, correction)); //it's ok to train up to 2x more training rows per iteration, but not fewer than half.
        if (Math.abs(correction) < 0.8 || Math.abs(correction) > 1.2) { //don't correct unless it's significant (avoid slow drift)
          actual_train_samples_per_iteration /= correction;
          actual_train_samples_per_iteration = Math.max(1, actual_train_samples_per_iteration);
          Log.debug("New value of train_samples_per_iteration: " + actual_train_samples_per_iteration);
        } else {
          Log.debug("Keeping value of train_samples_per_iteration the same (would deviate too little from previous value): " + actual_train_samples_per_iteration);
        }
      } else {
        Log.debug("Communication is faster than 10 ms. Not modifying train_samples_per_iteration: " + actual_train_samples_per_iteration);
      }
    }

    keep_running = (epoch_counter < get_params()._epochs) && !stopped_early;
    final long sinceLastScore = now -_timeLastScoreStart;

    // this is potentially slow - only do every so often
    if( !keep_running || get_params()._score_each_iteration ||
        (sinceLastScore > get_params()._score_interval *1000 //don't score too often
            &&(double)(_timeLastScoreEnd-_timeLastScoreStart)/sinceLastScore < get_params()._score_duty_cycle) ) { //duty cycle
      jobKey.get().update(0,"Scoring on " + fTrain.numRows() + " training samples" +(fValid != null ? (", " + fValid.numRows() + " validation samples") : ""));
      final boolean printme = !get_params()._quiet_mode;
      _timeLastScoreStart = System.currentTimeMillis();
      model_info().computeStats(); //might not be necessary, but is done to be certain that numbers are good
      DeepLearningScoringInfo scoringInfo = new DeepLearningScoringInfo();
      scoringInfo.time_stamp_ms = _timeLastScoreStart;
      updateTiming(jobKey);
      scoringInfo.total_training_time_ms = total_training_time_ms;
      scoringInfo.total_scoring_time_ms = total_scoring_time_ms;
      scoringInfo.total_setup_time_ms = total_setup_time_ms;
      scoringInfo.epoch_counter = epoch_counter;
      scoringInfo.iterations = iterations;
      scoringInfo.training_samples = (double)model_info().get_processed_total();
      scoringInfo.validation = fValid != null;
      scoringInfo.score_training_samples = fTrain.numRows();
      scoringInfo.is_classification = _output.isClassifier();
      scoringInfo.is_autoencoder = _output.isAutoencoder();

      if (get_params()._autoencoder) {
        if (printme) Log.info("Scoring the auto-encoder.");
        // training
        {
          final Frame mse_frame = scoreAutoEncoder(fTrain, Key.make(), false);
          mse_frame.delete();
          ModelMetrics mtrain = ModelMetrics.getFromDKV(this, fTrain); //updated by model.score
          _output._training_metrics = mtrain;
          scoringInfo.scored_train = new ScoreKeeper(mtrain);
        }
        if (fValid != null) {
          final Frame mse_frame = scoreAutoEncoder(fValid, Key.make(), false);
          mse_frame.delete();
          ModelMetrics mtest = ModelMetrics.getFromDKV(this, fValid); //updated by model.score
          _output._validation_metrics = mtest;
          scoringInfo.scored_valid = new ScoreKeeper(mtest);
        }
      } else {
        if (printme) Log.info("Scoring the model.");
        // compute errors
        final String m = model_info().toString();
        if (m.length() > 0) Log.info(m);

        // For GainsLift and Huber, we need the full predictions to compute the model metrics
        boolean needPreds = _output.nclasses() == 2 /* gains/lift table requires predictions */ ||
                            get_params()._distribution == DistributionFamily.huber;

        // Scoring on training data
        hex.ModelMetrics mtrain;
        Frame preds = null;
        if (needPreds) {
          // allocate predictions since they are needed
          preds = score(fTrain);
          mtrain = ModelMetrics.getFromDKV(this, fTrain);
          if (get_params()._distribution == DistributionFamily.huber) {
            Vec absdiff = new MathUtils.ComputeAbsDiff().doAll(1, (byte)3,
                new Frame(new String[]{"a","p"}, new Vec[]{fTrain.vec(get_params()._response_column), preds.anyVec()})
            ).outputFrame().anyVec();
            double huberDelta = MathUtils.computeWeightedQuantile(fTrain.vec(get_params()._weights_column), absdiff, get_params()._huber_alpha);
            if (model_info().gradientCheck == null) _dist.setHuberDelta(huberDelta);
          }
        } else {
          // no need to allocate predictions
          ModelMetrics.MetricBuilder mb = scoreMetrics(fTrain);
          mtrain = mb.makeModelMetrics(this,fTrain,fTrain,null);
        }
        if (preds!=null) preds.remove();
        _output._training_metrics = mtrain;
        scoringInfo.scored_train = new ScoreKeeper(mtrain);
        hex.ModelMetricsSupervised mm1 = (ModelMetricsSupervised)mtrain;
        if (fTrain.numRows() != training_rows) {
          _output._training_metrics._description = "Metrics reported on temporary training frame with " + fTrain.numRows() + " samples";
        } else if (fTrain._key != null && fTrain._key.toString().contains("chunks")){
          _output._training_metrics._description = "Metrics reported on temporary (load-balanced) training frame";
        } else {
          _output._training_metrics._description = "Metrics reported on full training frame";
        }

        // Scoring on validation data
        hex.ModelMetrics mvalid;
        if (fValid != null) {
          preds = null;
          if (needPreds) {
            // allocate predictions since they are needed
            preds = score(fValid);
            mvalid = ModelMetrics.getFromDKV(this, fValid);
          } else {
            // no need to allocate predictions
            ModelMetrics.MetricBuilder mb = scoreMetrics(fValid);
            mvalid = mb.makeModelMetrics(this, fValid, fValid,null);
          }
          if (preds!=null) preds.remove();
          _output._validation_metrics = mvalid;
          scoringInfo.scored_valid = new ScoreKeeper(mvalid);
          if (mvalid != null) {
            if (fValid.numRows() != validation_rows) {
              _output._validation_metrics._description = "Metrics reported on temporary validation frame with " + fValid.numRows() + " samples";
              if (get_params()._score_validation_sampling == DeepLearningParameters.ClassSamplingMethod.Stratified) {
                _output._validation_metrics._description += " (stratified sampling)";
              }
            } else if (fValid._key != null && fValid._key.toString().contains("chunks")){
              _output._validation_metrics._description = "Metrics reported on temporary (load-balanced) validation frame";
            } else {
              _output._validation_metrics._description = "Metrics reported on full validation frame";
            }
          }
        }
      }
      if (get_params()._variable_importances) {
        if (!get_params()._quiet_mode) Log.info("Computing variable importances.");
        final float[] vi = model_info().computeVariableImportances();
        scoringInfo.variable_importances = new VarImp(vi, Arrays.copyOfRange(model_info().data_info().coefNames(), 0, vi.length));
      }

      _timeLastScoreEnd = System.currentTimeMillis();
      long scoringTime = _timeLastScoreEnd - _timeLastScoreStart;
      total_scoring_time_ms += scoringTime;
      updateTiming(jobKey);
      // update the scoringInfo object to report proper speed
      scoringInfo.total_training_time_ms = total_training_time_ms;
      scoringInfo.total_scoring_time_ms = total_scoring_time_ms;
      scoringInfo.this_scoring_time_ms = scoringTime;
      // enlarge the error array by one, push latest score back
      if (this.scoringInfo == null) {
        this.scoringInfo = new DeepLearningScoringInfo[]{scoringInfo};
      } else {
        DeepLearningScoringInfo[] err2 = new DeepLearningScoringInfo[this.scoringInfo.length + 1];
        System.arraycopy(this.scoringInfo, 0, err2, 0, this.scoringInfo.length);
        err2[err2.length - 1] = scoringInfo;
        this.scoringInfo = err2;
      }
      _output.errors = last_scored();
      makeWeightsBiases(_key);
      water.util.Timer t = new Timer();
      // store weights and matrices to Frames
      if (_output.weights != null && _output.biases != null) {
        for (int i = 0; i < _output.weights.length; ++i) {
          Frame f = model_info.get_weights(i).toFrame(_output.weights[i]);
          if (i==0) {
            f.setNames(model_info.data_info.coefNames());
            DKV.put(f);
          }
        }
        for (int i = 0; i < _output.biases.length; ++i) {
          model_info.get_biases(i).toFrame(_output.biases[i]);
        }
        if (!get_params()._quiet_mode)
          Log.info("Writing weights and biases to Frames took " + t.time()/1000. + " seconds.");
      }
      _output._scoring_history = DeepLearningScoringInfo.createScoringHistoryTable(this.scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder());
      _output._variable_importances = calcVarImp(last_scored().variable_importances);
      _output._model_summary = model_info.createSummaryTable();

      // always keep a copy of the best model so far (based on the following criterion)
      if (!finalScoring) {
        if (actual_best_model_key != null && get_params()._overwrite_with_best_model && (
                // if we have a best_model in DKV, then compare against its error() (unless it's a different model as judged by the network size)
                (DKV.get(actual_best_model_key) != null && (!(loss() >= DKV.get(actual_best_model_key).get().loss()) || !Arrays.equals(model_info().units, DKV.get(actual_best_model_key).get().model_info().units)))
                        ||
                        // otherwise, compare against our own _bestError
                        (DKV.get(actual_best_model_key) == null && loss() < _bestLoss)
        ) ) {
          _bestLoss = loss();
          putMeAsBestModel(actual_best_model_key);
        }
        // print the freshly scored model to ASCII
        if (keep_running && printme)
          Log.info(toString());
        if ((_output.isClassifier() && last_scored().scored_train._classError <= get_params()._classification_stop)
                || (!_output.isClassifier() && last_scored().scored_train._mse <= get_params()._regression_stop)) {
          Log.info("Achieved requested predictive accuracy on the training data. Model building completed.");
          stopped_early = true;
        }
        // note: stopping metric should be known at this point and setting problemType is redundant
        ScoreKeeper.ProblemType problemType = get_params()._autoencoder ? 
                ScoreKeeper.ProblemType.autoencoder : ScoreKeeper.ProblemType.forSupervised(_output.isClassifier());
        if (ScoreKeeper.stopEarly(ScoringInfo.scoreKeepers(scoring_history()),
                get_params()._stopping_rounds, problemType, get_params()._stopping_metric, get_params()._stopping_tolerance, "model's last", true
        )) {
          Log.info("Convergence detected based on simple moving average of the loss function for the past " + get_params()._stopping_rounds + " scoring events. Model building completed.");
          stopped_early = true;
        }
        if (printme) Log.info("Time taken for scoring and diagnostics: " + PrettyPrint.msecs(scoringInfo.this_scoring_time_ms, true));
      }
    }
    if (stopped_early) {
      // pretend as if we finished all epochs to get the progress bar pretty (especially for N-fold and grid-search)
      ((Job) DKV.getGet(jobKey)).update((long) (get_params()._epochs * training_rows));
      update(jobKey);
      return false;
    }
    progressUpdate(jobKey, keep_running);
    update(jobKey);
    return keep_running;
  }

  private void progressUpdate(Key job_key, boolean keep_running) {
    updateTiming(job_key);
    Job job = job_key.get();
    double progress = job.progress();
//    Log.info("2nd speed: (samples: " + model_info().get_processed_total() + ", total_run_time: " + total_training_time_ms + ", total_scoring_time: " + total_scoring_time_ms + ", total_setup_time: " + total_setup_time_ms + ")");
    int speed = (int)(model_info().get_processed_total() * 1000. / (total_training_time_ms -total_scoring_time_ms-total_setup_time_ms));
    assert(speed >= 0) : "negative speed computed! (total_run_time: " + total_training_time_ms + ", total_scoring_time: " + total_scoring_time_ms + ", total_setup_time: " + total_setup_time_ms + ")";
    String msg =
            "Iterations: " + String.format("%,d", iterations)
            + ". Epochs: " + String.format("%g", epoch_counter)
            + ". Speed: " + String.format("%,d", speed) + " samples/sec."
            + (progress == 0 ? "" : " Estimated time left: " + PrettyPrint.msecs((long) (total_training_time_ms * (1. - progress) / progress), true));
    job.update(actual_train_samples_per_iteration,msg); //mark the amount of work done for the progress bar
    long now = System.currentTimeMillis();
    long sinceLastPrint = now -_timeLastPrintStart;
    if (!keep_running || sinceLastPrint > get_params()._score_interval * 1000) { //print this after every score_interval, not considering duty cycle
      _timeLastPrintStart = now;
      if (!get_params()._quiet_mode) {
        Log.info(
                "Training time: " + PrettyPrint.msecs(total_training_time_ms, true) + " (scoring: " + PrettyPrint.msecs(total_scoring_time_ms, true) + "). "
                + "Processed " + String.format("%,d", model_info().get_processed_total()) + " samples" + " (" + String.format("%.3f", epoch_counter) + " epochs).\n");
        Log.info(msg);
      }
    }
  }

  /** Make either a prediction or a reconstruction.
   * @param orig Test dataset
   * @param adaptedFr Test dataset, adapted to the model
   * @param computeMetrics
   * @return A frame containing the prediction or reconstruction
   */
  @Override protected PredictScoreResult predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key, Job j, boolean computeMetrics, CFuncRef customMetricFunc) {
    if (!get_params()._autoencoder) {
      return super.predictScoreImpl(orig, adaptedFr, destination_key, j, computeMetrics, customMetricFunc);
    } else {
      // Reconstruction
      final int len = model_info().data_info().fullN();
      assert(model_info().data_info()._responses == 0);
      String[] coefnames = model_info().data_info().coefNames();
      assert(len == coefnames.length);
      String[] names = new String[len];
      for(int i = 0; i < names.length; ++i) {
        names[i] = "reconstr_" + coefnames[i];
      }
      Frame f = new MRTask() {
        @Override public void map( Chunk chks[], NewChunk recon[] ) {
          double tmp [] = new double[_output._names.length];
          double preds[] = new double [len];
          final Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info);
          for( int row=0; rowmake(destination_key), names, f.vecs());
      DKV.put(of);
      ModelMetrics.MetricBuilder mb = makeMetricBuilder(null);
      return new PredictScoreResult(mb, of, of);
    }
  }

  @Override
  protected double[] score0(double[] data, double[] preds) {
    return score0(data, preds, 0);
  }

  /**
   * Compute the loss function
   * @param myRows Mini-Batch Array of denseRow's containing numerical/categorical predictor and response data (standardized)
   * @return loss
   */
  public double meanLoss(DataInfo.Row[] myRows) {
    double loss = 0;
    Neurons[] neurons = DeepLearningTask.makeNeuronsForTraining(model_info());
    //for absolute error, gradient -1/1 matches the derivative of abs(x) without correction term
    long seed = -1; //ignored

    double[] responses = new double[myRows.length];
    double[] offsets   = new double[myRows.length];
    int n=0;
    for (int mb=0; mb0?loss/n:loss;
  }

  /**
   * Predict from raw double values representing the data
   * @param data raw array containing categorical values (horizontalized to 1,0,0,1,0,0 etc.) and numerical values (0.35,1.24,5.3234,etc), both can contain NaNs
   * @param preds predicted label and per-class probabilities (for classification), predicted target (regression), can contain NaNs
   * @return preds, can contain NaNs
   */
  @Override
  public double[] score0(double[] data, double[] preds, double offset) {
    int mb=0;
    int n=1;

    if (model_info().isUnstable()) {
      Log.err(unstable_msg);
      throw new UnsupportedOperationException(unstable_msg);
    }
    Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info);
    ((Neurons.Input)neurons[0]).setInput(-1, data, mb);
    DeepLearningTask.fpropMiniBatch(-1, neurons, model_info, null, false, null, new double[]{offset}, n);
    double[] out = neurons[neurons.length - 1]._a[mb].raw();

    if (get_params()._distribution == DistributionFamily.modified_huber) {
      preds[0] = -1;
      preds[2] = _dist.linkInv(out[0]);
      preds[1] = 1-preds[2];
      return preds;
    } else if (_output.isClassifier()) {
      assert (preds.length == out.length + 1);
      for (int i = 0; i < preds.length - 1; ++i) {
        preds[i + 1] = out[i];
        if (Double.isNaN(preds[i + 1])) throw new RuntimeException("Predicted class probability NaN!");
      }
      // label assignment happens later - explicitly mark it as invalid here
      preds[0] = -1;
    } else {
      if (model_info().data_info()._normRespMul != null) //either both are null or none
        preds[0] = (out[0] / model_info().data_info()._normRespMul[0] + model_info().data_info()._normRespSub[0]);
      else
        preds[0] = out[0];
      // transform prediction to response space
      preds[0] = _dist.linkInv(preds[0]);
      if (Double.isNaN(preds[0]))
        throw new RuntimeException("Predicted regression target NaN!");
    }
    return preds;
  }


  /**
   * Score auto-encoded reconstruction (on-the-fly, without allocating the reconstruction as done in Frame score(Frame fr))
   * @param frame Original data (can contain response, will be ignored)
   * @param destination_key Frame Id for output
   * @param reconstruction_error_per_feature whether to return the squared error per feature
   * @return Frame containing one Vec with reconstruction error (MSE) of each reconstructed row, caller is responsible for deletion
   */
  public Frame scoreAutoEncoder(Frame frame, Key destination_key, final boolean reconstruction_error_per_feature) {
    if (!get_params()._autoencoder)
      throw new H2OIllegalArgumentException("Only for AutoEncoder Deep Learning model.", "");
    final int len = _output._names.length;
    Frame adaptFrm = new Frame(frame);
    adaptTestForTrain(adaptFrm, true, false);
    final int outputcols = reconstruction_error_per_feature ? model_info.data_info.fullN() : 1;
    Frame mse = new MRTask() {
      @Override public void map( Chunk chks[], NewChunk[] mse ) {
        double tmp [] = new double[len];
        double out[] = new double[outputcols];
        final Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info);
        for( int row=0; row= model_info().get_params()._hidden.length)
      throw new H2OIllegalArgumentException("hidden layer (index) to extract must be between " + 0 + " and " + (model_info().get_params()._hidden.length-1),"");
    final int len = _output.nfeatures();
    if (isSupervised()) {
      int ridx = frame.find(_output.responseName());
      if (ridx != -1) { // drop the response for scoring!
        frame = new Frame(frame);
        frame.remove(ridx);
      }
    }
    Frame adaptFrm = new Frame(frame);
    //create new features, will be dense
    final int features = model_info().get_params()._hidden[layer];
    Vec v = adaptFrm.anyVec();
    Vec[] vecs = v!=null ? v.makeZeros(features) : null;
    if (vecs == null) throw new IllegalArgumentException("Cannot create deep features from a frame with no columns.");

    Scope.enter();
    adaptTestForTrain(adaptFrm, true, false);
    for (int j=0; j0) sb.append(",");
      sb.append(prefix).append(coefnames[c]);
    }
    return sb.toString();
  }

  @Override protected SBPrintStream toJavaInit(SBPrintStream sb, CodeGeneratorPipeline fileCtx) {
    sb = super.toJavaInit(sb, fileCtx);
    final String mname = JCodeGen.toJavaId(_key.toString());

    final Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info());
    final DeepLearningParameters p = model_info.get_params();
    CategoricalEncoding encoding = getGenModelEncoding();
    if (encoding == null) {
      throw new IllegalArgumentException("Only default, OneHotInternal, Binary, Eigen, LabelEncoder and SortByResponse categorical_encoding scheme is supported for POJO/MOJO");
    }

    sb.ip("public boolean isSupervised() { return " + isSupervised() + "; }").nl();
    sb.ip("public int nfeatures() { return "+_output.nfeatures()+"; }").nl();
    sb.ip("public int nclasses() { return "+ (p._autoencoder ? neurons[neurons.length-1].units : _output.nclasses()) + "; }").nl();
    if (encoding != CategoricalEncoding.AUTO) {
      sb.ip("public hex.genmodel.CategoricalEncoding getCategoricalEncoding() { return hex.genmodel.CategoricalEncoding." +
              encoding.name() + "; }").nl();
    }
    if (encoding == CategoricalEncoding.Eigen) {
      sb.ip("public double[] getOrigProjectionArray() { return " + PojoUtils.toJavaDoubleArray(_output._orig_projection_array) + "; }").nl();
    }
    if (model_info().data_info()._nums > 0) {
      sb.i(0).p("// Thread-local storage for input neuron activation values.").nl();
      sb.i(0).p("final double[] NUMS = new double[" + model_info().data_info()._nums +"];").nl();
      JCodeGen.toClassWithArray(sb, "static", "NORMMUL", model_info().data_info()._normMul);//, "Standardization/Normalization scaling factor for numerical variables.");
      JCodeGen.toClassWithArray(sb, "static", "NORMSUB", model_info().data_info()._normSub);//, "Standardization/Normalization offset for numerical variables.");
    }
    if (model_info().data_info()._cats > 0) {
      sb.i(0).p("// Thread-local workspace for storing categorical input variables.").nl();
      sb.i(0).p("final int[] CATS = new int[" + model_info().data_info()._cats +"];").nl();
    }
    JCodeGen.toStaticVar(sb, "CATOFFSETS", model_info().data_info()._catOffsets, "Offset into the workspace for categorical variables.");
    if (model_info().data_info()._normRespMul != null) {
      JCodeGen.toStaticVar(sb, "NORMRESPMUL", model_info().data_info()._normRespMul, "Standardization/Normalization scaling factor for response.");
      JCodeGen.toStaticVar(sb, "NORMRESPSUB", model_info().data_info()._normRespSub, "Standardization/Normalization offset for response.");
    }
    if (p._hidden_dropout_ratios != null) {
      JCodeGen.toStaticVar(sb, "HIDDEN_DROPOUT_RATIOS", p._hidden_dropout_ratios, "Hidden layer dropout ratios.");
    }

    final int[] layers = new int[neurons.length];
    for (int i=0;i0) {
            for (int j=0; j 0) {
            out.i().p("// Neuron weights connecting ").
                p(neurons[i - 1].getClass().getSimpleName()).p(" and ").
                p(neurons[i].getClass().getSimpleName()).
                p(" layer").nl();
          }
          float[]
              weights =
              i == 0 ? null : new float[model_info().get_weights(i - 1).rows() * model_info()
                  .get_weights(i - 1).cols()];
          if (i > 0) {
            final int rows = model_info().get_weights(i - 1).rows();
            final int cols = model_info().get_weights(i - 1).cols();
            for (int j = 0; j < rows; ++j)
              for (int k = 0; k < cols; ++k)
                weights[j * cols + k] = model_info().get_weights(i - 1).get(j, k);
          }
          JCodeGen.toClassWithArray(out, null, colInfoClazz, weights);
        }
      }
    });

    return sb;
  }

  @Override protected boolean toJavaCheckTooBig() { return (model_info.size() > 1e6); }

  private SBPrintStream pureMatVec(final SBPrintStream bodySb) {
    bodySb.i(1).p("int cols = ACTIVATION[i-1].length;").nl();
    bodySb.i(1).p("int rows = ACTIVATION[i].length;").nl();
    bodySb.i(1).p("int extra=cols-cols%8;").nl();
    bodySb.i(1).p("int multiple = (cols/8)*8-1;").nl();
    bodySb.i(1).p("int idx = 0;").nl();
    bodySb.i(1).p("float[] a = WEIGHT[i];").nl();
    bodySb.i(1).p("double[] x = ACTIVATION[i-1];").nl();
    bodySb.i(1).p("double[] y = BIAS[i];").nl();
    bodySb.i(1).p("double[] res = ACTIVATION[i];").nl();
    bodySb.i(1).p("for (int row=0; row 0) bodySb.i().p("java.util.Arrays.fill(NUMS,0);").nl();
    if (cats > 0) bodySb.i().p("java.util.Arrays.fill(CATS,0);").nl();
    bodySb.i().p("int i = 0, ncats = 0;").nl();
    if (cats > 0) {
      bodySb.i().p("for(; i<"+cats+"; ++i) {").nl();
      bodySb.i(1).p("if (!Double.isNaN(data[i])) {").nl();
      bodySb.i(2).p("int c = (int) data[i];").nl();
      if (model_info().data_info()._useAllFactorLevels)
        bodySb.i(2).p("CATS[ncats] = c + CATOFFSETS[i];").nl();
      else {
        bodySb.i(2).p("if (c != 0) {").nl();
        bodySb.i(3).p("CATS[ncats] = c + CATOFFSETS[i] - 1;").nl();
        bodySb.i(2).p("} else {").nl();
        bodySb.i(3).p("CATS[ncats] = -1;").nl();
        bodySb.i(2).p("}").nl();
      }
      bodySb.i(1).p("} else {").nl();  // set CAT level when encountering NAN
      bodySb.i(2).p("CATS[ncats] = CATOFFSETS[i+1]-1;").nl();
      bodySb.i(1).p("}").nl();
      bodySb.i(1).p("ncats++;").nl();
      bodySb.i().p("}").nl();
    }
    if (nums > 0) {
      bodySb.i().p("final int n = data.length;").nl();
      bodySb.i().p("for(; i 0 ? "-" + cats : "") + "] = Double.isNaN(data[i]) ? 0 : ");
      if (model_info().data_info()._normMul != null) {
        bodySb.p("(data[i] - NORMSUB.VALUES[i" + (cats > 0 ? "-" + cats : "") + "])*NORMMUL.VALUES[i" + (cats > 0 ? "-" + cats : "") + "];").nl();
      } else {
        bodySb.p("data[i];").nl();
      }
      bodySb.i(0).p("}").nl();
    }
    bodySb.i().p("java.util.Arrays.fill(ACTIVATION[0],0);").nl();
    if (cats > 0) {
      bodySb.i().p("for (i=0; i= 0) ACTIVATION[0][CATS[i]] = 1;").nl();
      bodySb.i(0).p("}").nl();
    }
    if (nums > 0) {
      bodySb.i().p("for (i=0; i channel[maxK]) maxK=k;").nl();
        bodySb.i(3).p("}").nl();
        bodySb.i(3).p("ACTIVATION[i][r] = channel[maxK];").nl();
    } else {
      // optimized
      pureMatVec(bodySb);
      // Activation function
      bodySb.i(1).p("if " + stopping + " {").nl();
      bodySb.i(2).p("for (int r=0; rmax) max = ACTIVATION[i][r];").nl();
      bodySb.i(2).p("}").nl();
      bodySb.i(2).p("double scale = 0;").nl();
      bodySb.i(2).p("for (int r=0; r 0) {
        int ns = model_info().data_info().numStart();
        bodySb.i(2).p("for (int k=" + ns + "; k<" + model_info().data_info().fullN() + "; ++k) {").nl();
        bodySb.i(3).p("preds[k] = preds[k] / NORMMUL.VALUES[k-" + ns + "] + NORMSUB.VALUES[k-" + ns + "];").nl();
        bodySb.i(2).p("}").nl();
      }
      bodySb.i(1).p("}").nl();
      bodySb.i().p("}").nl();
      // DEBUGGING
//      bodySb.i().p("System.out.println(java.util.Arrays.toString(data));").nl();
//      bodySb.i().p("System.out.println(java.util.Arrays.toString(ACTIVATION[0]));").nl();
//      bodySb.i().p("System.out.println(java.util.Arrays.toString(ACTIVATION[ACTIVATION.length-1]));").nl();
//      bodySb.i().p("System.out.println(java.util.Arrays.toString(preds));").nl();
//      bodySb.i().p("System.out.println(\"\");").nl();
    }
    if (_output.autoencoder) return;
    if (_output.isClassifier()) {
      if (get_params()._balance_classes)
        bodySb.ip("hex.genmodel.GenModel.correctProbabilities(preds, PRIOR_CLASS_DISTRIB, MODEL_CLASS_DISTRIB);").nl();
      bodySb.ip("preds[0] = hex.genmodel.GenModel.getPrediction(preds, PRIOR_CLASS_DISTRIB, data, " + defaultThreshold()+");").nl();
    } else {
      bodySb.ip("preds[0] = preds[1];").nl();
    }
  }

  private final String unstable_msg = technote(4,
      "\n\nTrying to predict with an unstable model." +
          "\nJob was aborted due to observed numerical instability (exponential growth)."
          + "\nEither the weights or the bias values are unreasonably large or lead to large activation values."
          + "\nTry a different initial distribution, a bounded activation function (Tanh), adding regularization"
          + "\n(via max_w2, l1, l2, dropout) or learning rate (either enable adaptive_rate or use a smaller learning rate or faster annealing).");

  @Override protected long checksum_impl() {
    return super.checksum_impl() * model_info.checksum_impl();
  }

  /**
   * Deep Learning Parameters
   */
  public static class DeepLearningParameters extends Model.Parameters {
    public String algoName() { return "DeepLearning"; }
    public String fullName() { return "Deep Learning"; }
    public String javaName() { return DeepLearningModel.class.getName(); }
    @Override protected double defaultStoppingTolerance() { return 0; }
    public DeepLearningParameters() {
      super();
      _stopping_rounds = 5;
    }
    @Override
    public long progressUnits() {
      if (train()==null) return 1;
      return (long)Math.ceil(_epochs*train().numRows());
    }
    @Override
    public double missingColumnsType() {
      return _sparse ? 0 : Double.NaN;
    }

    /**
     * If enabled, store the best model under the destination key of this model at the end of training.
     * Only applicable if training is not cancelled.
     */
    public boolean _overwrite_with_best_model = true;

    public boolean _autoencoder = false;

    public boolean _use_all_factor_levels = true;

    /**
     * If enabled, automatically standardize the data. If disabled, the user must provide properly scaled input data.
     */
    public boolean _standardize = true;

  /*Neural Net Topology*/
    /**
     * The activation function (non-linearity) to be used the neurons in the hidden layers.
     * Tanh: Hyperbolic tangent function (same as scaled and shifted sigmoid).
     * Rectifier: Chooses the maximum of (0, x) where x is the input value.
     * Maxout: Choose the maximum coordinate of the input vector.
     * With Dropout: Zero out a random user-given fraction of the
     * incoming weights to each hidden layer during training, for each
     * training row. This effectively trains exponentially many models at
     * once, and can improve generalization.
     */
    public Activation _activation = Activation.Rectifier;

    /**
     * The number and size of each hidden layer in the model.
     * For example, if a user specifies "100,200,100" a model with 3 hidden
     * layers will be produced, and the middle hidden layer will have 200
     * neurons.
     */
    public int[] _hidden = new int[]{200, 200};

    /**
     * The number of passes over the training dataset to be carried out.
     * It is recommended to start with lower values for initial experiments.
     * This value can be modified during checkpoint restarts and allows continuation
     * of selected models.
     */
    public double _epochs = 10;

    /**
     * The number of training data rows to be processed per iteration. Note that
     * independent of this parameter, each row is used immediately to update the model
     * with (online) stochastic gradient descent. This parameter controls the
     * synchronization period between nodes in a distributed environment and the
     * frequency at which scoring and model cancellation can happen. For example, if
     * it is set to 10,000 on H2O running on 4 nodes, then each node will
     * process 2,500 rows per iteration, sampling randomly from their local data.
     * Then, model averaging between the nodes takes place, and scoring can happen
     * (dependent on scoring interval and duty factor). Special values are 0 for
     * one epoch per iteration, -1 for processing the maximum amount of data
     * per iteration (if **replicate training data** is enabled, N epochs
     * will be trained per iteration on N nodes, otherwise one epoch). Special value
     * of -2 turns on automatic mode (auto-tuning).
     */
    public long _train_samples_per_iteration = -2;

    public double _target_ratio_comm_to_comp = 0.05;

  /*Adaptive Learning Rate*/
    /**
     * The implemented adaptive learning rate algorithm (ADADELTA) automatically
     * combines the benefits of learning rate annealing and momentum
     * training to avoid slow convergence. Specification of only two
     * parameters (rho and epsilon)  simplifies hyper parameter search.
     * In some cases, manually controlled (non-adaptive) learning rate and
     * momentum specifications can lead to better results, but require the
     * specification (and hyper parameter search) of up to 7 parameters.
     * If the model is built on a topology with many local minima or
     * long plateaus, it is possible for a constant learning rate to produce
     * sub-optimal results. Learning rate annealing allows digging deeper into
     * local minima, while rate decay allows specification of different
     * learning rates per layer.  When the gradient is being estimated in
     * a long valley in the optimization landscape, a large learning rate
     * can cause the gradient to oscillate and move in the wrong
     * direction. When the gradient is computed on a relatively flat
     * surface with small learning rates, the model can converge far
     * slower than necessary.
     */
    public boolean _adaptive_rate = true;

    /**
     * The first of two hyper parameters for adaptive learning rate (ADADELTA).
     * It is similar to momentum and relates to the memory to prior weight updates.
     * Typical values are between 0.9 and 0.999.
     * This parameter is only active if adaptive learning rate is enabled.
     */
    public double _rho = 0.99;

    /**
     * The second of two hyper parameters for adaptive learning rate (ADADELTA).
     * It is similar to learning rate annealing during initial training
     * and momentum at later stages where it allows forward progress.
     * Typical values are between 1e-10 and 1e-4.
     * This parameter is only active if adaptive learning rate is enabled.
     */
    public double _epsilon = 1e-8;

  /*Learning Rate*/
    /**
     * When adaptive learning rate is disabled, the magnitude of the weight
     * updates are determined by the user specified learning rate
     * (potentially annealed), and are a function  of the difference
     * between the predicted value and the target value. That difference,
     * generally called delta, is only available at the output layer. To
     * correct the output at each hidden layer, back propagation is
     * used. Momentum modifies back propagation by allowing prior
     * iterations to influence the current update. Using the momentum
     * parameter can aid in avoiding local minima and the associated
     * instability. Too much momentum can lead to instabilities, that's
     * why the momentum is best ramped up slowly.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _rate = .005;

    /**
     * Learning rate annealing reduces the learning rate to "freeze" into
     * local minima in the optimization landscape.  The annealing rate is the
     * inverse of the number of training samples it takes to cut the learning rate in half
     * (e.g., 1e-6 means that it takes 1e6 training samples to halve the learning rate).
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _rate_annealing = 1e-6;

    /**
     * The learning rate decay parameter controls the change of learning rate across layers.
     * For example, assume the rate parameter is set to 0.01, and the rate_decay parameter is set to 0.5.
     * Then the learning rate for the weights connecting the input and first hidden layer will be 0.01,
     * the learning rate for the weights connecting the first and the second hidden layer will be 0.005,
     * and the learning rate for the weights connecting the second and third hidden layer will be 0.0025, etc.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _rate_decay = 1.0;

  /*Momentum*/
    /**
     * The momentum_start parameter controls the amount of momentum at the beginning of training.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _momentum_start = 0;

    /**
     * The momentum_ramp parameter controls the amount of learning for which momentum increases
     * (assuming momentum_stable is larger than momentum_start). The ramp is measured in the number
     * of training samples.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _momentum_ramp = 1e6;

    /**
     * The momentum_stable parameter controls the final momentum value reached after momentum_ramp training samples.
     * The momentum used for training will remain the same for training beyond reaching that point.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    public double _momentum_stable = 0;

    /**
     * The Nesterov accelerated gradient descent method is a modification to
     * traditional gradient descent for convex functions. The method relies on
     * gradient information at various points to build a polynomial approximation that
     * minimizes the residuals in fewer iterations of the descent.
     */
    public boolean _nesterov_accelerated_gradient = true;

  /*Regularization*/
    /**
     * A fraction of the features for each training row to be omitted from training in order
     * to improve generalization (dimension sampling).
     */
    public double _input_dropout_ratio = 0.0;

    /**
     * A fraction of the inputs for each hidden layer to be omitted from training in order
     * to improve generalization. Defaults to 0.5 for each hidden layer if omitted.
     */
    public double[] _hidden_dropout_ratios;

    /**
     * A regularization method that constrains the absolute value of the weights and
     * has the net effect of dropping some weights (setting them to zero) from a model
     * to reduce complexity and avoid overfitting.
     */
    public double _l1 = 0.0;

    /**
     * A regularization method that constrains the sum of the squared
     * weights. This method introduces bias into parameter estimates, but
     * frequently produces substantial gains in modeling as estimate variance is
     * reduced.
     */
    public double _l2 = 0.0;

    /**
     * A maximum on the sum of the squared incoming weights into
     * any one neuron. This tuning parameter is especially useful for unbound
     * activation functions such as Rectifier.
     */
    public float _max_w2 = Float.MAX_VALUE;

  /*Initialization*/
    /**
     * The distribution from which initial weights are to be drawn. The default
     * option is an optimized initialization that considers the size of the network.
     * The "uniform" option uses a uniform distribution with a mean of 0 and a given
     * interval. The "normal" option draws weights from the standard normal
     * distribution with a mean of 0 and given standard deviation.
     */
    public InitialWeightDistribution _initial_weight_distribution = InitialWeightDistribution.UniformAdaptive;

    /**
     * The scale of the distribution function for Uniform or Normal distributions.
     * For Uniform, the values are drawn uniformly from -initial_weight_scale...initial_weight_scale.
     * For Normal, the values are drawn from a Normal distribution with a standard deviation of initial_weight_scale.
     */
    public double _initial_weight_scale = 1.0;

    /**
     * Frame keys for initial weight matrices
     */
    public Key[] _initial_weights;

    /**
     * Frame keys for initial bias vectors
     */
    public Key[] _initial_biases;

    /**
     * The loss (error) function to be minimized by the model.
     * Cross Entropy loss is used when the model output consists of independent
     * hypotheses, and the outputs can be interpreted as the probability that each
     * hypothesis is true. Cross entropy is the recommended loss function when the
     * target values are class labels, and especially for imbalanced data.
     * It strongly penalizes error in the prediction of the actual class label.
     * Mean Square loss is used when the model output are continuous real values, but can
     * be used for classification as well (where it emphasizes the error on all
     * output classes, not just for the actual class).
     */
    public Loss _loss = Automatic;

  /*Scoring*/
    /**
     * The minimum time (in seconds) to elapse between model scoring. The actual
     * interval is determined by the number of training samples per iteration and the scoring duty cycle.
     */
    public double _score_interval = 5;

    /**
     * The number of training dataset points to be used for scoring. Will be
     * randomly sampled. Use 0 for selecting the entire training dataset.
     */
    public long _score_training_samples = 10000l;

    /**
     * The number of validation dataset points to be used for scoring. Can be
     * randomly sampled or stratified (if "balance classes" is set and "score
     * validation sampling" is set to stratify). Use 0 for selecting the entire
     * training dataset.
     */
    public long _score_validation_samples = 0l;

    /**
     * Maximum fraction of wall clock time spent on model scoring on training and validation samples,
     * and on diagnostics such as computation of feature importances (i.e., not on training).
     */
    public double _score_duty_cycle = 0.1;

    /**
     * The stopping criteria in terms of classification error (1-accuracy) on the
     * training data scoring dataset. When the error is at or below this threshold,
     * training stops.
     */
    public double _classification_stop = 0;

    /**
     * The stopping criteria in terms of regression error (MSE) on the training
     * data scoring dataset. When the error is at or below this threshold, training
     * stops.
     */
    public double _regression_stop = 1e-6;

    /**
     * Enable quiet mode for less output to standard output.
     */
    public boolean _quiet_mode = false;

    /**
     * Method used to sample the validation dataset for scoring, see Score Validation Samples above.
     */
    public ClassSamplingMethod _score_validation_sampling = ClassSamplingMethod.Uniform;

  /*Misc*/
    /**
     * Gather diagnostics for hidden layers, such as mean and RMS values of learning
     * rate, momentum, weights and biases.
     */
    public boolean _diagnostics = true;

    /**
     * Whether to compute variable importances for input features.
     * The implemented method (by Gedeon) considers the weights connecting the
     * input features to the first two hidden layers.
     */
    public boolean _variable_importances = true;

    /**
     * Enable fast mode (minor approximation in back-propagation), should not affect results significantly.
     */
    public boolean _fast_mode = true;

    /**
     * Increase training speed on small datasets by splitting it into many chunks
     * to allow utilization of all cores.
     */
    public boolean _force_load_balance = true;

    /**
     * Replicate the entire training dataset onto every node for faster training on small datasets.
     */
    public boolean _replicate_training_data = true;

    /**
     * Run on a single node for fine-tuning of model parameters. Can be useful for
     * checkpoint resumes after training on multiple nodes for fast initial
     * convergence.
     */
    public boolean _single_node_mode = false;

    /**
     * Enable shuffling of training data (on each node). This option is
     * recommended if training data is replicated on N nodes, and the number of training samples per iteration
     * is close to N times the dataset size, where all nodes train with (almost) all
     * the data. It is automatically enabled if the number of training samples per iteration is set to -1 (or to N
     * times the dataset size or larger).
     */
    public boolean _shuffle_training_data = false;

    public MissingValuesHandling _missing_values_handling = MissingValuesHandling.MeanImputation;

    public boolean _sparse = false;

    public boolean _col_major = false;

    public double _average_activation = 0;

    public double _sparsity_beta = 0;

    /**
     * Max. number of categorical features, enforced via hashing (Experimental)
     */
    public int _max_categorical_features = Integer.MAX_VALUE;

    /**
     * Force reproducibility on small data (will be slow - only uses 1 thread)
     */
    public boolean _reproducible = false;

    public boolean _export_weights_and_biases = false;

    public boolean _elastic_averaging = false;
    public double _elastic_averaging_moving_rate = 0.9;
    public double _elastic_averaging_regularization = 1e-3;

    // stochastic gradient descent: mini-batch size = 1
    // batch gradient descent: mini-batch size = # training rows
    public int _mini_batch_size = 1;

    public enum MissingValuesHandling {
      MeanImputation, Skip
    }

    public enum ClassSamplingMethod {
      Uniform, Stratified
    }

    public enum InitialWeightDistribution {
      UniformAdaptive, Uniform, Normal
    }

    /**
     * Activation functions
     */
    public enum Activation {
      Tanh, TanhWithDropout, Rectifier, RectifierWithDropout, Maxout, MaxoutWithDropout, ExpRectifier, ExpRectifierWithDropout
    }

    /**
     * Loss functions
     * Absolute, Quadratic, Huber, Quantile for regression
     * Quadratic, ModifiedHuber or CrossEntropy for classification
     */
    public enum Loss {
      Automatic, Quadratic, CrossEntropy, ModifiedHuber, Huber, Absolute, Quantile
    }

    /**
     * Validate model parameters
     * @param dl DL Model Builder (Driver)
     * @param expensive (whether or not this is the "final" check)
     */
    void validate(DeepLearning dl, boolean expensive) {
      boolean classification = expensive || dl.nclasses() != 0 ? dl.isClassifier() : _loss == CrossEntropy || _loss == ModifiedHuber;
      if (_loss == ModifiedHuber) dl.error("_loss", "ModifiedHuber loss function is not supported yet.");
//      if (_hidden == null || _hidden.length == 0) dl.error("_hidden", "There must be at least one hidden layer.");
      if (_hidden == null) _hidden = new int[0];
      for (int h : _hidden) if (h <= 0) dl.error("_hidden", "Hidden layer size must be positive.");
      if (_mini_batch_size < 1)
        dl.error("_mini_batch_size", "Mini-batch size must be >= 1");
      if (!_diagnostics)
        dl.warn("_diagnostics", "Deprecated option: Diagnostics are always enabled.");

      if (!_autoencoder) {
        if (_valid == null)
          dl.hide("_score_validation_samples", "score_validation_samples requires a validation frame.");

        if (classification) {
          dl.hide("_regression_stop", "regression_stop is used only with regression.");
        } else {
          dl.hide("_classification_stop", "classification_stop is used only with classification.");
        }
        if (!classification && _valid != null || _valid == null)
          dl.hide("_score_validation_sampling", "score_validation_sampling requires classification and a validation frame.");
      } else {
        if (_nfolds > 1) {
          dl.error("_nfolds", "N-fold cross-validation is not supported for Autoencoder.");
        }
      }
      if (_categorical_encoding==CategoricalEncodingScheme.Enum) {
        dl.error("_categorical_encoding", "Cannot use Enum encoding for categoricals - need numbers!");
      }
      if (_categorical_encoding==CategoricalEncodingScheme.OneHotExplicit) {
        dl.error("_categorical_encoding", "Won't use explicit Enum encoding for categoricals - it's much faster with OneHotInternal!");
      }
      if (_activation != Activation.TanhWithDropout && _activation != Activation.MaxoutWithDropout && _activation != Activation.RectifierWithDropout && _activation != Activation.ExpRectifierWithDropout) {
        dl.hide("_hidden_dropout_ratios", "hidden_dropout_ratios requires a dropout activation function.");
      }
      if (_hidden_dropout_ratios != null) {
        if (_hidden_dropout_ratios.length != _hidden.length) {
          dl.error("_hidden_dropout_ratios", "Must have " + _hidden.length + " hidden layer dropout ratios.");
        } else if (_activation != Activation.TanhWithDropout && _activation != Activation.MaxoutWithDropout && _activation != Activation.RectifierWithDropout && _activation != Activation.ExpRectifierWithDropout) {
          dl.error("_hidden_dropout_ratios", "Cannot specify hidden_dropout_ratios with a non-dropout activation function. Use 'RectifierWithDropout', 'TanhWithDropout', etc.");
        } else if (ArrayUtils.maxValue(_hidden_dropout_ratios) >= 1 || ArrayUtils.minValue(_hidden_dropout_ratios) < 0) {
          dl.error("_hidden_dropout_ratios", "Hidden dropout ratios must be >= 0 and <1.");
        }
      }
      if (_input_dropout_ratio < 0 || _input_dropout_ratio >= 1)
        dl.error("_input_dropout_ratio", "Input dropout must be >= 0 and <1.");
      if (_score_duty_cycle < 0 || _score_duty_cycle > 1)
        dl.error("_score_duty_cycle", "Score duty cycle must be >= 0 and <=1.");
      if (_l1 < 0)
        dl.error("_l1", "L1 penalty must be >= 0.");
      if (_l2 < 0)
        dl.error("_l2", "L2 penalty must be >= 0.");
      if (H2O.CLOUD.size() == 1 && _replicate_training_data)
        dl.hide("_replicate_training_data", "replicate_training_data is only valid with cloud size greater than 1.");
      if (_single_node_mode && (H2O.CLOUD.size() == 1 || !_replicate_training_data))
        dl.hide("_single_node_mode", "single_node_mode is only used with multi-node operation with replicated training data.");
      if (H2O.ARGS.client && _single_node_mode)
        dl.error("_single_node_mode", "Cannot run on a single node in client mode");
      if (_autoencoder)
        dl.hide("_use_all_factor_levels", "use_all_factor_levels is mandatory in combination with autoencoder.");
      if (_nfolds != 0)
        dl.hide("_overwrite_with_best_model", "overwrite_with_best_model is unsupported in combination with n-fold cross-validation.");
      if (_adaptive_rate) {
        dl.hide("_rate", "rate is not used with adaptive_rate.");
        dl.hide("_rate_annealing", "rate_annealing is not used with adaptive_rate.");
        dl.hide("_rate_decay", "rate_decay is not used with adaptive_rate.");
        dl.hide("_momentum_start", "momentum_start is not used with adaptive_rate.");
        dl.hide("_momentum_ramp", "momentum_ramp is not used with adaptive_rate.");
        dl.hide("_momentum_stable", "momentum_stable is not used with adaptive_rate.");
        if (_rate!=0.005) dl.warn("_rate", "rate cannot be specified if adaptive_rate is enabled.");
        if (_rate_annealing!=1e-6) dl.warn("_rate_annealing", "rate_annealing cannot be specified if adaptive_rate is enabled.");
        if (_rate_decay!=1) dl.warn("_rate_decay", "rate_decay cannot be specified if adaptive_rate is enabled.");
        if (_momentum_start!=0) dl.warn("_momentum_start", "momentum_start cannot be specified if adaptive_rate is enabled.");
        if (_momentum_ramp!=1e6) dl.warn("_momentum_ramb", "momentum_ramp cannot be specified if adaptive_rate is enabled.");
        if (_momentum_stable!=0) dl.warn("_momentum_stable", "momentum_stable cannot be specified if adaptive_rate is enabled.");
      } else {
        // ! adaptive_rate
        dl.hide("_rho", "rho is only used with adaptive_rate.");
        dl.hide("_epsilon", "epsilon is only used with adaptive_rate.");
      }
      if (_initial_weight_distribution == InitialWeightDistribution.UniformAdaptive) {
        dl.hide("_initial_weight_scale", "initial_weight_scale is not used if initial_weight_distribution == UniformAdaptive.");
      }
      if ((_initial_weights != null || _initial_biases != null) && _checkpoint != null) {
        dl.error("_checkpoint", "Cannot specify initial weights or biases during checkpoint restart. Will use the checkpoint model's weights and biases.");
      }
      if (_initial_weights != null && _initial_weights.length!=_hidden.length+1) {
        dl.error("_initial_weights", "The number of initial weights matrices must be " + (_hidden.length+1) + " (some weight matrices can be NULL/None/null).");
      }
      if (_initial_biases != null && _initial_biases.length!=_hidden.length+1) {
        dl.error("_initial_biases", "The number of initial bias vectors must be " + (_hidden.length+1) + " (some bias vectors can be NULL/None/null).");
      }
      if (_loss == null) {
        if (expensive || dl.nclasses() != 0) {
          dl.error("_loss", "Loss function must be specified. Try CrossEntropy for categorical response (classification), ModifiedHuber for binomial response, Quadratic, Absolute or Huber for numerical response (regression).");
        }
        //otherwise, we might not know whether classification=true or false (from R, for example, the training data isn't known when init(false) is called).
      } else {
        if (_autoencoder && _loss == CrossEntropy)
          dl.error("_loss", "Cannot use CrossEntropy loss for auto-encoder.");
        if (!classification && _loss == CrossEntropy)
          dl.error("_loss", technote(2, "For CrossEntropy loss, the response must be categorical."));
      }
      if (!classification && _loss == CrossEntropy)
        dl.error("_loss", "For CrossEntropy loss, the response must be categorical.");
      if (classification && (_loss != Automatic && _loss != CrossEntropy && _loss != Quadratic && _loss != ModifiedHuber))
        dl.error("_loss", "For classification tasks, the loss must be one of: Automatic, Quadratic, CrossEntropy or ModifiedHuber.");
      if (classification) {
        switch(_distribution) {
          case gaussian:
          case huber:
          case laplace:
          case quantile:
          case tweedie:
          case gamma:
          case poisson:
            dl.error("_distribution", technote(2, _distribution  + " distribution is not allowed for classification."));
            break;
          case AUTO:
          case bernoulli:
          case modified_huber:
          case multinomial:
          default:
            //OK
            break;
        }
      } else {
        switch(_distribution) {
          case multinomial:
          case bernoulli:
          case modified_huber:
            dl.error("_distribution", technote(2, _distribution  + " distribution is not allowed for regression."));
            break;
          case tweedie:
          case gamma:
          case poisson:
            if (_loss != Automatic)
              dl.error("_distribution", "Only Automatic loss (deviance) is allowed for " + _distribution + " distribution.");
            break;
          case laplace:
            if (_loss != Loss.Absolute && _loss != Automatic)
              dl.error("_distribution", "Only Automatic or Absolute loss is allowed for " + _distribution + " distribution.");
            break;
          case quantile:
            if (_loss != Loss.Quantile && _loss != Automatic)
              dl.error("_distribution", "Only Automatic or Quantile loss is allowed for " + _distribution + " distribution.");
            break;
          case huber:
            if (_loss != Loss.Huber && _loss != Automatic)
              dl.error("_distribution", "Only Automatic or Huber loss is allowed for " + _distribution + " distribution.");
            break;
          case AUTO:
          case gaussian:
          default:
            //OK
            break;
        }
      }

      if (_distribution == DistributionFamily.quasibinomial)
        dl.error("_distribution", "Quasibinomial is not supported for deeplearning in current H2O.");
      if (expensive) dl.checkDistributions();

      if (_score_training_samples < 0)
        dl.error("_score_training_samples", "Number of training samples for scoring must be >= 0 (0 for all).");
      if (_score_validation_samples < 0)
        dl.error("_score_validation_samples", "Number of training samples for scoring must be >= 0 (0 for all).");
      if (_autoencoder && _sparsity_beta > 0) {
        if (_activation == Activation.Tanh || _activation == Activation.TanhWithDropout || _activation == Activation.ExpRectifier || _activation == Activation.ExpRectifierWithDropout) {
          if (_average_activation >= 1 || _average_activation <= -1)
            dl.error("_average_activation", "Tanh average activation must be in (-1,1).");
        } else if (_activation == Activation.Rectifier || _activation == Activation.RectifierWithDropout) {
          if (_average_activation <= 0)
            dl.error("_average_activation", "Rectifier average activation must be positive.");
        }
      }
      if (!_autoencoder && _sparsity_beta != 0)
        dl.error("_sparsity_beta", "Sparsity beta can only be used for autoencoder.");
      if (classification && dl.hasOffsetCol())
        dl.error("_offset_column", "Offset is only supported for regression.");

      // reason for the error message below is that validation might not have the same horizontalized features as the training data (or different order)
      if (_autoencoder && _activation == Activation.Maxout)
        dl.error("_activation", "Maxout activation is not supported for auto-encoder.");
      if (_max_categorical_features < 1)
        dl.error("_max_categorical_features", "max_categorical_features must be at least 1.");
      if (_col_major)
        dl.error("_col_major", "Deprecated: Column major data handling not supported anymore - not faster.");
      if (!_sparse && _col_major) {
        dl.error("_col_major", "Cannot use column major storage for non-sparse data handling.");
      }
      if (_sparse && _elastic_averaging) {
        dl.error("_elastic_averaging", "Cannot use elastic averaging for sparse data handling.");
      }
      if (_max_w2 <= 0) {
        dl.error("_max_w2", "Cannot use max_w2 <= 0.");
      }
      if (expensive) {
        if (!classification && _balance_classes) {
          dl.error("_balance_classes", "balance_classes requires classification.");
        }
        if (_class_sampling_factors != null && !_balance_classes) {
          dl.error("_class_sampling_factors", "class_sampling_factors requires balance_classes to be enabled.");
        }
        if (_replicate_training_data && null != train() && train().byteSize() > 0.9*H2O.CLOUD.free_mem()/H2O.CLOUD.size() && H2O.CLOUD.size() > 1) {
          dl.error("_replicate_training_data", "Compressed training dataset takes more than 90% of avg. free available memory per node (" + 0.9*H2O.CLOUD.free_mem()/H2O.CLOUD.size() + "), cannot run with replicate_training_data.");
        }
      }
      if (!_elastic_averaging) {
        dl.hide("_elastic_averaging_moving_rate", "Elastic averaging is required for this parameter.");
        dl.hide("_elastic_averaging_regularization", "Elastic averaging is required for this parameter.");
      } else {
        if (_elastic_averaging_moving_rate > 1 || _elastic_averaging_moving_rate < 0)
          dl.error("_elastic_averaging_moving_rate", "Elastic averaging moving rate must be between 0 and 1.");
        if (_elastic_averaging_regularization < 0)
          dl.error("_elastic_averaging_regularization", "Elastic averaging regularization strength must be >= 0.");
      }
      if (_autoencoder && _stopping_metric != ScoreKeeper.StoppingMetric.AUTO && _stopping_metric != ScoreKeeper.StoppingMetric.MSE) {
        dl.error("_stopping_metric", "Stopping metric must either be AUTO or MSE for autoencoder.");
      }
    }

    static class Sanity {
      // the following parameters can be modified when restarting from a checkpoint
      transient static private final String[] cp_modifiable = new String[]{
              "_seed",
              "_checkpoint",
              "_epochs",
              "_score_interval",
              "_train_samples_per_iteration",
              "_target_ratio_comm_to_comp",
              "_score_duty_cycle",
              "_score_training_samples",
              "_score_validation_samples",
              "_score_validation_sampling",
              "_classification_stop",
              "_regression_stop",
              "_stopping_rounds",
              "_stopping_metric",
              "_stopping_tolerance",
              "_quiet_mode",
              "_max_confusion_matrix_size",
              "_diagnostics",
              "_variable_importances",
              "_initial_weight_distribution", //will be ignored anyway
              "_initial_weight_scale", //will be ignored anyway
              "_initial_weights",
              "_initial_biases",
              "_force_load_balance",
              "_replicate_training_data",
              "_shuffle_training_data",
              "_single_node_mode",
              "_fast_mode",
              // Allow modification of the regularization parameters after a checkpoint restart
              "_l1",
              "_l2",
              "_max_w2",
              "_input_dropout_ratio",
              "_hidden_dropout_ratios",
              "_loss",
              "_overwrite_with_best_model",
              "_missing_values_handling",
              "_average_activation",
              "_reproducible",
              "_export_weights_and_biases",
              "_elastic_averaging",
              "_elastic_averaging_moving_rate",
              "_elastic_averaging_regularization",
              "_mini_batch_size",
              "_pretrained_autoencoder"
      };

      // the following parameters must not be modified when restarting from a checkpoint
      transient static private final String[] cp_not_modifiable = new String[]{
              "_drop_na20_cols",
              "_response_column",
              "_activation",
              "_use_all_factor_levels",
              "_standardize",
              "_adaptive_rate",
              "_autoencoder",
              "_rho",
              "_epsilon",
              "_sparse",
              "_sparsity_beta",
              "_col_major",
              "_rate",
              "_rate_annealing",
              "_rate_decay",
              "_momentum_start",
              "_momentum_ramp",
              "_momentum_stable",
              "_nesterov_accelerated_gradient",
              "_ignore_const_cols",
              "_max_categorical_features",
              "_nfolds",
              "_distribution",
              "_quantile_alpha",
              "_huber_alpha",
              "_tweedie_power"
      };

      static void checkCompleteness() {
        for (Field f : DeepLearningParameters.class.getDeclaredFields())
          if (!ArrayUtils.contains(cp_not_modifiable, f.getName())
                  &&
                  !ArrayUtils.contains(cp_modifiable, f.getName())
                  ) {
            if (f.getName().equals("_hidden")) continue;
            if (f.getName().equals("_ignored_columns")) continue;
	    if (f.getName().equals("$jacocoData")) continue; // If code coverage is enabled
            throw H2O.unimpl("Please add " + f.getName() + " to either cp_modifiable or cp_not_modifiable");
          }
      }

      /**
       * Check that checkpoint continuation is possible
       *
       * @param oldP old DL parameters (from checkpoint)
       * @param newP new DL parameters (user-given, to restart from checkpoint)
       */
      static void checkIfParameterChangeAllowed(final DeepLearningParameters oldP, final DeepLearningParameters newP) {
        checkCompleteness();
        if (newP._nfolds != 0)
          throw new UnsupportedOperationException("nfolds must be 0: Cross-validation is not supported during checkpoint restarts.");
        if ((newP._valid == null) != (oldP._valid == null)) {
          throw new H2OIllegalArgumentException("Presence of validation dataset must agree with the checkpointed model.");
        }
        if (!newP._autoencoder && (newP._response_column == null || !newP._response_column.equals(oldP._response_column))) {
          throw new H2OIllegalArgumentException("Response column (" + newP._response_column + ") is not the same as for the checkpointed model: " + oldP._response_column);
        }
        if (!Arrays.equals(newP._hidden, oldP._hidden)) {
          throw new H2OIllegalArgumentException("Hidden layers (" + Arrays.toString(newP._hidden) + ") is not the same as for the checkpointed model: " + Arrays.toString(oldP._hidden));
        }
        if (!Arrays.equals(newP._ignored_columns, oldP._ignored_columns)) {
          throw new H2OIllegalArgumentException("Ignored columns must be the same as for the checkpointed model.");
        }

        //compare the user-given parameters before and after and check that they are not changed
        for (Field fBefore : oldP.getClass().getFields()) {
          if (ArrayUtils.contains(cp_not_modifiable, fBefore.getName())) {
            for (Field fAfter : newP.getClass().getFields()) {
              if (fBefore.equals(fAfter)) {
                try {
                  if (fAfter.get(newP) == null || fBefore.get(oldP) == null || !fBefore.get(oldP).toString().equals(fAfter.get(newP).toString())) { // if either of the two parameters is null, skip the toString()
                    if (fBefore.get(oldP) == null && fAfter.get(newP) == null)
                      continue; //if both parameters are null, we don't need to do anything
                    throw new H2OIllegalArgumentException("Cannot change parameter: '" + fBefore.getName() + "': " + fBefore.get(oldP) + " -> " + fAfter.get(newP));
                  }
                } catch (IllegalAccessException e) {
                  e.printStackTrace();
                }
              }
            }
          }
        }
      }

      /**
       * Update the parameters from checkpoint to user-specified
       *
       * @param srcParms     source: user-specified parameters
       * @param tgtParms     target: parameters to be modified
       * @param doIt         whether to overwrite target parameters (or just print the message)
       * @param quiet        whether to suppress the notifications about parameter changes
       */
      static void updateParametersDuringCheckpointRestart(DeepLearningParameters srcParms, DeepLearningParameters tgtParms/*actually used during training*/, boolean doIt, boolean quiet) {
        for (Field fTarget : tgtParms.getClass().getFields()) {
          if (ArrayUtils.contains(cp_modifiable, fTarget.getName())) {
            for (Field fSource : srcParms.getClass().getFields()) {
              if (fTarget.equals(fSource)) {
                try {
                  if (fSource.get(srcParms) == null || fTarget.get(tgtParms) == null || !fTarget.get(tgtParms).toString().equals(fSource.get(srcParms).toString())) { // if either of the two parameters is null, skip the toString()
                    if (fTarget.get(tgtParms) == null && fSource.get(srcParms) == null)
                      continue; //if both parameters are null, we don't need to do anything
                    if (!tgtParms._quiet_mode && !quiet)
                      Log.info("Applying user-requested modification of '" + fTarget.getName() + "': " + fTarget.get(tgtParms) + " -> " + fSource.get(srcParms));
                    if (doIt)
                      fTarget.set(tgtParms, fSource.get(srcParms));
                  }
                } catch (IllegalAccessException e) {
                  e.printStackTrace();
                }
              }
            }
          }
        }
      }

      /**
       * Take user-given parameters and turn them into usable, fully populated parameters (e.g., to be used by Neurons during training)
       *
       * @param fromParms      raw user-given parameters from the REST API (READ ONLY)
       * @param toParms        modified set of parameters, with defaults filled in (WILL BE MODIFIED)
       * @param nClasses       number of classes (1 for regression or autoencoder)
       */
      static void modifyParms(DeepLearningParameters fromParms, DeepLearningParameters toParms, int nClasses) {
        if (fromParms._hidden_dropout_ratios == null) {
          if (fromParms._activation == Activation.TanhWithDropout
                  || fromParms._activation == Activation.MaxoutWithDropout
                  || fromParms._activation == Activation.RectifierWithDropout
                  || fromParms._activation == Activation.ExpRectifierWithDropout
              ) {
            toParms._hidden_dropout_ratios = new double[fromParms._hidden.length];
            if (!fromParms._quiet_mode)
              Log.info("_hidden_dropout_ratios: Automatically setting all hidden dropout ratios to 0.5.");
            Arrays.fill(toParms._hidden_dropout_ratios, 0.5);
          }
        } else {
          toParms._hidden_dropout_ratios = fromParms._hidden_dropout_ratios.clone();
        }
        if (H2O.CLOUD.size() == 1 && fromParms._replicate_training_data) {
          if (!fromParms._quiet_mode)
            Log.info("_replicate_training_data: Disabling replicate_training_data on 1 node.");
          toParms._replicate_training_data = false;
        }
        if (fromParms._single_node_mode && (H2O.CLOUD.size() == 1 || !fromParms._replicate_training_data)) {
          if (!fromParms._quiet_mode)
            Log.info("_single_node_mode: Disabling single_node_mode (only for multi-node operation with replicated training data).");
          toParms._single_node_mode = false;
        }
        if (!fromParms._use_all_factor_levels && fromParms._autoencoder) {
          if (!fromParms._quiet_mode)
            Log.info("_use_all_factor_levels: Automatically enabling all_factor_levels for auto-encoders.");
          toParms._use_all_factor_levels = true;
        }
        if (fromParms._overwrite_with_best_model && fromParms._nfolds != 0) {
          if (!fromParms._quiet_mode)
            Log.info("_overwrite_with_best_model: Disabling overwrite_with_best_model in combination with n-fold cross-validation.");
          toParms._overwrite_with_best_model = false;
        }
        if (fromParms._categorical_encoding==CategoricalEncodingScheme.AUTO) {
          if (!fromParms._quiet_mode)
            Log.info("_categorical_encoding: Automatically enabling OneHotInternal categorical encoding.");
          toParms._categorical_encoding = CategoricalEncodingScheme.OneHotInternal;
         }
        if (fromParms._mini_batch_size > 1) {
          Log.warn("_mini_batch_size", "Only mini-batch size = 1 is supported right now.");
          toParms._mini_batch_size = 1;
        }
        if (fromParms._adaptive_rate) {
          if (!fromParms._quiet_mode)
            Log.info("_adaptive_rate: Using automatic learning rate. Ignoring the following input parameters: "
                    + "rate, rate_decay, rate_annealing, momentum_start, momentum_ramp, momentum_stable.");
          toParms._rate = 0;
          toParms._rate_decay = 0;
          toParms._rate_annealing = 0;
          toParms._momentum_start = 0;
          toParms._momentum_ramp = 0;
          toParms._momentum_stable = 0;
        } else {
          if (!fromParms._quiet_mode)
            Log.info("_adaptive_rate: Using manual learning rate. Ignoring the following input parameters: "
                    + "rho, epsilon.");
          toParms._rho = 0;
          toParms._epsilon = 0;
        }
        if (fromParms._activation == Activation.Rectifier || fromParms._activation == Activation.RectifierWithDropout) {
          if (fromParms._max_w2 == Float.POSITIVE_INFINITY) {
            if (!fromParms._quiet_mode)
              Log.info("_max_w2: Automatically setting max_w2 to 1000 to keep (unbounded) Rectifier activation in check.");
            toParms._max_w2 = 1e3f;
          }
        }
        if (fromParms._nfolds != 0) {
          if (fromParms._overwrite_with_best_model) {
            if (!fromParms._quiet_mode)
              Log.info("_overwrite_with_best_model: Automatically disabling overwrite_with_best_model, since the final model is the only scored model with n-fold cross-validation.");
            toParms._overwrite_with_best_model = false;
          }
        }
        if (fromParms._autoencoder && fromParms._stopping_metric == ScoreKeeper.StoppingMetric.AUTO) {
          if (!fromParms._quiet_mode)
            Log.info("_stopping_metric: Automatically setting stopping_metric to MSE for autoencoder.");
          toParms._stopping_metric = ScoreKeeper.StoppingMetric.MSE;
        }

        // Automatically set the distribution
        if (fromParms._distribution == DistributionFamily.AUTO) {
          // For classification, allow AUTO/bernoulli/multinomial with losses CrossEntropy/Quadratic/Huber/Absolute
          if (nClasses > 1) {
            toParms._distribution = nClasses == 2 ? DistributionFamily.bernoulli : DistributionFamily.multinomial;
          }
          else {
            //regression/autoencoder
            switch(fromParms._loss) {
              case Automatic:
              case Quadratic:
                toParms._distribution = DistributionFamily.gaussian;
                break;
              case Absolute:
                toParms._distribution = DistributionFamily.laplace;
                break;
              case Quantile:
                toParms._distribution = DistributionFamily.quantile;
                break;
              case Huber:
                toParms._distribution = DistributionFamily.huber;
                break;
              case ModifiedHuber:
                toParms._distribution = DistributionFamily.modified_huber;
                break;
              default:
                throw H2O.unimpl();
            }
          }
        }

        if (fromParms._loss == Automatic) {
          switch (toParms._distribution) {
            // regression
            case gaussian:
              toParms._loss = Quadratic;
              break;
            case quantile:
              toParms._loss = Loss.Quantile;
              break;
            case laplace:
              toParms._loss = Loss.Absolute;
              break;
            case huber:
              toParms._loss = Loss.Huber;
              break;
            case tweedie:
            case poisson:
            case gamma:
              toParms._loss = Automatic; //deviance
              break;

            // classification
            case multinomial:
            case bernoulli:
              toParms._loss = CrossEntropy;
              break;
            case modified_huber:
              toParms._loss = ModifiedHuber;
              break;
            default:
              throw H2O.unimpl();
          }
        }
        if (fromParms._reproducible) {
          if (!fromParms._quiet_mode)
            Log.info("_reproducibility: Automatically enabling force_load_balancing and score_each_iteration to enforce reproducibility. Turning off replicate_training_data.");
          toParms._force_load_balance = true;
          toParms._score_each_iteration = true;
          toParms._replicate_training_data = false;
          if (fromParms._train_samples_per_iteration == -2) {
            toParms._train_samples_per_iteration = -1;
            Log.info("_reproducibility: Also setting train_samples_per_iteration to -1 since auto-tuning (-2) was specified.");
          }
        }
      }
    }

  }

  @Override
  public DeepLearningMojoWriter getMojo() {
    return new DeepLearningMojoWriter(this);
  }

  @Override
  public boolean isFeatureUsedInPredict(String featureName) {
    if (!_parms._variable_importances) return true;
    int featureIdx = ArrayUtils.find(varImp()._names, featureName);
    return featureIdx != -1 && (double) varImp()._varimp[featureIdx] != 0d;
  }

  @Override
  public boolean isDistributionHuber() {
    return super.isDistributionHuber() || get_params()._distribution == DistributionFamily.huber;
  }

  @Override protected CategoricalEncoding getGenModelEncoding() {
    switch (_parms._categorical_encoding) {
      case AUTO:
      case SortByResponse:
      case OneHotInternal:
        return CategoricalEncoding.AUTO;
      case Binary:
        return CategoricalEncoding.Binary;
      case Eigen:
        return CategoricalEncoding.Eigen;
      case LabelEncoder:
        return CategoricalEncoding.LabelEncoder;
      default:
        return null;
    }
  }


}