
hex.Model Maven / Gradle / Ivy
package hex;
import org.joda.time.DateTime;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import hex.genmodel.GenModel;
import water.*;
import water.fvec.C0DChunk;
import water.fvec.Chunk;
import water.fvec.EnumWrappedVec;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.JCodeGen;
import water.util.Log;
import water.util.MathUtils;
import water.util.SB;
import water.util.TwoDimTable;
import static hex.ModelMetricsMultinomial.getHitRatioTable;
/**
* A Model models reality (hopefully).
* A model can be used to 'score' a row (make a prediction), or a collection of
* rows on any compatible dataset - meaning the row has all the columns with the
* same names as used to build the mode and any enum (categorical) columns can
* be adapted.
*/
public abstract class Model, P extends Model.Parameters, O extends Model.Output> extends Lockable {
public interface DeepFeatures {
Frame scoreAutoEncoder(Frame frame, Key destination_key, boolean reconstruction_error_per_feature);
Frame scoreDeepFeatures(Frame frame, final int layer);
}
/**
* Default threshold for assigning class labels to the target class (for binomial models)
* @return threshold in 0...1
*/
public final double defaultThreshold() {
if (_output.nclasses() != 2 || _output._training_metrics == null)
return 0.5;
if (_output._validation_metrics != null && ((ModelMetricsBinomial)_output._validation_metrics)._auc != null)
return ((ModelMetricsBinomial)_output._validation_metrics)._auc.defaultThreshold();
if (_output._training_metrics != null && ((ModelMetricsBinomial)_output._training_metrics)._auc != null)
return ((ModelMetricsBinomial)_output._training_metrics)._auc.defaultThreshold();
return 0.5;
}
public final boolean isSupervised() { return _output.isSupervised(); }
/** Model-specific parameter class. Each model sub-class contains
* instance of one of these containing its builder parameters, with
* model-specific parameters. E.g. KMeansModel extends Model and has a
* KMeansParameters extending Model.Parameters; sample parameters include K,
* whether or not to normalize, max iterations and the initial random seed.
*
* The non-transient fields are input parameters to the model-building
* process, and are considered "first class citizens" by the front-end - the
* front-end will cache Parameters (in the browser, in JavaScript, on disk)
* and rebuild Parameter instances from those caches.
*
* WARNING: Model Parameters is not immutable object and ModelBuilder can modify
* them!
*/
public abstract static class Parameters extends Iced {
/** Maximal number of supported levels in response. */
public static final int MAX_SUPPORTED_LEVELS = 1000;
public Key _model_id; // desired Key for this model (otherwise is autogenerated)
public Key _train; // User-Key of the Frame the Model is trained on
public Key _valid; // User-Key of the Frame the Model is validated on, if any
public int _nfolds;
public boolean _keep_cross_validation_predictions;
public enum FoldAssignmentScheme {
AUTO, Random, Modulo
}
public FoldAssignmentScheme _fold_assignment = FoldAssignmentScheme.AUTO;
public Distribution.Family _distribution = Distribution.Family.AUTO;
public double _tweedie_power = 1.5f;
// TODO: This field belongs in the front-end column-selection process and
// NOT in the parameters - because this requires all model-builders to have
// column strip/ignore code.
public String[] _ignored_columns;// column names to ignore for training
public boolean _ignore_const_cols; // True if dropping constant cols
public String _weights_column;
public String _offset_column;
public String _fold_column;
// Scoring a model on a dataset is not free; sometimes it is THE limiting
// factor to model building. By default, partially built models are only
// scored every so many major model iterations - throttled to limit scoring
// costs to less than 10% of the build time. This flag forces scoring for
// every iteration, allowing e.g. more fine-grained progress reporting.
public boolean _score_each_iteration;
/** Supervised models have an expected response they get to train with! */
public String _response_column; // response column name
/** Should all classes be over/under-sampled to balance the class
* distribution? */
public boolean _balance_classes = false;
/** When classes are being balanced, limit the resulting dataset size to
* the specified multiple of the original dataset size. Maximum relative
* size of the training data after balancing class counts (can be less
* than 1.0) */
public float _max_after_balance_size = 5.0f;
/**
* Desired over/under-sampling ratios per class (lexicographic order).
* Only when balance_classes is enabled.
* If not specified, they will be automatically computed to obtain class balance during training.
*/
public float[] _class_sampling_factors;
/** The maximum number (top K) of predictions to use for hit ratio
* computation (for multi-class only, 0 to disable) */
public int _max_hit_ratio_k = 10;
/** For classification models, the maximum size (in terms of classes) of
* the confusion matrix for it to be printed. This option is meant to
* avoid printing extremely large confusion matrices. */
public int _max_confusion_matrix_size = 20;
/**
* A model key associated with a previously trained Deep Learning
* model. This option allows users to build a new model as a
* continuation of a previously generated model.
*/
public Key extends Model> _checkpoint;
// Public no-arg constructor for reflective creation
public Parameters() { _ignore_const_cols = defaultDropConsCols(); }
/** @return the training frame instance */
public final Frame train() { return _train.get(); }
/** @return the validation frame instance, or null
* if a validation frame was not specified */
public final Frame valid() { return _valid==null ? null : _valid.get(); }
/** Read-Lock both training and validation User frames. */
public void read_lock_frames(Job job) {
Frame tr = train();
if (tr != null)
tr.read_lock(job._key);
if (_valid != null && !_train.equals(_valid))
valid().read_lock(job._key);
}
/** Read-UnLock both training and validation User frames. */
public void read_unlock_frames(Job job) {
Frame tr = train();
if( tr != null ) tr.unlock(job._key);
if( _valid != null && !_train.equals(_valid) )
valid().unlock(job._key);
}
// Override in subclasses to change the default; e.g. true in GLM
protected boolean defaultDropNA20Cols() { return false; }
protected boolean defaultDropConsCols() { return true; }
/** Type of missing columns during adaptation between train/test datasets
* Overload this method for models that have sparse data handling - a zero
* will preserve the sparseness. Otherwise, NaN is used.
* @return real-valued number (can be NaN) */
public double missingColumnsType() { return Double.NaN; }
public boolean hasCheckpoint() { return _checkpoint != null; }
// FIXME: this is really horrible hack, Model.Parameters has method checksum_impl,
// but not checksum, the API is totally random :(
public long checksum() {
return checksum_impl();
}
/**
* Compute a checksum based on all non-transient non-static ice-able assignable fields (incl. inherited ones) which have @API annotations.
* Sort the fields first, since reflection gives us the fields in random order and we don't want the checksum to be affected by the field order.
* NOTE: if a field is added to a Parameters class the checksum will differ even when all the previous parameters have the same value. If
* a client wants backward compatibility they will need to compare parameter values explicitly.
* @return checksum
*/
protected long checksum_impl() {
long xs = 0x600DL;
int count = 0;
Field[] fields = Weaver.getWovenFields(this.getClass());
Arrays.sort(fields,
new Comparator() {
public int compare(Field field1, Field field2) {
return field1.getName().compareTo(field2.getName());
}
});
for (Field f : fields) {
final long P = MathUtils.PRIMES[count % MathUtils.PRIMES.length];
Class> c = f.getType();
if (c.isArray()) {
try {
f.setAccessible(true);
if (f.get(this) != null) {
if (c.getComponentType() == Integer.TYPE){
int[] arr = (int[]) f.get(this);
xs = P * xs + (long) Arrays.hashCode(arr);
} else if (c.getComponentType() == Float.TYPE) {
float[] arr = (float[]) f.get(this);
xs = P * xs + (long) Arrays.hashCode(arr);
} else if (c.getComponentType() == Double.TYPE) {
double[] arr = (double[]) f.get(this);
xs = P * xs + (long) Arrays.hashCode(arr);
} else if (c.getComponentType() == Long.TYPE){
long[] arr = (long[]) f.get(this);
xs = P * xs + (long) Arrays.hashCode(arr);
} else {
Object[] arr = (Object[]) f.get(this);
xs = P * xs + (long) Arrays.deepHashCode(arr);
} //else lead to ClassCastException
} else {
xs = P * xs + P;
}
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
} catch (ClassCastException t) {
throw H2O.fail(); //no support yet for int[][] etc.
}
} else {
try {
f.setAccessible(true);
Object value = f.get(this);
if (value != null) {
xs = P * xs + (long)(value.hashCode());
} else {
xs = P * xs + P;
}
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
}
}
count++;
}
xs ^= train().checksum() * (_valid == null ? 17 : valid().checksum());
return xs;
}
}
public P _parms; // TODO: move things around so that this can be protected
public String [] _warnings = new String[0];
public void addWarning(String s){
_warnings = Arrays.copyOf(_warnings,_warnings.length+1);
_warnings[_warnings.length-1] = s;
}
/** Model-specific output class. Each model sub-class contains an instance
* of one of these containing its "output": the pieces of the model needed
* for scoring. E.g. KMeansModel has a KMeansOutput extending Model.Output
* which contains the cluster centers. The output also includes the names,
* domains and other fields which are determined at training time. */
public abstract static class Output extends Iced {
/** Columns used in the model and are used to match up with scoring data
* columns. The last name is the response column name (if any). */
public String _names[];
/** List of Keys to cross-validation models (non-null iff _parms._nfolds > 1 or _parms._fold_column != null) **/
public Key _cross_validation_models[];
/** List of Keys to cross-validation predictions (if requested) **/
public Key _cross_validation_predictions[];
public Output(){this(false,false,false);}
public Output(boolean hasWeights, boolean hasOffset, boolean hasFold) {
_hasWeights = hasWeights;
_hasOffset = hasOffset;
_hasFold = hasFold;
}
/** Any final prep-work just before model-building starts, but after the
* user has clicked "go". E.g., converting a response column to an enum
* touches the entire column (can be expensive), makes a parallel vec
* (Key/Data leak management issues), and might throw IAE if there are too
* many classes. */
public Output( ModelBuilder b ) {
if( b == null ) {
_hasOffset = false;
_hasWeights = false;
_hasFold = false;
return;
}
_isSupervised = b.isSupervised();
if( b.error_count() > 0 )
throw new IllegalArgumentException(b.validationErrors());
// Capture the data "shape" the model is valid on
_names = b._train.names ();
_domains= b._train.domains();
_hasOffset = b.hasOffsetCol();
_hasWeights = b.hasWeightCol();
_hasFold = b.hasFoldCol();
_distribution = b._distribution;
_priorClassDist = b._priorClassDist;
}
/** Returns number of input features (OK for most supervised methods, need to override for unsupervised!) */
public int nfeatures() {
return _names.length - (_hasOffset?1:0) - (_hasWeights?1:0) - (_hasFold?1:0) - (isSupervised()?1:0);
}
/** Categorical/factor/enum mappings, per column. Null for non-enum cols.
* Columns match the post-init cleanup columns. The last column holds the
* response col enums for SupervisedModels. */
public String _domains[][];
/** List of all the associated ModelMetrics objects, so we can delete them
* when we delete this model. */
public Key[] _model_metrics = new Key[0];
/** Job state (CANCELLED, FAILED, DONE). TODO: Really the whole Job
* (run-time, etc) but that has to wait until Job is split from
* ModelBuilder. */
public Job.JobState _status;
public long _start_time;
public long _end_time;
public long _run_time;
/**
* Training set metrics obtained during model training
*/
public ModelMetrics _training_metrics;
/**
* Validation set metrics obtained during model training (if a validation data set was specified)
*/
public ModelMetrics _validation_metrics;
/**
* Cross-Validation metrics obtained during model training
*/
public ModelMetrics _cross_validation_metrics;
/**
* User-facing model summary - Display model type, complexity, size and other useful stats
*/
public TwoDimTable _model_summary;
/**
* User-facing model scoring history - 2D table with modeling accuracy as a function of time/trees/epochs/iterations, etc.
*/
public TwoDimTable _scoring_history;
protected boolean _isSupervised;
public boolean isSupervised() { return _isSupervised; }
/** The name of the response column (which is always the last column). */
protected final boolean _hasOffset; // weights and offset are kept at designated position in the names array
protected final boolean _hasWeights;// only need to know if we have them
protected final boolean _hasFold;// only need to know if we have them
public boolean hasOffset () { return _hasOffset;}
public boolean hasWeights () { return _hasWeights;}
public boolean hasFold () { return _hasFold;}
public String responseName() { return isSupervised()?_names[responseIdx()]:null;}
public String weightsName () { return _hasWeights ?_names[weightsIdx()]:null;}
public String offsetName () { return _hasOffset ?_names[offsetIdx()]:null;}
public String foldName () { return _hasFold ?_names[foldIdx()]:null;}
// Vec layout is [c1,c2,...,cn,w?,o?,r], cn are predictor cols, r is response, w and o are weights and offset, both are optional
public int weightsIdx () {
if(!_hasWeights) return -1;
return _names.length - (isSupervised()?1:0) - (hasOffset()?1:0) - 1 - (hasFold()?1:0);
}
public int offsetIdx () {
if(!_hasOffset) return -1;
return _names.length - (isSupervised()?1:0) - (hasFold()?1:0) - 1;
}
public int foldIdx () {
if(!_hasFold) return -1;
return _names.length - (isSupervised()?1:0) - 1;
}
public int responseIdx () {
if(!isSupervised()) return -1;
return _names.length-1;
}
/** The names of the levels for an enum (categorical) response column. */
public String[] classNames() { assert isSupervised();
return _domains[_domains.length-1];
}
/** Is this model a classification model? (v. a regression or clustering model) */
public boolean isClassifier() { return isSupervised() && nclasses() > 1; }
public int nclasses() {
assert isSupervised();
String cns[] = classNames();
return cns==null ? 1 : cns.length;
}
public double [] _distribution;
public double [] _modelClassDist;
public double [] _priorClassDist;
// Note: some algorithms MUST redefine this method to return other model categories
public ModelCategory getModelCategory() {
if(isSupervised())
return (isClassifier() ?
(nclasses() > 2 ? ModelCategory.Multinomial : ModelCategory.Binomial) :
ModelCategory.Regression);
return ModelCategory.Unknown;
}
public synchronized ModelMetrics addModelMetrics(ModelMetrics mm) {
DKV.put(mm);
for( Key key : _model_metrics ) // Dup removal
if( key==mm._key ) return mm;
_model_metrics = Arrays.copyOf(_model_metrics, _model_metrics.length + 1);
_model_metrics[_model_metrics.length - 1] = mm._key;
return mm; // Flow coding
}
long checksum_impl() {
return (null == _names ? 13 : Arrays.hashCode(_names)) *
(null == _domains ? 17 : Arrays.deepHashCode(_domains)) *
getModelCategory().ordinal();
}
public void printTwoDimTables(StringBuilder sb, Object o) {
for (Field f : Weaver.getWovenFields(o.getClass())) {
Class> c = f.getType();
if (c.isAssignableFrom(TwoDimTable.class)) {
try {
TwoDimTable t = (TwoDimTable) f.get(this);
f.setAccessible(true);
if (t != null) sb.append(t.toString(1,false /*don't print the full table if too long*/));
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
}
}
@Override public String toString() {
StringBuilder sb = new StringBuilder();
if (_training_metrics!=null) sb.append(_training_metrics.toString());
if (_validation_metrics!=null) sb.append(_validation_metrics.toString());
if (_cross_validation_metrics!=null) sb.append(_cross_validation_metrics.toString());
printTwoDimTables(sb, this);
return sb.toString();
}
} // Output
public O _output; // TODO: move things around so that this can be protected
public ModelMetrics addMetrics(ModelMetrics mm) { return _output.addModelMetrics(mm); }
public abstract ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain);
/** Full constructor */
public Model( Key selfKey, P parms, O output) {
super(selfKey);
_parms = parms ; assert parms != null;
_output = output; // Output won't be set if we're assert output != null;
}
/**
* Deviance of given distribution function at predicted value f
* @param w observation weight
* @param y (actual) response
* @param f (predicted) response in original response space
* @return value of gradient
*/
public double deviance(double w, double y, double f) {
return new Distribution(Distribution.Family.gaussian).deviance(w, y, f);
}
/** Adapt a Test/Validation Frame to be compatible for a Training Frame. The
* intention here is that ModelBuilders can assume the test set has the same
* count of columns, and within each factor column the same set of
* same-numbered levels. Extra levels are renumbered past those in the
* Train set but will still be present in the Test set, thus requiring
* range-checking.
*
* This routine is used before model building (with no Model made yet) to
* check for compatible datasets, and also used to prepare a large dataset
* for scoring (with a Model).
*
* Adaption does the following things:
* - Remove any "extra" Vecs appearing only in the test and not the train
* - Insert any "missing" Vecs appearing only in the train and not the test
* with all NAs ({@see missingColumnsType}). This will issue a warning,
* and if the "expensive" flag is false won't actually make the column
* replacement column but instead will bail-out on the whole adaption (but
* will continue looking for more warnings).
* - If all columns are missing, issue an error.
* - Renumber matching cat levels to match the Train levels; this might make
* "holes" in the Test set cat levels, if some are not in the Test set.
* - Extra Test levels are renumbered past the end of the Train set, hence
* the train and test levels match up to all the train levels; there might
* be extra Test levels past that.
* - For all mis-matched levels, issue a warning.
*
* The {@code test} frame is updated in-place to be compatible, by altering
* the names and Vecs; make a defensive copy if you do not want it modified.
* There is a fast-path cutout if the test set is already compatible. Since
* the test-set is conditionally modifed with extra EnumWrappedVec optionally
* added it is recommended to use a Scope enter/exit to track Vec lifetimes.
*
* @param test Testing Frame, updated in-place
* @param expensive Try hard to adapt; this might involve the creation of
* whole Vecs and thus get expensive. If {@code false}, then only adapt if
* no warnings and errors; otherwise just the messages are produced.
* Created Vecs have to be deleted by the caller (e.g. Scope.enter/exit).
* @return Array of warnings; zero length (never null) for no warnings.
* Throws {@code IllegalArgumentException} if no columns are in common, or
* if any factor column has no levels in common.
*/
public String[] adaptTestForTrain( Frame test, boolean expensive, boolean computeMetrics) {
return adaptTestForTrain(_output._names, _output.weightsName(), _output.offsetName(), _output.foldName(), _output.responseName(), _output._domains, test, _parms.missingColumnsType(), expensive, computeMetrics);
}
/**
* @param names Training column names
* @param weights Name of column with observation weights, weights are NOT filled in if missing in test frame
* @param offset Name of column with offset, if not null (i.e. trained with offset), offset MUST be present in test data as well, otherwise can not scorew and IAE is thrown.
* @param fold
* @param response Name of response column, response is NOT filled in if missing in test frame
* @param domains Training column levels
* @param missing Substitute for missing columns; usually NaN
* */
public static String[] adaptTestForTrain(String[] names, String weights, String offset, String fold, String response, String[][] domains, Frame test, double missing, boolean expensive, boolean computeMetrics) throws IllegalArgumentException {
if( test == null) return new String[0];
// Fast path cutout: already compatible
String[][] tdomains = test.domains();
if( names == test._names && domains == tdomains )
return new String[0];
// Fast path cutout: already compatible but needs work to test
if( Arrays.equals(names,test._names) && Arrays.deepEquals(domains,tdomains) )
return new String[0];
// Build the validation set to be compatible with the training set.
// Toss out extra columns, complain about missing ones, remap enums
ArrayList msgs = new ArrayList<>();
Vec vvecs[] = new Vec[names.length];
int good = 0; // Any matching column names, at all?
int convNaN = 0;
for( int i=0; i= domains[i].length;
if( isResponse && vec.domain() != null && ds.length == domains[i].length+vec.domain().length )
throw new IllegalArgumentException("Test/Validation dataset has a categorical response column '"+names[i]+"' with no levels in common with the model");
if (ds.length > domains[i].length)
msgs.add("Test/Validation dataset column '" + names[i] + "' has levels not trained on: " + Arrays.toString(Arrays.copyOfRange(ds, domains[i].length, ds.length)));
if (expensive) { vec = evec; good++; } // Keep it
else { evec.remove(); vec = null; } // No leaking if not-expensive
} else {
good++;
}
} else if( vec.isEnum() ) {
throw new IllegalArgumentException("Test/Validation dataset has categorical column '"+names[i]+"' which is real-valued in the training data");
} else {
good++; // Assumed compatible; not checking e.g. Strings vs UUID
}
}
vvecs[i] = vec;
}
if( good == convNaN )
throw new IllegalArgumentException("Test/Validation dataset has no columns in common with the training set");
if( good == names.length || (response != null && test.find(response) == -1 && good == names.length - 1) ) // Only update if got something for all columns
test.restructure(names,vvecs,good);
return msgs.toArray(new String[msgs.size()]);
}
/**
* Bulk score the frame, and auto-name the resulting predictions frame.
* @see #score(Frame, String)
* @param fr frame which should be scored
* @return A new frame containing a predicted values. For classification it
* contains a column with prediction and distribution for all
* response classes. For regression it contains only one column with
* predicted values.
* @throws IllegalArgumentException
*/
public Frame score(Frame fr) throws IllegalArgumentException {
return score(fr, null);
}
/** Bulk score the frame {@code fr}, producing a Frame result; the 1st
* Vec is the predicted class, the remaining Vecs are the probability
* distributions. For Regression (single-class) models, the 1st and only
* Vec is the prediction value. The result is in the DKV; caller is
* responsible for deleting.
*
* @param fr frame which should be scored
* @return A new frame containing a predicted values. For classification it
* contains a column with prediction and distribution for all
* response classes. For regression it contains only one column with
* predicted values.
* @throws IllegalArgumentException
*/
public Frame score(Frame fr, String destination_key) throws IllegalArgumentException {
Frame adaptFr = new Frame(fr);
boolean computeMetrics = (!isSupervised() || adaptFr.find(_output.responseName()) != -1);
adaptTestForTrain(adaptFr,true, computeMetrics); // Adapt
Frame output = predictScoreImpl(fr, adaptFr, destination_key); // Predict & Score
// Log modest confusion matrices
Vec predicted = output.vecs()[0]; // Modeled/predicted response
String mdomain[] = predicted.domain(); // Domain of predictions (union of test and train)
// Output is in the model's domain, but needs to be mapped to the scored
// dataset's domain.
if(_output.isClassifier() && computeMetrics) {
// assert(mdomain != null); // label must be enum
ModelMetrics mm = ModelMetrics.getFromDKV(this,fr);
ConfusionMatrix cm = mm.cm();
if (cm != null && cm._domain != null) //don't print table for regression
if( cm._cm.length < _parms._max_confusion_matrix_size/*Print size limitation*/ ) {
Log.info(cm.table().toString(1));
}
if (mm.hr() != null) {
Log.info(getHitRatioTable(mm.hr()));
}
Vec actual = fr.vec(_output.responseName());
if( actual != null ) { // Predict does not have an actual, scoring does
String sdomain[] = actual.domain(); // Scored/test domain; can be null
if (sdomain != null && mdomain != sdomain && !Arrays.equals(mdomain, sdomain))
output.replace(0, new EnumWrappedVec(actual.group().addVec(), actual.get_espc(), sdomain, predicted._key));
}
}
cleanup_adapt(adaptFr, fr);
return output;
}
// Remove temp keys. TODO: Really should use Scope but Scope does not
// currently allow nested-key-keepers.
static protected void cleanup_adapt( Frame adaptFr, Frame fr ) {
Key[] keys = adaptFr.keys();
for( int i=0; i= 0,computeMetrics, true /*make preds*/).doAll(ncols,adaptFrm);
if (computeMetrics)
bs._mb.makeModelMetrics(this, fr);
return bs.outputFrame((null == destination_key ? Key.make() : Key.make(destination_key)), names, domains);
}
/** Score an already adapted frame. Returns a MetricBuilder that can be used to make a model metrics.
* @param adaptFrm Already adapted frame
* @return MetricBuilder
*/
protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) {
final boolean computeMetrics = (!isSupervised() || adaptFrm.find(_output.responseName()) != -1);
// Build up the names & domains.
final int nc = _output.nclasses();
final int ncols = nc==1?1:nc+1; // Regression has 1 predict col; classification also has class distribution
String[] names = new String[ncols];
String[][] domains = new String[ncols][];
names[0] = "predict";
for(int i = 1; i < names.length; ++i) {
names[i] = _output.classNames()[i - 1];
// turn integer class labels such as 0, 1, etc. into p0, p1, etc.
try {
Integer.valueOf(names[i]);
names[i] = "p" + names[i];
} catch (Throwable t) {
// do nothing, non-integer names are fine already
}
}
domains[0] = nc==1 ? null : !computeMetrics ? _output._domains[_output._domains.length-1] : adaptFrm.lastVec().domain();
// Score the dataset, building the class distribution & predictions
BigScore bs = new BigScore(domains[0],ncols,adaptFrm.means(),_output.hasWeights() && adaptFrm.find(_output.weightsName()) >= 0,computeMetrics, false /*no preds*/).doAll(adaptFrm);
return bs._mb;
}
private class BigScore extends MRTask {
final String[] _domain; // Prediction domain; union of test and train classes
final int _npredcols; // Number of columns in prediction; nclasses+1 - can be less than the prediction domain
ModelMetrics.MetricBuilder _mb;
final double[] _mean; // Column means of test frame
final boolean _computeMetrics; // Column means of test frame
final boolean _hasWeights;
final boolean _makePreds;
BigScore( String[] domain, int ncols, double[] mean, boolean testHasWeights, boolean computeMetrics, boolean makePreds ) {
_domain = domain; _npredcols = ncols; _mean = mean; _computeMetrics = computeMetrics; _makePreds = makePreds;
if(_output._hasWeights && _computeMetrics && !testHasWeights)
throw new IllegalArgumentException("Missing weights when computing validation metrics.");
_hasWeights = testHasWeights;
}
@Override public void map( Chunk chks[], NewChunk cpreds[] ) {
Chunk weightsChunk = _hasWeights && _computeMetrics ? chks[_output.weightsIdx()] : new C0DChunk(1, chks[0]._len);
Chunk offsetChunk = _output.hasOffset() ? chks[_output.offsetIdx()] : new C0DChunk(0, chks[0]._len);
Chunk responseChunk = null;
double [] tmp = new double[_output.nfeatures()];
float [] actual = null;
_mb = Model.this.makeMetricBuilder(_domain);
if (_computeMetrics) {
if (isSupervised()) {
actual = new float[1];
responseChunk = chks[_output.responseIdx()];
} else
actual = new float[chks.length];
}
double[] preds = _mb._work; // Sized for the union of test and train classes
int len = chks[0]._len;
for (int row = 0; row < len; row++) {
double weight = weightsChunk.atd(row);
if (weight == 0) {
if (_makePreds) {
for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes)
cpreds[c].addNum(0);
}
continue;
}
double offset = offsetChunk.atd(row);
double [] p = score0(chks, weight, offset, row, tmp, preds);
if (_computeMetrics) {
if(isSupervised()) {
actual[0] = (float)responseChunk.atd(row);
} else {
for(int i = 0; i < actual.length; ++i)
actual[i] = (float)chks[i].atd(row);
}
_mb.perRow(preds, actual, weight, offset, Model.this);
}
if (_makePreds) {
for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes)
cpreds[c].addNum(p[c]);
}
}
}
@Override public void reduce( BigScore bs ) { if(_mb != null)_mb.reduce(bs._mb); }
@Override protected void postGlobal() { if(_mb != null)_mb.postGlobal(); }
}
/** Bulk scoring API for one row. Chunks are all compatible with the model,
* and expect the last Chunks are for the final distribution and prediction.
* Default method is to just load the data into the tmp array, then call
* subclass scoring logic. */
public double[] score0( Chunk chks[], int row_in_chunk, double[] tmp, double[] preds ) {
return score0(chks, 1, 0, row_in_chunk, tmp, preds);
}
public double[] score0( Chunk chks[], double weight, double offset, int row_in_chunk, double[] tmp, double[] preds ) {
assert(_output.nfeatures() == tmp.length);
for( int i=0; i< tmp.length; i++ )
tmp[i] = chks[i].atd(row_in_chunk);
double [] scored = score0(tmp, preds, weight, offset);
if(isSupervised()) {
// Correct probabilities obtained from training on oversampled data back to original distribution
// C.f. http://gking.harvard.edu/files/0s.pdf Eq.(27)
if( _output.isClassifier()) {
if (_parms._balance_classes)
GenModel.correctProbabilities(scored, _output._priorClassDist, _output._modelClassDist);
//assign label at the very end (after potentially correcting probabilities)
scored[0] = hex.genmodel.GenModel.getPrediction(scored, _output._priorClassDist, tmp, defaultThreshold());
}
}
return scored;
}
/** Subclasses implement the scoring logic. The data is pre-loaded into a
* re-used temp array, in the order the model expects. The predictions are
* loaded into the re-used temp array, which is also returned. */
protected abstract double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/]);
/**Override scoring logic for models that handle weight/offset**/
protected double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/], double weight, double offset) {
assert (weight == 1 && offset == 0) : "Override this method for non-trivial weight/offset!";
return score0(data, preds);
}
// Version where the user has just ponied-up an array of data to be scored.
// Data must be in proper order. Handy for JUnit tests.
public double score(double[] data){ return ArrayUtils.maxIndex(score0(data, new double[_output.nclasses()])); }
@Override protected Futures remove_impl( Futures fs ) {
if (_output._model_metrics != null)
for( Key k : _output._model_metrics )
k.remove(fs);
return fs;
}
@Override protected long checksum_impl() { return _parms.checksum_impl() * _output.checksum_impl(); }
// ==========================================================================
/** Return a String which is a valid Java program representing a class that
* implements the Model. The Java is of the form:
*
* class UUIDxxxxModel {
* public static final String NAMES[] = { ....column names... }
* public static final String DOMAINS[][] = { ....domain names... }
* // Pass in data in a double[], pre-aligned to the Model's requirements.
* // Jam predictions into the preds[] array; preds[0] is reserved for the
* // main prediction (class for classifiers or value for regression),
* // and remaining columns hold a probability distribution for classifiers.
* double[] predict( double data[], double preds[] );
* double[] map( HashMap < String,Double > row, double data[] );
* // Does the mapping lookup for every row, no allocation
* double[] predict( HashMap < String,Double > row, double data[], double preds[] );
* // Allocates a double[] for every row
* double[] predict( HashMap < String,Double > row, double preds[] );
* // Allocates a double[] and a double[] for every row
* double[] predict( HashMap < String,Double > row );
* }
*
*/
public final String toJava(boolean preview) { return toJava(new SB(), preview).toString(); }
public SB toJava( SB sb, boolean preview ) {
SB fileContext = new SB(); // preserve file context
String modelName = JCodeGen.toJavaId(_key.toString());
// HEADER
sb.p("// Licensed under the Apache License, Version 2.0").nl();
sb.p("// http://www.apache.org/licenses/LICENSE-2.0.html").nl();
sb.p("//").nl();
sb.p("// AUTOGENERATED BY H2O at ").p(new DateTime().toString()).nl();
sb.p("// ").p(H2O.ABV.projectVersion()).nl();
sb.p("//").nl();
sb.p("// Standalone prediction code with sample test data for ").p(this.getClass().getSimpleName()).p(" named ").p(modelName).nl();
sb.p("//").nl();
sb.p("// How to download, compile and execute:").nl();
sb.p("// mkdir tmpdir").nl();
sb.p("// cd tmpdir").nl();
sb.p("// curl http:/").p(H2O.SELF.toString()).p("/3/h2o-genmodel.jar > h2o-genmodel.jar").nl();
sb.p("// curl http:/").p(H2O.SELF.toString()).p("/3/Models.java/").pobj(_key).p(" > ").p(modelName).p(".java").nl();
sb.p("// javac -cp h2o-genmodel.jar -J-Xmx2g -J-XX:MaxPermSize=128m ").p(modelName).p(".java").nl();
// Intentionally disabled since there is no main method in generated code
// sb.p("// java -cp h2o-genmodel.jar:. -Xmx2g -XX:MaxPermSize=256m -XX:ReservedCodeCacheSize=256m ").p(modelName).nl();
sb.p("//").nl();
sb.p("// (Note: Try java argument -XX:+PrintCompilation to show runtime JIT compiler behavior.)").nl();
if (_parms._offset_column != null) {
sb.nl();
sb.nl();
sb.p("//").nl();
sb.p("// NOTE: Java model export does not support offset_column.").nl();
sb.p("//").nl();
Log.warn("Java model export does not support offset_column.");
}
if (preview && toJavaCheckTooBig()) {
sb.nl();
sb.nl();
sb.p("//").nl();
sb.p("// NOTE: Java model is too large to preview, please download as shown above.").nl();
sb.p("//").nl();
return sb;
}
sb.p("import java.util.Map;").nl();
sb.p("import hex.genmodel.GenModel;").nl();
sb.nl();
sb.p("public class ").p(modelName).p(" extends GenModel {").nl().ii(1);
sb.ip("public hex.ModelCategory getModelCategory() { return hex.ModelCategory."+_output.getModelCategory()+"; }").nl();
toJavaInit(sb, fileContext).nl();
toJavaNAMES(sb, fileContext);
toJavaNCLASSES(sb);
toJavaDOMAINS(sb, fileContext);
toJavaPROB(sb);
toJavaSuper(modelName, sb); //
sb.p(" public String getUUID() { return Long.toString("+checksum()+"L); }").nl();
toJavaPredict(sb, fileContext);
sb.p("}").nl().di(1);
sb.p(fileContext).nl(); // Append file
if (preview) {
String [] first100k = sb._sb.toString().substring(0, Math.min(sb._sb.toString().length(), 100000)).split("\n");
SB out = new SB();
int lines=0;
for (String line : first100k) {
out.p(line).nl();
if (lines++ > 1000) break;
}
return out;
}
return sb;
}
/** Generate implementation for super class. */
protected SB toJavaSuper( String modelName, SB sb ) {
return sb.nl().ip("public " + modelName + "() { super(NAMES,DOMAINS); }").nl();
}
private SB toJavaNAMES(SB sb, SB fileContextSB) {
String modelName = JCodeGen.toJavaId(_key.toString());
String namesHolderClassName = "NamesHolder_"+modelName;
sb.i().p("// ").p("Names of columns used by model.").nl();
sb.i().p("public static final String[] NAMES = "+namesHolderClassName+".VALUES;").nl();
// Generate class which fills the names into array
fileContextSB.i().p("// The class representing training column names").nl();
JCodeGen.toClassWithArray(fileContextSB, null, namesHolderClassName, Arrays.copyOf(_output._names, _output.nfeatures()));
return sb;
}
protected SB toJavaNCLASSES( SB sb ) {
return _output.isClassifier() ? JCodeGen.toStaticVar(sb, "NCLASSES", _output.nclasses(), "Number of output classes included in training data response column.") : sb;
}
private SB toJavaDOMAINS( SB sb, SB fileContext ) {
String modelName = JCodeGen.toJavaId(_key.toString());
sb.nl();
sb.ip("// Column domains. The last array contains domain of response column.").nl();
sb.ip("public static final String[][] DOMAINS = new String[][] {").nl();
for (int i=0; i<_output._domains.length; i++) {
String[] dom = _output._domains[i];
String colInfoClazz = modelName+"_ColInfo_"+i;
sb.i(1).p("/* ").p(_output._names[i]).p(" */ ");
if (dom != null) sb.p(colInfoClazz).p(".VALUES"); else sb.p("null");
if (i!=_output._domains.length-1) sb.p(',');
sb.nl();
// Right now do not generate the class representing column
// since it does not hold any interesting information except String array holding domain
if (dom != null) {
fileContext.ip("// The class representing column ").p(_output._names[i]).nl();
JCodeGen.toClassWithArray(fileContext, null, colInfoClazz, dom);
}
}
return sb.ip("};").nl();
}
protected SB toJavaPROB( SB sb) {
if(isSupervised()) {
JCodeGen.toStaticVar(sb, "PRIOR_CLASS_DISTRIB", _output._priorClassDist, "Prior class distribution");
JCodeGen.toStaticVar(sb, "MODEL_CLASS_DISTRIB", _output._modelClassDist, "Class distribution used for model building");
}
return sb;
}
protected boolean toJavaCheckTooBig() {
Log.warn("toJavaCheckTooBig must be overridden for this model type to render it in the browser");
return true;
}
// Override in subclasses to provide some top-level model-specific goodness
protected SB toJavaInit(SB sb, SB fileContext) { return sb; }
// Override in subclasses to provide some inside 'predict' call goodness
// Method returns code which should be appended into generated top level class after
// predict method.
protected void toJavaPredictBody(SB body, SB cls, SB file) {
throw new IllegalArgumentException("This model type does not support conversion to Java");
}
// Wrapper around the main predict call, including the signature and return value
private SB toJavaPredict(SB ccsb, SB file) { // ccsb = classContext
ccsb.nl();
ccsb.ip("// Pass in data in a double[], pre-aligned to the Model's requirements.").nl();
ccsb.ip("// Jam predictions into the preds[] array; preds[0] is reserved for the").nl();
ccsb.ip("// main prediction (class for classifiers or value for regression),").nl();
ccsb.ip("// and remaining columns hold a probability distribution for classifiers.").nl();
ccsb.ip("public final double[] score0( double[] data, double[] preds ) {").nl();
SB classCtxSb = new SB().ii(1);
toJavaPredictBody(ccsb.ii(1), classCtxSb, file);
ccsb.ip("return preds;").nl();
ccsb.di(1).ip("}").nl();
ccsb.p(classCtxSb);
return ccsb;
}
// Convenience method for testing: build Java, convert it to a class &
// execute it: compare the results of the new class's (JIT'd) scoring with
// the built-in (interpreted) scoring on this dataset. Returns true if all
// is well, false is there are any mismatches. Throws if there is any error
// (typically an AssertionError or unable to compile the POJO).
public boolean testJavaScoring( Frame data, Frame model_predictions, double rel_epsilon) {
assert data.numRows()==model_predictions.numRows();
final Frame fr = new Frame(data);
boolean computeMetrics = data.find(_output.responseName()) != -1;
try {
String[] warns = adaptTestForTrain(fr,true, computeMetrics);
if( warns.length > 0 )
System.err.println(Arrays.toString(warns));
// Output is in the model's domain, but needs to be mapped to the scored
// dataset's domain.
int[] omap = null;
if( _output.isClassifier() ) {
Vec actual = fr.vec(_output.responseName());
String sdomain[] = actual == null ? null : actual.domain(); // Scored/test domain; can be null
String mdomain[] = model_predictions.vec(0).domain(); // Domain of predictions (union of test and train)
if( sdomain != null && mdomain != sdomain && !Arrays.equals(mdomain, sdomain)) {
EnumWrappedVec ewv = new EnumWrappedVec(mdomain,sdomain);
omap = ewv.enum_map(); // Map from model-domain to scoring-domain
ewv.remove();
}
}
String modelName = JCodeGen.toJavaId(_key.toString());
boolean preview = false;
String java_text = toJava(preview);
//System.out.println(java_text);
GenModel genmodel;
try {
Class clz = JCodeGen.compile(modelName,java_text);
genmodel = (GenModel)clz.newInstance();
} catch (Exception e) {
throw H2O.fail("Internal POJO compilation failed",e);
}
Vec[] dvecs = fr.vecs();
Vec[] pvecs = model_predictions.vecs();
double features [] = MemoryManager.malloc8d(genmodel._names.length);
double predictions[] = MemoryManager.malloc8d(genmodel.nclasses() + 1);
// Compare predictions, counting mis-predicts
int miss = 0;
for( int row=0; row getPublishedKeys() {
List p = Arrays.asList(_output._model_metrics);
p.addAll(super.getPublishedKeys());
return p;
}
@Override
public void delete() {
deleteCrossValidationModels();
super.delete();
}
private void deleteCrossValidationModels( ) {
if (_output._cross_validation_models != null) {
for (Key k : _output._cross_validation_models) {
Model m = DKV.getGet(k);
if (m!=null) m.delete(); //delete all subparts
}
}
}
@Override public String toString() {
return _output.toString();
}
}