hex.genmodel.GenModel Maven / Gradle / Ivy
package hex.genmodel;
import hex.ModelCategory;
import water.genmodel.IGeneratedModel;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import java.util.List;
/**
* This is a helper class to support Java generated models.
*/
public abstract class GenModel implements IGenModel, IGeneratedModel, Serializable {
/** Column names; last is response for supervised models */
public final String[] _names;
/** Categorical (factor/enum) mappings, per column. Null for non-enum cols.
* Columns match the post-init cleanup columns. The last column holds the
* response col enums for SupervisedModels. */
public final String[][] _domains;
/** Name of the response column used for training (only for supervised models). */
public final String _responseColumn;
/** Name of the column with offsets (used for certain types of models). */
public String _offsetColumn;
public GenModel(String[] names, String[][] domains, String responseColumn) {
_names = names;
_domains = domains;
_responseColumn = responseColumn;
}
/**
* @deprecated This constructor is deprecated and will be removed in a future version.
* use {@link #GenModel(String[] names, String[][] domains, String responseColumn)()} instead.
*/
@Deprecated
public GenModel(String[] names, String[][] domains) {
this(names, domains, null);
}
//--------------------------------------------------------------------------------------------------------------------
// IGenModel interface
//--------------------------------------------------------------------------------------------------------------------
/** Returns true for supervised models. */
@Override public boolean isSupervised() {
return false;
}
/** Returns number of input features. */
@Override public int nfeatures() {
return _names.length;
}
/** Returns number of output classes for classifiers, 1 for regression models, and 0 for unsupervised models. */
@Override public int nclasses() {
return 0;
}
/** Returns this model category. */
@Override public abstract ModelCategory getModelCategory();
/** Override this for models that may produce results in different categories. */
@Override public EnumSet getModelCategories() {
return EnumSet.of(getModelCategory());
}
//--------------------------------------------------------------------------------------------------------------------
// IGeneratedModel interface
//--------------------------------------------------------------------------------------------------------------------
@Override public abstract String getUUID();
/** Returns number of columns used as input for training (i.e., exclude response and offset columns). */
@Override public int getNumCols() {
return nfeatures();
}
/** The names of all columns used, including response and offset columns. */
@Override public String[] getNames() {
return _names;
}
/** The name of the response column. */
@Override public String getResponseName() {
// Note: _responseColumn is not set when deprecated constructor GenModel(String[] names, String[][] domains) is used
int r = getResponseIdx();
return r < _names.length ? _names[r] : _responseColumn;
}
/** Returns the index of the response column inside getDomains(). */
@Override public int getResponseIdx() {
if (!isSupervised())
throw new UnsupportedOperationException("Cannot provide response index for unsupervised models.");
return _domains.length - 1;
}
/** Get number of classes in the given column.
* Return number greater than zero if the column is categorical or -1 if the column is numeric. */
@Override public int getNumClasses(int colIdx) {
String[] domval = getDomainValues(colIdx);
return domval != null? domval.length : -1;
}
/** Return a number of classes in response column. */
@Override public int getNumResponseClasses() {
if (!isClassifier())
throw new UnsupportedOperationException("Cannot provide number of response classes for non-classifiers.");
return nclasses();
}
/** Returns true if this model represents a classifier, else it is used for regression. */
@Override public boolean isClassifier() {
ModelCategory cat = getModelCategory();
return cat == ModelCategory.Binomial || cat == ModelCategory.Multinomial || cat == ModelCategory.Ordinal;
}
/** Returns true if this model represents an AutoEncoder. */
@Override public boolean isAutoEncoder() {
return getModelCategory() == ModelCategory.AutoEncoder;
}
/** Gets domain of the given column. */
@Override public String[] getDomainValues(String name) {
int colIdx = getColIdx(name);
return colIdx != -1 ? getDomainValues(colIdx) : null;
}
/** Returns domain values for the i-th column. */
@Override public String[] getDomainValues(int i) {
return getDomainValues()[i];
}
/** Returns domain values for all columns, including the response column. */
@Override public String[][] getDomainValues() {
return _domains;
}
/** Returns index of a column with given name, or -1 if the column is not found. */
@Override public int getColIdx(String name) {
String[] names = getNames();
for (int i = 0; i < names.length; i++) if (names[i].equals(name)) return i;
return -1;
}
/** Maps given column's categorical to the integer used by this model (returns -1 if mapping not found). */
@Override public int mapEnum(int colIdx, String enumValue) {
String[] domain = getDomainValues(colIdx);
if (domain != null)
for (int i = 0; i < domain.length; i++)
if (enumValue.equals(domain[i]))
return i;
return -1;
}
/** Returns the expected size of preds array which is passed to `predict(double[], double[])` function. */
@Override public int getPredsSize() {
return isClassifier()? 1 + getNumResponseClasses() : 2;
}
public int getPredsSize(ModelCategory mc) {
return (mc == ModelCategory.DimReduction)? nclasses() :getPredsSize();
}
public static String createAuxKey(String k) {
return k + ".aux";
}
/*
@Override
public float[] predict(double[] data, float[] preds) {
return predict(data, preds, 0);
}
@Override
public float[] predict(double[] data, float[] preds, int maxIters) {
throw new UnsupportedOperationException("Unsupported operation - use score0 method!");
}
*/
//--------------------------------------------------------------------------------------------------------------------
/** Takes a HashMap mapping column names to doubles.
*
* Looks up the column names needed by the model, and places the doubles into
* the data array in the order needed by the model. Missing columns use NaN.
*
*/
/*
public double[] map(Map row, double data[]) {
for (int i = 0; i < nfeatures(); i++) {
Double d = row.get(_names[i]);
data[i] = d==null ? Double.NaN : d;
}
return data;
}
*/
/** Subclasses implement the scoring logic. The data is pre-loaded into a
* re-used temp array, in the order the model expects. The predictions are
* loaded into the re-used temp array, which is also returned. This call
* exactly matches the hex.Model.score0, but uses the light-weight
* GenModel class. */
public abstract double[] score0(double[] row, double[] preds);
public double[] score0(double[] row, double offset, double[] preds) {
throw new UnsupportedOperationException("`offset` column is not supported");
}
/** Subclasses implement calibration of class probabilities. The input is array of
* predictions returned by the scoring function (score0). Supports classification
* models that were trained with calibration enabled. Original probabilities
* in the predictions array are overwritten by their corresponding calibrated
* counterparts. Return false if model doesn't support calibration.
*/
public boolean calibrateClassProbabilities(double preds[]) {
return false;
}
/*
// Does the mapping lookup for every row, no allocation.
// data and preds arrays are pre-allocated and can be re-used for every row.
public double[] score0(Map row, double[] data, double[] preds) {
Double offset = _offsetColumn == null? null : row.get(_offsetColumn);
return score0(map(row, data), offset == null? 0.0 : offset, preds);
}
// Does the mapping lookup for every row.
// preds array is pre-allocated and can be re-used for every row.
// Allocates a double[] for every row.
public double[] score0(Map row, double[] preds) {
return score0(row, new double[nfeatures()], preds);
}
// Does the mapping lookup for every row.
// Allocates a double[] and a float[] for every row.
public double[] score0(Map row) {
return score0(row, new double[nfeatures()], new double[nclasses()+1]);
}
*/
/**
* Correct a given list of class probabilities produced as a prediction by a model back to prior class distribution
*
* The implementation is based on Eq. (27) in the paper.
*
* @param scored list of class probabilities beginning at index 1
* @param priorClassDist original class distribution
* @param modelClassDist class distribution used for model building (e.g., data was oversampled)
* @return corrected list of probabilities
*/
public static double[] correctProbabilities(double[] scored, double[] priorClassDist, double[] modelClassDist) {
double probsum=0;
for( int c=1; c0) for (int i=1;i= threshold) ? 1 : 0; //no tie-breaking
}
List ties = new ArrayList<>();
ties.add(0);
int best=1, tieCnt=0; // Best class; count of ties
for( int c=2; c> 6; // drop 6 least significants bits of mantissa (layout of long is: 1b sign, 11b exp, 52b mantisa)
if (priorClassDist!=null) {
assert(preds.length==priorClassDist.length+1);
// Tie-breaking based on prior probabilities
// Example: probabilities are 0.4, 0.2, 0.4 for a 3-class problem with priors 0.7, 0.1, 0.2
// Probability of predicting class 1 should be higher than for class 3 based on the priors
double sum = 0;
for (Integer i : ties) { //ties = [0, 2]
sum += priorClassDist[i]; //0.7 + 0.2
}
// sum is now 0.9
Random rng = new Random(hash);
double tie = rng.nextDouble(); //for example 0.4135 -> should pick the first of the ties, since it occupies 0.7777 = 0.7/0.9 of the 0...1 range, and 0.4135 < 0.7777
double partialSum = 0;
for (Integer i : ties) {
partialSum += priorClassDist[i] / sum; //0.7777 at first iteration, 1.0000 at second iteration
if (tie <= partialSum)
return i;
}
}
// Tie-breaking logic (should really never be triggered anymore)
double res = preds[best]; // One of the tied best results
int idx = (int)hash%(tieCnt+1); // Which of the ties we'd like to keep
for( best=1; best= 0 && idx < nbits): "Must have "+bitoff+" <= idx <= " + (bitoff+nbits-1) + ": " + idx;
return (bits[idx >> 3] & ((byte)1 << (idx & 7))) != 0;
}
public static boolean bitSetIsInRange(int nbits, int bitoff, double dnum) {
assert(!Double.isNaN(dnum));
int idx = (int)dnum;
idx -= bitoff;
return (idx >= 0 && idx < nbits);
}
// Todo: Done for K-means but we should really unify for all models.
public static void Kmeans_preprocessData(double [] data, double [] means, double [] mults, int[] modes){
for(int i = 0; i < data.length; i++) {
data[i] = Kmeans_preprocessData(data[i], i, means, mults, modes);
}
}
public static double Kmeans_preprocessData(double d, int i, double [] means, double [] mults, int[] modes){
if(modes[i] == -1) { // Mode = -1 for non-categorical cols
if( Double.isNaN(d) )
d = means[i];
if( mults != null ) {
d -= means[i];
d *= mults[i];
}
} else {
if( Double.isNaN(d) )
d = modes[i];
}
return d;
}
// --------------------------------------------------------------------------
// KMeans utilities
// For KMeansModel scoring; just the closest cluster center
public static int KMeans_closest(double[][] centers, double[] point, String[][] domains) {
int min = -1;
double minSqr = Double.MAX_VALUE;
for( int cluster = 0; cluster < centers.length; cluster++ ) {
double sqr = KMeans_distance(centers[cluster],point,domains);
if( sqr < minSqr ) { // Record nearest cluster center
min = cluster;
minSqr = sqr;
}
}
return min;
}
// Outputs distances from a given point to all cluster centers, returns index of the closest cluster center
public static int KMeans_distances(double[][] centers, double[] point, String[][] domains, double[] distances) {
int min = -1;
double minSqr = Double.MAX_VALUE;
for (int cluster = 0; cluster < centers.length; cluster++) {
distances[cluster] = KMeans_distance(centers[cluster], point, domains);
if (distances[cluster] < minSqr) { // Record nearest cluster center
min = cluster;
minSqr = distances[cluster];
}
}
return min;
}
// only used for GLRM initialization - inverse of distance to each cluster center normalized to sum to one
public static double[] KMeans_simplex(double[][] centers, double[] point, String[][] domains) {
double[] dist = new double[centers.length];
double sum = 0, inv_sum = 0;
for( int cluster = 0; cluster < centers.length; cluster++ ) {
dist[cluster] = KMeans_distance(centers[cluster],point,domains);
sum += dist[cluster];
inv_sum += 1.0 / dist[cluster];
}
double[] ratios = new double[centers.length];
if (sum == 0) { // In degenerate case where all cluster centers identical to point, pick one at random
Random rng = new Random();
int idx = rng.nextInt(centers.length);
ratios[idx] = 1;
} else {
// Is the point identical to an existing cluster center?
int idx = -1;
for (int cluster = 0; cluster < centers.length; cluster++) {
if(dist[cluster] == 0) {
idx = cluster;
break;
}
}
if(idx == -1) { // If not, take ratios as inverse of cluster distance normalized to sum to one
for (int cluster = 0; cluster < centers.length; cluster++)
ratios[cluster] = 1.0 / (dist[cluster] * inv_sum);
} else // Otherwise, just assign directly to closest cluster
ratios[idx] = 1;
}
return ratios;
}
// only used for metric builder - uses float[] and fills up colSum & colSumSq arrays, otherwise the same as method below.
// WARNING - if changing this code - also change the code below
public static double KMeans_distance(double[] center, float[] point, int [] modes,
double[] colSum, double[] colSumSq) {
double sqr = 0; // Sum of dimensional distances
int pts = point.length; // Count of valid points
for(int column = 0; column < center.length; column++) {
float d = point[column];
if( Float.isNaN(d) ) { pts--; continue; }
if( modes[column] != -1 ) { // Categorical?
if( d != center[column] ) {
sqr += 1.0; // Manhattan distance
}
if(d != modes[column]) {
colSum[column] += 1;
}
} else { // Euclidean distance
double delta = d - center[column];
sqr += delta * delta;
colSum[column] += d;
colSumSq[column] += d*d;
}
}
// Scale distance by ratio of valid dimensions to all dimensions - since
// we did not add any error term for the missing point, the sum of errors
// is small - ratio up "as if" the missing error term is equal to the
// average of other error terms. Same math another way:
// double avg_dist = sqr / pts; // average distance per feature/column/dimension
// sqr = sqr * point.length; // Total dist is average*#dimensions
if( 0 < pts && pts < point.length ) {
double scale = ((double) point.length) / pts;
sqr *= scale;
// for (int i=0; i= _catOffsets[i + 1])
cats[i] = (_catOffsets[i + 1] - 1);
}
}
for (int i = _cats; i < from.length; ++i) {
double d = from[i];
if ((_normMul != null) && (_normMul.length >0)) {
d = (d - _normSub[i - _cats]) * _normMul[i - _cats];
}
nums[i - _cats] = d; //can be NaN for missing numerical data
}
}
public static float[] convertDouble2Float(double[] input) {
int arraySize = input.length;
float[] output = new float[arraySize];
for (int index=0; index