All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.genmodel.GenModel Maven / Gradle / Ivy

There is a newer version: 3.46.0.5
Show newest version
package hex.genmodel;

import hex.ModelCategory;
import water.genmodel.IGeneratedModel;

import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import java.util.*;

/**
 * This is a helper class to support Java generated models.
 */
public abstract class GenModel implements IGenModel, IGeneratedModel, Serializable {

  /** Column names; last is response for supervised models */
  public final String[] _names;

  /** Categorical (factor/enum) mappings, per column.  Null for non-enum cols.
   *  Columns match the post-init cleanup columns.  The last column holds the
   *  response col enums for SupervisedModels.  */
  public final String[][] _domains;

  /** Name of the response column used for training (only for supervised models). */
  public final String _responseColumn;

  /** Name of the column with offsets (used for certain types of models). */
  public String _offsetColumn;
  public String _foldColumn;
  
  /** Name of the column determine treatment group, currently only for UpliftDRF models */
  public String _treatmentColumn;

  public GenModel(String[] names, String[][] domains, String responseColumn) {
    _names = names;
    _domains = domains;
    _responseColumn = responseColumn;
    _treatmentColumn = null;
  }

  public GenModel(String[] names, String[][] domains, String responseColumn, String treatmentColumn) {
    _names = names;
    _domains = domains;
    _responseColumn = responseColumn;
    _treatmentColumn = treatmentColumn;
  }

  /**
   * @deprecated This constructor is deprecated and will be removed in a future version.
   *             use {@link #GenModel(String[] names, String[][] domains, String responseColumn)()} instead.
   */
  @Deprecated
  public GenModel(String[] names, String[][] domains) {
    this(names, domains, null);
  }

  public boolean requiresOffset() {
    return false;
  }

  //--------------------------------------------------------------------------------------------------------------------
  // IGenModel interface
  //--------------------------------------------------------------------------------------------------------------------

  /** Returns true for supervised models. */
  @Override public boolean isSupervised() {
    return false;
  }

  /** Returns number of input features. */
  @Override public int nfeatures() {
    return _names.length;
  }

  public int nCatFeatures() {
    int nCat = 0;
    String[][] domainValues = getDomainValues();
    for (int i = 0; i < nfeatures(); i++) {
      if (domainValues[i] != null)
        nCat++;
    }
    return nCat;
  }

  /** Returns names of input features. */
  @Override public String[] features() {
    return Arrays.copyOf(_names, nfeatures());
  }

  /** Returns number of output classes for classifiers, 1 for regression models, and 0 for unsupervised models. */
  @Override public int nclasses() {
    return 0;
  }

  /** Returns this model category. */
  @Override public abstract ModelCategory getModelCategory();

  public String[] getOutputNames() {
    final ModelCategory category = getModelCategory();
    final String[] outputNames;
    // Emit outputCSV column names.
    switch (category) {
      case AutoEncoder:
        List onames = new LinkedList<>(); 
        final String[] cnames = getNames();
        final int numCats = nCatFeatures();
        final String[][] domainValues = getDomainValues();

        for (int index = 0; index <= numCats - 1; index++) { // add names for categorical columns
          String[] tdomains = domainValues[index];
          int tdomainLen = tdomains.length-1;
          for (int index2 = 0; index2 <= tdomainLen; index2++) {
            onames.add("reconstr_" + cnames[index] + "." + tdomains[index2]);
          }
          onames.add("reconstr_" + cnames[index] + ".missing(NA)");
        }
        for (int index = numCats; index < cnames.length; index++) {  // add the numerical column names
          onames.add("reconstr_" + cnames[index]);
        }
        outputNames = onames.toArray(new String[0]);
        break;
      case Binomial:
      case Multinomial:
      case Ordinal:
        final String[] responseDomainValues = getDomainValues(getResponseIdx());
        outputNames = new String[1 + responseDomainValues.length];
        outputNames[0] = "predict";
        System.arraycopy(responseDomainValues, 0, outputNames, 1, outputNames.length - 1);
        // turn integer class labels such as 0, 1, etc. into p0, p1, etc.
        for (int i = 1; i < outputNames.length; i++) {
          try {
            Integer.valueOf(outputNames[i]);
            outputNames[i] = "p" + outputNames[i];
          } catch (Exception e) {
            // do nothing, non-integer names are fine already
          }
        }
        break;

      case Clustering:
        outputNames = new String[]{"cluster"};
        break;

      case Regression:
        outputNames = new String[]{"predict"};
        break;
        
      case CoxPH:
        outputNames = new String[]{"lp"};
        break;
      case BinomialUplift:
        outputNames = new String[]{"uplift_predict", "p_y1_with_treatment", "p_y1_without_treatment"};
        break;
      default:
        throw new UnsupportedOperationException("Getting output column names for model category '" + 
                category + "' is not supported.");
    }

    return outputNames;
  }

  /**
   * Companion method to getOutputNames. For each output column specifies
   * what is the domain of the column.
   * 
   * @return array of domain values for each output column, if the type of the column is not categorical use null
   */
  public String[][] getOutputDomains() {
    final ModelCategory category = getModelCategory();
    final String[][] outputDomains = new String[getOutputNames().length][];
    switch (category) {
      case Binomial:
      case Multinomial:
      case Ordinal:
        outputDomains[0] = getDomainValues(getResponseIdx());
        break;
      case Regression:
      case HGLMRegression:
      case Clustering:
      case AutoEncoder:
      case TargetEncoder:
      case DimReduction:
      case WordEmbedding:
      case CoxPH:
      case AnomalyDetection:
      case KLime:
      case BinomialUplift:
        // all numeric, nothing to set here
        break;
      default:
        throw new UnsupportedOperationException("Getting output domains for model category '" +
                category + "' is not yet supported.");
    }
    return outputDomains;
  }

  /** Override this for models that may produce results in different categories. */
  @Override public EnumSet getModelCategories() {
    return EnumSet.of(getModelCategory());
  }


  //--------------------------------------------------------------------------------------------------------------------
  // IGeneratedModel interface
  //--------------------------------------------------------------------------------------------------------------------

  @Override public abstract String getUUID();

  /** Returns number of columns used as input for training (i.e., exclude response and offset columns). */
  @Override public int getNumCols() {
    return nfeatures();
  }

  /** The names of all columns used, including response and offset columns. */
  @Override public String[] getNames() {
    return _names;
  }
  
  public int getOrigNumCols() {
    String[] origNames = getOrigNames();
    if (origNames == null || origNames.length == 0)
      return 0;
    boolean hasResponse = false;
    if (isSupervised()) {
      String responseName = getResponseName();
      hasResponse = origNames[origNames.length - 1].equals(responseName);
    } 
    return hasResponse ? origNames.length - 1 : origNames.length;
  }
  
  /** The original names of all columns used, including response and offset columns. */
  @Override public String[] getOrigNames() {
    return null;
  }
  
  /** The name of the response column. */
  @Override public String getResponseName() {
    // Note: _responseColumn is not set when deprecated constructor GenModel(String[] names, String[][] domains) is used
    int r = getResponseIdx();
    return r < _names.length ? _names[r] : _responseColumn;
  }

  /** Returns the index of the response column inside getDomains(). */
  @Override public int getResponseIdx() {
    if (!isSupervised())
      throw new UnsupportedOperationException("Cannot provide response index for unsupervised models.");
    return _domains.length - 1;
  }

  @Override public String getOffsetName() {
    return _offsetColumn;
  }

  /** Get number of classes in the given column.
   * Return number greater than zero if the column is categorical or -1 if the column is numeric. */
  @Override public int getNumClasses(int colIdx) {
    String[] domval = getDomainValues(colIdx);
    return domval != null? domval.length : -1;
  }

  /** Return a number of classes in response column. */
  @Override public int getNumResponseClasses() {
    if (!isClassifier())
      throw new UnsupportedOperationException("Cannot provide number of response classes for non-classifiers.");
    return nclasses();
  }

  /** Return type of encoding expected by the model implementation. */
  @Override public CategoricalEncoding getCategoricalEncoding() {
    return CategoricalEncoding.AUTO; // by default model handles the encoding
  }

  /** Returns true if this model represents a classifier, else it is used for regression. */
  @Override public boolean isClassifier() {
    ModelCategory cat = getModelCategory();
    return cat == ModelCategory.Binomial || cat == ModelCategory.Multinomial || cat == ModelCategory.Ordinal;
  }

  /** Returns true if this model represents an AutoEncoder. */
  @Override public boolean isAutoEncoder() {
    return getModelCategory() == ModelCategory.AutoEncoder;
  }

  /** Gets domain of the given column. */
  @Override public String[] getDomainValues(String name) {
    int colIdx = getColIdx(name);
    return colIdx != -1 ? getDomainValues(colIdx) : null;
  }

  /** Returns domain values for the i-th column. */
  @Override public String[] getDomainValues(int i) {
    return getDomainValues()[i];
  }

  /** Returns domain values for all columns, including the response column. */
  @Override public String[][] getDomainValues() {
    return _domains;
  }

  @Override
  public String[][] getOrigDomainValues() {
    return null;
  }

  /** Returns original Eigen encoder projections array for all columns. */
  @Override
  public double[] getOrigProjectionArray() {return null;}

  /** Returns index of a column with given name, or -1 if the column is not found. */
  @Override public int getColIdx(String name) {
    String[] names = getNames();
    for (int i = 0; i < names.length; i++) if (names[i].equals(name)) return i;
    return -1;
  }

  /** Maps given column's categorical to the integer used by this model (returns -1 if mapping not found). */
  @Override public int mapEnum(int colIdx, String enumValue) {
    String[] domain = getDomainValues(colIdx);
    if (domain != null)
      for (int i = 0; i < domain.length; i++)
        if (enumValue.equals(domain[i]))
          return i;
    return -1;
  }

  /** Returns the expected size of preds array which is passed to `predict(double[], double[])` function. */
  @Override public int getPredsSize() { // fractional binomial has numerical response
    return isClassifier() ? (1 + getNumResponseClasses()) : 2;
  }

  public int getPredsSize(ModelCategory mc) {
    return (mc == ModelCategory.DimReduction)? nclasses() :getPredsSize();
  }

  public static String createAuxKey(String k) {
    return k + ".aux";
  }

  /*
  @Override
  public float[] predict(double[] data, float[] preds) {
    return predict(data, preds, 0);
  }

  @Override
  public float[] predict(double[] data, float[] preds, int maxIters) {
    throw new UnsupportedOperationException("Unsupported operation - use score0 method!");
  }
  */


  //--------------------------------------------------------------------------------------------------------------------

  /** Takes a HashMap mapping column names to doubles.
   *  

* Looks up the column names needed by the model, and places the doubles into * the data array in the order needed by the model. Missing columns use NaN. *

*/ /* public double[] map(Map row, double data[]) { for (int i = 0; i < nfeatures(); i++) { Double d = row.get(_names[i]); data[i] = d==null ? Double.NaN : d; } return data; } */ /** Subclasses implement the scoring logic. The data is pre-loaded into a * re-used temp array, in the order the model expects. The predictions are * loaded into the re-used temp array, which is also returned. This call * exactly matches the hex.Model.score0, but uses the light-weight * GenModel class. */ public abstract double[] score0(double[] row, double[] preds); public double[] score0(double[] row, double offset, double[] preds) { throw new UnsupportedOperationException("`offset` column is not supported"); } /** Subclasses implement calibration of class probabilities. The input is array of * predictions returned by the scoring function (score0). Supports classification * models that were trained with calibration enabled. Original probabilities * in the predictions array are overwritten by their corresponding calibrated * counterparts. Return false if model doesn't support calibration. */ public boolean calibrateClassProbabilities(double preds[]) { return false; } /* // Does the mapping lookup for every row, no allocation. // data and preds arrays are pre-allocated and can be re-used for every row. public double[] score0(Map row, double[] data, double[] preds) { Double offset = _offsetColumn == null? null : row.get(_offsetColumn); return score0(map(row, data), offset == null? 0.0 : offset, preds); } // Does the mapping lookup for every row. // preds array is pre-allocated and can be re-used for every row. // Allocates a double[] for every row. public double[] score0(Map row, double[] preds) { return score0(row, new double[nfeatures()], preds); } // Does the mapping lookup for every row. // Allocates a double[] and a float[] for every row. public double[] score0(Map row) { return score0(row, new double[nfeatures()], new double[nclasses()+1]); } */ /** * Correct a given list of class probabilities produced as a prediction by a model back to prior class distribution * *

The implementation is based on Eq. (27) in the paper. * * @param scored list of class probabilities beginning at index 1 * @param priorClassDist original class distribution * @param modelClassDist class distribution used for model building (e.g., data was oversampled) * @return corrected list of probabilities */ public static double[] correctProbabilities(double[] scored, double[] priorClassDist, double[] modelClassDist) { double probsum=0; for( int c=1; c0) for (int i=1;i= threshold) ? 1 : 0; //no tie-breaking } public static int getPredictionMultinomial(double[] preds, double[] priorClassDist, double[] data) { List ties = new ArrayList<>(); ties.add(0); int best=1, tieCnt=0; // Best class; count of ties for( int c=2; c> 6; // drop 6 least significants bits of mantissa (layout of long is: 1b sign, 11b exp, 52b mantisa) if (priorClassDist!=null) { assert(preds.length==priorClassDist.length+1); // Tie-breaking based on prior probabilities // Example: probabilities are 0.4, 0.2, 0.4 for a 3-class problem with priors 0.7, 0.1, 0.2 // Probability of predicting class 1 should be higher than for class 3 based on the priors double sum = 0; for (Integer i : ties) { //ties = [0, 2] sum += priorClassDist[i]; //0.7 + 0.2 } // sum is now 0.9 Random rng = new Random(hash); double tie = rng.nextDouble(); //for example 0.4135 -> should pick the first of the ties, since it occupies 0.7777 = 0.7/0.9 of the 0...1 range, and 0.4135 < 0.7777 double partialSum = 0; for (Integer i : ties) { partialSum += priorClassDist[i] / sum; //0.7777 at first iteration, 1.0000 at second iteration if (tie <= partialSum) return i; } } // Tie-breaking logic (should really never be triggered anymore) double res = preds[best]; // One of the tied best results int idx = (int)hash%(tieCnt+1); // Which of the ties we'd like to keep for( best=1; best= 0 && idx < nbits): "Must have "+bitoff+" <= idx <= " + (bitoff+nbits-1) + ": " + idx; return (bits[idx >> 3] & ((byte)1 << (idx & 7))) != 0; } public static boolean bitSetIsInRange(int nbits, int bitoff, double dnum) { assert(!Double.isNaN(dnum)); int idx = (int)dnum; idx -= bitoff; return (idx >= 0 && idx < nbits); } // Todo: Done for K-means but we should really unify for all models. public static void Kmeans_preprocessData(double [] data, double [] means, double [] mults, int[] modes){ for(int i = 0; i < data.length; i++) { data[i] = Kmeans_preprocessData(data[i], i, means, mults, modes); } } public static double Kmeans_preprocessData(double d, int i, double [] means, double [] mults, int[] modes){ if(modes[i] == -1) { // Mode = -1 for non-categorical cols if( Double.isNaN(d) ) d = means[i]; if( mults != null ) { d -= means[i]; d *= mults[i]; } } else { if( Double.isNaN(d) ) d = modes[i]; } return d; } // -------------------------------------------------------------------------- // KMeans utilities // For KMeansModel scoring; just the closest cluster center public static int KMeans_closest(double[][] centers, double[] point, String[][] domains) { int min = -1; double minSqr = Double.MAX_VALUE; for( int cluster = 0; cluster < centers.length; cluster++ ) { double sqr = KMeans_distance(centers[cluster],point,domains); if( sqr < minSqr ) { // Record nearest cluster center min = cluster; minSqr = sqr; } } return min; } // Outputs distances from a given point to all cluster centers, returns index of the closest cluster center public static int KMeans_distances(double[][] centers, double[] point, String[][] domains, double[] distances) { int min = -1; double minSqr = Double.MAX_VALUE; for (int cluster = 0; cluster < centers.length; cluster++) { distances[cluster] = KMeans_distance(centers[cluster], point, domains); if (distances[cluster] < minSqr) { // Record nearest cluster center min = cluster; minSqr = distances[cluster]; } } return min; } // only used for GLRM initialization - inverse of distance to each cluster center normalized to sum to one public static double[] KMeans_simplex(double[][] centers, double[] point, String[][] domains) { double[] dist = new double[centers.length]; double sum = 0, inv_sum = 0; for( int cluster = 0; cluster < centers.length; cluster++ ) { dist[cluster] = KMeans_distance(centers[cluster],point,domains); sum += dist[cluster]; inv_sum += 1.0 / dist[cluster]; } double[] ratios = new double[centers.length]; if (sum == 0) { // In degenerate case where all cluster centers identical to point, pick one at random Random rng = new Random(); int idx = rng.nextInt(centers.length); ratios[idx] = 1; } else { // Is the point identical to an existing cluster center? int idx = -1; for (int cluster = 0; cluster < centers.length; cluster++) { if(dist[cluster] == 0) { idx = cluster; break; } } if(idx == -1) { // If not, take ratios as inverse of cluster distance normalized to sum to one for (int cluster = 0; cluster < centers.length; cluster++) ratios[cluster] = 1.0 / (dist[cluster] * inv_sum); } else // Otherwise, just assign directly to closest cluster ratios[idx] = 1; } return ratios; } // only used for metric builder - uses float[] and fills up colSum & colSumSq arrays, otherwise the same as method below. // WARNING - if changing this code - also change the code below public static double KMeans_distance(double[] center, float[] point, int [] modes, double[] colSum, double[] colSumSq) { double sqr = 0; // Sum of dimensional distances int pts = point.length; // Count of valid points for(int column = 0; column < center.length; column++) { float d = point[column]; if( Float.isNaN(d) ) { pts--; continue; } if( modes[column] != -1 ) { // Categorical? if( d != center[column] ) { sqr += 1.0; // Manhattan distance } if(d != modes[column]) { colSum[column] += 1; } } else { // Euclidean distance double delta = d - center[column]; sqr += delta * delta; colSum[column] += d; colSumSq[column] += d*d; } } // Scale distance by ratio of valid dimensions to all dimensions - since // we did not add any error term for the missing point, the sum of errors // is small - ratio up "as if" the missing error term is equal to the // average of other error terms. Same math another way: // double avg_dist = sqr / pts; // average distance per feature/column/dimension // sqr = sqr * point.length; // Total dist is average*#dimensions if( 0 < pts && pts < point.length ) { double scale = ((double) point.length) / pts; sqr *= scale; // for (int i=0; i= 0) to[cats[i]] = 1f; // one-hot encode categoricals for (int i = 0; i < _nums; ++i) to[_catOffsets[_cats] + i] = Double.isNaN(nums[i]) ? (replaceMissingWithZero ? 0 : Float.NaN) : (float)nums[i]; } // Helper for Deeplearning, note: we assume nums and cats are allocated already and being re-used static public void setInput(final double[] from, double[] to, double[] nums, int[] cats, int _nums, int _cats, int[] _catOffsets, double[] _normMul, double[] _normSub, boolean useAllFactorLevels, boolean replaceMissingWithZero) { setCats(from, nums, cats, _cats, _catOffsets, _normMul, _normSub, useAllFactorLevels); assert(to.length == _nums + _catOffsets[_cats]); Arrays.fill(to, 0d); for (int i = 0; i < _cats; ++i) if (cats[i] >= 0) to[cats[i]] = 1d; // one-hot encode categoricals for (int i = 0; i < _nums; ++i) to[_catOffsets[_cats] + i] = Double.isNaN(nums[i]) ? (replaceMissingWithZero ? 0 : Double.NaN) : nums[i]; } // Helper for XGBoost Java static public void setCats(final double[] from, double[] nums, int[] cats, int _cats, int[] _catOffsets, double[] _normMul, double[] _normSub, boolean useAllFactorLevels) { setCats(from, cats, _cats, _catOffsets, useAllFactorLevels); for (int i = _cats; i < from.length; ++i) { double d = from[i]; if ((_normMul != null) && (_normMul.length >0)) { d = (d - _normSub[i - _cats]) * _normMul[i - _cats]; } nums[i - _cats] = d; //can be NaN for missing numerical data } } static public void setCats(final double[] from, int[] to, int cats, int[] catOffsets, boolean useAllFactorLevels) { for (int i = 0; i < cats; ++i) { if (Double.isNaN(from[i])) { to[i] = (catOffsets[i + 1] - 1); //use the extra level for NAs made during training } else { int c = (int) from[i]; if (useAllFactorLevels) to[i] = c + catOffsets[i]; else { if (c != 0) to[i] = c - 1 + catOffsets[i]; else to[i] = -1; } if (to[i] >= catOffsets[i + 1]) to[i] = (catOffsets[i + 1] - 1); } } } public static float[] convertDouble2Float(double[] input) { int arraySize = input.length; float[] output = new float[arraySize]; for (int index=0; index





© 2015 - 2024 Weber Informatics LLC | Privacy Policy