hex.naivebayes.NaiveBayesModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.naivebayes;

import hex.Model;
import hex.ModelMetrics;
import hex.ModelMetricsBinomial;
import hex.ModelMetricsMultinomial;
import hex.genmodel.GenModel;
import hex.schemas.NaiveBayesModelV3;
import hex.util.EffectiveParametersUtils;
import water.H2O;
import water.Key;
import water.api.schemas3.ModelSchemaV3;
import water.codegen.CodeGenerator;
import water.codegen.CodeGeneratorPipeline;
import water.exceptions.JCodeSB;
import water.util.JCodeGen;
import water.util.SBPrintStream;
import water.util.TwoDimTable;

public class NaiveBayesModel extends Model {

  public static class NaiveBayesParameters extends Model.Parameters {
    public double _laplace = 0;         // Laplace smoothing parameter
    public double _eps_sdev = 0;   // Cutoff below which standard deviation is replaced with _min_sdev
    public double _min_sdev = 0.001;   // Minimum standard deviation to use for observations without enough data
    public double _eps_prob = 0;   // Cutoff below which probability is replaced with _min_prob
    public double _min_prob = 0.001;   // Minimum conditional probability to use for observations without enough data
    public boolean _compute_metrics = true;   // Should a second pass be made through data to compute metrics?
    public String algoName() { return "NaiveBayes"; }
    public String fullName() { return "Naive Bayes"; }
    public String javaName() { return NaiveBayesModel.class.getName(); }
    @Override public long progressUnits() { return 6; }
  }

  public static class NaiveBayesOutput extends Model.Output {
    // Class distribution of the response
    public TwoDimTable _apriori;
    public double[/*res level*/] _apriori_raw;

    // For every predictor, a table providing, for each attribute level, the conditional probabilities given the target class
    public TwoDimTable[/*predictor*/] _pcond;
    public double[/*predictor*/][/*res level*/][/*pred level*/] _pcond_raw;

    // Count of response levels
    public int[] _rescnt;

    // Domain of the response
    public String[] _levels;

    // Number of categorical predictors
    public int _ncats;

    public NaiveBayesOutput(NaiveBayes b) { super(b); }
  }

  public NaiveBayesModel(Key selfKey, NaiveBayesParameters parms, NaiveBayesOutput output) { 
    super(selfKey,parms,output);
  }
  
  @Override
  public void initActualParamValues() {
    super.initActualParamValues();
    EffectiveParametersUtils.initFoldAssignment(_parms);
  }

  public ModelSchemaV3 schema() {
    return new NaiveBayesModelV3();
  }

  // TODO: Constant response shouldn't be regression. Need to override getModelCategory()
  @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) {
    switch(_output.getModelCategory()) {
      case Binomial:    return new ModelMetricsBinomial.MetricBuilderBinomial(domain);
      case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(domain.length,domain, _parms._auc_type);
      default: throw H2O.unimpl();
    }
  }

  // Note: For small probabilities, product may end up zero due to underflow error. Can circumvent by taking logs.
  @Override protected double[] score0(double[] data, double[] preds) {
    double[] nums = new double[_output._levels.length];    // log(p(x,y)) for all levels of y
    assert preds.length >= (_output._levels.length + 1);   // Note: First column of preds is predicted response class

    // Compute joint probability of predictors for every response class
    for(int rlevel = 0; rlevel < _output._levels.length; rlevel++) {
      // Take logs to avoid overflow: p(x,y) = p(x|y)*p(y) -> log(p(x,y)) = log(p(x|y)) + log(p(y))
      nums[rlevel] = Math.log(_output._apriori_raw[rlevel]);

      for(int col = 0; col < _output._ncats; col++) {
        if(Double.isNaN(data[col])) continue;   // Skip predictor in joint x_1,...,x_m if NA
        int plevel = (int)data[col];
        double prob = plevel < _output._pcond_raw[col][rlevel].length ? _output._pcond_raw[col][rlevel][plevel] :
                _parms._laplace / ((double)_output._rescnt[rlevel] + _parms._laplace * _output._domains[col].length);   // Laplace smoothing if predictor level unobserved in training set
        nums[rlevel] += Math.log(prob <= _parms._eps_prob ? _parms._min_prob : prob);   // log(p(x|y)) = \sum_{j = 1}^m p(x_j|y)
      }

      // For numeric predictors, assume Gaussian distribution with sample mean and variance from model
      for(int col = _output._ncats; col < data.length; col++) {
        if(Double.isNaN(data[col])) continue;   // Skip predictor in joint x_1,...,x_m if NA
        double x = data[col];
        double mean = Double.isNaN(_output._pcond_raw[col][rlevel][0]) ? 0 : _output._pcond_raw[col][rlevel][0];
        double stddev = Double.isNaN(_output._pcond_raw[col][rlevel][1]) ? 1.0 :
          (_output._pcond_raw[col][rlevel][1] <= _parms._eps_sdev ? _parms._min_sdev : _output._pcond_raw[col][rlevel][1]);
        // double prob = Math.exp(new NormalDistribution(mean, stddev).density(data[col])); // slower
        double prob = Math.exp(-((x-mean)*(x-mean))/(2.*stddev*stddev)) / (stddev*Math.sqrt(2.*Math.PI)); // faster
        nums[rlevel] += Math.log(prob <= _parms._eps_prob ? _parms._min_prob : prob);
      }
    }

    // Numerically unstable:
    // p(x,y) = exp(log(p(x,y))), p(x) = \Sum_{r = levels of y} exp(log(p(x,y = r))) -> p(y|x) = p(x,y)/p(x)
    // Instead, we rewrite using a more stable form:
    // p(y|x) = p(x,y)/p(x) = exp(log(p(x,y))) / (\Sum_{r = levels of y} exp(log(p(x,y = r)))
    //        = 1 / ( exp(-log(p(x,y))) * \Sum_{r = levels of y} exp(log(p(x,y = r))) )
    //        = 1 / ( \Sum_{r = levels of y} exp( log(p(x,y = r)) - log(p(x,y)) ))
    for(int i = 0; i < nums.length; i++) {
      double sum = 0;
      for(int j = 0; j < nums.length; j++)
        sum += Math.exp(nums[j] - nums[i]);
      preds[i+1] = 1/sum;
    }

    // Select class with highest conditional probability
    preds[0] = GenModel.getPrediction(preds, _output._priorClassDist, data, defaultThreshold());
    return preds;
  }

  @Override protected SBPrintStream toJavaInit(SBPrintStream sb, CodeGeneratorPipeline fileCtx) {
    sb = super.toJavaInit(sb, fileCtx);
    sb.ip("public boolean isSupervised() { return " + isSupervised() + "; }").nl();
    sb.ip("public int nfeatures() { return " + _output.nfeatures() + "; }").nl();
    sb.ip("public int nclasses() { return " + _output.nclasses() + "; }").nl();

    // This is model name
    final String mname = JCodeGen.toJavaId(_key.toString());

    fileCtx.add(new CodeGenerator() {
      @Override
      public void generate(JCodeSB out) {
        JCodeGen.toClassWithArray(out, null, mname + "_RESCNT", _output._rescnt,
                                  "Count of categorical levels in response.");
        JCodeGen.toClassWithArray(out, null, mname + "_APRIORI", _output._apriori_raw,
                                  "Apriori class distribution of the response.");
        JCodeGen.toClassWithArray(out, null, mname + "_PCOND", _output._pcond_raw,
                                  "Conditional probability of predictors.");
        double[] dlen = null;
        if (_output._ncats > 0) {
          dlen = new double[_output._ncats];
          for (int i = 0; i < _output._ncats; i++)
            dlen[i] = _output._domains[i].length;
        }
        JCodeGen.toClassWithArray(out, null, mname + "_DOMLEN", dlen,
                                  "Number of unique levels for each categorical predictor.");
      }
    });

    return sb;
  }

  @Override protected void toJavaPredictBody(SBPrintStream bodySb,
                                             CodeGeneratorPipeline classCtx,
                                             CodeGeneratorPipeline fileCtx,
                                             final boolean verboseCode) {
    // This is model name
    final String mname = JCodeGen.toJavaId(_key.toString());

    bodySb.i().p("java.util.Arrays.fill(preds,0);").nl();
    bodySb.i().p("double mean, sdev, prob;").nl();
    bodySb.i().p("double[] nums = new double[" + _output._levels.length + "];").nl();

    bodySb.i().p("for(int i = 0; i < " + _output._levels.length + "; i++) {").nl();
    bodySb.i(1).p("nums[i] = Math.log(").pj(mname+"_APRIORI", "VALUES").p("[i]);").nl();
    bodySb.i(1).p("for(int j = 0; j < " + _output._ncats + "; j++) {").nl();
    bodySb.i(2).p("if(Double.isNaN(data[j])) continue;").nl();
    bodySb.i(2).p("int level = (int)data[j];").nl();
    bodySb.i(2).p("prob = level < ").p(_output._pcond_raw.length).p(" ? " + mname + "_PCOND.VALUES[j][i][level] : ")
                .p(_parms._laplace == 0 ? "0" : _parms._laplace + "/("+mname+"_RESCNT.VALUES[i] + " + _parms._laplace
                                              + "*" + mname + "_DOMLEN.VALUES[j])").p(";").nl();
    bodySb.i(2).p("nums[i] += Math.log(prob <= " + _parms._eps_prob + " ? " + _parms._min_prob + " : prob);").nl();
    bodySb.i(1).p("}").nl();

    bodySb.i(1).p("for(int j = " + _output._ncats + "; j < data.length; j++) {").nl();
    bodySb.i(2).p("if(Double.isNaN(data[j])) continue;").nl();
    bodySb.i(2).p("mean = Double.isNaN("+mname+"_PCOND.VALUES[j][i][0]) ? 0 : "+mname+"_PCOND.VALUES[j][i][0];").nl();
    bodySb.i(2).p("sdev = Double.isNaN("+mname+"_PCOND.VALUES[j][i][1]) ? 1 : ("+mname+"_PCOND.VALUES[j][i][1] <= " + _parms._eps_sdev + " ? "
            + _parms._min_sdev + " : "+mname+"_PCOND.VALUES[j][i][1]);").nl();
    bodySb.i(2).p("prob = Math.exp(-((data[j]-mean)*(data[j]-mean))/(2.*sdev*sdev)) / (sdev*Math.sqrt(2.*Math.PI));").nl();
    bodySb.i(2).p("nums[i] += Math.log(prob <= " + _parms._eps_prob + " ? " + _parms._min_prob + " : prob);").nl();
    bodySb.i(1).p("}").nl();
    bodySb.i().p("}").nl();

    bodySb.i().p("double sum;").nl();
    bodySb.i().p("for(int i = 0; i < nums.length; i++) {").nl();
    bodySb.i(1).p("sum = 0;").nl();
    bodySb.i(1).p("for(int j = 0; j < nums.length; j++) {").nl();
    bodySb.i(2).p("sum += Math.exp(nums[j]-nums[i]);").nl();
    bodySb.i(1).p("}").nl();
    bodySb.i(1).p("preds[i+1] = 1/sum;").nl();
    bodySb.i().p("}").nl();

    bodySb.i().p("preds[0] = hex.genmodel.GenModel.getPrediction(preds, PRIOR_CLASS_DISTRIB, data, " + defaultThreshold()+");").nl();
  }

  @Override
  protected boolean isFeatureUsedInPredict(int featureIdx) {
    /**
     * NaiveBayes considers each feature independently so even if we would have two features
     * that are identical to the target NB would pick both... In the case of constant input
     * hex.Model#adaptTestForTrain will take care of removing the columns and the default logic
     * will work just fine if `_ignore_const_cols` is true else we check that the feature is
     * independent to the response. (P(x|resp=A) = P(x|resp=B) =... ) (I think that in the real world being "numerically"
     * independent will be pretty rare)
     */
    for (int response = 0; response < _output._pcond_raw[featureIdx].length; response++) {
      double val = _output._pcond_raw[featureIdx][response][0];
      for (double p : _output._pcond_raw[featureIdx][response]) {
        if (val != p)
          return true;
      }
    }
    return false;
  }
}