edu.stanford.nlp.classify.NBLinearClassifierFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.classify;

import edu.stanford.nlp.ling.BasicDatum;
import edu.stanford.nlp.optimization.GoldenSectionLineSearch;

import java.util.function.Function;


import edu.stanford.nlp.util.logging.Redwood;

/**
 * Provides a medium-weight implementation of Bernoulli (or binary)
 * Naive Bayes via a linear classifier.  It's medium weight in that
 * it uses dense arrays for counts and calculation (but, hey, NB is
 * efficient to estimate).  Each feature is treated as an independent
 * binary variable.
 * 
 * CDM Jun 2003: I added a dirty trick so that if there is a feature
 * that is always on in input examples, then its weight is turned into
 * a prior feature!  (This will work well iff it is also always on at
 * test time.)  In fact, this is done for each such feature, so by
 * having several such features, one can even get an integral prior
 * boost out of this.
 *
 * @author Dan Klein
 * @author Sarah Spikes ([email protected]) (Templatization)
 *
 * @param  The type of the labels in the Classifier
 * @param  The type of the features in the Classifier
 */
public class NBLinearClassifierFactory extends AbstractLinearClassifierFactory  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(NBLinearClassifierFactory.class);

  private static final boolean VERBOSE = false;

  private double sigma;     // amount of add-k smoothing of evidence
  private final boolean interpretAlwaysOnFeatureAsPrior;
  private static final double epsilon = 1e-30;   // fudge to keep nonzero
  private boolean tuneSigma = false;
  private int folds;

  final static Redwood.RedwoodChannels logger = Redwood.channels(NBLinearClassifierFactory.class);


  @Override
  protected double[][] trainWeights(GeneralDataset data) {
    return trainWeights(data.getDataArray(), data.getLabelsArray());
  }

  /**
   * Train weights.
   * If tuneSigma is true, the optimal sigma value is found using cross-validation:
   * the number of folds is determined by the folds variable,
   * if there are less training examples than folds,
   * leave-one-out is used.
   */
  double[][] trainWeights(int[][] data, int[] labels) {
    if (tuneSigma) {
      tuneSigma(data, labels);
    }
    if (VERBOSE) {
      logger.info("NB CF: " + data.length + " data items ");
      for (int i = 0; i < data.length; i++) {
        log.info("Datum " + i + ": " + labels[i] + ":");
        for (int j = 0; j < data[i].length; j++) {
          log.info(" " + data[i][j]);
        }
        logger.info("");
      }
    }
    int numFeatures = numFeatures();
    int numClasses = numClasses();
    double[][] weights = new double[numFeatures][numClasses];
    // find P(C|F)/P(C)
    int num = 0;
    double[] numc = new double[numClasses];
    double n = 0;   // num active features in whole dataset
    double[] n_c = new double[numClasses];  // num active features in class c items
    double[] n_f = new double[numFeatures]; // num data items for which feature is active
    double[][] n_fc = new double[numFeatures][numClasses];  // num times feature active in class c
    for (int d = 0; d < data.length; d++) {
      num++;
      numc[labels[d]]++;
      for (int i = 0; i < data[d].length; i++) {
        n++;
        n_c[labels[d]]++;
        n_f[data[d][i]]++;
        n_fc[data[d][i]][labels[d]]++;
      }
    }
    for (int c = 0; c < numClasses; c++) {
      for (int f = 0; f < numFeatures; f++) {
        if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length) {
          // interpret always on feature as prior!
          weights[f][c] = Math.log(numc[c] / num);
        } else {
          // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f)
          // set lambda = log (P()/P())
          double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon);
          double p_c_f = (n_fc[f][c] + sigma) / (n_f[f] + sigma * numClasses);
          if (VERBOSE) {
            logger.info("Prob ratio(f=" + f + ",c=" + c + ") = " + p_c_f / p_c + " (nc=" + n_c[c] + ", nf=" + n_f[f] + ", nfc=" + n_fc[f][c] + ")");
          }
          weights[f][c] = Math.log(p_c_f / p_c);
        }
      }
    }
    return weights;
  }

  double[][] weights(int[][] data, int[] labels, int testMin, int testMax, double trialSigma, int foldSize) {
    int numFeatures = numFeatures();
    int numClasses = numClasses();
    double[][] weights = new double[numFeatures][numClasses];
    // find P(C|F)/P(C)
    int num = 0;
    double[] numc = new double[numClasses];
    double n = 0;   // num active features in whole dataset
    double[] n_c = new double[numClasses];  // num active features in class c items
    double[] n_f = new double[numFeatures]; // num data items for which feature is active
    double[][] n_fc = new double[numFeatures][numClasses];  // num times feature active in class c
    for (int d = 0; d < data.length; d++) {
      if (d == testMin) {
        d = testMax - 1;
        continue;
      }
      num++;
      numc[labels[d]]++;
      for (int i = 0; i < data[d].length; i++) {
        if (i == testMin) {
          i = testMax - 1;
          continue;
        }
        n++;
        n_c[labels[d]]++;
        n_f[data[d][i]]++;
        n_fc[data[d][i]][labels[d]]++;
      }
    }
    for (int c = 0; c < numClasses; c++) {
      for (int f = 0; f < numFeatures; f++) {
        if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length - foldSize) {
          // interpret always on feature as prior!
          weights[f][c] = Math.log(numc[c] / num);
        } else {
          // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f)
          // set lambda = log (P()/P())
          double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon);
          double p_c_f = (n_fc[f][c] + trialSigma) / (n_f[f] + trialSigma * numClasses);
          weights[f][c] = Math.log(p_c_f / p_c);
        }
      }
    }
    return weights;
  }


  private void tuneSigma(final int[][] data, final int[] labels) {

    Function CVSigmaToPerplexity = trialSigma -> {
      double score = 0.0;
      double sumScore = 0.0;
      int foldSize, nbCV;
      logger.info("Trying sigma = " + trialSigma);
      //test if enough training data
      if (data.length >= folds) {
        foldSize = data.length / folds;
        nbCV = folds;
      } else { //leave-one-out
        foldSize = 1;
        nbCV = data.length;
      }

      for (int j = 0; j < nbCV; j++) {
        //System.out.println("CV j: "+ j);
        int testMin = j * foldSize;
        int testMax = testMin + foldSize;

        LinearClassifier c = new LinearClassifier<>(weights(data, labels, testMin, testMax, trialSigma, foldSize), featureIndex, labelIndex);
        for (int i = testMin; i < testMax; i++) {
          //System.out.println("test i: "+ i + " "+ new BasicDatum(featureIndex.objects(data[i])));
          score -= c.logProbabilityOf(new BasicDatum<>(featureIndex.objects(data[i]))).getCount(labelIndex.get(labels[i]));
        }
        //System.err.printf("%d: %8g%n", j, score);
        sumScore += score;
      }
      System.err.printf(": %8g%n", sumScore);
      return sumScore;
    };

    GoldenSectionLineSearch gsls = new GoldenSectionLineSearch(true);
    sigma = gsls.minimize(CVSigmaToPerplexity, 0.01, 0.0001, 2.0);
    System.out.println("Sigma used: " + sigma);
  }

  /**
   * Create a ClassifierFactory.
   */
  public NBLinearClassifierFactory() {
    this(1.0);
  }

  /**
   * Create a ClassifierFactory.
   *
   * @param sigma The amount of add-sigma smoothing of evidence
   */
  public NBLinearClassifierFactory(double sigma) {
    this(sigma, false);
  }

  /**
   * Create a ClassifierFactory.
   *
   * @param sigma The amount of add-sigma smoothing of evidence
   * @param interpretAlwaysOnFeatureAsPrior If true, a feature that is in every
   *              data item is interpreted as an indication to include a prior
   *              factor over classes.  (If there are multiple such features, an
   *              integral "prior boost" will occur.)  If false, an always on
   *              feature is interpreted as an evidence feature (and, following
   *              the standard math) will have no effect on the model.

   */
  public NBLinearClassifierFactory(double sigma, boolean interpretAlwaysOnFeatureAsPrior) {
    this.sigma = sigma;
    this.interpretAlwaysOnFeatureAsPrior = interpretAlwaysOnFeatureAsPrior;
  }

  /**
   * setTuneSigmaCV sets the tuneSigma flag: when turned on,
   * the sigma is tuned by cross-validation.
   * If there is less data than the number of folds, leave-one-out is used.
   * The default for tuneSigma is false.
   *
   * @param folds Number of folds for cross validation
   */
  public void setTuneSigmaCV(int folds) {
    tuneSigma = true;
    this.folds = folds;
  }

  private static final long serialVersionUID = 1;

}