weka.classifiers.bayes.NaiveBayes Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    NaiveBayes.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.bayes;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.estimators.DiscreteEstimator;
import weka.estimators.Estimator;
import weka.estimators.KernelEstimator;
import weka.estimators.NormalEstimator;

/**
 *  Class for a Naive Bayes classifier using estimator
 * classes. Numeric estimator precision values are chosen based on analysis of
 * the training data. For this reason, the classifier is not an
 * UpdateableClassifier (which in typical usage are initialized with zero
 * training instances) -- if you need the UpdateableClassifier functionality,
 * use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable classifier
 * will use a default precision of 0.1 for numeric attributes when
 * buildClassifier is called with zero training instances.

 * 

 * For more information on Naive Bayes classifiers, see

 * 

 * George H. John, Pat Langley: Estimating Continuous Distributions in Bayesian
 * Classifiers. In: Eleventh Conference on Uncertainty in Artificial
 * Intelligence, San Mateo, 338-345, 1995.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @inproceedings{John1995,
 *    address = {San Mateo},
 *    author = {George H. John and Pat Langley},
 *    booktitle = {Eleventh Conference on Uncertainty in Artificial Intelligence},
 *    pages = {338-345},
 *    publisher = {Morgan Kaufmann},
 *    title = {Estimating Continuous Distributions in Bayesian Classifiers},
 *    year = {1995}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -K
 *  Use kernel density estimator rather than normal
 *  distribution for numeric attributes
 * 
 * 
 *  * -D
 *  Use supervised discretization to process numeric attributes
 * 
 * 
 *  * -O
 *  Display model in old format (good when there are many classes)
 * 
 * 
 * 
 * 
 * @author Len Trigg ([email protected])
 * @author Eibe Frank ([email protected])
 * @version $Revision: 15232 $
 */
public class NaiveBayes extends AbstractClassifier implements OptionHandler,
  WeightedInstancesHandler, WeightedAttributesHandler, TechnicalInformationHandler,
  Aggregateable {

  /** for serialization */
  static final long serialVersionUID = 5995231201785697655L;

  /** The attribute estimators. */
  protected Estimator[][] m_Distributions;

  /** The class estimator. */
  protected Estimator m_ClassDistribution;

  /**
   * Whether to use kernel density estimator rather than normal distribution for
   * numeric attributes
   */
  protected boolean m_UseKernelEstimator = false;

  /**
   * Whether to use discretization than normal distribution for numeric
   * attributes
   */
  protected boolean m_UseDiscretization = false;

  /** The number of classes (or 1 for numeric class) */
  protected int m_NumClasses;

  /**
   * The dataset header for the purposes of printing out a semi-intelligible
   * model
   */
  protected Instances m_Instances;

  /*** The precision parameter used for numeric attributes */
  protected static final double DEFAULT_NUM_PRECISION = 0.01;

  /**
   * The discretization filter.
   */
  protected weka.filters.supervised.attribute.Discretize m_Disc = null;

  protected boolean m_displayModelInOldFormat = false;

  /**
   * Returns a string describing this classifier
   * 
   * @return a description of the classifier suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Class for a Naive Bayes classifier using estimator classes. Numeric"
      + " estimator precision values are chosen based on analysis of the "
      + " training data. For this reason, the classifier is not an"
      + " UpdateableClassifier (which in typical usage are initialized with zero"
      + " training instances) -- if you need the UpdateableClassifier functionality,"
      + " use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable"
      + " classifier will  use a default precision of 0.1 for numeric attributes"
      + " when buildClassifier is called with zero training instances.\n\n"
      + "For more information on Naive Bayes classifiers, see\n\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "George H. John and Pat Langley");
    result.setValue(Field.TITLE,
      "Estimating Continuous Distributions in Bayesian Classifiers");
    result.setValue(Field.BOOKTITLE,
      "Eleventh Conference on Uncertainty in Artificial Intelligence");
    result.setValue(Field.YEAR, "1995");
    result.setValue(Field.PAGES, "338-345");
    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
    result.setValue(Field.ADDRESS, "San Mateo");

    return result;
  }

  /**
   * Returns default capabilities of the classifier.
   * 
   * @return the capabilities of this classifier
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable( Capability.MISSING_VALUES );

    // class
    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);

    // instances
    result.setMinimumNumberInstances(0);

    return result;
  }

  /**
   * Generates the classifier.
   * 
   * @param instances set of instances serving as training data
   * @exception Exception if the classifier has not been generated successfully
   */
  @Override
  public void buildClassifier(Instances instances) throws Exception {

    if (getUseKernelEstimator() && getUseSupervisedDiscretization()) {
      throw new IllegalArgumentException("Cannot use both kernel density estimation and discretization!");
    }
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    m_NumClasses = instances.numClasses();

    // Copy the instances
    m_Instances = new Instances(instances);

    // Discretize instances if required
    if (m_UseDiscretization) {
      m_Disc = new weka.filters.supervised.attribute.Discretize();
      m_Disc.setInputFormat(m_Instances);
      m_Instances = weka.filters.Filter.useFilter(m_Instances, m_Disc);
    } else {
      m_Disc = null;
    }

    // Reserve space for the distributions
    m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances
      .numClasses()];
    m_ClassDistribution = new DiscreteEstimator(m_Instances.numClasses(), true);
    int attIndex = 0;
    Enumeration enu = m_Instances.enumerateAttributes();
    while (enu.hasMoreElements()) {
      Attribute attribute = enu.nextElement();

      // If the attribute is numeric, determine the estimator
      // numeric precision from differences between adjacent values
      double numPrecision = DEFAULT_NUM_PRECISION;
      if (attribute.type() == Attribute.NUMERIC) {
        m_Instances.sort(attribute);
        if ((m_Instances.numInstances() > 0)
          && !m_Instances.instance(0).isMissing(attribute)) {
          double lastVal = m_Instances.instance(0).value(attribute);
          double currentVal, deltaSum = 0;
          int distinct = 0;
          for (int i = 1; i < m_Instances.numInstances(); i++) {
            Instance currentInst = m_Instances.instance(i);
            if (currentInst.isMissing(attribute)) {
              break;
            }
            currentVal = currentInst.value(attribute);
            if (currentVal != lastVal) {
              deltaSum += currentVal - lastVal;
              lastVal = currentVal;
              distinct++;
            }
          }
          if (distinct > 0) {
            numPrecision = deltaSum / distinct;
          }
        }
      }

      for (int j = 0; j < m_Instances.numClasses(); j++) {
        switch (attribute.type()) {
        case Attribute.NUMERIC:
          if (m_UseKernelEstimator) {
            m_Distributions[attIndex][j] = new KernelEstimator(numPrecision);
          } else {
            m_Distributions[attIndex][j] = new NormalEstimator(numPrecision);
          }
          break;
        case Attribute.NOMINAL:
          m_Distributions[attIndex][j] = new DiscreteEstimator(
            attribute.numValues(), true);
          break;
        default:
          throw new Exception("Attribute type unknown to NaiveBayes");
        }
      }
      attIndex++;
    }

    // Compute counts
    Enumeration enumInsts = m_Instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = enumInsts.nextElement();
      updateClassifier(instance);
    }

    // Save space
    m_Instances = new Instances(m_Instances, 0);
  }

  /**
   * Updates the classifier with the given instance.
   * 
   * @param instance the new training instance to include in the model
   * @exception Exception if the instance could not be incorporated in the
   *              model.
   */
  public void updateClassifier(Instance instance) throws Exception {

    if (!instance.classIsMissing()) {
      Enumeration enumAtts = m_Instances.enumerateAttributes();
      int attIndex = 0;
      while (enumAtts.hasMoreElements()) {
        Attribute attribute = enumAtts.nextElement();
        if (!instance.isMissing(attribute)) {
          m_Distributions[attIndex][(int) instance.classValue()].addValue(
            instance.value(attribute), instance.weight());
        }
        attIndex++;
      }
      m_ClassDistribution.addValue(instance.classValue(), instance.weight());
    }
  }

  /**
   * Calculates the class membership probabilities for the given test instance.
   * 
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @exception Exception if there is a problem generating the prediction
   */
  @Override
  public double[] distributionForInstance(Instance instance) throws Exception {

    if (m_UseDiscretization) {
      m_Disc.input(instance);
      instance = m_Disc.output();
    }
    double[] probs = new double[m_NumClasses];
    for (int j = 0; j < m_NumClasses; j++) {
      probs[j] = m_ClassDistribution.getProbability(j);
    }
    Enumeration enumAtts = instance.enumerateAttributes();
    int attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = enumAtts.nextElement();
      if (!instance.isMissing(attribute)) {
        double temp, max = 0;
        for (int j = 0; j < m_NumClasses; j++) {
          temp = Math.max(1e-75, Math.pow(m_Distributions[attIndex][j]
            .getProbability(instance.value(attribute)),
            m_Instances.attribute(attIndex).weight()));
          probs[j] *= temp;
          if (probs[j] > max) {
            max = probs[j];
          }
          if (Double.isNaN(probs[j])) {
            throw new Exception("NaN returned from estimator for attribute "
              + attribute.name() + ":\n"
              + m_Distributions[attIndex][j].toString());
          }
        }
        if ((max > 0) && (max < 1e-75)) { // Danger of probability underflow
          for (int j = 0; j < m_NumClasses; j++) {
            probs[j] *= 1e75;
          }
        }
      }
      attIndex++;
    }

    // Display probabilities
    Utils.normalize(probs);
    return probs;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration