All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.classifiers.bayes.NaiveBayes Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    NaiveBayes.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.bayes;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.estimators.DiscreteEstimator;
import weka.estimators.Estimator;
import weka.estimators.KernelEstimator;
import weka.estimators.NormalEstimator;

/**
 *  Class for a Naive Bayes classifier using estimator
 * classes. Numeric estimator precision values are chosen based on analysis of
 * the training data. For this reason, the classifier is not an
 * UpdateableClassifier (which in typical usage are initialized with zero
 * training instances) -- if you need the UpdateableClassifier functionality,
 * use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable classifier
 * will use a default precision of 0.1 for numeric attributes when
 * buildClassifier is called with zero training instances.
*
* For more information on Naive Bayes classifiers, see
*
* George H. John, Pat Langley: Estimating Continuous Distributions in Bayesian * Classifiers. In: Eleventh Conference on Uncertainty in Artificial * Intelligence, San Mateo, 338-345, 1995. *

* * * BibTeX: * *

 * @inproceedings{John1995,
 *    address = {San Mateo},
 *    author = {George H. John and Pat Langley},
 *    booktitle = {Eleventh Conference on Uncertainty in Artificial Intelligence},
 *    pages = {338-345},
 *    publisher = {Morgan Kaufmann},
 *    title = {Estimating Continuous Distributions in Bayesian Classifiers},
 *    year = {1995}
 * }
 * 
*

* * * Valid options are: *

* *

 * -K
 *  Use kernel density estimator rather than normal
 *  distribution for numeric attributes
 * 
* *
 * -D
 *  Use supervised discretization to process numeric attributes
 * 
* *
 * -O
 *  Display model in old format (good when there are many classes)
 * 
* * * * @author Len Trigg ([email protected]) * @author Eibe Frank ([email protected]) * @version $Revision: 15232 $ */ public class NaiveBayes extends AbstractClassifier implements OptionHandler, WeightedInstancesHandler, WeightedAttributesHandler, TechnicalInformationHandler, Aggregateable { /** for serialization */ static final long serialVersionUID = 5995231201785697655L; /** The attribute estimators. */ protected Estimator[][] m_Distributions; /** The class estimator. */ protected Estimator m_ClassDistribution; /** * Whether to use kernel density estimator rather than normal distribution for * numeric attributes */ protected boolean m_UseKernelEstimator = false; /** * Whether to use discretization than normal distribution for numeric * attributes */ protected boolean m_UseDiscretization = false; /** The number of classes (or 1 for numeric class) */ protected int m_NumClasses; /** * The dataset header for the purposes of printing out a semi-intelligible * model */ protected Instances m_Instances; /*** The precision parameter used for numeric attributes */ protected static final double DEFAULT_NUM_PRECISION = 0.01; /** * The discretization filter. */ protected weka.filters.supervised.attribute.Discretize m_Disc = null; protected boolean m_displayModelInOldFormat = false; /** * Returns a string describing this classifier * * @return a description of the classifier suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Class for a Naive Bayes classifier using estimator classes. Numeric" + " estimator precision values are chosen based on analysis of the " + " training data. For this reason, the classifier is not an" + " UpdateableClassifier (which in typical usage are initialized with zero" + " training instances) -- if you need the UpdateableClassifier functionality," + " use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable" + " classifier will use a default precision of 0.1 for numeric attributes" + " when buildClassifier is called with zero training instances.\n\n" + "For more information on Naive Bayes classifiers, see\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "George H. John and Pat Langley"); result.setValue(Field.TITLE, "Estimating Continuous Distributions in Bayesian Classifiers"); result.setValue(Field.BOOKTITLE, "Eleventh Conference on Uncertainty in Artificial Intelligence"); result.setValue(Field.YEAR, "1995"); result.setValue(Field.PAGES, "338-345"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); result.setValue(Field.ADDRESS, "San Mateo"); return result; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable( Capability.MISSING_VALUES ); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(0); return result; } /** * Generates the classifier. * * @param instances set of instances serving as training data * @exception Exception if the classifier has not been generated successfully */ @Override public void buildClassifier(Instances instances) throws Exception { if (getUseKernelEstimator() && getUseSupervisedDiscretization()) { throw new IllegalArgumentException("Cannot use both kernel density estimation and discretization!"); } // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_NumClasses = instances.numClasses(); // Copy the instances m_Instances = new Instances(instances); // Discretize instances if required if (m_UseDiscretization) { m_Disc = new weka.filters.supervised.attribute.Discretize(); m_Disc.setInputFormat(m_Instances); m_Instances = weka.filters.Filter.useFilter(m_Instances, m_Disc); } else { m_Disc = null; } // Reserve space for the distributions m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances .numClasses()]; m_ClassDistribution = new DiscreteEstimator(m_Instances.numClasses(), true); int attIndex = 0; Enumeration enu = m_Instances.enumerateAttributes(); while (enu.hasMoreElements()) { Attribute attribute = enu.nextElement(); // If the attribute is numeric, determine the estimator // numeric precision from differences between adjacent values double numPrecision = DEFAULT_NUM_PRECISION; if (attribute.type() == Attribute.NUMERIC) { m_Instances.sort(attribute); if ((m_Instances.numInstances() > 0) && !m_Instances.instance(0).isMissing(attribute)) { double lastVal = m_Instances.instance(0).value(attribute); double currentVal, deltaSum = 0; int distinct = 0; for (int i = 1; i < m_Instances.numInstances(); i++) { Instance currentInst = m_Instances.instance(i); if (currentInst.isMissing(attribute)) { break; } currentVal = currentInst.value(attribute); if (currentVal != lastVal) { deltaSum += currentVal - lastVal; lastVal = currentVal; distinct++; } } if (distinct > 0) { numPrecision = deltaSum / distinct; } } } for (int j = 0; j < m_Instances.numClasses(); j++) { switch (attribute.type()) { case Attribute.NUMERIC: if (m_UseKernelEstimator) { m_Distributions[attIndex][j] = new KernelEstimator(numPrecision); } else { m_Distributions[attIndex][j] = new NormalEstimator(numPrecision); } break; case Attribute.NOMINAL: m_Distributions[attIndex][j] = new DiscreteEstimator( attribute.numValues(), true); break; default: throw new Exception("Attribute type unknown to NaiveBayes"); } } attIndex++; } // Compute counts Enumeration enumInsts = m_Instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { Instance instance = enumInsts.nextElement(); updateClassifier(instance); } // Save space m_Instances = new Instances(m_Instances, 0); } /** * Updates the classifier with the given instance. * * @param instance the new training instance to include in the model * @exception Exception if the instance could not be incorporated in the * model. */ public void updateClassifier(Instance instance) throws Exception { if (!instance.classIsMissing()) { Enumeration enumAtts = m_Instances.enumerateAttributes(); int attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (!instance.isMissing(attribute)) { m_Distributions[attIndex][(int) instance.classValue()].addValue( instance.value(attribute), instance.weight()); } attIndex++; } m_ClassDistribution.addValue(instance.classValue(), instance.weight()); } } /** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @exception Exception if there is a problem generating the prediction */ @Override public double[] distributionForInstance(Instance instance) throws Exception { if (m_UseDiscretization) { m_Disc.input(instance); instance = m_Disc.output(); } double[] probs = new double[m_NumClasses]; for (int j = 0; j < m_NumClasses; j++) { probs[j] = m_ClassDistribution.getProbability(j); } Enumeration enumAtts = instance.enumerateAttributes(); int attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (!instance.isMissing(attribute)) { double temp, max = 0; for (int j = 0; j < m_NumClasses; j++) { temp = Math.max(1e-75, Math.pow(m_Distributions[attIndex][j] .getProbability(instance.value(attribute)), m_Instances.attribute(attIndex).weight())); probs[j] *= temp; if (probs[j] > max) { max = probs[j]; } if (Double.isNaN(probs[j])) { throw new Exception("NaN returned from estimator for attribute " + attribute.name() + ":\n" + m_Distributions[attIndex][j].toString()); } } if ((max > 0) && (max < 1e-75)) { // Danger of probability underflow for (int j = 0; j < m_NumClasses; j++) { probs[j] *= 1e75; } } } attIndex++; } // Display probabilities Utils.normalize(probs); return probs; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy