weka.classifiers.meta.Decorate Maven / Gradle / Ivy
Show all versions of decorate Show documentation
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* Decorate.java
* Copyright (C) 2002-2012 Prem Melville
*
*/
package weka.classifiers.meta;
import weka.classifiers.Classifier;
import weka.classifiers.AbstractClassifier;
import weka.classifiers.RandomizableIteratedSingleClassifierEnhancer;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.UnsupportedClassTypeException;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* DECORATE is a meta-learner for building diverse ensembles of classifiers by using specially constructed artificial training examples. Comprehensive experiments have demonstrated that this technique is consistently more accurate than the base classifier, Bagging and Random Forests.Decorate also obtains higher accuracy than Boosting on small training sets, and achieves comparable performance on larger training sets.
*
* For more details see:
*
* P. Melville, R. J. Mooney: Constructing Diverse Classifier Ensembles Using Artificial Training Examples. In: Eighteenth International Joint Conference on Artificial Intelligence, 505-510, 2003.
*
* P. Melville, R. J. Mooney (2004). Creating Diversity in Ensembles Using Artificial Data. Information Fusion: Special Issue on Diversity in Multiclassifier Systems..
*
*
* BibTeX:
*
* @inproceedings{Melville2003,
* author = {P. Melville and R. J. Mooney},
* booktitle = {Eighteenth International Joint Conference on Artificial Intelligence},
* pages = {505-510},
* title = {Constructing Diverse Classifier Ensembles Using Artificial Training Examples},
* year = {2003}
* }
*
* @article{Melville2004,
* author = {P. Melville and R. J. Mooney},
* journal = {Information Fusion: Special Issue on Diversity in Multiclassifier Systems},
* note = {submitted},
* title = {Creating Diversity in Ensembles Using Artificial Data},
* year = {2004}
* }
*
*
*
* Valid options are:
*
* -E
* Desired size of ensemble.
* (default 15)
*
* -R
* Factor that determines number of artificial examples to generate.
* Specified proportional to training set size.
* (default 1.0)
*
* -S <num>
* Random number seed.
* (default 1)
*
* -I <num>
* Number of iterations.
* (default 50)
*
* -D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
* -W
* Full name of base classifier.
* (default: weka.classifiers.trees.J48)
*
*
* Options specific to classifier weka.classifiers.trees.J48:
*
*
* -U
* Use unpruned tree.
*
* -C <pruning confidence>
* Set confidence threshold for pruning.
* (default 0.25)
*
* -M <minimum number of instances>
* Set minimum number of instances per leaf.
* (default 2)
*
* -R
* Use reduced error pruning.
*
* -N <number of folds>
* Set number of folds for reduced error
* pruning. One fold is used as pruning set.
* (default 3)
*
* -B
* Use binary splits only.
*
* -S
* Don't perform subtree raising.
*
* -L
* Do not clean up after the tree has been built.
*
* -A
* Laplace smoothing for predicted probabilities.
*
* -Q <seed>
* Seed for random data shuffling (default 1).
*
*
* Options after -- are passed to the designated classifier.
*
* @author Prem Melville ([email protected])
* @version $Revision: 8038 $
*/
public class Decorate
extends RandomizableIteratedSingleClassifierEnhancer
implements TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = -6020193348750269931L;
/** Vector of classifiers that make up the committee/ensemble. */
protected Vector m_Committee = null;
/** The desired ensemble size. */
protected int m_DesiredSize = 15;
/** Amount of artificial/random instances to use - specified as a
fraction of the training data size. */
protected double m_ArtSize = 1.0 ;
/** The random number generator. */
protected Random m_Random = new Random(0);
/** Attribute statistics - used for generating artificial examples. */
protected Vector m_AttributeStats = null;
/**
* Constructor.
*/
public Decorate() {
m_Classifier = new weka.classifiers.trees.J48();
m_NumIterations = 50;
}
/**
* String describing default classifier.
*
* @return the default classifier classname
*/
protected String defaultClassifierString() {
return "weka.classifiers.trees.J48";
}
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(8);
newVector.addElement(new Option(
"\tDesired size of ensemble.\n"
+ "\t(default 10)",
"E", 1, "-E"));
newVector.addElement(new Option(
"\tFactor that determines number of artificial examples to generate.\n"
+"\tSpecified proportional to training set size.\n"
+ "\t(default 1.0)",
"R", 1, "-R"));
Enumeration enu = super.listOptions();
while (enu.hasMoreElements()) {
newVector.addElement(enu.nextElement());
}
// remove the super class num iterations option because
// we have a different default (50)
newVector.remove(4);
return newVector.elements();
}
/**
* Parses a given list of options.
*
* Valid options are:
*
* -E
* Desired size of ensemble.
* (default 10)
*
* -R
* Factor that determines number of artificial examples to generate.
* Specified proportional to training set size.
* (default 1.0)
*
* -S <num>
* Random number seed.
* (default 1)
*
* -I <num>
* Number of iterations.
* (default 10)
*
* -D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
* -W
* Full name of base classifier.
* (default: weka.classifiers.trees.J48)
*
*
* Options specific to classifier weka.classifiers.trees.J48:
*
*
* -U
* Use unpruned tree.
*
* -C <pruning confidence>
* Set confidence threshold for pruning.
* (default 0.25)
*
* -M <minimum number of instances>
* Set minimum number of instances per leaf.
* (default 2)
*
* -R
* Use reduced error pruning.
*
* -N <number of folds>
* Set number of folds for reduced error
* pruning. One fold is used as pruning set.
* (default 3)
*
* -B
* Use binary splits only.
*
* -S
* Don't perform subtree raising.
*
* -L
* Do not clean up after the tree has been built.
*
* -A
* Laplace smoothing for predicted probabilities.
*
* -Q <seed>
* Seed for random data shuffling (default 1).
*
*
* Options after -- are passed to the designated classifier.
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String desiredSize = Utils.getOption('E', options);
if (desiredSize.length() != 0) {
setDesiredSize(Integer.parseInt(desiredSize));
} else {
setDesiredSize(15);
}
String artSize = Utils.getOption('R', options);
if (artSize.length() != 0) {
setArtificialSize(Double.parseDouble(artSize));
} else {
setArtificialSize(1.0);
}
super.setOptions(options);
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] superOptions = super.getOptions();
String [] options = new String [superOptions.length + 4];
int current = 0;
options[current++] = "-E"; options[current++] = "" + getDesiredSize();
options[current++] = "-R"; options[current++] = "" + getArtificialSize();
System.arraycopy(superOptions, 0, options, current,
superOptions.length);
current += superOptions.length;
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String desiredSizeTipText() {
return "the desired number of member classifiers in the Decorate ensemble. Decorate may terminate "
+"before this size is reached (depending on the value of numIterations). "
+"Larger ensemble sizes usually lead to more accurate models, but increases "
+"training time and model complexity.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String numIterationsTipText() {
return "the maximum number of Decorate iterations to run. Each iteration generates a classifier, "
+"but does not necessarily add it to the ensemble. Decorate stops when the desired ensemble "
+"size is reached. This parameter should be greater than "
+"equal to the desiredSize. If the desiredSize is not being reached it may help to "
+"increase this value.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String artificialSizeTipText() {
return "determines the number of artificial examples to use during training. Specified as "
+"a proportion of the training data. Higher values can increase ensemble diversity.";
}
/**
* Returns a string describing classifier
* @return a description suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "DECORATE is a meta-learner for building diverse ensembles of "
+"classifiers by using specially constructed artificial training "
+"examples. Comprehensive experiments have demonstrated that this "
+"technique is consistently more accurate than the base classifier, Bagging and Random Forests."
+"Decorate also obtains higher accuracy than Boosting on small training sets, and achieves "
+"comparable performance on larger training sets. \n\n"
+"For more details see: \n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "P. Melville and R. J. Mooney");
result.setValue(Field.TITLE, "Constructing Diverse Classifier Ensembles Using Artificial Training Examples");
result.setValue(Field.BOOKTITLE, "Eighteenth International Joint Conference on Artificial Intelligence");
result.setValue(Field.YEAR, "2003");
result.setValue(Field.PAGES, "505-510");
additional = result.add(Type.ARTICLE);
additional.setValue(Field.AUTHOR, "P. Melville and R. J. Mooney");
additional.setValue(Field.TITLE, "Creating Diversity in Ensembles Using Artificial Data");
additional.setValue(Field.JOURNAL, "Information Fusion: Special Issue on Diversity in Multiclassifier Systems");
additional.setValue(Field.YEAR, "2004");
additional.setValue(Field.NOTE, "submitted");
return result;
}
/**
* Factor that determines number of artificial examples to generate.
*
* @return factor that determines number of artificial examples to generate
*/
public double getArtificialSize() {
return m_ArtSize;
}
/**
* Sets factor that determines number of artificial examples to generate.
*
* @param newArtSize factor that determines number of artificial examples to generate
*/
public void setArtificialSize(double newArtSize) {
m_ArtSize = newArtSize;
}
/**
* Gets the desired size of the committee.
*
* @return the desired size of the committee
*/
public int getDesiredSize() {
return m_DesiredSize;
}
/**
* Sets the desired size of the committee.
*
* @param newDesiredSize the desired size of the committee
*/
public void setDesiredSize(int newDesiredSize) {
m_DesiredSize = newDesiredSize;
}
/**
* Returns default capabilities of the classifier.
*
* @return the capabilities of this classifier
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// class
result.disableAllClasses();
result.disableAllClassDependencies();
result.enable(Capability.NOMINAL_CLASS);
// instances
result.setMinimumNumberInstances(m_DesiredSize);
return result;
}
/**
* Build Decorate classifier
*
* @param data the training data to be used for generating the classifier
* @throws Exception if the classifier could not be built successfully
*/
public void buildClassifier(Instances data) throws Exception {
if(m_Classifier == null) {
throw new Exception("A base classifier has not been specified!");
}
// can classifier handle the data?
getCapabilities().testWithFail(data);
// remove instances with missing class
data = new Instances(data);
data.deleteWithMissingClass();
//initialize random number generator
if(m_Seed==-1) m_Random = new Random();
else m_Random = new Random(m_Seed);
int i = 1;//current committee size
int numTrials = 1;//number of Decorate iterations
Instances divData = new Instances(data);//local copy of data - diversity data
Instances artData = null;//artificial data
//compute number of artficial instances to add at each iteration
int artSize = (int) (Math.abs(m_ArtSize)*divData.numInstances());
if(artSize==0) artSize=1;//atleast add one random example
computeStats(data);//Compute training data stats for creating artificial examples
//initialize new committee
m_Committee = new Vector();
Classifier newClassifier = m_Classifier;
newClassifier.buildClassifier(divData);
m_Committee.add(newClassifier);
double eComm = computeError(divData);//compute ensemble error
if(m_Debug) System.out.println("Initialize:\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm);
//repeat till desired committee size is reached OR the max number of iterations is exceeded
while(i cdf[index]){
index++;
}
return index;
}
/**
* Removes a specified number of instances from the given set of instances.
*
* @param data given instances
* @param numRemove number of instances to delete from the given instances
*/
protected void removeInstances(Instances data, int numRemove){
int num = data.numInstances();
for(int i=num - 1; i>num - 1 - numRemove;i--){
data.delete(i);
}
}
/**
* Add new instances to the given set of instances.
*
* @param data given instances
* @param newData set of instances to add to given instances
*/
protected void addInstances(Instances data, Instances newData){
for(int i=0; i