All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.classifiers.trees.J48 Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

The newest version!
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    J48.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.trees;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Sourcable;
import weka.classifiers.trees.j48.BinC45ModelSelection;
import weka.classifiers.trees.j48.C45ModelSelection;
import weka.classifiers.trees.j48.C45PruneableClassifierTree;
import weka.classifiers.trees.j48.ClassifierTree;
import weka.classifiers.trees.j48.ModelSelection;
import weka.classifiers.trees.j48.PruneableClassifierTree;
import weka.core.AdditionalMeasureProducer;
import weka.core.Capabilities;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Matchable;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.PartitionGenerator;
import weka.core.RevisionUtils;
import weka.core.Summarizable;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;

/**
 *  Class for generating a pruned or unpruned C4.5
 * decision tree. For more information, see
*
* Ross Quinlan (1993). C4.5: Programs for Machine Learning. Morgan Kaufmann * Publishers, San Mateo, CA. *

* * * BibTeX: * *

 * @book{Quinlan1993,
 *    address = {San Mateo, CA},
 *    author = {Ross Quinlan},
 *    publisher = {Morgan Kaufmann Publishers},
 *    title = {C4.5: Programs for Machine Learning},
 *    year = {1993}
 * }
 * 
*

* * * Valid options are: *

* *

 * -U
 *  Use unpruned tree.
 * 
* *
 * -O
 *  Do not collapse tree.
 * 
* *
 * -C <pruning confidence>
 *  Set confidence threshold for pruning.
 *  (default 0.25)
 * 
* *
 * -M <minimum number of instances>
 *  Set minimum number of instances per leaf.
 *  (default 2)
 * 
* *
 * -R
 *  Use reduced error pruning.
 * 
* *
 * -N <number of folds>
 *  Set number of folds for reduced error
 *  pruning. One fold is used as pruning set.
 *  (default 3)
 * 
* *
 * -B
 *  Use binary splits only.
 * 
* *
 * -S
 *  Don't perform subtree raising.
 * 
* *
 * -L
 *  Do not clean up after the tree has been built.
 * 
* *
 * -A
 *  Laplace smoothing for predicted probabilities.
 * 
* *
 * -J
 *  Do not use MDL correction for info gain on numeric attributes.
 * 
* *
 * -Q <seed>
 *  Seed for random data shuffling (default 1).
 * 
* *
 * -doNotMakeSplitPointActualValue
 *  Do not make split point actual value.
 * 
* * * * @author Eibe Frank ([email protected]) * @version $Revision: 15519 $ */ public class J48 extends AbstractClassifier implements OptionHandler, Drawable, Matchable, Sourcable, WeightedInstancesHandler, Summarizable, AdditionalMeasureProducer, TechnicalInformationHandler, PartitionGenerator { /** for serialization */ static final long serialVersionUID = -217733168393644444L; /** The decision tree */ protected ClassifierTree m_root; /** Unpruned tree? */ protected boolean m_unpruned = false; /** Collapse tree? */ protected boolean m_collapseTree = true; /** Confidence level */ protected float m_CF = 0.25f; /** Minimum number of instances */ protected int m_minNumObj = 2; /** Use MDL correction? */ protected boolean m_useMDLcorrection = true; /** * Determines whether probabilities are smoothed using Laplace correction when * predictions are generated */ protected boolean m_useLaplace = false; /** Use reduced error pruning? */ protected boolean m_reducedErrorPruning = false; /** Number of folds for reduced error pruning. */ protected int m_numFolds = 3; /** Binary splits on nominal attributes? */ protected boolean m_binarySplits = false; /** Subtree raising to be performed? */ protected boolean m_subtreeRaising = true; /** Cleanup after the tree has been built. */ protected boolean m_noCleanup = false; /** Random number seed for reduced-error pruning. */ protected int m_Seed = 1; /** Do not relocate split point to actual data value */ protected boolean m_doNotMakeSplitPointActualValue; /** * Returns a string describing classifier * * @return a description suitable for displaying in the explorer/experimenter * gui */ public String globalInfo() { return "Class for generating a pruned or unpruned C4.5 decision tree. For more " + "information, see\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.BOOK); result.setValue(Field.AUTHOR, "Ross Quinlan"); result.setValue(Field.YEAR, "1993"); result.setValue(Field.TITLE, "C4.5: Programs for Machine Learning"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann Publishers"); result.setValue(Field.ADDRESS, "San Mateo, CA"); return result; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result; result = new Capabilities(this); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(0); return result; } /** * Generates the classifier. * * @param instances the data to train the classifier with * @throws Exception if classifier can't be built successfully */ @Override public void buildClassifier(Instances instances) throws Exception { if ((m_unpruned) && (!m_subtreeRaising)) { throw new Exception("Subtree raising does not need to be unset for unpruned trees!"); } if ((m_unpruned) && (m_reducedErrorPruning)) { throw new Exception("Unpruned tree and reduced error pruning cannot be selected simultaneously!"); } if ((m_unpruned) && (m_CF != 0.25f)) { throw new Exception("It does not make sense to change the confidence for an unpruned tree!"); } if ((m_reducedErrorPruning) && (m_CF != 0.25f)) { throw new Exception("Changing the confidence does not make sense for reduced error pruning."); } if ((!m_reducedErrorPruning) && (m_numFolds != 3)) { throw new Exception("Changing the number of folds does not make sense if" + " reduced error pruning is not selected."); } if ((!m_reducedErrorPruning) && (m_Seed != 1)) { throw new Exception("Changing the seed does not make sense if" + " reduced error pruning is not selected."); } if ((m_CF <= 0) || (m_CF >= 1)) { throw new Exception("Confidence has to be greater than zero and smaller than one!"); } getCapabilities().testWithFail(instances); ModelSelection modSelection; if (m_binarySplits) { modSelection = new BinC45ModelSelection(m_minNumObj, instances, m_useMDLcorrection, m_doNotMakeSplitPointActualValue); } else { modSelection = new C45ModelSelection(m_minNumObj, instances, m_useMDLcorrection, m_doNotMakeSplitPointActualValue); } if (!m_reducedErrorPruning) { m_root = new C45PruneableClassifierTree(modSelection, !m_unpruned, m_CF, m_subtreeRaising, !m_noCleanup, m_collapseTree); } else { m_root = new PruneableClassifierTree(modSelection, !m_unpruned, m_numFolds, !m_noCleanup, m_Seed); } m_root.buildClassifier(instances); if (m_binarySplits) { ((BinC45ModelSelection) modSelection).cleanup(); } else { ((C45ModelSelection) modSelection).cleanup(); } } /** * Classifies an instance. * * @param instance the instance to classify * @return the classification for the instance * @throws Exception if instance can't be classified successfully */ @Override public double classifyInstance(Instance instance) throws Exception { return m_root.classifyInstance(instance); } /** * Returns class probabilities for an instance. * * @param instance the instance to calculate the class probabilities for * @return the class probabilities * @throws Exception if distribution can't be computed successfully */ @Override public final double[] distributionForInstance(Instance instance) throws Exception { return m_root.distributionForInstance(instance, m_useLaplace); } /** * Returns the type of graph this classifier represents. * * @return Drawable.TREE */ @Override public int graphType() { return Drawable.TREE; } /** * Returns graph describing the tree. * * @return the graph describing the tree * @throws Exception if graph can't be computed */ @Override public String graph() throws Exception { return m_root.graph(); } /** * Returns tree in prefix order. * * @return the tree in prefix order * @throws Exception if something goes wrong */ @Override public String prefix() throws Exception { return m_root.prefix(); } /** * Returns tree as an if-then statement. * * @param className the name of the Java class * @return the tree as a Java if-then type statement * @throws Exception if something goes wrong */ @Override public String toSource(String className) throws Exception { StringBuffer[] source = m_root.toSource(className); return "class " + className + " {\n\n" + " public static double classify(Object[] i)\n" + " throws Exception {\n\n" + " double p = Double.NaN;\n" + source[0] // Assignment code + " return p;\n" + " }\n" + source[1] // Support code + "}\n"; } /** * Returns an enumeration describing the available options. * * Valid options are: *

* * -U
* Use unpruned tree. *

* * -C confidence
* Set confidence threshold for pruning. (Default: 0.25) *

* * -M number
* Set minimum number of instances per leaf. (Default: 2) *

* * -R
* Use reduced error pruning. No subtree raising is performed. *

* * -N number
* Set number of folds for reduced error pruning. One fold is used as the * pruning set. (Default: 3) *

* * -B
* Use binary splits for nominal attributes. *

* * -S
* Don't perform subtree raising. *

* * -L
* Do not clean up after the tree has been built. * * -A
* If set, Laplace smoothing is used for predicted probabilites. *

* * -Q
* The seed for reduced-error pruning. *

* * @return an enumeration of all the available options. */ @Override public Enumeration

* *

   * -U
   *  Use unpruned tree.
   * 
* *
   * -O
   *  Do not collapse tree.
   * 
* *
   * -C <pruning confidence>
   *  Set confidence threshold for pruning.
   *  (default 0.25)
   * 
* *
   * -M <minimum number of instances>
   *  Set minimum number of instances per leaf.
   *  (default 2)
   * 
* *
   * -R
   *  Use reduced error pruning.
   * 
* *
   * -N <number of folds>
   *  Set number of folds for reduced error
   *  pruning. One fold is used as pruning set.
   *  (default 3)
   * 
* *
   * -B
   *  Use binary splits only.
   * 
* *
   * -S
   *  Don't perform subtree raising.
   * 
* *
   * -L
   *  Do not clean up after the tree has been built.
   * 
* *
   * -A
   *  Laplace smoothing for predicted probabilities.
   * 
* *
   * -J
   *  Do not use MDL correction for info gain on numeric attributes.
   * 
* *
   * -Q <seed>
   *  Seed for random data shuffling (default 1).
   * 
* *
   * -doNotMakeSplitPointActualValue
   *  Do not make split point actual value.
   * 
* * * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { // Other options String minNumString = Utils.getOption('M', options); if (minNumString.length() != 0) { m_minNumObj = Integer.parseInt(minNumString); } else { m_minNumObj = 2; } m_binarySplits = Utils.getFlag('B', options); m_useLaplace = Utils.getFlag('A', options); m_useMDLcorrection = !Utils.getFlag('J', options); // Pruning options m_unpruned = Utils.getFlag('U', options); m_collapseTree = !Utils.getFlag('O', options); m_subtreeRaising = !Utils.getFlag('S', options); m_noCleanup = Utils.getFlag('L', options); m_doNotMakeSplitPointActualValue = Utils.getFlag("doNotMakeSplitPointActualValue", options); m_reducedErrorPruning = Utils.getFlag('R', options); String confidenceString = Utils.getOption('C', options); if (confidenceString.length() != 0) { setConfidenceFactor((new Float(confidenceString)).floatValue()); } else { m_CF = 0.25f; } String numFoldsString = Utils.getOption('N', options); if (numFoldsString.length() != 0) { m_numFolds = Integer.parseInt(numFoldsString); } else { m_numFolds = 3; } String seedString = Utils.getOption('Q', options); if (seedString.length() != 0) { m_Seed = Integer.parseInt(seedString); } else { m_Seed = 1; } super.setOptions(options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector options = new Vector(); // Issue some warnings for the current configuration if necessary if (m_unpruned) { if (!m_subtreeRaising) { System.err.println("WARNING: Subtree raising does not need to be unset for an unpruned tree!"); } if (m_reducedErrorPruning) { System.err.println("WARNING: Unpruned tree and reduced error pruning cannot be selected simultaneously!"); } } if (m_unpruned || m_reducedErrorPruning) { if (m_CF != 0.25f) { System.err.println("WARNING: Changing the confidence will only affect error-based pruning!"); } } if (m_unpruned || !m_reducedErrorPruning) { if (m_Seed != 1) { System.err.println("WARNING: Changing the seed only makes sense when using reduced error pruning"); } if (m_numFolds != 3) { System.err.println("WARNING: Changing the number of folds does not make sense if " + "reduced error pruning is not selected."); } } if (m_noCleanup) { options.add("-L"); } if (!m_collapseTree) { options.add("-O"); } if (m_unpruned) { options.add("-U"); } if (!m_subtreeRaising) { options.add("-S"); } if (m_reducedErrorPruning) { options.add("-R"); } if (m_binarySplits) { options.add("-B"); } if (m_useLaplace) { options.add("-A"); } if (!m_useMDLcorrection) { options.add("-J"); } if (m_doNotMakeSplitPointActualValue) { options.add("-doNotMakeSplitPointActualValue"); } if (m_reducedErrorPruning) { options.add("-N"); options.add("" + m_numFolds); options.add("-Q"); options.add("" + m_Seed); } else if (!m_unpruned) { options.add("-C"); options.add("" + m_CF); } options.add("-M"); options.add("" + m_minNumObj); Collections.addAll(options, super.getOptions()); return options.toArray(new String[0]); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String seedTipText() { return "The seed used for randomizing the data " + "when reduced-error pruning is used."; } /** * Get the value of Seed. * * @return Value of Seed. */ public int getSeed() { return m_Seed; } /** * Set the value of Seed. * * @param newSeed Value to assign to Seed. */ public void setSeed(int newSeed) { m_Seed = newSeed; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useLaplaceTipText() { return "Whether counts at leaves are smoothed based on Laplace."; } /** * Get the value of useLaplace. * * @return Value of useLaplace. */ public boolean getUseLaplace() { return m_useLaplace; } /** * Set the value of useLaplace. * * @param newuseLaplace Value to assign to useLaplace. */ public void setUseLaplace(boolean newuseLaplace) { m_useLaplace = newuseLaplace; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useMDLcorrectionTipText() { return "Whether MDL correction is used when finding splits on numeric attributes."; } /** * Get the value of useMDLcorrection. * * @return Value of useMDLcorrection. */ public boolean getUseMDLcorrection() { return m_useMDLcorrection; } /** * Set the value of useMDLcorrection. * * @param newuseMDLcorrection Value to assign to useMDLcorrection. */ public void setUseMDLcorrection(boolean newuseMDLcorrection) { m_useMDLcorrection = newuseMDLcorrection; } /** * Returns a description of the classifier. * * @return a description of the classifier */ @Override public String toString() { if (m_root == null) { return "No classifier built"; } if (m_unpruned) { return "J48 unpruned tree\n------------------\n" + m_root.toString(); } else { return "J48 pruned tree\n------------------\n" + m_root.toString(); } } /** * Returns a superconcise version of the model * * @return a summary of the model */ @Override public String toSummaryString() { return "Number of leaves: " + m_root.numLeaves() + "\n" + "Size of the tree: " + m_root.numNodes() + "\n"; } /** * Returns the size of the tree * * @return the size of the tree */ public double measureTreeSize() { return m_root.numNodes(); } /** * Returns the number of leaves * * @return the number of leaves */ public double measureNumLeaves() { return m_root.numLeaves(); } /** * Returns the number of rules (same as number of leaves) * * @return the number of rules */ public double measureNumRules() { return m_root.numLeaves(); } /** * Returns an enumeration of the additional measure names * * @return an enumeration of the measure names */ @Override public Enumeration enumerateMeasures() { Vector newVector = new Vector(3); newVector.addElement("measureTreeSize"); newVector.addElement("measureNumLeaves"); newVector.addElement("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * * @param additionalMeasureName the name of the measure to query for its value * @return the value of the named measure * @throws IllegalArgumentException if the named measure is not supported */ @Override public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) { return measureNumRules(); } else if (additionalMeasureName.compareToIgnoreCase("measureTreeSize") == 0) { return measureTreeSize(); } else if (additionalMeasureName.compareToIgnoreCase("measureNumLeaves") == 0) { return measureNumLeaves(); } else { throw new IllegalArgumentException(additionalMeasureName + " not supported (j48)"); } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String unprunedTipText() { return "Whether pruning is performed."; } /** * Get the value of unpruned. * * @return Value of unpruned. */ public boolean getUnpruned() { return m_unpruned; } /** * Set the value of unpruned. Turns reduced-error pruning off if set. * * @param v Value to assign to unpruned. */ public void setUnpruned(boolean v) { m_unpruned = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String collapseTreeTipText() { return "Whether parts are removed that do not reduce training error."; } /** * Get the value of collapseTree. * * @return Value of collapseTree. */ public boolean getCollapseTree() { return m_collapseTree; } /** * Set the value of collapseTree. * * @param v Value to assign to collapseTree. */ public void setCollapseTree(boolean v) { m_collapseTree = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String confidenceFactorTipText() { return "The confidence factor used for pruning (smaller values incur " + "more pruning)."; } /** * Get the value of CF. * * @return Value of CF. */ public float getConfidenceFactor() { return m_CF; } /** * Set the value of CF. * * @param v Value to assign to CF. */ public void setConfidenceFactor(float v) { m_CF = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String minNumObjTipText() { return "The minimum number of instances per leaf."; } /** * Get the value of minNumObj. * * @return Value of minNumObj. */ public int getMinNumObj() { return m_minNumObj; } /** * Set the value of minNumObj. * * @param v Value to assign to minNumObj. */ public void setMinNumObj(int v) { m_minNumObj = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String reducedErrorPruningTipText() { return "Whether reduced-error pruning is used instead of C.4.5 pruning."; } /** * Get the value of reducedErrorPruning. * * @return Value of reducedErrorPruning. */ public boolean getReducedErrorPruning() { return m_reducedErrorPruning; } /** * Set the value of reducedErrorPruning. Turns unpruned trees off if set. * * @param v Value to assign to reducedErrorPruning. */ public void setReducedErrorPruning(boolean v) { m_reducedErrorPruning = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String numFoldsTipText() { return "Determines the amount of data used for reduced-error pruning. " + " One fold is used for pruning, the rest for growing the tree."; } /** * Get the value of numFolds. * * @return Value of numFolds. */ public int getNumFolds() { return m_numFolds; } /** * Set the value of numFolds. * * @param v Value to assign to numFolds. */ public void setNumFolds(int v) { m_numFolds = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String binarySplitsTipText() { return "Whether to use binary splits on nominal attributes when " + "building the trees."; } /** * Get the value of binarySplits. * * @return Value of binarySplits. */ public boolean getBinarySplits() { return m_binarySplits; } /** * Set the value of binarySplits. * * @param v Value to assign to binarySplits. */ public void setBinarySplits(boolean v) { m_binarySplits = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String subtreeRaisingTipText() { return "Whether to consider the subtree raising operation when pruning."; } /** * Get the value of subtreeRaising. * * @return Value of subtreeRaising. */ public boolean getSubtreeRaising() { return m_subtreeRaising; } /** * Set the value of subtreeRaising. * * @param v Value to assign to subtreeRaising. */ public void setSubtreeRaising(boolean v) { m_subtreeRaising = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String saveInstanceDataTipText() { return "Whether to save the training data for visualization."; } /** * Check whether instance data is to be saved. * * @return true if instance data is saved */ public boolean getSaveInstanceData() { return m_noCleanup; } /** * Set whether instance data is to be saved. * * @param v true if instance data is to be saved */ public void setSaveInstanceData(boolean v) { m_noCleanup = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String doNotMakeSplitPointActualValueTipText() { return "If true, the split point is not relocated to an actual data value." + " This can yield substantial speed-ups for large datasets with numeric attributes."; } /** * Gets the value of doNotMakeSplitPointActualValue. * * @return the value */ public boolean getDoNotMakeSplitPointActualValue() { return m_doNotMakeSplitPointActualValue; } /** * Sets the value of doNotMakeSplitPointActualValue. * * @param m_doNotMakeSplitPointActualValue the value to set */ public void setDoNotMakeSplitPointActualValue( boolean m_doNotMakeSplitPointActualValue) { this.m_doNotMakeSplitPointActualValue = m_doNotMakeSplitPointActualValue; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 15519 $"); } /** * Builds the classifier to generate a partition. */ @Override public void generatePartition(Instances data) throws Exception { buildClassifier(data); } /** * Computes an array that indicates node membership. */ @Override public double[] getMembershipValues(Instance inst) throws Exception { return m_root.getMembershipValues(inst); } /** * Returns the number of elements in the partition. */ @Override public int numElements() throws Exception { return m_root.numNodes(); } /** * Main method for testing this class * * @param argv the commandline options */ public static void main(String[] argv) { runClassifier(new J48(), argv); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy