All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.classifiers.trees.HoeffdingTree Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other breaking updates.

There is a newer version: 3.8.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    HoeffdingTree.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.trees;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.UpdateableClassifier;
import weka.classifiers.trees.ht.ActiveHNode;
import weka.classifiers.trees.ht.GiniSplitMetric;
import weka.classifiers.trees.ht.HNode;
import weka.classifiers.trees.ht.InactiveHNode;
import weka.classifiers.trees.ht.InfoGainSplitMetric;
import weka.classifiers.trees.ht.LeafNode;
import weka.classifiers.trees.ht.LearningNode;
import weka.classifiers.trees.ht.NBNode;
import weka.classifiers.trees.ht.NBNodeAdaptive;
import weka.classifiers.trees.ht.SplitCandidate;
import weka.classifiers.trees.ht.SplitMetric;
import weka.classifiers.trees.ht.SplitNode;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;

/**
  
 * A Hoeffding tree (VFDT) is an incremental, anytime
 * decision tree induction algorithm that is capable of learning from massive
 * data streams, assuming that the distribution generating examples does not
 * change over time. Hoeffding trees exploit the fact that a small sample can
 * often be enough to choose an optimal splitting attribute. This idea is
 * supported mathematically by the Hoeffding bound, which quantifies the number
 * of observations (in our case, examples) needed to estimate some statistics
 * within a prescribed precision (in our case, the goodness of an attribute).
*
* A theoretically appealing feature of Hoeffding Trees not shared by * otherincremental decision tree learners is that it has sound guarantees of * performance. Using the Hoeffding bound one can show that its output is * asymptotically nearly identical to that of a non-incremental learner using * infinitely many examples. For more information see:
*
* Geoff Hulten, Laurie Spencer, Pedro Domingos: Mining time-changing data * streams. In: ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining, * 97-106, 2001. *

* * BibTeX: * *

 * @inproceedings{Hulten2001,
 *    author = {Geoff Hulten and Laurie Spencer and Pedro Domingos},
 *    booktitle = {ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining},
 *    pages = {97-106},
 *    publisher = {ACM Press},
 *    title = {Mining time-changing data streams},
 *    year = {2001}
 * }
 * 
*

* * Valid options are: *

* *

 * -L
 *  The leaf prediction strategy to use. 0 = majority class, 1 = naive Bayes, 2 = naive Bayes adaptive.
 *  (default = 0)
 * 
* *
 * -S
 *  The splitting criterion to use. 0 = Gini, 1 = Info gain
 *  (default = 0)
 * 
* *
 * -E
 *  The allowable error in a split decision - values closer to zero will take longer to decide
 *  (default = 1e-7)
 * 
* *
 * -H
 *  Threshold below which a split will be forced to break ties
 *  (default = 0.05)
 * 
* *
 * -M
 *  Minimum fraction of weight required down at least two branches for info gain splitting
 *  (default = 0.01)
 * 
* *
 * -G
 *  Grace period - the number of instances a leaf should observe between split attempts
 *  (default = 200)
 * 
* *
 * -N
 *  The number of instances (weight) a leaf should observe before allowing naive Bayes to make predictions (NB or NB adaptive only)
 *  (default = 0)
 * 
* *
 * -P
 *  Print leaf models when using naive Bayes at the leaves.
 * 
* * * @author Richard Kirkby ([email protected]) * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 15233 $ */ public class HoeffdingTree extends AbstractClassifier implements UpdateableClassifier, WeightedInstancesHandler, OptionHandler, RevisionHandler, TechnicalInformationHandler, Drawable, Serializable { /** * For serialization */ private static final long serialVersionUID = 7117521775722396251L; protected Instances m_header; protected HNode m_root; /** The number of instances a leaf should observe between split attempts */ protected double m_gracePeriod = 200; /** * The allowable error in a split decision. Values closer to zero will take * longer to decide */ protected double m_splitConfidence = 0.0000001; /** Threshold below which a split will be forced to break ties */ protected double m_hoeffdingTieThreshold = 0.05; /** * The minimum fraction of weight required down at least two branches for info * gain splitting */ protected double m_minFracWeightForTwoBranchesGain = 0.01; /** The splitting metric to use */ protected int m_selectedSplitMetric = INFO_GAIN_SPLIT; protected SplitMetric m_splitMetric = new InfoGainSplitMetric( m_minFracWeightForTwoBranchesGain); /** The leaf prediction strategy to use */ protected int m_leafStrategy = LEAF_NB_ADAPTIVE; /** * The number of instances (total weight) a leaf should observe before * allowing naive Bayes to make predictions */ protected double m_nbThreshold = 0; protected int m_activeLeafCount; protected int m_inactiveLeafCount; protected int m_decisionNodeCount; public static final int GINI_SPLIT = 0; public static final int INFO_GAIN_SPLIT = 1; public static final Tag[] TAGS_SELECTION = { new Tag(GINI_SPLIT, "Gini split"), new Tag(INFO_GAIN_SPLIT, "Info gain split") }; public static final int LEAF_MAJ_CLASS = 0; public static final int LEAF_NB = 1; public static final int LEAF_NB_ADAPTIVE = 2; public static final Tag[] TAGS_SELECTION2 = { new Tag(LEAF_MAJ_CLASS, "Majority class"), new Tag(LEAF_NB, "Naive Bayes"), new Tag(LEAF_NB_ADAPTIVE, "Naive Bayes adaptive") }; /** * Print out leaf models in the case of naive Bayes or naive Bayes adaptive * leaves */ protected boolean m_printLeafModels; /** * Returns a string describing classifier * * @return a description suitable for displaying in the explorer/experimenter * gui */ public String globalInfo() { return "A Hoeffding tree (VFDT) is an incremental, anytime decision tree induction algorithm" + " that is capable of learning from massive data streams, assuming that the" + " distribution generating examples does not change over time. Hoeffding trees" + " exploit the fact that a small sample can often be enough to choose an optimal" + " splitting attribute. This idea is supported mathematically by the Hoeffding" + " bound, which quantifies the number of observations (in our case, examples)" + " needed to estimate some statistics within a prescribed precision (in our" + " case, the goodness of an attribute).\n\nA theoretically appealing feature " + " of Hoeffding Trees not shared by otherincremental decision tree learners is that " + " it has sound guarantees of performance. Using the Hoeffding bound one can show that " + " its output is asymptotically nearly identical to that of a non-incremental learner " + " using infinitely many examples. For more information see: \n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Geoff Hulten and Laurie Spencer and Pedro Domingos"); result.setValue(Field.TITLE, "Mining time-changing data streams"); result.setValue(Field.BOOKTITLE, "ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining"); result.setValue(Field.YEAR, "2001"); result.setValue(Field.PAGES, "97-106"); result.setValue(Field.PUBLISHER, "ACM Press"); return result; } protected void reset() { m_root = null; m_activeLeafCount = 0; m_inactiveLeafCount = 0; m_decisionNodeCount = 0; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.setMinimumNumberInstances(0); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy