weka.classifiers.trees.HoeffdingTree Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    HoeffdingTree.java
 *    Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.trees;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.UpdateableClassifier;
import weka.classifiers.trees.ht.ActiveHNode;
import weka.classifiers.trees.ht.GiniSplitMetric;
import weka.classifiers.trees.ht.HNode;
import weka.classifiers.trees.ht.InactiveHNode;
import weka.classifiers.trees.ht.InfoGainSplitMetric;
import weka.classifiers.trees.ht.LeafNode;
import weka.classifiers.trees.ht.LearningNode;
import weka.classifiers.trees.ht.NBNode;
import weka.classifiers.trees.ht.NBNodeAdaptive;
import weka.classifiers.trees.ht.SplitCandidate;
import weka.classifiers.trees.ht.SplitMetric;
import weka.classifiers.trees.ht.SplitNode;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;

/**
  
 * A Hoeffding tree (VFDT) is an incremental, anytime
 * decision tree induction algorithm that is capable of learning from massive
 * data streams, assuming that the distribution generating examples does not
 * change over time. Hoeffding trees exploit the fact that a small sample can
 * often be enough to choose an optimal splitting attribute. This idea is
 * supported mathematically by the Hoeffding bound, which quantifies the number
 * of observations (in our case, examples) needed to estimate some statistics
 * within a prescribed precision (in our case, the goodness of an attribute).

 * 

 * A theoretically appealing feature of Hoeffding Trees not shared by
 * otherincremental decision tree learners is that it has sound guarantees of
 * performance. Using the Hoeffding bound one can show that its output is
 * asymptotically nearly identical to that of a non-incremental learner using
 * infinitely many examples. For more information see: 

 * 

 * Geoff Hulten, Laurie Spencer, Pedro Domingos: Mining time-changing data
 * streams. In: ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining,
 * 97-106, 2001.
 * 
 
 * 
  
 * BibTeX:
 * 
 * 
 * @inproceedings{Hulten2001,
 *    author = {Geoff Hulten and Laurie Spencer and Pedro Domingos},
 *    booktitle = {ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining},
 *    pages = {97-106},
 *    publisher = {ACM Press},
 *    title = {Mining time-changing data streams},
 *    year = {2001}
 * }
 * 
 * 
 
 * 
  
 * Valid options are:
 * 

 * 
 * 
 * -L
 *  The leaf prediction strategy to use. 0 = majority class, 1 = naive Bayes, 2 = naive Bayes adaptive.
 *  (default = 0)
 * 
 * 
 *  * -S
 *  The splitting criterion to use. 0 = Gini, 1 = Info gain
 *  (default = 0)
 * 
 * 
 *  * -E
 *  The allowable error in a split decision - values closer to zero will take longer to decide
 *  (default = 1e-7)
 * 
 * 
 *  * -H
 *  Threshold below which a split will be forced to break ties
 *  (default = 0.05)
 * 
 * 
 *  * -M
 *  Minimum fraction of weight required down at least two branches for info gain splitting
 *  (default = 0.01)
 * 
 * 
 *  * -G
 *  Grace period - the number of instances a leaf should observe between split attempts
 *  (default = 200)
 * 
 * 
 *  * -N
 *  The number of instances (weight) a leaf should observe before allowing naive Bayes to make predictions (NB or NB adaptive only)
 *  (default = 0)
 * 
 * 
 *  * -P
 *  Print leaf models when using naive Bayes at the leaves.
 * 
 * 
 
 * 
 * @author Richard Kirkby ([email protected])
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 15519 $
 */
public class HoeffdingTree extends AbstractClassifier implements
    UpdateableClassifier, WeightedInstancesHandler, OptionHandler,
    RevisionHandler, TechnicalInformationHandler, Drawable, Serializable {

  /**
   * For serialization
   */
  private static final long serialVersionUID = 7117521775722396251L;

  protected Instances m_header;
  protected HNode m_root;

  /** The number of instances a leaf should observe between split attempts */
  protected double m_gracePeriod = 200;

  /**
   * The allowable error in a split decision. Values closer to zero will take
   * longer to decide
   */
  protected double m_splitConfidence = 0.0000001;

  /** Threshold below which a split will be forced to break ties */
  protected double m_hoeffdingTieThreshold = 0.05;

  /**
   * The minimum fraction of weight required down at least two branches for info
   * gain splitting
   */
  protected double m_minFracWeightForTwoBranchesGain = 0.01;

  /** The splitting metric to use */
  protected int m_selectedSplitMetric = INFO_GAIN_SPLIT;
  protected SplitMetric m_splitMetric = new InfoGainSplitMetric(
      m_minFracWeightForTwoBranchesGain);

  /** The leaf prediction strategy to use */
  protected int m_leafStrategy = LEAF_NB_ADAPTIVE;

  /**
   * The number of instances (total weight) a leaf should observe before
   * allowing naive Bayes to make predictions
   */
  protected double m_nbThreshold = 0;

  protected int m_activeLeafCount;
  protected int m_inactiveLeafCount;
  protected int m_decisionNodeCount;

  public static final int GINI_SPLIT = 0;
  public static final int INFO_GAIN_SPLIT = 1;

  public static final Tag[] TAGS_SELECTION = {
      new Tag(GINI_SPLIT, "Gini split"),
      new Tag(INFO_GAIN_SPLIT, "Info gain split") };

  public static final int LEAF_MAJ_CLASS = 0;
  public static final int LEAF_NB = 1;
  public static final int LEAF_NB_ADAPTIVE = 2;

  public static final Tag[] TAGS_SELECTION2 = {
      new Tag(LEAF_MAJ_CLASS, "Majority class"),
      new Tag(LEAF_NB, "Naive Bayes"),
      new Tag(LEAF_NB_ADAPTIVE, "Naive Bayes adaptive") };

  /**
   * Print out leaf models in the case of naive Bayes or naive Bayes adaptive
   * leaves
   */
  protected boolean m_printLeafModels;

  /**
   * Returns a string describing classifier
   * 
   * @return a description suitable for displaying in the explorer/experimenter
   *         gui
   */
  public String globalInfo() {
    return "A Hoeffding tree (VFDT) is an incremental, anytime decision tree induction algorithm"
        + " that is capable of learning from massive data streams, assuming that the"
        + " distribution generating examples does not change over time. Hoeffding trees"
        + " exploit the fact that a small sample can often be enough to choose an optimal"
        + " splitting attribute. This idea is supported mathematically by the Hoeffding"
        + " bound, which quantifies the number of observations (in our case, examples)"
        + " needed to estimate some statistics within a prescribed precision (in our"
        + " case, the goodness of an attribute).\n\nA theoretically appealing feature "
        + " of Hoeffding Trees not shared by otherincremental decision tree learners is that "
        + " it has sound guarantees of performance. Using the Hoeffding bound one can show that "
        + " its output is asymptotically nearly identical to that of a non-incremental learner "
        + " using infinitely many examples. For more information see: \n\n"
        + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR,
        "Geoff Hulten and Laurie Spencer and Pedro Domingos");
    result.setValue(Field.TITLE, "Mining time-changing data streams");
    result.setValue(Field.BOOKTITLE,
        "ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining");
    result.setValue(Field.YEAR, "2001");
    result.setValue(Field.PAGES, "97-106");
    result.setValue(Field.PUBLISHER, "ACM Press");

    return result;
  }

  protected void reset() {
    m_root = null;

    m_activeLeafCount = 0;
    m_inactiveLeafCount = 0;
    m_decisionNodeCount = 0;
  }

  /**
   * Returns default capabilities of the classifier.
   * 
   * @return the capabilities of this classifier
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);

    result.setMinimumNumberInstances(0);

    return result;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration