weka.classifiers.trees.j48.C45ModelSelection Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    C45ModelSelection.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.trees.j48;

import java.util.Enumeration;

import weka.core.Attribute;
import weka.core.Instances;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 * Class for selecting a C4.5-type split for a given dataset.
 * 
 * @author Eibe Frank ([email protected])
 * @version $Revision: 15122 $
 */
public class C45ModelSelection extends ModelSelection {

  /** for serialization */
  private static final long serialVersionUID = 3372204862440821989L;

  /** Minimum number of objects in interval. */
  protected final int m_minNoObj;

  /** Use MDL correction? */
  protected final boolean m_useMDLcorrection;

  /** All the training data */
  protected Instances m_allData; //

  /** Do not relocate split point to actual data value */
  protected final boolean m_doNotMakeSplitPointActualValue;

  /**
   * Initializes the split selection method with the given parameters.
   * 
   * @param minNoObj minimum number of instances that have to occur in at least
   *          two subsets induced by split
   * @param allData FULL training dataset (necessary for selection of split
   *          points).
   * @param useMDLcorrection whether to use MDL adjustement when finding splits
   *          on numeric attributes
   * @param doNotMakeSplitPointActualValue if true, split point is not relocated
   *          by scanning the entire dataset for the closest data value
   */
  public C45ModelSelection(int minNoObj, Instances allData,
    boolean useMDLcorrection, boolean doNotMakeSplitPointActualValue) {
    m_minNoObj = minNoObj;
    m_allData = allData;
    m_useMDLcorrection = useMDLcorrection;
    m_doNotMakeSplitPointActualValue = doNotMakeSplitPointActualValue;
  }

  /**
   * Sets reference to training data to null.
   */
  public void cleanup() {

    m_allData = null;
  }

  /**
   * Selects C4.5-type split for the given dataset.
   */
  @Override
  public ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    C45Split[] currentModel;
    C45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {

      // Check if all Instances belong to one class or if not
      // enough Instances to split.
      checkDistribution = new Distribution(data);
      noSplitModel = new NoSplit(checkDistribution);
      if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj)
        || Utils.eq(checkDistribution.total(),
          checkDistribution.perClass(checkDistribution.maxClass()))) {
        return noSplitModel;
      }

      // Check if all attributes are nominal and have a
      // lot of values.
      if (m_allData != null) {
        Enumeration enu = data.enumerateAttributes();
        while (enu.hasMoreElements()) {
          attribute = enu.nextElement();
          if ((attribute.isNumeric())
            || (Utils.sm(attribute.numValues(),
              (0.3 * m_allData.numInstances())))) {
            multiVal = false;
            break;
          }
        }
      }

      currentModel = new C45Split[data.numAttributes()];
      sumOfWeights = data.sumOfWeights();

      // For each attribute.
      for (i = 0; i < data.numAttributes(); i++) {

        // Apart from class attribute.
        if (i != (data).classIndex()) {

          // Get models for current attribute.
          currentModel[i] = new C45Split(i, m_minNoObj, sumOfWeights,
            m_useMDLcorrection);
          currentModel[i].buildClassifier(data);

          // Check if useful split for current attribute
          // exists and check for enumerated attributes with
          // a lot of values.
          if (currentModel[i].checkModel()) {
            if (m_allData != null) {
              if ((data.attribute(i).isNumeric())
                || (multiVal || Utils.sm(data.attribute(i).numValues(),
                  (0.3 * m_allData.numInstances())))) {
                averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                validModels++;
              }
            } else {
              averageInfoGain = averageInfoGain + currentModel[i].infoGain();
              validModels++;
            }
          }
        } else {
          currentModel[i] = null;
        }
      }

      // Check if any useful split was found.
      if (validModels == 0) {
        return noSplitModel;
      }
      averageInfoGain = averageInfoGain / validModels;

      // Find "best" attribute to split on.
      minResult = 0;
      for (i = 0; i < data.numAttributes(); i++) {
        if ((i != (data).classIndex()) && (currentModel[i].checkModel())) {
          // Use 1E-3 here to get a closer approximation to the original
          // implementation.
          if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
            && Utils.gr(currentModel[i].gainRatio(), minResult)) {
            bestModel = currentModel[i];
            minResult = currentModel[i].gainRatio();
          }
        }
      }

      // Check if useful split was found.
      if (Utils.eq(minResult, 0)) {
        return noSplitModel;
      }

      // Add all Instances with unknown values for the corresponding
      // attribute to the distribution for the model, so that
      // the complete distribution is stored with the model.
      bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

      // Set the split point analogue to C45 if attribute numeric.
      if ((m_allData != null) && (!m_doNotMakeSplitPointActualValue)) {
        bestModel.setSplitPoint(m_allData);
      }
      return bestModel;
    } catch (Exception e) {
      e.printStackTrace();
    }
    return null;
  }

  /**
   * Selects C4.5-type split for the given dataset.
   */
  @Override
  public final ClassifierSplitModel selectModel(Instances train, Instances test) {

    return selectModel(train);
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 15122 $");
  }
}