weka.attributeSelection.ClassifierSubsetEval Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ClassifierSubsetEval.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.attributeSelection;

import java.io.File;
import java.util.BitSet;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.evaluation.AbstractEvaluationMetric;
import weka.classifiers.evaluation.InformationRetrievalEvaluationMetric;
import weka.classifiers.rules.ZeroR;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

/**
 
 * Classifier subset evaluator:

 * 

 * Evaluates attribute subsets on training data or a seperate hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes.
 * 


 
 * 
 
 * Valid options are: 
 * 
 * 
 -B <classifier>
 *  class name of the classifier to use for accuracy estimation.
 *  Place any classifier options LAST on the command line
 *  following a "--". eg.:
 *   -B weka.classifiers.bayes.NaiveBayes ... -- -K
 *  (default: weka.classifiers.rules.ZeroR)
 * 
 *  -T
 *  Use the training data to estimate accuracy.
 * 
 *  -H <filename>
 *  Name of the hold out/test set to 
 *  estimate accuracy on.
 * 
 *  -percentage-split
 *  Perform a percentage split on the training data.
 *  Use in conjunction with -T.
 * 
 *  -P
 *  Split percentage to use (default = 90).
 * 
 *  -S
 *  Random seed for percentage split (default = 1).
 * 
 *  -E <DEFAULT|ACC|RMSE|MAE|F-MEAS|AUC|AUPRC|CORR-COEFF>
 *  Performance evaluation measure to use for selecting attributes.
 *  (Default = default: accuracy for discrete class and rmse for numeric class)
 * 
 *  -IRclass <label | index>
 *  Optional class value (label or 1-based index) to use in conjunction with
 *  IR statistics (f-meas, auc or auprc). Omitting this option will use
 *  the class-weighted average.
 * 
 *  
 * Options specific to scheme weka.classifiers.rules.ZeroR:
 * 
 * 
 *  -output-debug-info
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * 
 *  -do-not-check-capabilities
 *  If set, classifier capabilities are not checked before classifier is built
 *  (use with caution).
 * 
 *  -num-decimal-places
 *  The number of decimal places for the output of numbers in the model (default 2).
 * 
 *  -batch-size
 *  The desired batch size for batch prediction  (default 100).
 * 
 
 * 
 * @author Mark Hall ([email protected])
 * @version $Revision: 10332 $
 */
public class ClassifierSubsetEval extends HoldOutSubsetEvaluator implements
  OptionHandler, ErrorBasedMeritEvaluator {

  /** for serialization */
  static final long serialVersionUID = 7532217899385278710L;

  /** training instances */
  private Instances m_trainingInstances;

  /** class index */
  private int m_classIndex;

  /** number of attributes in the training data */
  private int m_numAttribs;

  /** number of training instances */
  // private int m_numInstances; NOT USED

  /** holds the template classifier to use for error estimates */
  private Classifier m_ClassifierTemplate = new ZeroR();

  /**
   * Holds the classifier used when evaluating single hold-out instances - this
   * is used by RaceSearch and the trained classifier may need to persist
   * between calls to that particular method.
   */
  private Classifier m_Classifier = new ZeroR();

  /** the file that containts hold out/test instances */
  private File m_holdOutFile = new File("Click to set hold out or "
    + "test instances");

  /** the instances to test on */
  private Instances m_holdOutInstances;

  /** evaluate on training data rather than separate hold out/test set */
  private boolean m_useTraining = true;

  /** Whether to hold out a percentage of the training data */
  protected boolean m_usePercentageSplit;

  /** Seed for randomizing prior to splitting training data */
  protected int m_seed = 1;

  /** The split to use if doing a percentage split */
  protected String m_splitPercent = "90";

  public static final int EVAL_DEFAULT = 1;
  public static final int EVAL_ACCURACY = 2;
  public static final int EVAL_RMSE = 3;
  public static final int EVAL_MAE = 4;
  public static final int EVAL_FMEASURE = 5;
  public static final int EVAL_AUC = 6;
  public static final int EVAL_AUPRC = 7;
  public static final int EVAL_CORRELATION = 8;
  public static final int EVAL_PLUGIN = 9;

  protected static List PLUGIN_METRICS =
    AbstractEvaluationMetric.getPluginMetrics();

  /** Holds all tags for metrics */
  public static final Tag[] TAGS_EVALUATION;

  static {
    int totalPluginCount = 0;
    if (PLUGIN_METRICS != null) {
      for (AbstractEvaluationMetric m : PLUGIN_METRICS) {
        totalPluginCount += m.getStatisticNames().size();
      }
    }

    TAGS_EVALUATION = new Tag[8 + totalPluginCount];
    TAGS_EVALUATION[0] =
      new Tag(EVAL_DEFAULT, "default",
        "Default: accuracy (discrete class); RMSE (numeric class)");
    TAGS_EVALUATION[1] =
      new Tag(EVAL_ACCURACY, "acc", "Accuracy (discrete class only)");
    TAGS_EVALUATION[2] =
      new Tag(EVAL_RMSE, "rmse",
        "RMSE (of the class probabilities for discrete class)");
    TAGS_EVALUATION[3] =
      new Tag(EVAL_MAE, "mae",
        "MAE (of the class probabilities for discrete class)");
    TAGS_EVALUATION[4] =
      new Tag(EVAL_FMEASURE, "f-meas", "F-measure (discrete class only)");
    TAGS_EVALUATION[5] =
      new Tag(EVAL_AUC, "auc",
        "AUC (area under the ROC curve - discrete class only)");
    TAGS_EVALUATION[6] =
      new Tag(EVAL_AUPRC, "auprc",
        "AUPRC (area under the precision-recall curve - discrete class only)");
    TAGS_EVALUATION[7] =
      new Tag(EVAL_CORRELATION, "corr-coeff",
        "Correlation coefficient - numeric class only");

    if (PLUGIN_METRICS != null) {
      int index = 8;
      for (AbstractEvaluationMetric m : PLUGIN_METRICS) {
        for (String stat : m.getStatisticNames()) {
          TAGS_EVALUATION[index++] =
            new WrapperSubsetEval.PluginTag(index + 1, m, stat);
        }
      }
    }
  }

  /** The evaluation measure to use */
  protected Tag m_evaluationMeasure = TAGS_EVALUATION[0];

  /**
   * If >= 0, and an IR metric is being used, then evaluate with respect to this
   * class value (0-based index)
   */
  protected int m_IRClassVal = -1;

  /** User supplied option for IR class value (either name or 1-based index) */
  protected String m_IRClassValS = "";

  /**
   * Returns a string describing this attribute evaluator
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Classifier subset evaluator:\n\nEvaluates attribute subsets on training data or a seperate "
      + "hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   **/
  @Override
  public Enumeration