weka.attributeSelection.CfsSubsetEval Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CfsSubsetEval.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.attributeSelection;

import java.util.BitSet;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.ThreadSafe;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/**
 *  CfsSubsetEval :

 * 

 * Evaluates the worth of a subset of attributes by considering the individual
 * predictive ability of each feature along with the degree of redundancy
 * between them.

 * 

 * Subsets of features that are highly correlated with the class while having
 * low intercorrelation are preferred.

 * 

 * For more information see:

 * 

 * M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine
 * Learning. Hamilton, New Zealand.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @phdthesis{Hall1998,
 *    address = {Hamilton, New Zealand},
 *    author = {M. A. Hall},
 *    school = {University of Waikato},
 *    title = {Correlation-based Feature Subset Selection for Machine Learning},
 *    year = {1998}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -M
 *  Treat missing values as a separate value.
 * 
 * 
 *  * -L
 *  Don't include locally predictive attributes.
 * 
 * 
 *  * -Z
 *  Precompute the full correlation matrix at the outset, rather than compute correlations lazily (as needed) during the search. Use this in conjuction with parallel processing in order to speed up a backward search.
 * 
 * 
 *  * -P <int>
 *  The size of the thread pool, for example, the number of cores in the CPU. (default 1)
 * 
 * 
 *  * -E <int>
 *  The number of threads to use, which should be >= size of thread pool. (default 1)
 * 
 * 
 *  * -D
 *  Output debugging info.
 * 
 * 
 * 
 * 
 * @author Mark Hall ([email protected])
 * @version $Revision: 11215 $
 * @see Discretize
 */
public class CfsSubsetEval extends ASEvaluation implements SubsetEvaluator,
  ThreadSafe, OptionHandler, TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = 747878400813276317L;

  /** The training instances */
  private Instances m_trainInstances;
  /** Discretise attributes when class in nominal */
  private Discretize m_disTransform;
  /** The class index */
  private int m_classIndex;
  /** Is the class numeric */
  private boolean m_isNumeric;
  /** Number of attributes in the training data */
  private int m_numAttribs;
  /** Number of instances in the training data */
  private int m_numInstances;
  /** Treat missing values as separate values */
  private boolean m_missingSeparate;
  /** Include locally predictive attributes */
  private boolean m_locallyPredictive;
  /** Holds the matrix of attribute correlations */
  // private Matrix m_corr_matrix;
  private float[][] m_corr_matrix;
  /** Standard deviations of attributes (when using pearsons correlation) */
  private double[] m_std_devs;
  /** Threshold for admitting locally predictive features */
  private double m_c_Threshold;

  /** Output debugging info */
  protected boolean m_debug;

  /** Number of entries in the correlation matrix */
  protected int m_numEntries;

  /** Number of correlations actually computed */
  protected AtomicInteger m_numFilled;

  protected boolean m_preComputeCorrelationMatrix;

  /**
   * The number of threads used to compute the correlation matrix. Used when
   * correlation matrix is precomputed
   */
  protected int m_numThreads = 1;

  /**
   * The size of the thread pool. Usually set equal to the number of CPUs or CPU
   * cores available
   */
  protected int m_poolSize = 1;

  /** Thread pool */
  protected transient ExecutorService m_pool = null;

  /**
   * Returns a string describing this attribute evaluator
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
      + "by considering the individual predictive ability of each feature "
      + "along with the degree of redundancy between them.\n\n"
      + "Subsets of features that are highly correlated with the class "
      + "while having low intercorrelation are preferred.\n\n"
      + "For more information see:\n\n" + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.PHDTHESIS);
    result.setValue(Field.AUTHOR, "M. A. Hall");
    result.setValue(Field.YEAR, "1998");
    result.setValue(Field.TITLE,
      "Correlation-based Feature Subset Selection for Machine Learning");
    result.setValue(Field.SCHOOL, "University of Waikato");
    result.setValue(Field.ADDRESS, "Hamilton, New Zealand");

    return result;
  }

  /**
   * Constructor
   */
  public CfsSubsetEval() {
    resetOptions();
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   * 
   **/
  @Override
  public Enumeration