weka.attributeSelection.SubsetSizeForwardSelection Maven / Gradle / Ivy
Show all versions of weka-stable Show documentation
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* SubsetSizeForwardSelection.java
* Copyright (C) 2007 Martin Guetlein
*
*/
package weka.attributeSelection;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.Utils;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* SubsetSizeForwardSelection:
*
* Extension of LinearForwardSelection. The search performs an interior cross-validation (seed and number of folds can be specified). A LinearForwardSelection is performed on each foldto determine the optimal subset-size (using the given SubsetSizeEvaluator). Finally, a LinearForwardSelection up to the optimal subset-size is performed on the whole data.
*
* For more information see:
*
* Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany.
*
*
* Valid options are:
*
* -I
* Perform initial ranking to select the
* top-ranked attributes.
*
* -K <num>
* Number of top-ranked attributes that are
* taken into account by the search.
*
* -T <0 = fixed-set | 1 = fixed-width>
* Type of Linear Forward Selection (default = 0).
*
* -S <num>
* Size of lookup cache for evaluated subsets.
* Expressed as a multiple of the number of
* attributes in the data set. (default = 1)
*
* -E <subset evaluator>
* Subset-evaluator used for subset-size determination.-- -M
*
* -F <num>
* Number of cross validation folds
* for subset size determination (default = 5).
*
* -R <num>
* Seed for cross validation
* subset size determination. (default = 1)
*
* -Z
* verbose on/off
*
*
* Options specific to evaluator weka.attributeSelection.ClassifierSubsetEval:
*
*
* -B <classifier>
* class name of the classifier to use for accuracy estimation.
* Place any classifier options LAST on the command line
* following a "--". eg.:
* -B weka.classifiers.bayes.NaiveBayes ... -- -K
* (default: weka.classifiers.rules.ZeroR)
*
* -T
* Use the training data to estimate accuracy.
*
* -H <filename>
* Name of the hold out/test set to
* estimate accuracy on.
*
*
* Options specific to scheme weka.classifiers.rules.ZeroR:
*
*
* -D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
*
* @author Martin Guetlein ([email protected])
* @version $Revision: 5605 $
*/
public class SubsetSizeForwardSelection extends ASSearch
implements OptionHandler {
/** search directions */
protected static final int TYPE_FIXED_SET = 0;
protected static final int TYPE_FIXED_WIDTH = 1;
public static final Tag[] TAGS_TYPE = {
new Tag(TYPE_FIXED_SET, "Fixed-set"),
new Tag(TYPE_FIXED_WIDTH, "Fixed-width"),
};
// member variables
/** perform initial ranking to select top-ranked attributes */
protected boolean m_performRanking;
/**
* number of top-ranked attributes that are taken into account for the
* search
*/
protected int m_numUsedAttributes;
/** 0 == fixed-set, 1 == fixed-width */
protected int m_linearSelectionType;
/** the subset evaluator to use for subset size determination */
private ASEvaluation m_setSizeEval;
/**
* Number of cross validation folds for subset size determination (default =
* 5).
*/
protected int m_numFolds;
/** Seed for cross validation subset size determination. (default = 1) */
protected int m_seed;
/** number of attributes in the data */
protected int m_numAttribs;
/** total number of subsets evaluated during a search */
protected int m_totalEvals;
/** for debugging */
protected boolean m_verbose;
/** holds the merit of the best subset found */
protected double m_bestMerit;
/** holds the maximum size of the lookup cache for evaluated subsets */
protected int m_cacheSize;
/**
* Constructor
*/
public SubsetSizeForwardSelection() {
resetOptions();
}
/**
* Returns a string describing this search method
*
* @return a description of the search method suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "SubsetSizeForwardSelection:\n\n" +
"Extension of LinearForwardSelection. The search performs an interior " +
"cross-validation (seed and number of folds can be specified). A " +
"LinearForwardSelection is performed on each foldto determine the optimal " +
"subset-size (using the given SubsetSizeEvaluator). Finally, a " +
"LinearForwardSelection up to the optimal subset-size is performed on " +
"the whole data.\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall");
result.setValue(Field.YEAR, "2009");
result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining");
result.setValue(Field.PAGES, "332-339");
result.setValue(Field.PUBLISHER, "IEEE");
additional = result.add(Type.MASTERSTHESIS);
additional.setValue(Field.AUTHOR, "Martin Guetlein");
additional.setValue(Field.YEAR, "2006");
additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet");
additional.setValue(Field.ADDRESS, "Freiburg, Germany");
return result;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*
*/
public Enumeration listOptions() {
Vector newVector = new Vector(9);
newVector.addElement(new Option("\tPerform initial ranking to select the" +
"\n\ttop-ranked attributes.", "I", 0, "-I"));
newVector.addElement(new Option(
"\tNumber of top-ranked attributes that are " +
"\n\ttaken into account by the search.", "K", 1, "-K "));
newVector.addElement(new Option(
"\tType of Linear Forward Selection (default = 0).", "T", 1,
"-T <0 = fixed-set | 1 = fixed-width>"));
newVector.addElement(new Option(
"\tSize of lookup cache for evaluated subsets." +
"\n\tExpressed as a multiple of the number of" +
"\n\tattributes in the data set. (default = 1)", "S", 1, "-S "));
newVector.addElement(new Option(
"\tSubset-evaluator used for subset-size determination." + "-- -M",
"E", 1, "-E "));
newVector.addElement(new Option("\tNumber of cross validation folds" +
"\n\tfor subset size determination (default = 5).", "F", 1, "-F "));
newVector.addElement(new Option("\tSeed for cross validation" +
"\n\tsubset size determination. (default = 1)", "R", 1, "-R "));
newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z"));
if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
newVector.addElement(new Option("", "", 0,
"\nOptions specific to " + "evaluator " +
m_setSizeEval.getClass().getName() + ":"));
Enumeration enu = ((OptionHandler) m_setSizeEval).listOptions();
while (enu.hasMoreElements()) {
newVector.addElement(enu.nextElement());
}
}
return newVector.elements();
}
/**
* Parses a given list of options.
*
* Valid options are:
*
*
* -I
* Perform initial ranking to select top-ranked attributes.
*
*
* -K
* Number of top-ranked attributes that are taken into account.
*
*
* -T <0 = fixed-set | 1 = fixed-width>
* Typ of Linear Forward Selection (default = 0).
*
*
* -S
* Size of lookup cache for evaluated subsets. Expressed as a multiple of
* the number of attributes in the data set. (default = 1).
*
*
* -E
* class name of subset evaluator to use for subset size determination
* (default = null, same subset evaluator as for ranking and final forward
* selection is used). Place any evaluator options LAST on the command line
* following a "--". eg. -A weka.attributeSelection.ClassifierSubsetEval ... --
* -M
*
*
*
* -F * Number of cross validation folds for subset size determination (default = * 5). *
*
* -R
*
* -Z
*
* @param options
* the list of options as an array of strings
* @exception Exception
* if an option is not supported
*
*/
public void setOptions(String[] options) throws Exception {
String optionString;
resetOptions();
setPerformRanking(Utils.getFlag('I', options));
optionString = Utils.getOption('K', options);
if (optionString.length() != 0) {
setNumUsedAttributes(Integer.parseInt(optionString));
}
optionString = Utils.getOption('T', options);
if (optionString.length() != 0) {
setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE));
} else {
setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE));
}
optionString = Utils.getOption('S', options);
if (optionString.length() != 0) {
setLookupCacheSize(Integer.parseInt(optionString));
}
optionString = Utils.getOption('E', options);
if (optionString.length() == 0) {
System.out.println(
"No subset size evaluator given, using evaluator that is used for final search.");
m_setSizeEval = null;
} else {
setSubsetSizeEvaluator(ASEvaluation.forName(optionString,
Utils.partitionOptions(options)));
}
optionString = Utils.getOption('F', options);
if (optionString.length() != 0) {
setNumSubsetSizeCVFolds(Integer.parseInt(optionString));
}
optionString = Utils.getOption('R', options);
if (optionString.length() != 0) {
setSeed(Integer.parseInt(optionString));
}
m_verbose = Utils.getFlag('Z', options);
}
/**
* Set the maximum size of the evaluated subset cache (hashtable). This is
* expressed as a multiplier for the number of attributes in the data set.
* (default = 1).
*
* @param size
* the maximum size of the hashtable
*/
public void setLookupCacheSize(int size) {
if (size >= 0) {
m_cacheSize = size;
}
}
/**
* Return the maximum size of the evaluated subset cache (expressed as a
* multiplier for the number of attributes in a data set.
*
* @return the maximum size of the hashtable.
*/
public int getLookupCacheSize() {
return m_cacheSize;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String lookupCacheSizeTipText() {
return "Set the maximum size of the lookup cache of evaluated subsets. This is " +
"expressed as a multiplier of the number of attributes in the data set. " +
"(default = 1).";
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String performRankingTipText() {
return "Perform initial ranking to select top-ranked attributes.";
}
/**
* Perform initial ranking to select top-ranked attributes.
*
* @param b
* true if initial ranking should be performed
*/
public void setPerformRanking(boolean b) {
m_performRanking = b;
}
/**
* Get boolean if initial ranking should be performed to select the
* top-ranked attributes
*
* @return true if initial ranking should be performed
*/
public boolean getPerformRanking() {
return m_performRanking;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String numUsedAttributesTipText() {
return "Set the amount of top-ranked attributes that are taken into account by the search process.";
}
/**
* Set the number of top-ranked attributes that taken into account by the
* search process.
*
* @param k
* the number of attributes
* @exception Exception
* if k is less than 2
*/
public void setNumUsedAttributes(int k) throws Exception {
if (k < 2) {
throw new Exception("Value of -K must be >= 2.");
}
m_numUsedAttributes = k;
}
/**
* Get the number of top-ranked attributes that taken into account by the
* search process.
*
* @return the number of top-ranked attributes that taken into account
*/
public int getNumUsedAttributes() {
return m_numUsedAttributes;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String typeTipText() {
return "Set the type of the search.";
}
/**
* Set the type
*
* @param t
* the Linear Forward Selection type
*/
public void setType(SelectedTag t) {
if (t.getTags() == TAGS_TYPE) {
m_linearSelectionType = t.getSelectedTag().getID();
}
}
/**
* Get the type
*
* @return the Linear Forward Selection type
*/
public SelectedTag getType() {
return new SelectedTag(m_linearSelectionType, TAGS_TYPE);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String subsetSizeEvaluatorTipText() {
return "Subset evaluator to use for subset size determination.";
}
/**
* Set the subset evaluator to use for subset size determination.
*
* @param eval
* the subset evaluator to use for subset size determination.
*/
public void setSubsetSizeEvaluator(ASEvaluation eval)
throws Exception {
if (!(eval instanceof SubsetEvaluator)) {
throw new Exception(eval.getClass().getName() +
" is no subset evaluator.");
}
m_setSizeEval = eval;
}
/**
* Get the subset evaluator used for subset size determination.
*
* @return the evaluator used for subset size determination.
*/
public ASEvaluation getSubsetSizeEvaluator() {
return m_setSizeEval;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String numSubsetSizeCVFoldsTipText() {
return "Number of cross validation folds for subset size determination";
}
/**
* Set the number of cross validation folds for subset size determination
* (default = 5).
*
* @param f
* number of folds
*/
public void setNumSubsetSizeCVFolds(int f) {
m_numFolds = f;
}
/**
* Get the number of cross validation folds for subset size determination
* (default = 5).
*
* @return number of folds
*/
public int getNumSubsetSizeCVFolds() {
return m_numFolds;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String seedTipText() {
return "Seed for cross validation subset size determination. (default = 1)";
}
/**
* Seed for cross validation subset size determination. (default = 1)
*
* @param s
* seed
*/
public void setSeed(int s) {
m_seed = s;
}
/**
* Seed for cross validation subset size determination. (default = 1)
*
* @return seed
*/
public int getSeed() {
return m_seed;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String verboseTipText() {
return "Turn on verbose output for monitoring the search's progress.";
}
/**
* Set whether verbose output should be generated.
*
* @param d
* true if output is to be verbose.
*/
public void setVerbose(boolean b) {
m_verbose = b;
}
/**
* Get whether output is to be verbose
*
* @return true if output will be verbose
*/
public boolean getVerbose() {
return m_verbose;
}
/**
* Gets the current settings of LinearForwardSelection.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions() {
String[] evaluatorOptions = new String[0];
if ((m_setSizeEval != null) && (m_setSizeEval instanceof OptionHandler)) {
evaluatorOptions = ((OptionHandler) m_setSizeEval).getOptions();
}
String[] options = new String[15 + evaluatorOptions.length];
int current = 0;
if (m_performRanking) {
options[current++] = "-I";
}
options[current++] = "-K";
options[current++] = "" + m_numUsedAttributes;
options[current++] = "-T";
options[current++] = "" + m_linearSelectionType;
options[current++] = "-F";
options[current++] = "" + m_numFolds;
options[current++] = "-S";
options[current++] = "" + m_seed;
options[current++] = "-Z";
options[current++] = "" + m_verbose;
if (m_setSizeEval != null) {
options[current++] = "-E";
options[current++] = m_setSizeEval.getClass().getName();
}
options[current++] = "--";
System.arraycopy(evaluatorOptions, 0, options, current,
evaluatorOptions.length);
current += evaluatorOptions.length;
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* returns a description of the search as a String
*
* @return a description of the search
*/
public String toString() {
StringBuffer LFSString = new StringBuffer();
LFSString.append("\tSubset Size Forward Selection.\n");
LFSString.append("\tLinear Forward Selection Type: ");
if (m_linearSelectionType == TYPE_FIXED_SET) {
LFSString.append("fixed-set\n");
} else {
LFSString.append("fixed-width\n");
}
LFSString.append("\tNumber of top-ranked attributes that are used: " +
m_numUsedAttributes + "\n");
LFSString.append(
"\tNumber of cross validation folds for subset size determination: " +
m_numFolds + "\n");
LFSString.append("\tSeed for cross validation subset size determination: " +
m_seed + "\n");
LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals +
"\n");
LFSString.append("\tMerit of best subset found: " +
Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n");
return LFSString.toString();
}
/**
* Searches the attribute subset space by subset size forward selection
*
* @param ASEvaluator
* the attribute evaluator to guide the search
* @param data
* the training instances.
* @return an array (not necessarily ordered) of selected attribute indexes
* @exception Exception
* if the search can't be completed
*/
public int[] search(ASEvaluation ASEval, Instances data)
throws Exception {
m_totalEvals = 0;
if (!(ASEval instanceof SubsetEvaluator)) {
throw new Exception(ASEval.getClass().getName() + " is not a " +
"Subset evaluator!");
}
if (m_setSizeEval == null) {
m_setSizeEval = ASEval;
}
m_numAttribs = data.numAttributes();
if (m_numUsedAttributes > m_numAttribs) {
System.out.println(
"Decreasing number of top-ranked attributes to total number of attributes: " +
data.numAttributes());
m_numUsedAttributes = m_numAttribs;
}
Instances[] trainData = new Instances[m_numFolds];
Instances[] testData = new Instances[m_numFolds];
LFSMethods[] searchResults = new LFSMethods[m_numFolds];
Random random = new Random(m_seed);
Instances dataCopy = new Instances(data);
dataCopy.randomize(random);
if (dataCopy.classAttribute().isNominal()) {
dataCopy.stratify(m_numFolds);
}
for (int f = 0; f < m_numFolds; f++) {
trainData[f] = dataCopy.trainCV(m_numFolds, f, random);
testData[f] = dataCopy.testCV(m_numFolds, f);
}
LFSMethods LSF = new LFSMethods();
int[] ranking;
if (m_performRanking) {
ASEval.buildEvaluator(data);
ranking = LSF.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose);
} else {
ranking = new int[m_numAttribs];
for (int i = 0; i < ranking.length; i++) {
ranking[i] = i;
}
}
int maxSubsetSize = 0;
for (int f = 0; f < m_numFolds; f++) {
if (m_verbose) {
System.out.println("perform search on internal fold: " + (f + 1) + "/" +
m_numFolds);
}
m_setSizeEval.buildEvaluator(trainData[f]);
searchResults[f] = new LFSMethods();
searchResults[f].forwardSearch(m_cacheSize, new BitSet(m_numAttribs),
ranking, m_numUsedAttributes,
m_linearSelectionType == TYPE_FIXED_WIDTH, 1, -1, trainData[f],
(SubsetEvaluator)m_setSizeEval, m_verbose);
maxSubsetSize = Math.max(maxSubsetSize,
searchResults[f].getBestGroup().cardinality());
}
if (m_verbose) {
System.out.println(
"continue searches on internal folds to maxSubsetSize (" +
maxSubsetSize + ")");
}
for (int f = 0; f < m_numFolds; f++) {
if (m_verbose) {
System.out.print("perform search on internal fold: " + (f + 1) + "/" +
m_numFolds + " with starting set ");
LFSMethods.printGroup(searchResults[f].getBestGroup(),
trainData[f].numAttributes());
}
if (searchResults[f].getBestGroup().cardinality() < maxSubsetSize) {
m_setSizeEval.buildEvaluator(trainData[f]);
searchResults[f].forwardSearch(m_cacheSize,
searchResults[f].getBestGroup(), ranking, m_numUsedAttributes,
m_linearSelectionType == TYPE_FIXED_WIDTH, 1, maxSubsetSize,
trainData[f], (SubsetEvaluator)m_setSizeEval, m_verbose);
}
}
double[][] testMerit = new double[m_numFolds][maxSubsetSize + 1];
for (int f = 0; f < m_numFolds; f++) {
for (int s = 1; s <= maxSubsetSize; s++) {
if (HoldOutSubsetEvaluator.class.isInstance(m_setSizeEval)) {
m_setSizeEval.buildEvaluator(trainData[f]);
testMerit[f][s] = ((HoldOutSubsetEvaluator) m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize(
s), testData[f]);
} else {
m_setSizeEval.buildEvaluator(testData[f]);
testMerit[f][s] = ((SubsetEvaluator)m_setSizeEval).evaluateSubset(searchResults[f].getBestGroupOfSize(
s));
}
}
}
double[] avgTestMerit = new double[maxSubsetSize + 1];
int finalSubsetSize = -1;
for (int s = 1; s <= maxSubsetSize; s++) {
for (int f = 0; f < m_numFolds; f++) {
avgTestMerit[s] = ((avgTestMerit[s] * f) + testMerit[f][s]) / (double) (f +
1);
}
if ((finalSubsetSize == -1) ||
(avgTestMerit[s] > avgTestMerit[finalSubsetSize])) {
finalSubsetSize = s;
}
if (m_verbose) {
System.out.println("average merit for subset-size " + s + ": " +
avgTestMerit[s]);
}
}
if (m_verbose) {
System.out.println("performing final forward selection to subset-size: " +
finalSubsetSize);
}
ASEval.buildEvaluator(data);
LSF.forwardSearch(m_cacheSize, new BitSet(m_numAttribs), ranking,
m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, 1,
finalSubsetSize, data, (SubsetEvaluator) ASEval, m_verbose);
m_totalEvals = LSF.getNumEvalsTotal();
m_bestMerit = LSF.getBestMerit();
return attributeList(LSF.getBestGroup());
}
/**
* Reset options to default values
*/
protected void resetOptions() {
m_performRanking = true;
m_numUsedAttributes = 50;
m_linearSelectionType = TYPE_FIXED_SET;
m_setSizeEval = new ClassifierSubsetEval();
m_numFolds = 5;
m_seed = 1;
m_totalEvals = 0;
m_cacheSize = 1;
m_verbose = false;
}
/**
* converts a BitSet into a list of attribute indexes
*
* @param group
* the BitSet to convert
* @return an array of attribute indexes
*/
protected int[] attributeList(BitSet group) {
int count = 0;
// count how many were selected
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
count++;
}
}
int[] list = new int[count];
count = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
list[count++] = i;
}
}
return list;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 5605 $");
}
}
* Seed for cross validation subset size determination. (default = 1)
*
* verbose on/off.
*