weka.attributeSelection.LinearForwardSelection Maven / Gradle / Ivy
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* LinearForwardSelection.java
* Copyright (C) 2007 Martin Guetlein
*
*/
package weka.attributeSelection;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
/**
* LinearForwardSelection:
*
* Extension of BestFirst. Takes a restricted number of k attributes into account. Fixed-set selects a fixed number k of attributes, whereas k is increased in each step when fixed-width is selected. The search uses either the initial ordering to select the top k attributes, or performs a ranking (with the same evalutator the search uses later on). The search direction can be forward, or floating forward selection (with opitional backward search steps).
*
* For more information see:
*
* Martin Guetlein (2006). Large Scale Attribute Selection Using Wrappers. Freiburg, Germany.
*
*
* Valid options are:
*
* -P <start set>
* Specify a starting set of attributes.
* Eg. 1,3,5-7.
*
* -D <0 = forward selection | 1 = floating forward selection>
* Forward selection method. (default = 0).
*
* -N <num>
* Number of non-improving nodes to
* consider before terminating search.
*
* -I
* Perform initial ranking to select the
* top-ranked attributes.
*
* -K <num>
* Number of top-ranked attributes that are
* taken into account by the search.
*
* -T <0 = fixed-set | 1 = fixed-width>
* Type of Linear Forward Selection (default = 0).
*
* -S <num>
* Size of lookup cache for evaluated subsets.
* Expressed as a multiple of the number of
* attributes in the data set. (default = 1)
*
* -Z
* verbose on/off
*
*
* @author Martin Guetlein ([email protected])
* @version $Revision: 6161 $
*/
public class LinearForwardSelection
extends ASSearch
implements OptionHandler,
StartSetHandler,
TechnicalInformationHandler {
/** search directions */
protected static final int SEARCH_METHOD_FORWARD = 0;
protected static final int SEARCH_METHOD_FLOATING = 1;
public static final Tag[] TAGS_SEARCH_METHOD = {
new Tag(SEARCH_METHOD_FORWARD, "Forward selection"),
new Tag(SEARCH_METHOD_FLOATING, "Floating forward selection"),
};
/** search directions */
protected static final int TYPE_FIXED_SET = 0;
protected static final int TYPE_FIXED_WIDTH = 1;
public static final Tag[] TAGS_TYPE = {
new Tag(TYPE_FIXED_SET, "Fixed-set"),
new Tag(TYPE_FIXED_WIDTH, "Fixed-width"),
};
// member variables
/** maximum number of stale nodes before terminating search */
protected int m_maxStale;
/** 0 == forward selection, 1 == floating forward search */
protected int m_forwardSearchMethod;
/** perform initial ranking to select top-ranked attributes */
protected boolean m_performRanking;
/**
* number of top-ranked attributes that are taken into account for the
* search
*/
protected int m_numUsedAttributes;
/** 0 == fixed-set, 1 == fixed-width */
protected int m_linearSelectionType;
/** holds an array of starting attributes */
protected int[] m_starting;
/** holds the start set for the search as a Range */
protected Range m_startRange;
/** does the data have a class */
protected boolean m_hasClass;
/** holds the class index */
protected int m_classIndex;
/** number of attributes in the data */
protected int m_numAttribs;
/** total number of subsets evaluated during a search */
protected int m_totalEvals;
/** for debugging */
protected boolean m_verbose;
/** holds the merit of the best subset found */
protected double m_bestMerit;
/** holds the maximum size of the lookup cache for evaluated subsets */
protected int m_cacheSize;
/**
* Constructor
*/
public LinearForwardSelection() {
resetOptions();
}
/**
* Returns a string describing this search method
*
* @return a description of the search method suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "LinearForwardSelection:\n\n" +
"Extension of BestFirst. Takes a restricted number of k attributes " +
"into account. Fixed-set selects a fixed number k of attributes, " +
"whereas k is increased in each step when fixed-width is selected. " +
"The search uses either the initial ordering to select the " +
"top k attributes, or performs a ranking (with the same evalutator the " +
"search uses later on). The search direction can be forward, " +
"or floating forward selection (with opitional backward search steps).\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "Martin Guetlein and Eibe Frank and Mark Hall and Andreas Karwath");
result.setValue(Field.YEAR, "2009");
result.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
result.setValue(Field.BOOKTITLE, "Proc IEEE Symposium on Computational Intelligence and Data Mining");
result.setValue(Field.PAGES, "332-339");
result.setValue(Field.PUBLISHER, "IEEE");
additional = result.add(Type.MASTERSTHESIS);
additional.setValue(Field.AUTHOR, "Martin Guetlein");
additional.setValue(Field.YEAR, "2006");
additional.setValue(Field.TITLE, "Large Scale Attribute Selection Using Wrappers");
additional.setValue(Field.SCHOOL, "Albert-Ludwigs-Universitaet");
additional.setValue(Field.ADDRESS, "Freiburg, Germany");
return result;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*
*/
public Enumeration listOptions() {
Vector newVector = new Vector(8);
newVector.addElement(new Option("\tSpecify a starting set of attributes." +
"\n\tEg. 1,3,5-7.", "P", 1, "-P "));
newVector.addElement(new Option(
"\tForward selection method. (default = 0).", "D", 1,
"-D <0 = forward selection | 1 = floating forward selection>"));
newVector.addElement(new Option("\tNumber of non-improving nodes to" +
"\n\tconsider before terminating search.", "N", 1, "-N "));
newVector.addElement(new Option("\tPerform initial ranking to select the" +
"\n\ttop-ranked attributes.", "I", 0, "-I"));
newVector.addElement(new Option(
"\tNumber of top-ranked attributes that are " +
"\n\ttaken into account by the search.", "K", 1, "-K "));
newVector.addElement(new Option(
"\tType of Linear Forward Selection (default = 0).", "T", 1,
"-T <0 = fixed-set | 1 = fixed-width>"));
newVector.addElement(new Option(
"\tSize of lookup cache for evaluated subsets." +
"\n\tExpressed as a multiple of the number of" +
"\n\tattributes in the data set. (default = 1)", "S", 1, "-S "));
newVector.addElement(new Option("\tverbose on/off", "Z", 0, "-Z"));
return newVector.elements();
}
/**
* Parses a given list of options.
*
* Valid options are:
*
*
* -P
* Specify a starting set of attributes. Eg 1,4,7-9.
*
*
* -D <0 = forward selection | 1 = floating forward selection>
* Forward selection method of the search. (default = 0).
*
*
* -N
* Number of non improving nodes to consider before terminating search.
* (default = 5).
*
*
* -I
* Perform initial ranking to select top-ranked attributes.
*
*
* -K
* Number of top-ranked attributes that are taken into account.
*
*
* -T <0 = fixed-set | 1 = fixed-width>
* Typ of Linear Forward Selection (default = 0).
*
*
* -S
* Size of lookup cache for evaluated subsets. Expressed as a multiple of
* the number of attributes in the data set. (default = 1).
*
*
* -Z
* verbose on/off.
*
*
* @param options
* the list of options as an array of strings
* @exception Exception
* if an option is not supported
*
*/
public void setOptions(String[] options) throws Exception {
String optionString;
resetOptions();
optionString = Utils.getOption('P', options);
if (optionString.length() != 0) {
setStartSet(optionString);
}
optionString = Utils.getOption('D', options);
if (optionString.length() != 0) {
setForwardSelectionMethod(new SelectedTag(Integer.parseInt(optionString),
TAGS_SEARCH_METHOD));
} else {
setForwardSelectionMethod(new SelectedTag(SEARCH_METHOD_FORWARD,
TAGS_SEARCH_METHOD));
}
optionString = Utils.getOption('N', options);
if (optionString.length() != 0) {
setSearchTermination(Integer.parseInt(optionString));
}
setPerformRanking(Utils.getFlag('I', options));
optionString = Utils.getOption('K', options);
if (optionString.length() != 0) {
setNumUsedAttributes(Integer.parseInt(optionString));
}
optionString = Utils.getOption('T', options);
if (optionString.length() != 0) {
setType(new SelectedTag(Integer.parseInt(optionString), TAGS_TYPE));
} else {
setType(new SelectedTag(TYPE_FIXED_SET, TAGS_TYPE));
}
optionString = Utils.getOption('S', options);
if (optionString.length() != 0) {
setLookupCacheSize(Integer.parseInt(optionString));
}
m_verbose = Utils.getFlag('Z', options);
}
/**
* Set the maximum size of the evaluated subset cache (hashtable). This is
* expressed as a multiplier for the number of attributes in the data set.
* (default = 1).
*
* @param size
* the maximum size of the hashtable
*/
public void setLookupCacheSize(int size) {
if (size >= 0) {
m_cacheSize = size;
}
}
/**
* Return the maximum size of the evaluated subset cache (expressed as a
* multiplier for the number of attributes in a data set.
*
* @return the maximum size of the hashtable.
*/
public int getLookupCacheSize() {
return m_cacheSize;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String lookupCacheSizeTipText() {
return "Set the maximum size of the lookup cache of evaluated subsets. This is " +
"expressed as a multiplier of the number of attributes in the data set. " +
"(default = 1).";
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String startSetTipText() {
return "Set the start point for the search. This is specified as a comma " +
"seperated list off attribute indexes starting at 1. It can include " +
"ranges. Eg. 1,2,5-9,17.";
}
/**
* Sets a starting set of attributes for the search. It is the search
* method's responsibility to report this start set (if any) in its
* toString() method.
*
* @param startSet
* a string containing a list of attributes (and or ranges), eg.
* 1,2,6,10-15.
* @exception Exception
* if start set can't be set.
*/
public void setStartSet(String startSet) throws Exception {
m_startRange.setRanges(startSet);
}
/**
* Returns a list of attributes (and or attribute ranges) as a String
*
* @return a list of attributes (and or attribute ranges)
*/
public String getStartSet() {
return m_startRange.getRanges();
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String searchTerminationTipText() {
return "Set the amount of backtracking. Specify the number of ";
}
/**
* Set the numnber of non-improving nodes to consider before terminating
* search.
*
* @param t
* the number of non-improving nodes
* @exception Exception
* if t is less than 1
*/
public void setSearchTermination(int t) throws Exception {
if (t < 1) {
throw new Exception("Value of -N must be > 0.");
}
m_maxStale = t;
}
/**
* Get the termination criterion (number of non-improving nodes).
*
* @return the number of non-improving nodes
*/
public int getSearchTermination() {
return m_maxStale;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String performRankingTipText() {
return "Perform initial ranking to select top-ranked attributes.";
}
/**
* Perform initial ranking to select top-ranked attributes.
*
* @param b
* true if initial ranking should be performed
*/
public void setPerformRanking(boolean b) {
m_performRanking = b;
}
/**
* Get boolean if initial ranking should be performed to select the
* top-ranked attributes
*
* @return true if initial ranking should be performed
*/
public boolean getPerformRanking() {
return m_performRanking;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String numUsedAttributesTipText() {
return "Set the amount of top-ranked attributes that are taken into account by the search process.";
}
/**
* Set the number of top-ranked attributes that taken into account by the
* search process.
*
* @param k
* the number of attributes
* @exception Exception
* if k is less than 2
*/
public void setNumUsedAttributes(int k) throws Exception {
if (k < 2) {
throw new Exception("Value of -K must be >= 2.");
}
m_numUsedAttributes = k;
}
/**
* Get the number of top-ranked attributes that taken into account by the
* search process.
*
* @return the number of top-ranked attributes that taken into account
*/
public int getNumUsedAttributes() {
return m_numUsedAttributes;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String forwardSelectionMethodTipText() {
return "Set the direction of the search.";
}
/**
* Set the search direction
*
* @param d
* the direction of the search
*/
public void setForwardSelectionMethod(SelectedTag d) {
if (d.getTags() == TAGS_SEARCH_METHOD) {
m_forwardSearchMethod = d.getSelectedTag().getID();
}
}
/**
* Get the search direction
*
* @return the direction of the search
*/
public SelectedTag getForwardSelectionMethod() {
return new SelectedTag(m_forwardSearchMethod, TAGS_SEARCH_METHOD);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String typeTipText() {
return "Set the type of the search.";
}
/**
* Set the type
*
* @param t
* the Linear Forward Selection type
*/
public void setType(SelectedTag t) {
if (t.getTags() == TAGS_TYPE) {
m_linearSelectionType = t.getSelectedTag().getID();
}
}
/**
* Get the type
*
* @return the Linear Forward Selection type
*/
public SelectedTag getType() {
return new SelectedTag(m_linearSelectionType, TAGS_TYPE);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String verboseTipText() {
return "Turn on verbose output for monitoring the search's progress.";
}
/**
* Set whether verbose output should be generated.
*
* @param d
* true if output is to be verbose.
*/
public void setVerbose(boolean b) {
m_verbose = b;
}
/**
* Get whether output is to be verbose
*
* @return true if output will be verbose
*/
public boolean getVerbose() {
return m_verbose;
}
/**
* Gets the current settings of LinearForwardSelection.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions() {
String[] options = new String[13];
int current = 0;
if (!(getStartSet().equals(""))) {
options[current++] = "-P";
options[current++] = "" + startSetToString();
}
options[current++] = "-D";
options[current++] = "" + m_forwardSearchMethod;
options[current++] = "-N";
options[current++] = "" + m_maxStale;
if (m_performRanking) {
options[current++] = "-I";
}
options[current++] = "-K";
options[current++] = "" + m_numUsedAttributes;
options[current++] = "-T";
options[current++] = "" + m_linearSelectionType;
if (m_verbose)
options[current++] = "-Z";
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* converts the array of starting attributes to a string. This is used by
* getOptions to return the actual attributes specified as the starting set.
* This is better than using m_startRanges.getRanges() as the same start set
* can be specified in different ways from the command line---eg 1,2,3 ==
* 1-3. This is to ensure that stuff that is stored in a database is
* comparable.
*
* @return a comma seperated list of individual attribute numbers as a
* String
*/
private String startSetToString() {
StringBuffer FString = new StringBuffer();
boolean didPrint;
if (m_starting == null) {
return getStartSet();
}
for (int i = 0; i < m_starting.length; i++) {
didPrint = false;
if ((m_hasClass == false) ||
((m_hasClass == true) && (i != m_classIndex))) {
FString.append((m_starting[i] + 1));
didPrint = true;
}
if (i == (m_starting.length - 1)) {
FString.append("");
} else {
if (didPrint) {
FString.append(",");
}
}
}
return FString.toString();
}
/**
* returns a description of the search as a String
*
* @return a description of the search
*/
public String toString() {
StringBuffer LFSString = new StringBuffer();
LFSString.append("\tLinear Forward Selection.\n\tStart set: ");
if (m_starting == null) {
LFSString.append("no attributes\n");
} else {
LFSString.append(startSetToString() + "\n");
}
LFSString.append("\tForward selection method: ");
if (m_forwardSearchMethod == SEARCH_METHOD_FORWARD) {
LFSString.append("forward selection\n");
} else {
LFSString.append("floating forward selection\n");
}
LFSString.append("\tStale search after " + m_maxStale +
" node expansions\n");
LFSString.append("\tLinear Forward Selection Type: ");
if (m_linearSelectionType == TYPE_FIXED_SET) {
LFSString.append("fixed-set\n");
} else {
LFSString.append("fixed-width\n");
}
LFSString.append("\tNumber of top-ranked attributes that are used: " +
m_numUsedAttributes + "\n");
LFSString.append("\tTotal number of subsets evaluated: " + m_totalEvals +
"\n");
LFSString.append("\tMerit of best subset found: " +
Utils.doubleToString(Math.abs(m_bestMerit), 8, 3) + "\n");
return LFSString.toString();
}
/**
* Searches the attribute subset space by linear forward selection
*
* @param ASEvaluator
* the attribute evaluator to guide the search
* @param data
* the training instances.
* @return an array (not necessarily ordered) of selected attribute indexes
* @exception Exception
* if the search can't be completed
*/
public int[] search(ASEvaluation ASEval, Instances data)
throws Exception {
m_totalEvals = 0;
if (!(ASEval instanceof SubsetEvaluator)) {
throw new Exception(ASEval.getClass().getName() + " is not a " +
"Subset evaluator!");
}
if (ASEval instanceof UnsupervisedSubsetEvaluator) {
m_hasClass = false;
} else {
m_hasClass = true;
m_classIndex = data.classIndex();
}
((ASEvaluation) ASEval).buildEvaluator(data);
m_numAttribs = data.numAttributes();
if (m_numUsedAttributes > m_numAttribs) {
System.out.println(
"Decreasing number of top-ranked attributes to total number of attributes: " +
data.numAttributes());
m_numUsedAttributes = m_numAttribs;
}
BitSet start_group = new BitSet(m_numAttribs);
m_startRange.setUpper(m_numAttribs - 1);
if (!(getStartSet().equals(""))) {
m_starting = m_startRange.getSelection();
}
// If a starting subset has been supplied, then initialise the bitset
if (m_starting != null) {
for (int i = 0; i < m_starting.length; i++) {
if ((m_starting[i]) != m_classIndex) {
start_group.set(m_starting[i]);
}
}
}
LFSMethods LFS = new LFSMethods();
int[] ranking;
if (m_performRanking) {
ranking = LFS.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose);
} else {
ranking = new int[m_numAttribs];
for (int i = 0; i < ranking.length; i++) {
ranking[i] = i;
}
}
if (m_forwardSearchMethod == SEARCH_METHOD_FORWARD) {
LFS.forwardSearch(m_cacheSize, start_group, ranking, m_numUsedAttributes,
m_linearSelectionType == TYPE_FIXED_WIDTH, m_maxStale, -1, data,
(SubsetEvaluator) ASEval, m_verbose);
} else if (m_forwardSearchMethod == SEARCH_METHOD_FLOATING) {
LFS.floatingForwardSearch(m_cacheSize, start_group, ranking,
m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH,
m_maxStale, data, (SubsetEvaluator) ASEval, m_verbose);
}
m_totalEvals = LFS.getNumEvalsTotal();
m_bestMerit = LFS.getBestMerit();
return attributeList(LFS.getBestGroup());
}
/**
* Reset options to default values
*/
protected void resetOptions() {
m_maxStale = 5;
m_forwardSearchMethod = SEARCH_METHOD_FORWARD;
m_performRanking = true;
m_numUsedAttributes = 50;
m_linearSelectionType = TYPE_FIXED_SET;
m_starting = null;
m_startRange = new Range();
m_classIndex = -1;
m_totalEvals = 0;
m_cacheSize = 1;
m_verbose = false;
}
/**
* converts a BitSet into a list of attribute indexes
*
* @param group
* the BitSet to convert
* @return an array of attribute indexes
*/
protected int[] attributeList(BitSet group) {
int count = 0;
// count how many were selected
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
count++;
}
}
int[] list = new int[count];
count = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
list[count++] = i;
}
}
return list;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 6161 $");
}
}