Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
weka.attributeSelection.CfsSubsetEval Maven / Gradle / Ivy
Go to download
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* CfsSubsetEval.java
* Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.attributeSelection;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.ThreadSafe;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
/**
* CfsSubsetEval :
*
* Evaluates the worth of a subset of attributes by considering the individual
* predictive ability of each feature along with the degree of redundancy
* between them.
*
* Subsets of features that are highly correlated with the class while having
* low intercorrelation are preferred.
*
* For more information see:
*
* M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine
* Learning. Hamilton, New Zealand.
*
*
*
* BibTeX:
*
*
* @phdthesis{Hall1998,
* address = {Hamilton, New Zealand},
* author = {M. A. Hall},
* school = {University of Waikato},
* title = {Correlation-based Feature Subset Selection for Machine Learning},
* year = {1998}
* }
*
*
*
*
* Valid options are:
*
*
*
* -M
* Treat missing values as a separate value.
*
*
*
* -L
* Don't include locally predictive attributes.
*
*
*
* -Z
* Precompute the full correlation matrix at the outset, rather than compute correlations lazily (as needed) during the search. Use this in conjuction with parallel processing in order to speed up a backward search.
*
*
*
* -P <int>
* The size of the thread pool, for example, the number of cores in the CPU. (default 1)
*
*
*
* -E <int>
* The number of threads to use, which should be >= size of thread pool. (default 1)
*
*
*
* -D
* Output debugging info.
*
*
*
*
* @author Mark Hall ([email protected] )
* @version $Revision: 11215 $
* @see Discretize
*/
public class CfsSubsetEval extends ASEvaluation implements SubsetEvaluator,
ThreadSafe, OptionHandler, TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = 747878400813276317L;
/** The training instances */
private Instances m_trainInstances;
/** Discretise attributes when class in nominal */
private Discretize m_disTransform;
/** The class index */
private int m_classIndex;
/** Is the class numeric */
private boolean m_isNumeric;
/** Number of attributes in the training data */
private int m_numAttribs;
/** Number of instances in the training data */
private int m_numInstances;
/** Treat missing values as separate values */
private boolean m_missingSeparate;
/** Include locally predictive attributes */
private boolean m_locallyPredictive;
/** Holds the matrix of attribute correlations */
// private Matrix m_corr_matrix;
private float[][] m_corr_matrix;
/** Standard deviations of attributes (when using pearsons correlation) */
private double[] m_std_devs;
/** Threshold for admitting locally predictive features */
private double m_c_Threshold;
/** Output debugging info */
protected boolean m_debug;
/** Number of entries in the correlation matrix */
protected int m_numEntries;
/** Number of correlations actually computed */
protected AtomicInteger m_numFilled;
protected boolean m_preComputeCorrelationMatrix;
/**
* The number of threads used to compute the correlation matrix. Used when
* correlation matrix is precomputed
*/
protected int m_numThreads = 1;
/**
* The size of the thread pool. Usually set equal to the number of CPUs or CPU
* cores available
*/
protected int m_poolSize = 1;
/** Thread pool */
protected transient ExecutorService m_pool = null;
/**
* Returns a string describing this attribute evaluator
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
+ "by considering the individual predictive ability of each feature "
+ "along with the degree of redundancy between them.\n\n"
+ "Subsets of features that are highly correlated with the class "
+ "while having low intercorrelation are preferred.\n\n"
+ "For more information see:\n\n" + getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.PHDTHESIS);
result.setValue(Field.AUTHOR, "M. A. Hall");
result.setValue(Field.YEAR, "1998");
result.setValue(Field.TITLE,
"Correlation-based Feature Subset Selection for Machine Learning");
result.setValue(Field.SCHOOL, "University of Waikato");
result.setValue(Field.ADDRESS, "Hamilton, New Zealand");
return result;
}
/**
* Constructor
*/
public CfsSubsetEval() {
resetOptions();
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*
**/
@Override
public Enumeration listOptions() {
Vector newVector = new Vector (6);
newVector.addElement(new Option("\tTreat missing values as a separate "
+ "value.", "M", 0, "-M"));
newVector.addElement(new Option(
"\tDon't include locally predictive attributes" + ".", "L", 0, "-L"));
newVector.addElement(new Option(
"\t" + preComputeCorrelationMatrixTipText(), "Z", 0, "-Z"));
newVector.addElement(new Option(
"\t" + poolSizeTipText() + " (default 1)\n", "P", 1, "-P "));
newVector.addElement(new Option("\t" + numThreadsTipText()
+ " (default 1)\n", "E", 1, "-E "));
newVector.addElement(new Option("\tOutput debugging info" + ".", "D", 0,
"-D"));
return newVector.elements();
}
/**
* Parses and sets a given list of options.
*
*
* Valid options are:
*
*
*
* -M
* Treat missing values as a separate value.
*
*
*
* -L
* Don't include locally predictive attributes.
*
*
*
* -Z
* Precompute the full correlation matrix at the outset, rather than compute correlations lazily (as needed) during the search. Use this in conjuction with parallel processing in order to speed up a backward search.
*
*
*
* -P <int>
* The size of the thread pool, for example, the number of cores in the CPU. (default 1)
*
*
*
* -E <int>
* The number of threads to use, which should be >= size of thread pool. (default 1)
*
*
*
* -D
* Output debugging info.
*
*
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*
**/
@Override
public void setOptions(String[] options) throws Exception {
resetOptions();
setMissingSeparate(Utils.getFlag('M', options));
setLocallyPredictive(!Utils.getFlag('L', options));
setPreComputeCorrelationMatrix(Utils.getFlag('Z', options));
String PoolSize = Utils.getOption('P', options);
if (PoolSize.length() != 0) {
setPoolSize(Integer.parseInt(PoolSize));
} else {
setPoolSize(1);
}
String NumThreads = Utils.getOption('E', options);
if (NumThreads.length() != 0) {
setNumThreads(Integer.parseInt(NumThreads));
} else {
setNumThreads(1);
}
setDebug(Utils.getFlag('D', options));
}
/**
* @return a string to describe the option
*/
public String preComputeCorrelationMatrixTipText() {
return "Precompute the full correlation matrix at the outset, "
+ "rather than compute correlations lazily (as needed) "
+ "during the search. Use this in conjuction with "
+ "parallel processing in order to speed up a backward " + "search.";
}
/**
* Set whether to pre-compute the full correlation matrix at the outset,
* rather than computing individual correlations lazily (as needed) during the
* search.
*
* @param p true if the correlation matrix is to be pre-computed at the outset
*/
public void setPreComputeCorrelationMatrix(boolean p) {
m_preComputeCorrelationMatrix = p;
}
/**
* Get whether to pre-compute the full correlation matrix at the outset,
* rather than computing individual correlations lazily (as needed) during the
* search.
*
* @return true if the correlation matrix is to be pre-computed at the outset
*/
public boolean getPreComputeCorrelationMatrix() {
return m_preComputeCorrelationMatrix;
}
/**
* @return a string to describe the option
*/
public String numThreadsTipText() {
return "The number of threads to use, which should be >= size of thread pool.";
}
/**
* Gets the number of threads.
*/
public int getNumThreads() {
return m_numThreads;
}
/**
* Sets the number of threads
*/
public void setNumThreads(int nT) {
m_numThreads = nT;
}
/**
* @return a string to describe the option
*/
public String poolSizeTipText() {
return "The size of the thread pool, for example, the number of cores in the CPU.";
}
/**
* Gets the number of threads.
*/
public int getPoolSize() {
return m_poolSize;
}
/**
* Sets the number of threads
*/
public void setPoolSize(int nT) {
m_poolSize = nT;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String locallyPredictiveTipText() {
return "Identify locally predictive attributes. Iteratively adds "
+ "attributes with the highest correlation with the class as long "
+ "as there is not already an attribute in the subset that has a "
+ "higher correlation with the attribute in question";
}
/**
* Include locally predictive attributes
*
* @param b true or false
*/
public void setLocallyPredictive(boolean b) {
m_locallyPredictive = b;
}
/**
* Return true if including locally predictive attributes
*
* @return true if locally predictive attributes are to be used
*/
public boolean getLocallyPredictive() {
return m_locallyPredictive;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String missingSeparateTipText() {
return "Treat missing as a separate value. Otherwise, counts for missing "
+ "values are distributed across other values in proportion to their "
+ "frequency.";
}
/**
* Treat missing as a separate value
*
* @param b true or false
*/
public void setMissingSeparate(boolean b) {
m_missingSeparate = b;
}
/**
* Return true is missing is treated as a separate value
*
* @return true if missing is to be treated as a separate value
*/
public boolean getMissingSeparate() {
return m_missingSeparate;
}
/**
* Set whether to output debugging info
*
* @param d true if debugging info is to be output
*/
public void setDebug(boolean d) {
m_debug = d;
}
/**
* Set whether to output debugging info
*
* @return true if debugging info is to be output
*/
public boolean getDebug() {
return m_debug;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String debugTipText() {
return "Output debugging info";
}
/**
* Gets the current settings of CfsSubsetEval
*
* @return an array of strings suitable for passing to setOptions()
*/
@Override
public String[] getOptions() {
Vector options = new Vector();
if (getMissingSeparate()) {
options.add("-M");
}
if (!getLocallyPredictive()) {
options.add("-L");
}
if (getPreComputeCorrelationMatrix()) {
options.add("-Z");
}
options.add("-P");
options.add("" + getPoolSize());
options.add("-E");
options.add("" + getNumThreads());
if (getDebug()) {
options.add("-D");
}
return options.toArray(new String[0]);
}
/**
* Returns the capabilities of this evaluator.
*
* @return the capabilities of this evaluator
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NUMERIC_CLASS);
result.enable(Capability.DATE_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* Generates a attribute evaluator. Has to initialize all fields of the
* evaluator that are not being set via options.
*
* CFS also discretises attributes (if necessary) and initializes the
* correlation matrix.
*
* @param data set of instances serving as training data
* @throws Exception if the evaluator has not been generated successfully
*/
@Override
public void buildEvaluator(Instances data) throws Exception {
// can evaluator handle data?
getCapabilities().testWithFail(data);
m_numEntries = 0;
m_numFilled = new AtomicInteger();
m_trainInstances = new Instances(data);
m_trainInstances.deleteWithMissingClass();
m_classIndex = m_trainInstances.classIndex();
m_numAttribs = m_trainInstances.numAttributes();
m_numInstances = m_trainInstances.numInstances();
m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();
if (!m_isNumeric) {
m_disTransform = new Discretize();
m_disTransform.setUseBetterEncoding(true);
m_disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
if (m_debug) {
System.err.println("Finished discretizing input data");
}
}
m_std_devs = new double[m_numAttribs];
m_corr_matrix = new float[m_numAttribs][];
for (int i = 0; i < m_numAttribs; i++) {
m_corr_matrix[i] = new float[i + 1];
m_numEntries += (i + 1);
}
m_numEntries -= m_numAttribs;
for (int i = 0; i < m_corr_matrix.length; i++) {
m_corr_matrix[i][i] = 1.0f;
m_std_devs[i] = 1.0;
}
for (int i = 0; i < m_numAttribs; i++) {
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
m_corr_matrix[i][j] = -999;
}
}
if (m_preComputeCorrelationMatrix && m_poolSize > 1) {
m_pool = Executors.newFixedThreadPool(m_poolSize);
Set> results = new HashSet>();
int numEntriesPerThread = (m_numEntries + m_numAttribs) / m_numThreads;
numEntriesPerThread = numEntriesPerThread < 1 ? 1 : numEntriesPerThread;
int startRow = 0;
int startCol = 0;
int count = 0;
for (int i = 0; i < m_corr_matrix.length; i++) {
for (int j = 0; j < m_corr_matrix[i].length; j++) {
count++;
if (count == numEntriesPerThread
|| (i == m_corr_matrix.length - 1 && j == m_corr_matrix[i].length - 1)) {
final int sR = startRow;
final int sC = startCol;
final int eR = i;
final int eC = j;
startRow = i;
startCol = j;
count = 0;
Future future = m_pool.submit(new Callable() {
@Override
public Void call() throws Exception {
if (m_debug) {
System.err
.println("Starting correlation computation task...");
}
for (int i = sR; i <= eR; i++) {
for (int j = (i == sR ? sC : 0); j < (i == eR ? eC
: m_corr_matrix[i].length); j++) {
if (m_corr_matrix[i][j] == -999) {
float corr = correlate(i, j);
m_corr_matrix[i][j] = corr;
}
}
}
if (m_debug) {
System.err
.println("Percentage of correlation matrix computed: "
+ Utils.doubleToString(((double) m_numFilled.get()
/ m_numEntries * 100.0), 2) + "%");
}
return null;
}
});
results.add(future);
}
}
}
for (Future f : results) {
f.get();
}
// shut down the thread pool
m_pool.shutdown();
}
}
/**
* evaluates a subset of attributes
*
* @param subset a bitset representing the attribute subset to be evaluated
* @return the merit
* @throws Exception if the subset could not be evaluated
*/
@Override
public double evaluateSubset(BitSet subset) throws Exception {
double num = 0.0;
double denom = 0.0;
float corr;
int larger, smaller;
// do numerator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller =
* (i > m_classIndex ? m_classIndex : i);
*/
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
num += (m_std_devs[i] * corr);
} else {
num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);
}
}
}
}
// do denominator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
if (subset.get(j)) {
if (m_corr_matrix[i][j] == -999) {
corr = correlate(i, j);
m_corr_matrix[i][j] = corr;
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
} else {
denom +=
(2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);
}
}
}
}
}
}
if (denom < 0.0) {
denom *= -1.0;
}
if (denom == 0.0) {
return (0.0);
}
double merit = (num / Math.sqrt(denom));
if (merit < 0.0) {
merit *= -1.0;
}
return merit;
}
private float correlate(int att1, int att2) {
m_numFilled.addAndGet(1);
if (!m_isNumeric) {
return (float) symmUncertCorr(att1, att2);
}
boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
if (att1_is_num && att2_is_num) {
return (float) num_num(att1, att2);
} else {
if (att2_is_num) {
return (float) num_nom2(att1, att2);
} else {
if (att1_is_num) {
return (float) num_nom2(att2, att1);
}
}
}
return (float) nom_nom(att1, att2);
}
private double symmUncertCorr(int att1, int att2) {
int i, j, ii, jj;
int ni, nj;
double sum = 0.0;
double sumi[], sumj[];
double counts[][];
Instance inst;
double corr_measure;
boolean flag = false;
double temp = 0.0;
if (att1 == m_classIndex || att2 == m_classIndex) {
flag = true;
}
ni = m_trainInstances.attribute(att1).numValues() + 1;
nj = m_trainInstances.attribute(att2).numValues() + 1;
counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
ii = ni - 1;
} else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
jj = nj - 1;
} else {
jj = (int) inst.value(att2);
}
counts[ii][jj]++;
}
// get the row totals
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
sumj[j] += counts[i][j];
}
}
// distribute missing counts
if (!m_missingSeparate && (sumi[ni - 1] < m_numInstances)
&& (sumj[nj - 1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing =
(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
// do the missing i's
if (sumi[ni - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]);
counts[i][j] += temp;
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j] / (sum - total_missing))
* counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
corr_measure = ContingencyTables.symmetricalUncertainty(counts);
if (Utils.eq(corr_measure, 0.0)) {
if (flag == true) {
return (0.0);
} else {
return (1.0);
}
} else {
return (corr_measure);
}
}
private double num_num(int att1, int att2) {
int i;
Instance inst;
double r, diff1, diff2, num = 0.0, sx = 0.0, sy = 0.0;
double mx = m_trainInstances.meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.meanOrMode(m_trainInstances.attribute(att2));
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
diff1 = (inst.isMissing(att1)) ? 0.0 : (inst.value(att1) - mx);
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
num += (diff1 * diff2);
sx += (diff1 * diff1);
sy += (diff2 * diff2);
}
if (sx != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt((sx / m_numInstances));
}
}
if (sy != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((sy / m_numInstances));
}
}
if ((sx * sy) > 0.0) {
r = (num / (Math.sqrt(sx * sy)));
return ((r < 0.0) ? -r : r);
} else {
if (att1 != m_classIndex && att2 != m_classIndex) {
return 1.0;
} else {
return 0.0;
}
}
}
private double num_nom2(int att1, int att2) {
int i, ii, k;
double temp;
Instance inst;
int mx = (int) m_trainInstances
.meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.meanOrMode(m_trainInstances.attribute(att2));
double stdv_num = 0.0;
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate) ? m_trainInstances.attribute(att1)
.numValues() : m_trainInstances.attribute(att1).numValues() + 1;
double[] prior_nom = new double[nx];
double[] stdvs_nom = new double[nx];
double[] covs = new double[nx];
for (i = 0; i < nx; i++) {
stdvs_nom[i] = covs[i] = prior_nom[i] = 0.0;
}
// calculate frequencies (and means) of the values of the nominal
// attribute
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
} else {
ii = nx - 1;
}
} else {
ii = (int) inst.value(att1);
}
// increment freq for nominal
prior_nom[ii]++;
}
for (k = 0; k < m_numInstances; k++) {
inst = m_trainInstances.instance(k);
// std dev of numeric attribute
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
stdv_num += (diff2 * diff2);
//
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp = (i == mx) ? 1.0 : 0.0;
} else {
temp = (i == (nx - 1)) ? 1.0 : 0.0;
}
} else {
temp = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp - (prior_nom[i] / m_numInstances));
stdvs_nom[i] += (diff1 * diff1);
covs[i] += (diff1 * diff2);
}
}
// calculate weighted correlation
for (i = 0, temp = 0.0; i < nx; i++) {
// calculate the weighted variance of the nominal
temp +=
((prior_nom[i] / m_numInstances) * (stdvs_nom[i] / m_numInstances));
if ((stdvs_nom[i] * stdv_num) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i] / (Math.sqrt(stdvs_nom[i] * stdv_num)));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i] / m_numInstances) * rr);
}
/*
* if there is zero variance for the numeric att at a specific level of
* the catergorical att then if neither is the class then make this
* correlation at this level maximally bad i.e. 1.0. If either is the
* class then maximally bad correlation is 0.0
*/
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i] / m_numInstances) * 1.0);
}
}
}
// set the standard deviations for these attributes if necessary
// if ((att1 != classIndex) && (att2 != classIndex)) // =============
if (temp != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp);
}
}
if (stdv_num != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((stdv_num / m_numInstances));
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
private double nom_nom(int att1, int att2) {
int i, j, ii, jj, z;
double temp1, temp2;
Instance inst;
int mx = (int) m_trainInstances
.meanOrMode(m_trainInstances.attribute(att1));
int my = (int) m_trainInstances
.meanOrMode(m_trainInstances.attribute(att2));
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate) ? m_trainInstances.attribute(att1)
.numValues() : m_trainInstances.attribute(att1).numValues() + 1;
int ny = (!m_missingSeparate) ? m_trainInstances.attribute(att2)
.numValues() : m_trainInstances.attribute(att2).numValues() + 1;
double[][] prior_nom = new double[nx][ny];
double[] sumx = new double[nx];
double[] sumy = new double[ny];
double[] stdvsx = new double[nx];
double[] stdvsy = new double[ny];
double[][] covs = new double[nx][ny];
for (i = 0; i < nx; i++) {
sumx[i] = stdvsx[i] = 0.0;
}
for (j = 0; j < ny; j++) {
sumy[j] = stdvsy[j] = 0.0;
}
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
covs[i][j] = prior_nom[i][j] = 0.0;
}
}
// calculate frequencies (and means) of the values of the nominal
// attribute
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
} else {
ii = nx - 1;
}
} else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
jj = my;
} else {
jj = ny - 1;
}
} else {
jj = (int) inst.value(att2);
}
// increment freq for nominal
prior_nom[ii][jj]++;
sumx[ii]++;
sumy[jj]++;
}
for (z = 0; z < m_numInstances; z++) {
inst = m_trainInstances.instance(z);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
} else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
} else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
stdvsy[j] += (diff2 * diff2);
}
//
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp1 = (i == mx) ? 1.0 : 0.0;
} else {
temp1 = (i == (nx - 1)) ? 1.0 : 0.0;
}
} else {
temp1 = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp1 - (sumx[i] / m_numInstances));
stdvsx[i] += (diff1 * diff1);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
} else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
} else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
covs[i][j] += (diff1 * diff2);
}
}
}
// calculate weighted correlation
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
if ((stdvsx[i] * stdvsy[j]) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i][j] / (Math.sqrt(stdvsx[i] * stdvsy[j])));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i][j] / m_numInstances) * rr);
}
// if there is zero variance for either of the categorical atts then if
// neither is the class then make this
// correlation at this level maximally bad i.e. 1.0. If either is
// the class then maximally bad correlation is 0.0
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i][j] / m_numInstances) * 1.0);
}
}
}
}
// calculate weighted standard deviations for these attributes
// (if necessary)
for (i = 0, temp1 = 0.0; i < nx; i++) {
temp1 += ((sumx[i] / m_numInstances) * (stdvsx[i] / m_numInstances));
}
if (temp1 != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp1);
}
}
for (j = 0, temp2 = 0.0; j < ny; j++) {
temp2 += ((sumy[j] / m_numInstances) * (stdvsy[j] / m_numInstances));
}
if (temp2 != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt(temp2);
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
/**
* returns a string describing CFS
*
* @return the description as a string
*/
@Override
public String toString() {
StringBuffer text = new StringBuffer();
if (m_trainInstances == null) {
text.append("CFS subset evaluator has not been built yet\n");
} else {
text.append("\tCFS Subset Evaluator\n");
if (m_missingSeparate) {
text.append("\tTreating missing values as a separate value\n");
}
if (m_locallyPredictive) {
text.append("\tIncluding locally predictive attributes\n");
}
}
return text.toString();
}
private void addLocallyPredictive(BitSet best_group) {
int i, j;
boolean done = false;
boolean ok = true;
double temp_best = -1.0;
float corr;
j = 0;
BitSet temp_group = (BitSet) best_group.clone();
int larger, smaller;
while (!done) {
temp_best = -1.0;
// find best not already in group
for (i = 0; i < m_numAttribs; i++) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller = (i
* > m_classIndex ? m_classIndex : i);
*/
if ((!temp_group.get(i)) && (i != m_classIndex)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best) {
temp_best = m_corr_matrix[larger][smaller];
j = i;
}
}
}
if (temp_best == -1.0) {
done = true;
} else {
ok = true;
temp_group.set(j);
// check the best against correlations with others already
// in group
for (i = 0; i < m_numAttribs; i++) {
if (i > j) {
larger = i;
smaller = j;
} else {
larger = j;
smaller = i;
}
/*
* int larger = (i > j ? i : j); int smaller = (i > j ? j : i);
*/
if (best_group.get(i)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, j);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best - m_c_Threshold) {
ok = false;
break;
}
}
}
// if ok then add to best_group
if (ok) {
best_group.set(j);
}
}
}
}
/**
* Calls locallyPredictive in order to include locally predictive attributes
* (if requested).
*
* @param attributeSet the set of attributes found by the search
* @return a possibly ranked list of postprocessed attributes
* @throws Exception if postprocessing fails for some reason
*/
@Override
public int[] postProcess(int[] attributeSet) throws Exception {
if (m_debug) {
System.err.println("Percentage of correlation matrix computed "
+ "over the search: "
+ Utils.doubleToString(
((double) m_numFilled.get() / m_numEntries * 100.0), 2) + "%");
}
int j = 0;
if (!m_locallyPredictive) {
m_trainInstances = new Instances(m_trainInstances, 0);
return attributeSet;
}
BitSet bestGroup = new BitSet(m_numAttribs);
for (int element : attributeSet) {
bestGroup.set(element);
}
addLocallyPredictive(bestGroup);
// count how many are set
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
j++;
}
}
int[] newSet = new int[j];
j = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
newSet[j++] = i;
}
}
m_trainInstances = new Instances(m_trainInstances, 0);
return newSet;
}
protected void resetOptions() {
m_trainInstances = null;
m_missingSeparate = false;
m_locallyPredictive = true;
m_c_Threshold = 0.0;
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 11215 $");
}
/**
* Main method for testing this class.
*
* @param args the options
*/
public static void main(String[] args) {
runEvaluator(new CfsSubsetEval(), args);
}
}