weka.filters.unsupervised.instance.RemoveMisclassified Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* RemoveMisclassified.java
* Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.filters.unsupervised.instance;
import java.util.Enumeration;
import java.util.Vector;
import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.core.*;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;
/**
* A filter that removes instances which are
* incorrectly classified. Useful for removing outliers.
*
*
*
* Valid options are:
*
*
*
* -W <classifier specification>
* Full class name of classifier to use, followed
* by scheme options. eg:
* "weka.classifiers.bayes.NaiveBayes -D"
* (default: weka.classifiers.rules.ZeroR)
*
*
*
* -C <class index>
* Attribute on which misclassifications are based.
* If < 0 will use any current set class or default to the last attribute.
*
*
*
* -F <number of folds>
* The number of folds to use for cross-validation cleansing.
* (<2 = no cross-validation - default).
*
*
*
* -T <threshold>
* Threshold for the max error when predicting numeric class.
* (Value should be >= 0, default = 0.1).
*
*
*
* -I
* The maximum number of cleansing iterations to perform.
* (<1 = until fully cleansed - default)
*
*
*
* -V
* Invert the match so that correctly classified instances are discarded.
*
*
*
*
* @author Richard Kirkby ([email protected])
* @author Malcolm Ware ([email protected])
* @version $Revision: 14508 $
*/
public class RemoveMisclassified extends Filter implements UnsupervisedFilter,
OptionHandler, WeightedAttributesHandler, WeightedInstancesHandler {
/** for serialization */
static final long serialVersionUID = 5469157004717663171L;
/** The classifier used to do the cleansing */
protected Classifier m_cleansingClassifier = new weka.classifiers.rules.ZeroR();
/** The attribute to treat as the class for purposes of cleansing. */
protected int m_classIndex = -1;
/**
* The number of cross validation folds to perform (<2 = no cross
* validation)
*/
protected int m_numOfCrossValidationFolds = 0;
/**
* The maximum number of cleansing iterations to perform (<1 = until fully
* cleansed)
*/
protected int m_numOfCleansingIterations = 0;
/** The threshold for deciding when a numeric value is correctly classified */
protected double m_numericClassifyThreshold = 0.1;
/**
* Whether to invert the match so the correctly classified instances are
* discarded
*/
protected boolean m_invertMatching = false;
/** Have we processed the first batch (i.e. training data)? */
protected boolean m_firstBatchFinished = false;
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result;
if (getClassifier() == null) {
result = super.getCapabilities();
result.disableAll();
} else {
result = getClassifier().getCapabilities();
}
result.setMinimumNumberInstances(0);
return result;
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input instance
* structure (any instances contained in the object are ignored -
* only the structure is required).
* @return true if the outputFormat may be collected immediately
* @throws Exception if the inputFormat can't be set successfully
*/
@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
setOutputFormat(instanceInfo);
m_firstBatchFinished = false;
return true;
}
/**
* Cleanses the data based on misclassifications when used training data.
*
* @param data the data to train with and cleanse
* @return the cleansed data
* @throws Exception if something goes wrong
*/
private Instances cleanseTrain(Instances data) throws Exception {
Instance inst;
Instances buildSet = new Instances(data);
Instances temp;
Instances inverseSet = new Instances(data, data.numInstances());
int count = 0;
double ans;
int iterations = 0;
int classIndex = m_classIndex;
if (classIndex < 0) {
classIndex = data.classIndex();
}
if (classIndex < 0) {
classIndex = data.numAttributes() - 1;
}
// loop until perfect
while (count != buildSet.numInstances()) {
// check if hit maximum number of iterations
iterations++;
if (m_numOfCleansingIterations > 0
&& iterations > m_numOfCleansingIterations) {
break;
}
// build classifier
count = buildSet.numInstances();
buildSet.setClassIndex(classIndex);
m_cleansingClassifier.buildClassifier(buildSet);
temp = new Instances(buildSet, buildSet.numInstances());
// test on training data
for (int i = 0; i < buildSet.numInstances(); i++) {
inst = buildSet.instance(i);
ans = m_cleansingClassifier.classifyInstance(inst);
if (buildSet.classAttribute().isNumeric()) {
if (ans >= inst.classValue() - m_numericClassifyThreshold
&& ans <= inst.classValue() + m_numericClassifyThreshold) {
temp.add(inst);
} else if (m_invertMatching) {
inverseSet.add(inst);
}
} else { // class is nominal
if (ans == inst.classValue()) {
temp.add(inst);
} else if (m_invertMatching) {
inverseSet.add(inst);
}
}
}
buildSet = temp;
}
if (m_invertMatching) {
inverseSet.setClassIndex(data.classIndex());
return inverseSet;
} else {
buildSet.setClassIndex(data.classIndex());
return buildSet;
}
}
/**
* Cleanses the data based on misclassifications when performing
* cross-validation.
*
* @param data the data to train with and cleanse
* @return the cleansed data
* @throws Exception if something goes wrong
*/
private Instances cleanseCross(Instances data) throws Exception {
Instance inst;
Instances crossSet = new Instances(data);
Instances temp = new Instances(data, data.numInstances());
Instances inverseSet = new Instances(data, data.numInstances());
int count = 0;
double ans;
int iterations = 0;
int classIndex = m_classIndex;
if (classIndex < 0) {
classIndex = data.classIndex();
}
if (classIndex < 0) {
classIndex = data.numAttributes() - 1;
}
// loop until perfect
while (count != crossSet.numInstances()
&& crossSet.numInstances() >= m_numOfCrossValidationFolds) {
count = crossSet.numInstances();
// check if hit maximum number of iterations
iterations++;
if (m_numOfCleansingIterations > 0
&& iterations > m_numOfCleansingIterations) {
break;
}
crossSet.setClassIndex(classIndex);
if (crossSet.classAttribute().isNominal()) {
crossSet.stratify(m_numOfCrossValidationFolds);
}
// do the folds
temp = new Instances(crossSet, crossSet.numInstances());
for (int fold = 0; fold < m_numOfCrossValidationFolds; fold++) {
Instances train = crossSet.trainCV(m_numOfCrossValidationFolds, fold);
m_cleansingClassifier.buildClassifier(train);
Instances test = crossSet.testCV(m_numOfCrossValidationFolds, fold);
// now test
for (int i = 0; i < test.numInstances(); i++) {
inst = test.instance(i);
ans = m_cleansingClassifier.classifyInstance(inst);
if (crossSet.classAttribute().isNumeric()) {
if (ans >= inst.classValue() - m_numericClassifyThreshold
&& ans <= inst.classValue() + m_numericClassifyThreshold) {
temp.add(inst);
} else if (m_invertMatching) {
inverseSet.add(inst);
}
} else { // class is nominal
if (ans == inst.classValue()) {
temp.add(inst);
} else if (m_invertMatching) {
inverseSet.add(inst);
}
}
}
}
crossSet = temp;
}
if (m_invertMatching) {
inverseSet.setClassIndex(data.classIndex());
return inverseSet;
} else {
crossSet.setClassIndex(data.classIndex());
return crossSet;
}
}
/**
* Input an instance for filtering.
*
* @param instance the input instance
* @return true if the filtered instance may now be collected with output().
* @throws NullPointerException if the input format has not been defined.
* @throws Exception if the input instance was not of the correct format or if
* there was a problem with the filtering.
*/
@Override
public boolean input(Instance instance) throws Exception {
if (inputFormatPeek() == null) {
throw new NullPointerException("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (m_firstBatchFinished) {
push(instance);
return true;
} else {
bufferInput(instance);
return false;
}
}
/**
* Signify that this batch of input to the filter is finished.
*
* @return true if there are instances pending output
* @throws IllegalStateException if no input structure has been defined
*/
@Override
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (!m_firstBatchFinished) {
Instances filtered;
if (m_numOfCrossValidationFolds < 2) {
filtered = cleanseTrain(getInputFormat());
} else {
filtered = cleanseCross(getInputFormat());
}
for (int i = 0; i < filtered.numInstances(); i++) {
push(filtered.instance(i), false); // No need to copy
}
m_firstBatchFinished = true;
flushInput();
}
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration