All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.unsupervised.instance.RemoveMisclassified Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    RemoveMisclassified.java
 *    Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.unsupervised.instance;

import java.util.Enumeration;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.core.*;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

/**
 *  A filter that removes instances which are
 * incorrectly classified. Useful for removing outliers.
 * 

* * * Valid options are: *

* *

 * -W <classifier specification>
 *  Full class name of classifier to use, followed
 *  by scheme options. eg:
 *   "weka.classifiers.bayes.NaiveBayes -D"
 *  (default: weka.classifiers.rules.ZeroR)
 * 
* *
 * -C <class index>
 *  Attribute on which misclassifications are based.
 *  If < 0 will use any current set class or default to the last attribute.
 * 
* *
 * -F <number of folds>
 *  The number of folds to use for cross-validation cleansing.
 *  (<2 = no cross-validation - default).
 * 
* *
 * -T <threshold>
 *  Threshold for the max error when predicting numeric class.
 *  (Value should be >= 0, default = 0.1).
 * 
* *
 * -I
 *  The maximum number of cleansing iterations to perform.
 *  (<1 = until fully cleansed - default)
 * 
* *
 * -V
 *  Invert the match so that correctly classified instances are discarded.
 * 
* * * * @author Richard Kirkby ([email protected]) * @author Malcolm Ware ([email protected]) * @version $Revision: 14508 $ */ public class RemoveMisclassified extends Filter implements UnsupervisedFilter, OptionHandler, WeightedAttributesHandler, WeightedInstancesHandler { /** for serialization */ static final long serialVersionUID = 5469157004717663171L; /** The classifier used to do the cleansing */ protected Classifier m_cleansingClassifier = new weka.classifiers.rules.ZeroR(); /** The attribute to treat as the class for purposes of cleansing. */ protected int m_classIndex = -1; /** * The number of cross validation folds to perform (<2 = no cross * validation) */ protected int m_numOfCrossValidationFolds = 0; /** * The maximum number of cleansing iterations to perform (<1 = until fully * cleansed) */ protected int m_numOfCleansingIterations = 0; /** The threshold for deciding when a numeric value is correctly classified */ protected double m_numericClassifyThreshold = 0.1; /** * Whether to invert the match so the correctly classified instances are * discarded */ protected boolean m_invertMatching = false; /** Have we processed the first batch (i.e. training data)? */ protected boolean m_firstBatchFinished = false; /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result; if (getClassifier() == null) { result = super.getCapabilities(); result.disableAll(); } else { result = getClassifier().getCapabilities(); } result.setMinimumNumberInstances(0); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the inputFormat can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); setOutputFormat(instanceInfo); m_firstBatchFinished = false; return true; } /** * Cleanses the data based on misclassifications when used training data. * * @param data the data to train with and cleanse * @return the cleansed data * @throws Exception if something goes wrong */ private Instances cleanseTrain(Instances data) throws Exception { Instance inst; Instances buildSet = new Instances(data); Instances temp; Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) { classIndex = data.classIndex(); } if (classIndex < 0) { classIndex = data.numAttributes() - 1; } // loop until perfect while (count != buildSet.numInstances()) { // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) { break; } // build classifier count = buildSet.numInstances(); buildSet.setClassIndex(classIndex); m_cleansingClassifier.buildClassifier(buildSet); temp = new Instances(buildSet, buildSet.numInstances()); // test on training data for (int i = 0; i < buildSet.numInstances(); i++) { inst = buildSet.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (buildSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { // class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } buildSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { buildSet.setClassIndex(data.classIndex()); return buildSet; } } /** * Cleanses the data based on misclassifications when performing * cross-validation. * * @param data the data to train with and cleanse * @return the cleansed data * @throws Exception if something goes wrong */ private Instances cleanseCross(Instances data) throws Exception { Instance inst; Instances crossSet = new Instances(data); Instances temp = new Instances(data, data.numInstances()); Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) { classIndex = data.classIndex(); } if (classIndex < 0) { classIndex = data.numAttributes() - 1; } // loop until perfect while (count != crossSet.numInstances() && crossSet.numInstances() >= m_numOfCrossValidationFolds) { count = crossSet.numInstances(); // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) { break; } crossSet.setClassIndex(classIndex); if (crossSet.classAttribute().isNominal()) { crossSet.stratify(m_numOfCrossValidationFolds); } // do the folds temp = new Instances(crossSet, crossSet.numInstances()); for (int fold = 0; fold < m_numOfCrossValidationFolds; fold++) { Instances train = crossSet.trainCV(m_numOfCrossValidationFolds, fold); m_cleansingClassifier.buildClassifier(train); Instances test = crossSet.testCV(m_numOfCrossValidationFolds, fold); // now test for (int i = 0; i < test.numInstances(); i++) { inst = test.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (crossSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { // class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } } crossSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { crossSet.setClassIndex(data.classIndex()); return crossSet; } } /** * Input an instance for filtering. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws NullPointerException if the input format has not been defined. * @throws Exception if the input instance was not of the correct format or if * there was a problem with the filtering. */ @Override public boolean input(Instance instance) throws Exception { if (inputFormatPeek() == null) { throw new NullPointerException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (m_firstBatchFinished) { push(instance); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (!m_firstBatchFinished) { Instances filtered; if (m_numOfCrossValidationFolds < 2) { filtered = cleanseTrain(getInputFormat()); } else { filtered = cleanseCross(getInputFormat()); } for (int i = 0; i < filtered.numInstances(); i++) { push(filtered.instance(i), false); // No need to copy } m_firstBatchFinished = true; flushInput(); } m_NewBatch = true; return (numPendingOutput() != 0); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy