Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
weka.filters.unsupervised.attribute.AddNoise Maven / Gradle / Ivy
Go to download
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other breaking updates.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* AddNoise.java
* Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
*/
package weka.filters.unsupervised.attribute;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;
import weka.gui.ProgrammaticProperty;
/**
* An instance filter that changes a percentage of a
* given attribute's values. The attribute must be nominal. Missing value can be
* treated as a distinct separate value.
*
*
*
* Valid options are:
*
*
*
* -C <col>
* Index of the attribute to be changed
* (default last attribute)
*
*
*
* -M
* Treat missing values as an extra value
*
*
*
* -P <num>
* Specify the percentage of values that are changed (default 10)
*
*
*
* -S <num>
* Specify the random number seed (default 1)
*
*
*
*
* @author Gabi Schmidberger ([email protected] )
* @version $Revision: 14534 $
*/
public class AddNoise extends Filter implements UnsupervisedFilter,
OptionHandler, Randomizable, WeightedAttributesHandler {
/** for serialization */
static final long serialVersionUID = -8499673222857299082L;
/** The attribute's index setting. */
private final SingleIndex m_AttIndex = new SingleIndex("last");
/** Flag if missing values are taken as value. */
private boolean m_UseMissing = false;
/** The subsample size, percent of original set, default 10% */
private int m_Percent = 10;
/** The random number generator seed */
private int m_RandomSeed = 1;
/**
* Returns a string describing this filter
*
* @return a description of the filter suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "An instance filter that changes a percentage of a given"
+ " attribute's values. The attribute must be nominal."
+ " Missing value can be treated as as a distinct separate value.";
}
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
@Override
public Enumeration listOptions() {
Vector newVector = new Vector (4);
newVector.addElement(new Option("\tIndex of the attribute to be changed \n"
+ "\t(default last attribute)", "C", 1, "-C "));
newVector.addElement(new Option(
"\tTreat missing values as an extra value \n", "M", 1, "-M"));
newVector.addElement(new Option(
"\tSpecify the percentage of values that are changed (default 10)", "P", 1, "-P "));
newVector.addElement(new Option(
"\tSpecify the random number seed (default 1)", "S", 1, "-S "));
return newVector.elements();
}
/**
* Parses a given list of options.
*
*
* Valid options are:
*
*
*
* -C <col>
* Index of the attribute to be changed
* (default last attribute)
*
*
*
* -M
* Treat missing values as an extra value
*
*
*
* -P <num>
* Specify the percentage of values that are changed (default 10)
*
*
*
* -S <num>
* Specify the random number seed (default 1)
*
*
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String indexString = Utils.getOption('C', options);
if (indexString.length() != 0) {
setAttributeIndex(indexString);
} else {
setAttributeIndex("last");
}
if (Utils.getFlag('M', options)) {
setUseMissing(true);
}
String percentString = Utils.getOption('P', options);
if (percentString.length() != 0) {
setPercent((int) Double.valueOf(percentString).doubleValue());
} else {
setPercent(10);
}
String seedString = Utils.getOption('S', options);
if (seedString.length() != 0) {
setRandomSeed(Integer.parseInt(seedString));
} else {
setRandomSeed(1);
}
Utils.checkForRemainingOptions(options);
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
@Override
public String[] getOptions() {
Vector options = new Vector();
options.add("-C");
options.add("" + getAttributeIndex());
if (getUseMissing()) {
options.add("-M");
}
options.add("-P");
options.add("" + getPercent());
options.add("-S");
options.add("" + getRandomSeed());
return options.toArray(new String[0]);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String useMissingTipText() {
return "Flag to set if missing values are treated as separate values.";
}
/**
* Gets the flag if missing values are treated as extra values.
*
* @return the flag missing values.
*/
public boolean getUseMissing() {
return m_UseMissing;
}
/**
* Sets the flag if missing values are treated as extra values.
*
* @param newUseMissing the new flag value.
*/
public void setUseMissing(boolean newUseMissing) {
m_UseMissing = newUseMissing;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String randomSeedTipText() {
return "Random number seed.";
}
/**
* Gets the random number seed.
*
* @return the random number seed.
*/
public int getRandomSeed() {
return m_RandomSeed;
}
/**
* Sets the random number seed.
*
* @param newSeed the new random number seed.
*/
public void setRandomSeed(int newSeed) {
m_RandomSeed = newSeed;
}
@ProgrammaticProperty
public void setSeed(int seed) {
setRandomSeed(seed);
}
@ProgrammaticProperty
public int getSeed() {
return getRandomSeed();
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String percentTipText() {
return "Percentage of values that are changed.";
}
/**
* Gets the size of noise data as a percentage of the original set.
*
* @return the noise data size
*/
public int getPercent() {
return m_Percent;
}
/**
* Sets the size of noise data, as a percentage of the original set.
*
* @param newPercent the subsample set size, between 0 and 100.
*/
public void setPercent(int newPercent) {
m_Percent = newPercent;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String attributeIndexTipText() {
return "Index of the attribute that is to changed.";
}
/**
* Get the index of the attribute used.
*
* @return the index of the attribute
*/
public String getAttributeIndex() {
return m_AttIndex.getSingleIndex();
}
/**
* Sets index of the attribute used.
*
* @param attIndex the index of the attribute
*/
public void setAttributeIndex(String attIndex) {
m_AttIndex.setSingleIndex(attIndex);
}
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enableAllClasses();
result.enable(Capability.MISSING_CLASS_VALUES);
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input instance
* structure (any instances contained in the object are ignored -
* only the structure is required).
* @return true if the outputFormat may be collected immediately
* @throws Exception if the input format can't be set successfully
*/
@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
// set input format
// m_InputFormat = new Instances(instanceInfo, 0);
m_AttIndex.setUpper(getInputFormat().numAttributes() - 1);
// set index of attribute to be changed
// test if nominal
if (!getInputFormat().attribute(m_AttIndex.getIndex()).isNominal()) {
throw new Exception("Adding noise is not possible:"
+ "Chosen attribute is numeric.");
}
// test if two values are given
if ((getInputFormat().attribute(m_AttIndex.getIndex()).numValues() < 2)
&& (!m_UseMissing)) {
throw new Exception("Adding noise is not possible:"
+ "Chosen attribute has less than two values.");
}
setOutputFormat(getInputFormat());
m_NewBatch = true;
return false;
}
/**
* Input an instance for filtering.
*
* @param instance the input instance
* @return true if the filtered instance may now be collected with output().
* @throws Exception if the input format was not set
*/
@Override
public boolean input(Instance instance) throws Exception {
// check if input format is defined
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (isFirstBatchDone()) {
push(instance);
return true;
} else {
bufferInput(instance);
return false;
}
}
/**
* Signify that this batch of input to the filter is finished. If the filter
* requires all instances prior to filtering, output() may now be called to
* retrieve the filtered instances.
*
* @return true if there are instances pending output
* @throws Exception if no input structure has been defined
*/
@Override
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
// Do the subsample, and clear the input instances.
addNoise(getInputFormat(), m_RandomSeed, m_Percent, m_AttIndex.getIndex(),
m_UseMissing);
for (int i = 0; i < getInputFormat().numInstances(); i++) {
push((Instance) getInputFormat().instance(i).copy(), false); // No need to copy instance
}
flushInput();
m_NewBatch = true;
m_FirstBatchDone = true;
return (numPendingOutput() != 0);
}
/**
* add noise to the dataset
*
* a given percentage of the instances are changed in the way that a set of
* instances are randomly selected using seed. The attribute given by its
* index is changed from its current value to one of the other possibly ones,
* also randomly. This is done while leaving the proportion the same. If
* m_UseMissing is true, missing value is used as a value of its own
*
* @param instances is the dataset
* @param seed used for random function
* @param percent percentage of instances that are changed
* @param attIndex index of the attribute changed
* @param useMissing if true missing values are treated as extra value
*/
public void addNoise(Instances instances, int seed, int percent,
int attIndex, boolean useMissing) {
int indexList[];
int partition_count[];
int partition_max[];
double splitPercent = percent; // percentage used for splits
// fill array with the indexes
indexList = new int[instances.numInstances()];
for (int i = 0; i < instances.numInstances(); i++) {
indexList[i] = i;
}
// randomize list of indexes
Random random = new Random(seed);
for (int i = instances.numInstances() - 1; i >= 0; i--) {
int hValue = indexList[i];
int hIndex = (int) (random.nextDouble() * i);
indexList[i] = indexList[hIndex];
indexList[hIndex] = hValue;
}
// initialize arrays that are used to count instances
// of each value and to keep the amount of instances of that value
// that has to be changed
// this is done for the missing values in the two variables
// missing_count and missing_max
int numValues = instances.attribute(attIndex).numValues();
partition_count = new int[numValues];
partition_max = new int[numValues];
int missing_count = 0;
int missing_max = 0;
for (int i = 0; i < numValues; i++) {
partition_count[i] = 0;
partition_max[i] = 0;
}
// go through the dataset and count all occurrences of values
// and all missing values using temporarily .._max arrays and
// variable missing_max
for (Object element : instances) {
Instance instance = (Instance) element;
if (instance.isMissing(attIndex)) {
missing_max++;
} else {
instance.value(attIndex);
partition_max[(int) instance.value(attIndex)]++;
}
}
// use given percentage to calculate
// how many have to be changed per split and
// how many of the missing values
if (!useMissing) {
missing_max = missing_count;
} else {
missing_max = (int) (((double) missing_max / 100) * splitPercent + 0.5);
}
int sum_max = missing_max;
for (int i = 0; i < numValues; i++) {
partition_max[i] = (int) (((double) partition_max[i] / 100)
* splitPercent + 0.5);
sum_max = sum_max + partition_max[i];
}
// initialize sum_count to zero, use this variable to see if
// everything is done already
int sum_count = 0;
// add noise
// using the randomized index-array
//
Random randomValue = new Random(seed);
int numOfValues = instances.attribute(attIndex).numValues();
for (int i = 0; i < instances.numInstances(); i++) {
if (sum_count >= sum_max) {
break;
} // finished
Instance currInstance = instances.instance(indexList[i]);
// if value is missing then...
if (currInstance.isMissing(attIndex)) {
if (missing_count < missing_max) {
changeValueRandomly(randomValue, numOfValues, attIndex, currInstance,
useMissing);
missing_count++;
sum_count++;
}
} else {
int vIndex = (int) currInstance.value(attIndex);
if (partition_count[vIndex] < partition_max[vIndex]) {
changeValueRandomly(randomValue, numOfValues, attIndex, currInstance,
useMissing);
partition_count[vIndex]++;
sum_count++;
}
}
}
}
/**
* method to set a new value
*
* @param r random function
* @param numOfValues
* @param instance
* @param useMissing
*/
private void changeValueRandomly(Random r, int numOfValues, int indexOfAtt,
Instance instance, boolean useMissing) {
int currValue;
// get current value
// if value is missing set current value to number of values
// whiche is the highest possible value plus one
if (instance.isMissing(indexOfAtt)) {
currValue = numOfValues;
} else {
currValue = (int) instance.value(indexOfAtt);
}
// with only two possible values it is easier
if ((numOfValues == 2) && (!instance.isMissing(indexOfAtt))) {
instance.setValue(indexOfAtt, (currValue + 1) % 2);
} else {
// get randomly a new value not equal to the current value
// if missing values are used as values they must be treated
// in a special way
while (true) {
int newValue;
if (useMissing) {
newValue = (int) (r.nextDouble() * (numOfValues + 1));
} else {
newValue = (int) (r.nextDouble() * numOfValues);
}
// have we found a new value?
if (newValue != currValue) {
// the value 1 above the highest possible value (=numOfValues)
// is used as missing value
if (newValue == numOfValues) {
instance.setMissing(indexOfAtt);
} else {
instance.setValue(indexOfAtt, newValue);
}
break;
}
}
}
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 14534 $");
}
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter: use -h for help
*/
public static void main(String[] argv) {
runFilter(new AddNoise(), argv);
}
}