weka.filters.unsupervised.attribute.RemoveUseless Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* RemoveUseless.java
* Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.filters.unsupervised.attribute;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;
/**
* This filter removes attributes that do not vary at
* all or that vary too much. All constant attributes are deleted automatically,
* along with any that exceed the maximum percentage of variance parameter. The
* maximum variance test is only applied to nominal attributes.
*
*
*
* Valid options are:
*
*
*
* -M <max variance %>
* Maximum variance percentage allowed (default 99)
*
*
*
*
* @author Richard Kirkby ([email protected])
* @version $Revision: 10215 $
*/
public class RemoveUseless extends Filter implements UnsupervisedFilter,
OptionHandler {
/** for serialization */
static final long serialVersionUID = -8659417851407640038L;
/** The filter used to remove attributes */
protected Remove m_removeFilter = null;
/** The type of attribute to delete */
protected double m_maxVariancePercentage = 99.0;
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.STRING_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// class
result.enableAllClasses();
result.enable(Capability.MISSING_CLASS_VALUES);
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input instance
* structure (any instances contained in the object are ignored -
* only the structure is required).
* @return true if the outputFormat may be collected immediately
* @throws Exception if the inputFormat can't be set successfully
*/
@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
m_removeFilter = null;
return false;
}
/**
* Input an instance for filtering.
*
* @param instance the input instance
* @return true if the filtered instance may now be collected with output().
*/
@Override
public boolean input(Instance instance) {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (m_removeFilter != null) {
m_removeFilter.input(instance);
Instance processed = m_removeFilter.output();
processed.setDataset(getOutputFormat());
copyValues(processed, false, instance.dataset(), getOutputFormat());
push(processed);
return true;
}
bufferInput(instance);
return false;
}
/**
* Signify that this batch of input to the filter is finished.
*
* @return true if there are instances pending output
* @throws Exception if no input format defined
*/
@Override
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (m_removeFilter == null) {
// establish attributes to remove from first batch
Instances toFilter = getInputFormat();
int[] attsToDelete = new int[toFilter.numAttributes()];
int numToDelete = 0;
for (int i = 0; i < toFilter.numAttributes(); i++) {
if (i == toFilter.classIndex()) {
continue; // skip class
}
AttributeStats stats = toFilter.attributeStats(i);
if (stats.missingCount == toFilter.numInstances()) {
attsToDelete[numToDelete++] = i;
} else if (stats.distinctCount < 2) {
// remove constant attributes
attsToDelete[numToDelete++] = i;
} else if (toFilter.attribute(i).isNominal()) {
// remove nominal attributes that vary too much
double variancePercent = (double) stats.distinctCount
/ (double) (stats.totalCount - stats.missingCount) * 100.0;
if (variancePercent > m_maxVariancePercentage) {
attsToDelete[numToDelete++] = i;
}
}
}
int[] finalAttsToDelete = new int[numToDelete];
System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete);
m_removeFilter = new Remove();
m_removeFilter.setAttributeIndicesArray(finalAttsToDelete);
m_removeFilter.setInvertSelection(false);
m_removeFilter.setInputFormat(toFilter);
for (int i = 0; i < toFilter.numInstances(); i++) {
m_removeFilter.input(toFilter.instance(i));
}
m_removeFilter.batchFinished();
Instance processed;
Instances outputDataset = m_removeFilter.getOutputFormat();
// restore old relation name to hide attribute filter stamp
outputDataset.setRelationName(toFilter.relationName());
setOutputFormat(outputDataset);
while ((processed = m_removeFilter.output()) != null) {
processed.setDataset(outputDataset);
push(processed);
}
}
flushInput();
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration