Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
weka.filters.unsupervised.attribute.ReplaceMissingWithUserConstant Maven / Gradle / Ivy
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* ReplaceMissingWithUserConstant.java
* Copyright (C) 2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.filters.unsupervised.attribute;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.StreamableFilter;
import weka.filters.UnsupervisedFilter;
/**
* Replaces all missing values for nominal, string,
* numeric and date attributes in the dataset with user-supplied constant
* values.
*
*
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 12037 $
*/
public class ReplaceMissingWithUserConstant extends PotentialClassIgnorer
implements UnsupervisedFilter, StreamableFilter, EnvironmentHandler {
/** For serialization */
private static final long serialVersionUID = -7334039452189350356L;
/** Environment variables */
protected transient Environment m_env;
/** Range of columns to consider */
protected Range m_selectedRange;
protected String m_range = "first-last";
protected String m_resolvedRange = "";
/** Constant for replacing missing values in nominal/string atts with */
protected String m_nominalStringConstant = "";
/** Replacement value for nominal/string atts after resolving environment vars */
protected String m_resolvedNominalStringConstant = "";
/** Constant for replacing missing values in numeric attributes with */
protected String m_numericConstant = "0";
/** Replacement value for numeric atts after resolving environment vars */
protected String m_resolvedNumericConstant = "";
/** Parsed numeric constant value */
protected double m_numericConstVal = 0;
/** Constant for replacing missing values in date attributes with */
protected String m_dateConstant = "";
/** Replacement value for date atts after resolving environment vars */
protected String m_resolvedDateConstant = "";
/** Parsed date value as a double */
protected double m_dateConstVal = 0;
/** Formatting string to use for parsing the date constant */
protected String m_defaultDateFormat = "yyyy-MM-dd'T'HH:mm:ss";
/** Formatting string after resolving environment vars */
protected String m_resolvedDateFormat = "";
/**
* Returns a string describing this filter
*
* @return a description of the filter suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Replaces all missing values for nominal, string, numeric and date "
+ "attributes in the dataset with user-supplied constant values.";
}
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enableAllClasses();
result.enable(Capability.MISSING_CLASS_VALUES);
result.enable(Capability.NO_CLASS);
return result;
}
@Override
public Enumeration listOptions() {
Vector opts = new Vector (5);
opts
.addElement(new Option(
"\tSpecify list of attributes to replace missing values for "
+ "\n\t(as weka range list of indices or a comma separated list of attribute names).\n"
+ "\t(default: consider all attributes)", "R", 1,
"-A "));
opts.addElement(new Option(
"\tSpecify the replacement constant for nominal/string attributes", "N",
1, "-N"));
opts.addElement(new Option(
"\tSpecify the replacement constant for numeric attributes"
+ "\n\t(default: 0)", "R", 1, "-R"));
opts.addElement(new Option(
"\tSpecify the replacement constant for date attributes", "D", 1, "-D"));
opts.addElement(new Option(
"\tSpecify the date format for parsing the replacement date constant"
+ "\n\t(default: yyyy-MM-dd'T'HH:mm:ss)", "F", 1, "-F"));
opts.addAll(Collections.list(super.listOptions()));
return opts.elements();
}
/**
*
* Parses a given list of options.
*
*
* Valid options are:
*
*
*
* -A <index1,index2-index4,... | att-name1,att-name2,...>
* Specify list of attributes to replace missing values for
* (as weka range list of indices or a comma separated list of attribute names).
* (default: consider all attributes)
*
*
*
* -N
* Specify the replacement constant for nominal/string attributes
*
*
*
* -R
* Specify the replacement constant for numeric attributes
* (default: 0)
*
*
*
* -D
* Specify the replacement constant for date attributes
*
*
*
* -F
* Specify the date format for parsing the replacement date constant
* (default: yyyy-MM-dd'T'HH:mm:ss)
*
*
*
* -unset-class-temporarily
* Unsets the class index temporarily before the filter is
* applied to the data.
* (default: no)
*
*
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String atts = Utils.getOption('A', options);
if (atts.length() > 0) {
setAttributes(atts);
}
String nomString = Utils.getOption('N', options);
if (nomString.length() > 0) {
setNominalStringReplacementValue(nomString);
}
String numString = Utils.getOption('R', options);
if (numString.length() > 0) {
setNumericReplacementValue(numString);
}
String dateString = Utils.getOption('D', options);
if (dateString.length() > 0) {
setDateReplacementValue(dateString);
}
String formatString = Utils.getOption('F', options);
if (formatString.length() > 0) {
setDateFormat(formatString);
}
super.setOptions(options);
Utils.checkForRemainingOptions(options);
}
@Override
public String[] getOptions() {
ArrayList options = new ArrayList();
if (getAttributes().length() > 0) {
options.add("-A");
options.add(getAttributes());
}
if (getNominalStringReplacementValue().length() > 0) {
options.add("-N");
options.add(getNominalStringReplacementValue());
}
if (getNumericReplacementValue().length() > 0) {
options.add("-R");
options.add(getNumericReplacementValue());
}
if (getDateReplacementValue().length() > 0) {
options.add("-D");
options.add(getDateReplacementValue());
}
if (getDateFormat().length() > 0) {
options.add("-F");
options.add(getDateFormat());
}
Collections.addAll(options, super.getOptions());
return options.toArray(new String[1]);
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String attributesTipText() {
return "Specify range of attributes to act on."
+ " This is a comma separated list of attribute indices, with"
+ " \"first\" and \"last\" valid values. Specify an inclusive"
+ " range with \"-\". E.g: \"first-3,5,6-10,last\". Can alternatively"
+ " specify a comma separated list of attribute names. Note that "
+ " you can't mix indices and attribute names in the same list";
}
/**
* Set the list of attributes to consider for replacing missing values
*
* @param range the list of attributes to consider
*/
public void setAttributes(String range) {
m_range = range;
}
/**
* Get the list of attributes to consider for replacing missing values
*
* @return the list of attributes to consider
*/
public String getAttributes() {
return m_range;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String nominalStringReplacementValueTipText() {
return "The constant to replace missing values in nominal/string attributes with";
}
/**
* Get the nominal/string replacement value
*
* @return the nominal/string replacement value
*/
public String getNominalStringReplacementValue() {
return m_nominalStringConstant;
}
/**
* Set the nominal/string replacement value
*
* @param m_nominalStringConstant the nominal/string constant to use
*/
public void setNominalStringReplacementValue(String nominalStringConstant) {
m_nominalStringConstant = nominalStringConstant;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String numericReplacementValueTipText() {
return "The constant to replace missing values in numeric attributes with";
}
/**
* Get the numeric replacement value
*
* @return the numeric replacement value
*/
public String getNumericReplacementValue() {
return m_numericConstant;
}
/**
* Set the numeric replacement value
*
* @param numericConstant the numeric replacement value
*/
public void setNumericReplacementValue(String numericConstant) {
m_numericConstant = numericConstant;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String dateReplacementValueTipText() {
return "The constant to replace missing values in date attributes with";
}
/**
* Set the date replacement value
*
* @param dateConstant the date replacement value
*/
public void setDateReplacementValue(String dateConstant) {
m_dateConstant = dateConstant;
}
/**
* Get the date replacement value
*
* @return the date replacement value
*/
public String getDateReplacementValue() {
return m_dateConstant;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String dateFormatTipText() {
return "The formatting string to use for parsing the date replacement value";
}
/**
* Set the date format to use for parsing the date replacement constant
*
* @param dateFormat the date format to use
*/
public void setDateFormat(String dateFormat) {
m_defaultDateFormat = dateFormat;
}
/**
* Get the date format to use for parsing the date replacement constant
*
* @return the date format to use
*/
public String getDateFormat() {
return m_defaultDateFormat;
}
@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
m_resolvedNominalStringConstant = m_nominalStringConstant;
m_resolvedNumericConstant = m_numericConstant;
m_resolvedDateConstant = m_dateConstant;
m_resolvedDateFormat = m_defaultDateFormat;
m_resolvedRange = m_range;
if (m_env == null) {
m_env = Environment.getSystemWide();
}
try {
if (m_resolvedNominalStringConstant != null
&& m_resolvedNominalStringConstant.length() > 0) {
m_resolvedNominalStringConstant = m_env
.substitute(m_resolvedNominalStringConstant);
}
if (m_resolvedNumericConstant != null
&& m_resolvedNumericConstant.length() > 0) {
m_resolvedNumericConstant = m_env.substitute(m_resolvedNumericConstant);
}
if (m_resolvedDateConstant != null && m_resolvedDateConstant.length() > 0) {
m_resolvedDateConstant = m_env.substitute(m_resolvedDateConstant);
}
if (m_resolvedDateFormat != null && m_resolvedDateFormat.length() > 0) {
m_resolvedDateFormat = m_env.substitute(m_resolvedDateFormat);
}
if (m_resolvedRange != null && m_resolvedRange.length() > 0) {
m_resolvedRange = m_env.substitute(m_resolvedRange);
}
} catch (Exception ex) {
}
// try and set up a Range first directly from the supplied string
m_selectedRange = new Range(m_resolvedRange);
try {
m_selectedRange.setUpper(instanceInfo.numAttributes() - 1);
} catch (IllegalArgumentException e) {
// now try as a list of named attributes
String[] parts = m_resolvedRange.split(",");
if (parts.length == 0) {
throw new Exception(
"Must specify which attributes to replace missing values for!");
}
StringBuffer indexList = new StringBuffer();
for (String att : parts) {
att = att.trim();
Attribute a = instanceInfo.attribute(att);
if (a == null) {
throw new Exception("I can't find the requested attribute '" + att
+ "' in the incoming instances.");
}
indexList.append(",").append(a.index() + 1);
}
String result = indexList.toString();
result = result.substring(1, result.length());
m_selectedRange = new Range(result);
m_selectedRange.setUpper(instanceInfo.numAttributes() - 1);
}
boolean hasNominal = false;
boolean hasString = false;
boolean hasNumeric = false;
boolean hasDate = false;
for (int i = 0; i < instanceInfo.numAttributes(); i++) {
if (m_selectedRange.isInRange(i)) {
if (instanceInfo.attribute(i).isNominal()) {
hasNominal = true;
} else if (instanceInfo.attribute(i).isString()) {
hasString = true;
} else if (instanceInfo.attribute(i).isDate()) {
hasDate = true;
} else if (instanceInfo.attribute(i).isNumeric()) {
hasNumeric = true;
}
}
}
if (hasNominal || hasString) {
if (m_resolvedNominalStringConstant == null
|| m_resolvedNominalStringConstant.length() == 0) {
if (m_resolvedNumericConstant != null
&& m_resolvedNumericConstant.length() > 0) {
// use the supplied numeric constant as a nominal value
m_resolvedNominalStringConstant = "" + m_resolvedNumericConstant;
} else {
throw new Exception("Data contains nominal/string attributes and no "
+ "replacement constant has been supplied");
}
}
}
if (hasNumeric) {
if (m_numericConstant == null || m_numericConstant.length() == 0) {
if (m_resolvedNominalStringConstant != null
&& m_resolvedNominalStringConstant.length() > 0) {
// use the supplied nominal constant as numeric replacement
// value (if we can parse it as a number)
try {
Double.parseDouble(m_resolvedNominalStringConstant);
m_resolvedNumericConstant = m_resolvedNominalStringConstant;
} catch (NumberFormatException e) {
throw new Exception(
"Data contains numeric attributes and no numeric "
+ "constant has been supplied. Unable to parse nominal "
+ "constant as a number either.");
}
} else {
throw new Exception("Data contains numeric attributes and no "
+ "replacement constant has been supplied");
}
}
try {
m_numericConstVal = Double.parseDouble(m_resolvedNumericConstant);
} catch (NumberFormatException e) {
throw new Exception("Unable to parse numeric constant");
}
}
if (hasDate) {
if (m_resolvedDateConstant == null
|| m_resolvedDateConstant.length() == 0) {
throw new Exception(
"Data contains date attributes and no replacement constant has been "
+ "supplied");
}
SimpleDateFormat sdf = new SimpleDateFormat(m_resolvedDateFormat);
Date d = sdf.parse(m_resolvedDateConstant);
m_dateConstVal = d.getTime();
}
Instances outputFormat = new Instances(instanceInfo, 0);
// check for nominal attributes and add the supplied constant to the
// list of legal values (if necessary)
ArrayList updatedNoms = new ArrayList();
for (int i = 0; i < instanceInfo.numAttributes(); i++) {
if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) {
Attribute temp = instanceInfo.attribute(i);
if (temp.isNominal()) {
if (temp.indexOfValue(m_resolvedNominalStringConstant) < 0) {
List values = new ArrayList();
values.add(m_resolvedNominalStringConstant);
for (int j = 0; j < temp.numValues(); j++) {
values.add(temp.value(j));
}
Attribute newAtt = new Attribute(temp.name(), values);
newAtt.setWeight(temp.weight());
updatedNoms.add(newAtt);
}
}
}
}
if (updatedNoms.size() > 0) {
int nomCount = 0;
ArrayList atts = new ArrayList();
for (int i = 0; i < instanceInfo.numAttributes(); i++) {
if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) {
if (instanceInfo.attribute(i).isNominal()) {
atts.add(updatedNoms.get(nomCount++));
} else {
atts.add((Attribute) instanceInfo.attribute(i).copy());
}
} else {
atts.add((Attribute) instanceInfo.attribute(i).copy());
}
}
outputFormat = new Instances(instanceInfo.relationName(), atts, 0);
outputFormat.setClassIndex(getInputFormat().classIndex());
}
setOutputFormat(outputFormat);
return true;
}
@Override
public boolean input(Instance inst) throws Exception {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
double[] vals = new double[inst.numAttributes()];
for (int i = 0; i < inst.numAttributes(); i++) {
if (inst.isMissing(i) && m_selectedRange.isInRange(i)) {
if (i != inst.classIndex()) {
if (inst.attribute(i).isDate()) {
vals[i] = m_dateConstVal;
} else if (inst.attribute(i).isNumeric()) {
vals[i] = m_numericConstVal;
} else if (inst.attribute(i).isNominal()) {
// vals[i] = inst.attribute(i).numValues();
int temp = inst.attribute(i).indexOfValue(
m_resolvedNominalStringConstant);
// vals[i] = (temp >= 0) ? temp : inst.attribute(i).numValues();
vals[i] = (temp >= 0) ? temp : 0;
} else if (inst.attribute(i).isString()) {
// a bit of a hack here to try and detect if we're running in
// streaming
// mode or batch mode. If the string attribute has only one value in
// the
// header then it is likely that we're running in streaming mode
// (where only one
// value, the current instance's value, is maintained in memory)
if (inst.attribute(i).numValues() <= 1) {
outputFormatPeek().attribute(i).setStringValue(
m_resolvedNominalStringConstant);
vals[i] = 0;
} else {
vals[i] = outputFormatPeek().attribute(i).addStringValue(
m_resolvedNominalStringConstant);
}
} else {
vals[i] = inst.value(i);
}
} else {
vals[i] = inst.value(i);
}
} else {
if (m_selectedRange.isInRange(i)) {
// in range but not missing
if (inst.attribute(i).isString()) {
// a bit of a hack here to try and detect if we're running in
// streaming
// mode or batch mode. If the string attribute has only one value in
// the
// header then it is likely that we're running in streaming mode
// (where only one
// value, the current instance's value, is maintained in memory)
if (inst.attribute(i).numValues() <= 1) {
outputFormatPeek().attribute(i).setStringValue(
inst.stringValue(i));
} else {
outputFormatPeek().attribute(i).addStringValue(
inst.stringValue(i));
}
vals[i] = outputFormatPeek().attribute(i).indexOfValue(
inst.stringValue(i));
} else if (inst.attribute(i).isNominal() && i != inst.classIndex()) {
vals[i] = inst.value(i) + 1;
} else {
vals[i] = inst.value(i);
}
} else {
// missing but not in range
vals[i] = inst.value(i);
}
}
}
Instance newInst = null;
if (inst instanceof SparseInstance) {
newInst = new SparseInstance(inst.weight(), vals);
} else {
newInst = new DenseInstance(inst.weight(), vals);
}
newInst.setDataset(getOutputFormat());
/*
* copyValues(newInst, false, inst.dataset(), getOutputFormat());
* newInst.setDataset(getOutputFormat());
*/
push(newInst, false); // No need to copy
return true;
}
@Override
public void setEnvironment(Environment env) {
m_env = env;
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 12037 $");
}
/**
* Main method for testing this class.
*
* @param args should contain arguments to the filter: use -h for help
*/
public static void main(String[] args) {
runFilter(new ReplaceMissingWithUserConstant(), args);
}
}