All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.filters.unsupervised.attribute.ReplaceMissingWithUserConstant Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ReplaceMissingWithUserConstant.java
 *    Copyright (C) 2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.unsupervised.attribute;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.StreamableFilter;
import weka.filters.UnsupervisedFilter;

/**
 *  Replaces all missing values for nominal, string,
 * numeric and date attributes in the dataset with user-supplied constant
 * values.
 * 

* * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 12037 $ */ public class ReplaceMissingWithUserConstant extends PotentialClassIgnorer implements UnsupervisedFilter, StreamableFilter, EnvironmentHandler { /** For serialization */ private static final long serialVersionUID = -7334039452189350356L; /** Environment variables */ protected transient Environment m_env; /** Range of columns to consider */ protected Range m_selectedRange; protected String m_range = "first-last"; protected String m_resolvedRange = ""; /** Constant for replacing missing values in nominal/string atts with */ protected String m_nominalStringConstant = ""; /** Replacement value for nominal/string atts after resolving environment vars */ protected String m_resolvedNominalStringConstant = ""; /** Constant for replacing missing values in numeric attributes with */ protected String m_numericConstant = "0"; /** Replacement value for numeric atts after resolving environment vars */ protected String m_resolvedNumericConstant = ""; /** Parsed numeric constant value */ protected double m_numericConstVal = 0; /** Constant for replacing missing values in date attributes with */ protected String m_dateConstant = ""; /** Replacement value for date atts after resolving environment vars */ protected String m_resolvedDateConstant = ""; /** Parsed date value as a double */ protected double m_dateConstVal = 0; /** Formatting string to use for parsing the date constant */ protected String m_defaultDateFormat = "yyyy-MM-dd'T'HH:mm:ss"; /** Formatting string after resolving environment vars */ protected String m_resolvedDateFormat = ""; /** * Returns a string describing this filter * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Replaces all missing values for nominal, string, numeric and date " + "attributes in the dataset with user-supplied constant values."; } @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } @Override public Enumeration

* * Valid options are: *

* *

   * -A <index1,index2-index4,... | att-name1,att-name2,...>
   *  Specify list of attributes to replace missing values for 
   *  (as weka range list of indices or a comma separated list of attribute names).
   *  (default: consider all attributes)
   * 
* *
   * -N
   *  Specify the replacement constant for nominal/string attributes
   * 
* *
   * -R
   *  Specify the replacement constant for numeric attributes
   *  (default: 0)
   * 
* *
   * -D
   *  Specify the replacement constant for date attributes
   * 
* *
   * -F
   *  Specify the date format for parsing the replacement date constant
   *  (default: yyyy-MM-dd'T'HH:mm:ss)
   * 
* *
   * -unset-class-temporarily
   *  Unsets the class index temporarily before the filter is
   *  applied to the data.
   *  (default: no)
   * 
* * * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String atts = Utils.getOption('A', options); if (atts.length() > 0) { setAttributes(atts); } String nomString = Utils.getOption('N', options); if (nomString.length() > 0) { setNominalStringReplacementValue(nomString); } String numString = Utils.getOption('R', options); if (numString.length() > 0) { setNumericReplacementValue(numString); } String dateString = Utils.getOption('D', options); if (dateString.length() > 0) { setDateReplacementValue(dateString); } String formatString = Utils.getOption('F', options); if (formatString.length() > 0) { setDateFormat(formatString); } super.setOptions(options); Utils.checkForRemainingOptions(options); } @Override public String[] getOptions() { ArrayList options = new ArrayList(); if (getAttributes().length() > 0) { options.add("-A"); options.add(getAttributes()); } if (getNominalStringReplacementValue().length() > 0) { options.add("-N"); options.add(getNominalStringReplacementValue()); } if (getNumericReplacementValue().length() > 0) { options.add("-R"); options.add(getNumericReplacementValue()); } if (getDateReplacementValue().length() > 0) { options.add("-D"); options.add(getDateReplacementValue()); } if (getDateFormat().length() > 0) { options.add("-F"); options.add(getDateFormat()); } Collections.addAll(options, super.getOptions()); return options.toArray(new String[1]); } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String attributesTipText() { return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\". Can alternatively" + " specify a comma separated list of attribute names. Note that " + " you can't mix indices and attribute names in the same list"; } /** * Set the list of attributes to consider for replacing missing values * * @param range the list of attributes to consider */ public void setAttributes(String range) { m_range = range; } /** * Get the list of attributes to consider for replacing missing values * * @return the list of attributes to consider */ public String getAttributes() { return m_range; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String nominalStringReplacementValueTipText() { return "The constant to replace missing values in nominal/string attributes with"; } /** * Get the nominal/string replacement value * * @return the nominal/string replacement value */ public String getNominalStringReplacementValue() { return m_nominalStringConstant; } /** * Set the nominal/string replacement value * * @param m_nominalStringConstant the nominal/string constant to use */ public void setNominalStringReplacementValue(String nominalStringConstant) { m_nominalStringConstant = nominalStringConstant; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String numericReplacementValueTipText() { return "The constant to replace missing values in numeric attributes with"; } /** * Get the numeric replacement value * * @return the numeric replacement value */ public String getNumericReplacementValue() { return m_numericConstant; } /** * Set the numeric replacement value * * @param numericConstant the numeric replacement value */ public void setNumericReplacementValue(String numericConstant) { m_numericConstant = numericConstant; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String dateReplacementValueTipText() { return "The constant to replace missing values in date attributes with"; } /** * Set the date replacement value * * @param dateConstant the date replacement value */ public void setDateReplacementValue(String dateConstant) { m_dateConstant = dateConstant; } /** * Get the date replacement value * * @return the date replacement value */ public String getDateReplacementValue() { return m_dateConstant; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String dateFormatTipText() { return "The formatting string to use for parsing the date replacement value"; } /** * Set the date format to use for parsing the date replacement constant * * @param dateFormat the date format to use */ public void setDateFormat(String dateFormat) { m_defaultDateFormat = dateFormat; } /** * Get the date format to use for parsing the date replacement constant * * @return the date format to use */ public String getDateFormat() { return m_defaultDateFormat; } @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_resolvedNominalStringConstant = m_nominalStringConstant; m_resolvedNumericConstant = m_numericConstant; m_resolvedDateConstant = m_dateConstant; m_resolvedDateFormat = m_defaultDateFormat; m_resolvedRange = m_range; if (m_env == null) { m_env = Environment.getSystemWide(); } try { if (m_resolvedNominalStringConstant != null && m_resolvedNominalStringConstant.length() > 0) { m_resolvedNominalStringConstant = m_env .substitute(m_resolvedNominalStringConstant); } if (m_resolvedNumericConstant != null && m_resolvedNumericConstant.length() > 0) { m_resolvedNumericConstant = m_env.substitute(m_resolvedNumericConstant); } if (m_resolvedDateConstant != null && m_resolvedDateConstant.length() > 0) { m_resolvedDateConstant = m_env.substitute(m_resolvedDateConstant); } if (m_resolvedDateFormat != null && m_resolvedDateFormat.length() > 0) { m_resolvedDateFormat = m_env.substitute(m_resolvedDateFormat); } if (m_resolvedRange != null && m_resolvedRange.length() > 0) { m_resolvedRange = m_env.substitute(m_resolvedRange); } } catch (Exception ex) { } // try and set up a Range first directly from the supplied string m_selectedRange = new Range(m_resolvedRange); try { m_selectedRange.setUpper(instanceInfo.numAttributes() - 1); } catch (IllegalArgumentException e) { // now try as a list of named attributes String[] parts = m_resolvedRange.split(","); if (parts.length == 0) { throw new Exception( "Must specify which attributes to replace missing values for!"); } StringBuffer indexList = new StringBuffer(); for (String att : parts) { att = att.trim(); Attribute a = instanceInfo.attribute(att); if (a == null) { throw new Exception("I can't find the requested attribute '" + att + "' in the incoming instances."); } indexList.append(",").append(a.index() + 1); } String result = indexList.toString(); result = result.substring(1, result.length()); m_selectedRange = new Range(result); m_selectedRange.setUpper(instanceInfo.numAttributes() - 1); } boolean hasNominal = false; boolean hasString = false; boolean hasNumeric = false; boolean hasDate = false; for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (m_selectedRange.isInRange(i)) { if (instanceInfo.attribute(i).isNominal()) { hasNominal = true; } else if (instanceInfo.attribute(i).isString()) { hasString = true; } else if (instanceInfo.attribute(i).isDate()) { hasDate = true; } else if (instanceInfo.attribute(i).isNumeric()) { hasNumeric = true; } } } if (hasNominal || hasString) { if (m_resolvedNominalStringConstant == null || m_resolvedNominalStringConstant.length() == 0) { if (m_resolvedNumericConstant != null && m_resolvedNumericConstant.length() > 0) { // use the supplied numeric constant as a nominal value m_resolvedNominalStringConstant = "" + m_resolvedNumericConstant; } else { throw new Exception("Data contains nominal/string attributes and no " + "replacement constant has been supplied"); } } } if (hasNumeric) { if (m_numericConstant == null || m_numericConstant.length() == 0) { if (m_resolvedNominalStringConstant != null && m_resolvedNominalStringConstant.length() > 0) { // use the supplied nominal constant as numeric replacement // value (if we can parse it as a number) try { Double.parseDouble(m_resolvedNominalStringConstant); m_resolvedNumericConstant = m_resolvedNominalStringConstant; } catch (NumberFormatException e) { throw new Exception( "Data contains numeric attributes and no numeric " + "constant has been supplied. Unable to parse nominal " + "constant as a number either."); } } else { throw new Exception("Data contains numeric attributes and no " + "replacement constant has been supplied"); } } try { m_numericConstVal = Double.parseDouble(m_resolvedNumericConstant); } catch (NumberFormatException e) { throw new Exception("Unable to parse numeric constant"); } } if (hasDate) { if (m_resolvedDateConstant == null || m_resolvedDateConstant.length() == 0) { throw new Exception( "Data contains date attributes and no replacement constant has been " + "supplied"); } SimpleDateFormat sdf = new SimpleDateFormat(m_resolvedDateFormat); Date d = sdf.parse(m_resolvedDateConstant); m_dateConstVal = d.getTime(); } Instances outputFormat = new Instances(instanceInfo, 0); // check for nominal attributes and add the supplied constant to the // list of legal values (if necessary) ArrayList updatedNoms = new ArrayList(); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) { Attribute temp = instanceInfo.attribute(i); if (temp.isNominal()) { if (temp.indexOfValue(m_resolvedNominalStringConstant) < 0) { List values = new ArrayList(); values.add(m_resolvedNominalStringConstant); for (int j = 0; j < temp.numValues(); j++) { values.add(temp.value(j)); } Attribute newAtt = new Attribute(temp.name(), values); newAtt.setWeight(temp.weight()); updatedNoms.add(newAtt); } } } } if (updatedNoms.size() > 0) { int nomCount = 0; ArrayList atts = new ArrayList(); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (i != instanceInfo.classIndex() && m_selectedRange.isInRange(i)) { if (instanceInfo.attribute(i).isNominal()) { atts.add(updatedNoms.get(nomCount++)); } else { atts.add((Attribute) instanceInfo.attribute(i).copy()); } } else { atts.add((Attribute) instanceInfo.attribute(i).copy()); } } outputFormat = new Instances(instanceInfo.relationName(), atts, 0); outputFormat.setClassIndex(getInputFormat().classIndex()); } setOutputFormat(outputFormat); return true; } @Override public boolean input(Instance inst) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } double[] vals = new double[inst.numAttributes()]; for (int i = 0; i < inst.numAttributes(); i++) { if (inst.isMissing(i) && m_selectedRange.isInRange(i)) { if (i != inst.classIndex()) { if (inst.attribute(i).isDate()) { vals[i] = m_dateConstVal; } else if (inst.attribute(i).isNumeric()) { vals[i] = m_numericConstVal; } else if (inst.attribute(i).isNominal()) { // vals[i] = inst.attribute(i).numValues(); int temp = inst.attribute(i).indexOfValue( m_resolvedNominalStringConstant); // vals[i] = (temp >= 0) ? temp : inst.attribute(i).numValues(); vals[i] = (temp >= 0) ? temp : 0; } else if (inst.attribute(i).isString()) { // a bit of a hack here to try and detect if we're running in // streaming // mode or batch mode. If the string attribute has only one value in // the // header then it is likely that we're running in streaming mode // (where only one // value, the current instance's value, is maintained in memory) if (inst.attribute(i).numValues() <= 1) { outputFormatPeek().attribute(i).setStringValue( m_resolvedNominalStringConstant); vals[i] = 0; } else { vals[i] = outputFormatPeek().attribute(i).addStringValue( m_resolvedNominalStringConstant); } } else { vals[i] = inst.value(i); } } else { vals[i] = inst.value(i); } } else { if (m_selectedRange.isInRange(i)) { // in range but not missing if (inst.attribute(i).isString()) { // a bit of a hack here to try and detect if we're running in // streaming // mode or batch mode. If the string attribute has only one value in // the // header then it is likely that we're running in streaming mode // (where only one // value, the current instance's value, is maintained in memory) if (inst.attribute(i).numValues() <= 1) { outputFormatPeek().attribute(i).setStringValue( inst.stringValue(i)); } else { outputFormatPeek().attribute(i).addStringValue( inst.stringValue(i)); } vals[i] = outputFormatPeek().attribute(i).indexOfValue( inst.stringValue(i)); } else if (inst.attribute(i).isNominal() && i != inst.classIndex()) { vals[i] = inst.value(i) + 1; } else { vals[i] = inst.value(i); } } else { // missing but not in range vals[i] = inst.value(i); } } } Instance newInst = null; if (inst instanceof SparseInstance) { newInst = new SparseInstance(inst.weight(), vals); } else { newInst = new DenseInstance(inst.weight(), vals); } newInst.setDataset(getOutputFormat()); /* * copyValues(newInst, false, inst.dataset(), getOutputFormat()); * newInst.setDataset(getOutputFormat()); */ push(newInst, false); // No need to copy return true; } @Override public void setEnvironment(Environment env) { m_env = env; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 12037 $"); } /** * Main method for testing this class. * * @param args should contain arguments to the filter: use -h for help */ public static void main(String[] args) { runFilter(new ReplaceMissingWithUserConstant(), args); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy