weka.core.converters.CSVLoader Maven / Gradle / Ivy

Go to download
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StreamTokenizer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 *  Reads a source that is in comma separated or tab
 * separated format. Assumes that the first row in the file determines the
 * number of and names of the attributes.
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -D <range>
 *  The range of attribute to force type to be DATE.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -format <date format>
 *  The date formatting string to use to parse date values.
 *  (default: "yyyy-MM-dd'T'HH:mm:ss")
 * 
 * 
 *  * -M <str>
 *  The string representing a missing value.
 *  (default: ?)
 * 
 * 
 *  * -E <enclosures>
 *  The enclosure character(s) to use for strings.
 *  Specify as a comma separated list (e.g. ",' (default: '"')
 * 
 * 
 * 
 * 
 * @author Mark Hall ([email protected])
 * @version $Revision: 10372 $
 * @see Loader
 */
public class CSVLoader extends AbstractFileLoader implements BatchConverter,
  OptionHandler {

  /** for serialization. */
  static final long serialVersionUID = 5607529739745491340L;

  /** the file extension. */
  public static String FILE_EXTENSION = ".csv";

  /**
   * A list of hash tables for accumulating nominal values during parsing.
   */
  protected FastVector m_cumulativeStructure;

  /**
   * Holds instances accumulated so far.
   */
  protected FastVector m_cumulativeInstances;

  /** The reader for the data. */
  protected transient BufferedReader m_sourceReader;

  /** Tokenizer for the data. */
  protected transient StreamTokenizer m_st;

  /** The range of attributes to force to type nominal. */
  protected Range m_NominalAttributes = new Range();

  /** The range of attributes to force to type string. */
  protected Range m_StringAttributes = new Range();

  /** The range of attributes to force to type date */
  protected Range m_dateAttributes = new Range();

  /** The formatting string to use to parse dates */
  protected String m_dateFormat = "";

  /** The formatter to use on dates */
  protected SimpleDateFormat m_formatter;

  /** The placeholder for missing values. */
  protected String m_MissingValue = "?";

  /** whether the first row has been read. */
  protected boolean m_FirstCheck;

  /** enclosure character(s) to use for strings */
  protected String m_Enclosures = "\",\'";

  /**
   * default constructor.
   */
  public CSVLoader() {
    // No instances retrieved yet
    setRetrieval(NONE);
  }

  /**
   * Get the file extension used for arff files.
   * 
   * @return the file extension
   */
  @Override
  public String getFileExtension() {
    return FILE_EXTENSION;
  }

  /**
   * Returns a description of the file type.
   * 
   * @return a short file description
   */
  @Override
  public String getFileDescription() {
    return "CSV data files";
  }

  /**
   * Gets all the file extensions used for this type of file.
   * 
   * @return the file extensions
   */
  @Override
  public String[] getFileExtensions() {
    return new String[] { getFileExtension() };
  }

  /**
   * Returns a string describing this attribute evaluator.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in comma separated or tab separated format. "
      + "Assumes that the first row in the file determines the number of "
      + "and names of the attributes.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration listOptions() {
    Vector result = new Vector();

    result.addElement(new Option(
      "\tThe range of attributes to force type to be NOMINAL.\n"
        + "\t'first' and 'last' are accepted as well.\n"
        + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
        + "\t(default: -none-)", "N", 1, "-N "));

    result.addElement(new Option(
      "\tThe range of attribute to force type to be STRING.\n"
        + "\t'first' and 'last' are accepted as well.\n"
        + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
        + "\t(default: -none-)", "S", 1, "-S "));

    result.add(new Option(
      "\tThe range of attribute to force type to be DATE.\n"
        + "\t'first' and 'last' are accepted as well.\n"
        + "\tExamples: \"first-last\", \"1,4,5-27,50-last\"\n"
        + "\t(default: -none-)", "D", 1, "-D "));

    result.add(new Option(
      "\tThe date formatting string to use to parse date values.\n"
        + "\t(default: \"yyyy-MM-dd'T'HH:mm:ss\")", "format", 1,
      "-format "));

    result.addElement(new Option("\tThe string representing a missing value.\n"
      + "\t(default: ?)", "M", 1, "-M "));

    result
      .addElement(new Option(
        "\tThe enclosure character(s) to use for strings.\n"
          + "\tSpecify as a comma separated list (e.g. \",'"
          + " (default: \",')", "E", 1, "-E "));

    return result.elements();
  }

  /**
   * Parses a given list of options.
   * 
   * 
   *  Valid options are:
   * 

   * 
   * 
   * -N <range>
   *  The range of attributes to force type to be NOMINAL.
   *  'first' and 'last' are accepted as well.
   *  Examples: "first-last", "1,4,5-27,50-last"
   *  (default: -none-)
   * 
   * 
   *    * -S <range>
   *  The range of attribute to force type to be STRING.
   *  'first' and 'last' are accepted as well.
   *  Examples: "first-last", "1,4,5-27,50-last"
   *  (default: -none-)
   * 
   * 
   *    * -D <range>
   *  The range of attribute to force type to be DATE.
   *  'first' and 'last' are accepted as well.
   *  Examples: "first-last", "1,4,5-27,50-last"
   *  (default: -none-)
   * 
   * 
   *    * -format <date format>
   *  The date formatting string to use to parse date values.
   *  (default: "yyyy-MM-dd'T'HH:mm:ss")
   * 
   * 
   *    * -M <str>
   *  The string representing a missing value.
   *  (default: ?)
   * 
   * 
   *    * -E <enclosures>
   *  The enclosure character(s) to use for strings.
   *  Specify as a comma separated list (e.g. ",' (default: '"')
   * 
   * 
   * 
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {
    String tmpStr;

    tmpStr = Utils.getOption('N', options);
    if (tmpStr.length() != 0)
      setNominalAttributes(tmpStr);
    else
      setNominalAttributes("");

    tmpStr = Utils.getOption('S', options);
    if (tmpStr.length() != 0)
      setStringAttributes(tmpStr);
    else
      setStringAttributes("");

    tmpStr = Utils.getOption('M', options);
    if (tmpStr.length() != 0)
      setMissingValue(tmpStr);
    else
      setMissingValue("?");

    tmpStr = Utils.getOption('D', options);
    if (tmpStr.length() > 0) {
      setDateAttributes(tmpStr);
    }
    tmpStr = Utils.getOption("format", options);
    if (tmpStr.length() > 0) {
      setDateFormat(tmpStr);
    }
    tmpStr = Utils.getOption("E", options);
    if (tmpStr.length() > 0) {
      setEnclosureCharacters(tmpStr);
    }
  }

  /**
   * Gets the current settings of the Classifier.
   * 
   * @return an array of strings suitable for passing to setOptions
   */
  @Override
  public String[] getOptions() {
    Vector result;

    result = new Vector();

    if (getNominalAttributes().length() > 0) {
      result.add("-N");
      result.add(getNominalAttributes());
    }

    if (getStringAttributes().length() > 0) {
      result.add("-S");
      result.add(getStringAttributes());
    }

    if (getDateAttributes().length() > 0) {
      result.add("-D");
      result.add(getDateAttributes());
      result.add("-format");
      result.add(getDateFormat());
    }

    result.add("-M");
    result.add(getMissingValue());

    result.add("-E");
    result.add(getEnclosureCharacters());

    return result.toArray(new String[result.size()]);
  }

  /**
   * Sets the attribute range to be forced to type nominal.
   * 
   * @param value the range
   */
  public void setNominalAttributes(String value) {
    m_NominalAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type nominal.
   * 
   * @return the range
   */
  public String getNominalAttributes() {
    return m_NominalAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String nominalAttributesTipText() {
    return "The range of attributes to force to be of type NOMINAL, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Sets the attribute range to be forced to type string.
   * 
   * @param value the range
   */
  public void setStringAttributes(String value) {
    m_StringAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type string.
   * 
   * @return the range
   */
  public String getStringAttributes() {
    return m_StringAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String stringAttributesTipText() {
    return "The range of attributes to force to be of type STRING, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Set the attribute range to be forced to type date.
   * 
   * @param value the range
   */
  public void setDateAttributes(String value) {
    m_dateAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type date.
   * 
   * @return the range.
   */
  public String getDateAttributes() {
    return m_dateAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateAttributesTipText() {
    return "The range of attributes to force to type STRING, example "
      + "ranges: 'first-last', '1,4,7-14, 50-last'.";
  }

  /**
   * Set the format to use for parsing date values.
   * 
   * @param value the format to use.
   */
  public void setDateFormat(String value) {
    m_dateFormat = value;
    m_formatter = null;
  }

  /**
   * Get the format to use for parsing date values.
   * 
   * @return the format to use for parsing date values.
   * 
   */
  public String getDateFormat() {
    return m_dateFormat;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateFormatTipText() {
    return "The format to use for parsing date values.";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String enclosureCharactersTipText() {
    return "The characters to use as enclosures for strings. E.g. \",'";
  }

  /**
   * Set the character(s) to use/recognize as string enclosures
   * 
   * @param enclosure the characters to use as string enclosures
   */
  public void setEnclosureCharacters(String enclosure) {
    m_Enclosures = enclosure;
  }

  /**
   * Get the character(s) to use/recognize as string enclosures
   * 
   * @return the characters to use as string enclosures
   */
  public String getEnclosureCharacters() {
    return m_Enclosures;
  }

  /**
   * Sets the placeholder for missing values.
   * 
   * @param value the placeholder
   */
  public void setMissingValue(String value) {
    m_MissingValue = value;
  }

  /**
   * Returns the current placeholder for missing values.
   * 
   * @return the placeholder
   */
  public String getMissingValue() {
    return m_MissingValue;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String missingValueTipText() {
    return "The placeholder for missing values, default is '?'.";
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied Stream object.
   * 
   * @param input the input stream
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(InputStream input) throws IOException {
    m_structure = null;
    m_sourceFile = null;
    m_File = null;
    m_FirstCheck = true;

    m_sourceReader = new BufferedReader(new InputStreamReader(input));
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied File object.
   * 
   * @param file the source file.
   * @exception IOException if an error occurs
   */
  @Override
  public void setSource(File file) throws IOException {
    super.setSource(file);
  }

  /**
   * Determines and returns (if possible) the structure (internally the header)
   * of the data set as an empty set of instances.
   * 
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if an error occurs
   */
  @Override
  public Instances getStructure() throws IOException {
    if ((m_sourceFile == null) && (m_sourceReader == null)) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      try {
        m_st = new StreamTokenizer(m_sourceReader);
        initTokenizer(m_st);
        readStructure(m_st);
      } catch (FileNotFoundException ex) {
      }
    }

    return m_structure;
  }

  /**
   * reads the structure.
   * 
   * @param st the stream tokenizer to read from
   * @throws IOException if reading fails
   */
  private void readStructure(StreamTokenizer st) throws IOException {
    readHeader(st);
  }

  /**
   * Return the full data set. If the structure hasn't yet been determined by a
   * call to getStructure then method should do so before processing the rest of
   * the data set.
   * 
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if there is no source or parsing fails
   */
  @Override
  public Instances getDataSet() throws IOException {
    if ((m_sourceFile == null) && (m_sourceReader == null)) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      getStructure();
    }

    if (m_st == null) {
      m_st = new StreamTokenizer(m_sourceReader);
      initTokenizer(m_st);
    }

    m_st.ordinaryChar(',');
    m_st.ordinaryChar('\t');

    m_cumulativeStructure = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      m_cumulativeStructure.addElement(new Hashtable());
    }

    m_cumulativeInstances = new FastVector();
    FastVector current;
    while ((current = getInstance(m_st)) != null) {
      m_cumulativeInstances.addElement(current);
    }

    FastVector atts = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      String attname = m_structure.attribute(i).name();
      Hashtable tempHash = ((Hashtable) m_cumulativeStructure.elementAt(i));
      if (tempHash.size() == 0) {
        if (m_dateAttributes.isInRange(i)) {
          atts.addElement(new Attribute(attname, m_dateFormat));
        } else {
          atts.addElement(new Attribute(attname));
        }
      } else {
        if (m_StringAttributes.isInRange(i)) {
          atts.addElement(new Attribute(attname, (FastVector) null));
        } else {
          FastVector values = new FastVector(tempHash.size());
          // add dummy objects in order to make the FastVector's size ==
          // capacity
          for (int z = 0; z < tempHash.size(); z++) {
            values.addElement("dummy");
          }
          Enumeration e = tempHash.keys();
          while (e.hasMoreElements()) {
            Object ob = e.nextElement();
            // if (ob instanceof Double) {
            int index = ((Integer) tempHash.get(ob)).intValue();
            String s = ob.toString();
            if (s.startsWith("'") || s.startsWith("\""))
              s = s.substring(1, s.length() - 1);
            values.setElementAt(new String(s), index);
            // }
          }
          atts.addElement(new Attribute(attname, values));
        }
      }
    }

    // make the instances
    String relationName;
    if (m_sourceFile != null)
      relationName = (m_sourceFile.getName())
        .replaceAll("\\.[cC][sS][vV]$", "");
    else
      relationName = "stream";
    Instances dataSet = new Instances(relationName, atts,
      m_cumulativeInstances.size());

    for (int i = 0; i < m_cumulativeInstances.size(); i++) {
      current = ((FastVector) m_cumulativeInstances.elementAt(i));
      double[] vals = new double[dataSet.numAttributes()];
      for (int j = 0; j < current.size(); j++) {
        Object cval = current.elementAt(j);
        if (cval instanceof String) {
          if (((String) cval).compareTo(m_MissingValue) == 0) {
            vals[j] = Instance.missingValue();
          } else {
            if (dataSet.attribute(j).isString()) {
              vals[j] = dataSet.attribute(j).addStringValue((String) cval);
            } else if (dataSet.attribute(j).isNominal()) {
              // find correct index
              Hashtable lookup = (Hashtable) m_cumulativeStructure.elementAt(j);
              int index = ((Integer) lookup.get(cval)).intValue();
              vals[j] = index;
            } else {
              throw new IllegalStateException(
                "Wrong attribute type at position " + (i + 1) + "!!!");
            }
          }
        } else if (dataSet.attribute(j).isNominal()) {
          // find correct index
          Hashtable lookup = (Hashtable) m_cumulativeStructure.elementAt(j);
          int index = ((Integer) lookup.get(cval)).intValue();
          vals[j] = index;
        } else if (dataSet.attribute(j).isString()) {
          vals[j] = dataSet.attribute(j).addStringValue("" + cval);
        } else {
          vals[j] = ((Double) cval).doubleValue();
        }
      }
      dataSet.add(new Instance(1.0, vals));
    }
    m_structure = new Instances(dataSet, 0);
    setRetrieval(BATCH);
    m_cumulativeStructure = null; // conserve memory
    m_cumulativeInstances = null;

    // close the stream
    m_sourceReader.close();

    return dataSet;
  }

  /**
   * CSVLoader is unable to process a data set incrementally.
   * 
   * @param structure ignored
   * @return never returns without throwing an exception
   * @exception IOException always. CSVLoader is unable to process a data set
   *              incrementally.
   */
  @Override
  public Instance getNextInstance(Instances structure) throws IOException {
    throw new IOException("CSVLoader can't read data sets incrementally.");
  }

  /**
   * Attempts to parse a line of the data set.
   * 
   * @param tokenizer the tokenizer
   * @return a FastVector containg String and Double objects representing the
   *         values of the instance.
   * @exception IOException if an error occurs
   * 
   *                 * 
   *    private_normal_behavior
   *      requires: tokenizer != null;
   *      ensures: \result  != null;
   *  also
   *    private_exceptional_behavior
   *      requires: tokenizer == null
   *                || (* unsucessful parse *);
   *      signals: (IOException);
   * 
   * 
   */
  private FastVector getInstance(StreamTokenizer tokenizer) throws IOException {

    FastVector current = new FastVector();

    // Check if end of file reached.
    ConverterUtils.getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      return null;
    }
    boolean first = true;
    boolean wasSep;

    while (tokenizer.ttype != StreamTokenizer.TT_EOL
      && tokenizer.ttype != StreamTokenizer.TT_EOF) {

      // Get next token
      if (!first) {
        ConverterUtils.getToken(tokenizer);
      }

      if (tokenizer.ttype == ',' || tokenizer.ttype == '\t'
        || tokenizer.ttype == StreamTokenizer.TT_EOL) {
        current.addElement(m_MissingValue);
        wasSep = true;
      } else {
        wasSep = false;
        if (tokenizer.sval.equals(m_MissingValue)
          || tokenizer.sval.trim().length() == 0) {
          current.addElement(new String(m_MissingValue));
        } else {
          // try to parse as a number
          try {
            double val = Double.valueOf(tokenizer.sval).doubleValue();
            current.addElement(new Double(val));
          } catch (NumberFormatException e) {
            // otherwise assume its an enumerated value
            current.addElement(new String(tokenizer.sval));
          }
        }
      }

      if (!wasSep) {
        ConverterUtils.getToken(tokenizer);
      }
      first = false;
    }

    // check number of values read
    if (current.size() != m_structure.numAttributes()) {
      ConverterUtils.errms(tokenizer,
        "wrong number of values. Read " + current.size() + ", expected "
          + m_structure.numAttributes());
    }

    // check for structure update
    try {
      checkStructure(current);
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return current;
  }

  /**
   * Checks the current instance against what is known about the structure of
   * the data set so far. If there is a nominal value for an attribute that was
   * beleived to be numeric then all previously seen values for this attribute
   * are stored in a Hashtable.
   * 
   * @param current a FastVector value
   * @exception Exception if an error occurs
   * 
   *                 * 
   *    private_normal_behavior
   *      requires: current != null;
   *  also
   *    private_exceptional_behavior
   *      requires: current == null
   *                || (* unrecognized object type in current *);
   *      signals: (Exception);
   * 
   * 
   */
  private void checkStructure(FastVector current) throws Exception {
    if (current == null) {
      throw new Exception("current shouldn't be null in checkStructure");
    }

    // initialize ranges, if necessary
    if (m_FirstCheck) {
      m_NominalAttributes.setUpper(current.size() - 1);
      m_StringAttributes.setUpper(current.size() - 1);
      m_dateAttributes.setUpper(current.size() - 1);
      m_FirstCheck = false;
    }

    for (int i = 0; i < current.size(); i++) {
      Object ob = current.elementAt(i);
      if ((ob instanceof String) || (m_NominalAttributes.isInRange(i))
        || (m_StringAttributes.isInRange(i)) || m_dateAttributes.isInRange(i)) {
        if (ob.toString().compareTo(m_MissingValue) == 0) {
          // do nothing
        } else {

          boolean notDate = true;
          if (m_dateAttributes.isInRange(i)) {
            // try to parse date string
            if (m_formatter == null) {
              m_formatter = new SimpleDateFormat(m_dateFormat);
            }

            try {
              long time = m_formatter.parse(ob.toString()).getTime();
              Double timeL = new Double(time);
              current.setElementAt(timeL, i);
              notDate = false;
            } catch (ParseException e) {
              notDate = true;
            }
          }

          if (notDate) {
            Hashtable tempHash = (Hashtable) m_cumulativeStructure.elementAt(i);
            if (!tempHash.containsKey(ob)) {
              // may have found a nominal value in what was previously thought
              // to
              // be a numeric variable.
              if (tempHash.size() == 0) {
                for (int j = 0; j < m_cumulativeInstances.size(); j++) {
                  FastVector tempUpdate = ((FastVector) m_cumulativeInstances
                    .elementAt(j));
                  Object tempO = tempUpdate.elementAt(i);
                  if (tempO instanceof String) {
                    // must have been a missing value
                  } else {
                    if (!tempHash.containsKey(tempO)) {
                      tempHash.put(new Double(((Double) tempO).doubleValue()),
                        new Integer(tempHash.size()));
                    }
                  }
                }
              }
              int newIndex = tempHash.size();
              tempHash.put(ob, new Integer(newIndex));
            }
          }
        }
      } else if (ob instanceof Double) {
        Hashtable tempHash = (Hashtable) m_cumulativeStructure.elementAt(i);
        if (tempHash.size() != 0) {
          if (!tempHash.containsKey(ob)) {
            int newIndex = tempHash.size();
            tempHash.put(new Double(((Double) ob).doubleValue()), new Integer(
              newIndex));
          }
        }
      } else {
        throw new Exception("Wrong object type in checkStructure!");
      }
    }
  }

  /**
   * Assumes the first line of the file contains the attribute names. Assumes
   * all attributes are String attributes (Reading the full data set with
   * getDataSet will establish the true structure).
   * 
   * @param tokenizer a StreamTokenizer value
   * @exception IOException if an error occurs
   * 
   *                 * 
   *    private_normal_behavior
   *      requires: tokenizer != null;
   *      modifiable: m_structure;
   *      ensures: m_structure != null;
   *  also
   *    private_exceptional_behavior
   *      requires: tokenizer == null
   *                || (* unsucessful parse *);
   *      signals: (IOException);
   * 
   * 
   */
  private void readHeader(StreamTokenizer tokenizer) throws IOException {

    FastVector attribNames = new FastVector();
    ConverterUtils.getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      ConverterUtils.errms(tokenizer, "premature end of file");
    }

    while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
      attribNames.addElement(new Attribute(tokenizer.sval, (FastVector) null));
      ConverterUtils.getToken(tokenizer);
    }
    String relationName;
    if (m_sourceFile != null)
      relationName = (m_sourceFile.getName())
        .replaceAll("\\.[cC][sS][vV]$", "");
    else
      relationName = "stream";
    m_structure = new Instances(relationName, attribNames, 0);
  }

  /**
   * Initializes the stream tokenizer.
   * 
   * @param tokenizer the tokenizer to initialize
   */
  private void initTokenizer(StreamTokenizer tokenizer) {
    tokenizer.resetSyntax();
    tokenizer.whitespaceChars(0, (' ' - 1));
    tokenizer.wordChars(' ', '\u00FF');
    tokenizer.whitespaceChars(',', ',');
    tokenizer.whitespaceChars('\t', '\t');
    tokenizer.commentChar('%');

    String[] parts = m_Enclosures.split(",");
    for (String e : parts) {
      if (e.length() > 1 || e.length() == 0) {
        throw new IllegalArgumentException(
          "Enclosures can only be single characters");
      }
      tokenizer.quoteChar(e.charAt(0));
    }

    tokenizer.eolIsSignificant(true);
  }

  /**
   * Resets the Loader ready to read a new data set or the same data set again.
   * 
   * @throws IOException if something goes wrong
   */
  @Override
  public void reset() throws IOException {
    m_structure = null;
    m_cumulativeStructure = null;
    m_cumulativeInstances = null;
    m_st = null;
    setRetrieval(NONE);

    if (m_File != null) {
      setFile(new File(m_File));
    }
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 10372 $");
  }

  /**
   * Main method.
   * 
   * @param args should contain the name of an input file.
   */
  public static void main(String[] args) {
    runFileLoader(new CSVLoader(), args);
  }
}