weka.core.converters.CSVLoader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;

/**
  
 * Reads a source that is in comma separated format
 * (the default). One can also change the column separator from comma to tab or
 * another character, specify string enclosures, specify whether aheader row is
 * present or not and specify which attributes are to beforced to be nominal or
 * date. Can operate in batch or incremental mode. In batch mode, a buffer is
 * used to process a fixed number of rows in memory at any one time and the data
 * is dumped to a temporary file. This allows the legal values for nominal
 * attributes to be automatically determined. The final ARFF file is produced in
 * a second pass over the temporary file using the structure determined on the
 * first pass. In incremental mode, the first buffer full of rows is used to
 * determine the structure automatically. Following this all rows are read and
 * output incrementally. An error will occur if a row containing nominal values
 * not seen in the initial buffer is encountered. In this case, the size of the
 * initial buffer can be increased, or the user can explicitly provide the legal
 * values of all nominal attributes using the -L (setNominalLabelSpecs) option.
 * *
 * 
 
 * 
  
 * Valid options are:
 * 

 * 
 * 
 * -H
 *  No header row present in the data.
 * 
 * 
 *  * -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -L <nominal label spec>
 *  Optional specification of legal labels for nominal
 *  attributes. May be specified multiple times.
 *  Batch mode can determine this
 *  automatically (and so can incremental mode if
 *  the first in memory buffer load of instances
 *  contains an example of each legal value). The
 *  spec contains two parts separated by a ":". The
 *  first part can be a range of attribute indexes or
 *  a comma-separated list off attruibute names; the
 *  second part is a comma-separated list of labels. E.g
 *  "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
 * 
 * 
 *  * -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -D <range>
 *  The range of attribute to force type to be DATE.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
 * 
 *  * -format <date format>
 *  The date formatting string to use to parse date values.
 *  (default: "yyyy-MM-dd'T'HH:mm:ss")
 * 
 * 
 *  * -M <str>
 *  The string representing a missing value.
 *  (default: ?)
 * 
 * 
 *  * -F <separator>
 *  The field separator to be used.
 *  '\t' can be used as well.
 *  (default: ',')
 * 
 * 
 *  * -E <enclosures>
 *  The enclosure character(s) to use for strings.
 *  Specify as a comma separated list (e.g. ",' (default: ",')
 * 
 * 
 *  * -B <num>
 *  The size of the in memory buffer (in rows).
 *  (default: 100)
 * 
 * 
 
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 10920 $
 */
public class CSVLoader extends AbstractFileLoader implements BatchConverter,
  IncrementalConverter, OptionHandler {

  /** For serialization */
  private static final long serialVersionUID = -1300595850715808438L;

  /** the file extension. */
  public static String FILE_EXTENSION = ".csv";

  /** The reader for the data. */
  protected transient BufferedReader m_sourceReader;

  /** Tokenizer for the data. */
  protected transient StreamTokenizer m_st;

  protected transient File m_tempFile;
  protected transient PrintWriter m_dataDumper;

  /** the field separator. */
  protected String m_FieldSeparator = ",";

  /** The placeholder for missing values. */
  protected String m_MissingValue = "?";

  /** The range of attributes to force to type nominal. */
  protected Range m_NominalAttributes = new Range();

  /** The user-supplied legal nominal values - each entry in the list is a spec */
  protected List m_nominalLabelSpecs = new ArrayList();

  /** The range of attributes to force to type string. */
  protected Range m_StringAttributes = new Range();

  /** The range of attributes to force to type date */
  protected Range m_dateAttributes = new Range();

  /** The formatting string to use to parse dates */
  protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss";

  /** The formatter to use on dates */
  protected SimpleDateFormat m_formatter;

  /** whether the csv file contains a header row with att names */
  protected boolean m_noHeaderRow = false;

  /** enclosure character(s) to use for strings */
  protected String m_Enclosures = "\",\'";

  /** The in memory row buffer */
  protected List m_rowBuffer;

  /** The maximum number of rows to hold in memory at any one time */
  protected int m_bufferSize = 100;

  /** Lookup for nominal values */
  protected Map> m_nominalVals;

  /** Reader used to process and output data incrementally */
  protected ArffReader m_incrementalReader;

  protected transient int m_rowCount;

  /**
   * Array holding field separator and enclosures to pass through to the
   * underlying ArffReader
   */
  protected String[] m_fieldSeparatorAndEnclosures;

  /**
   * Returns a string describing this attribute evaluator.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in comma separated format (the default). "
      + "One can also change the column separator from comma to tab or "
      + "another character, specify string enclosures, specify whether a"
      + "header row is present or not and specify which attributes are to be"
      + "forced to be nominal or date. Can operate in batch or incremental mode. "
      + "In batch mode, a buffer is used to process a fixed number of rows in "
      + "memory at any one time and the data is dumped to a temporary file. This "
      + "allows the legal values for nominal attributes to be automatically "
      + "determined. The final ARFF file is produced in a second pass over the "
      + "temporary file using the structure determined on the first pass. In "
      + "incremental mode, the first buffer full of rows is used to determine "
      + "the structure automatically. Following this all rows are read and output "
      + "incrementally. An error will occur if a row containing nominal values not "
      + "seen in the initial buffer is encountered. In this case, the size of the "
      + "initial buffer can be increased, or the user can explicitly provide the "
      + "legal values of all nominal attributes using the -L (setNominalLabelSpecs) "
      + "option.";
  }

  /**
   * default constructor.
   */
  public CSVLoader() {
    // No instances retrieved yet
    setRetrieval(NONE);
  }

  @Override
  public String getFileExtension() {
    return FILE_EXTENSION;
  }

  @Override
  public String[] getFileExtensions() {
    return new String[] { getFileExtension() };
  }

  @Override
  public String getFileDescription() {
    return "CSV data files";
  }

  @Override
  public String getRevision() {
    return "$Revision: 10920 $";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String noHeaderRowPresentTipText() {
    return "First row of data does not contain attribute names";
  }

  /**
   * Set whether there is no header row in the data.
   * 
   * @param b true if there is no header row in the data
   */
  public void setNoHeaderRowPresent(boolean b) {
    m_noHeaderRow = b;
  }

  /**
   * Get whether there is no header row in the data.
   * 
   * @return true if there is no header row in the data
   */
  public boolean getNoHeaderRowPresent() {
    return m_noHeaderRow;
  }

  /**
   * Sets the placeholder for missing values.
   * 
   * @param value the placeholder
   */
  public void setMissingValue(String value) {
    m_MissingValue = value;
  }

  /**
   * Returns the current placeholder for missing values.
   * 
   * @return the placeholder
   */
  public String getMissingValue() {
    return m_MissingValue;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String missingValueTipText() {
    return "The placeholder for missing values, default is '?'.";
  }

  /**
   * Sets the attribute range to be forced to type string.
   * 
   * @param value the range
   */
  public void setStringAttributes(String value) {
    m_StringAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type string.
   * 
   * @return the range
   */
  public String getStringAttributes() {
    return m_StringAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String stringAttributesTipText() {
    return "The range of attributes to force to be of type STRING, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Sets the attribute range to be forced to type nominal.
   * 
   * @param value the range
   */
  public void setNominalAttributes(String value) {
    m_NominalAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type nominal.
   * 
   * @return the range
   */
  public String getNominalAttributes() {
    return m_NominalAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String nominalAttributesTipText() {
    return "The range of attributes to force to be of type NOMINAL, example "
      + "ranges: 'first-last', '1,4,7-14,50-last'.";
  }

  /**
   * Set the format to use for parsing date values.
   * 
   * @param value the format to use.
   */
  public void setDateFormat(String value) {
    m_dateFormat = value;
    m_formatter = null;
  }

  /**
   * Get the format to use for parsing date values.
   * 
   * @return the format to use for parsing date values.
   * 
   */
  public String getDateFormat() {
    return m_dateFormat;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateFormatTipText() {
    return "The format to use for parsing date values.";
  }

  /**
   * Set the attribute range to be forced to type date.
   * 
   * @param value the range
   */
  public void setDateAttributes(String value) {
    m_dateAttributes.setRanges(value);
  }

  /**
   * Returns the current attribute range to be forced to type date.
   * 
   * @return the range.
   */
  public String getDateAttributes() {
    return m_dateAttributes.getRanges();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dateAttributesTipText() {
    return "The range of attributes to force to type DATE, example "
      + "ranges: 'first-last', '1,4,7-14, 50-last'.";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String enclosureCharactersTipText() {
    return "The characters to use as enclosures for strings. E.g. \",'";
  }

  /**
   * Set the character(s) to use/recognize as string enclosures
   * 
   * @param enclosure the characters to use as string enclosures
   */
  public void setEnclosureCharacters(String enclosure) {
    m_Enclosures = enclosure;
  }

  /**
   * Get the character(s) to use/recognize as string enclosures
   * 
   * @return the characters to use as string enclosures
   */
  public String getEnclosureCharacters() {
    return m_Enclosures;
  }

  /**
   * Sets the character used as column separator.
   * 
   * @param value the character to use
   */
  public void setFieldSeparator(String value) {
    m_FieldSeparator = Utils.unbackQuoteChars(value);
    if (m_FieldSeparator.length() != 1) {
      m_FieldSeparator = ",";
      System.err
        .println("Field separator can only be a single character (exception being '\t'), "
          + "defaulting back to '" + m_FieldSeparator + "'!");
    }
  }

  /**
   * Returns the character used as column separator.
   * 
   * @return the character to use
   */
  public String getFieldSeparator() {
    return Utils.backQuoteChars(m_FieldSeparator);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String fieldSeparatorTipText() {
    return "The character to use as separator for the columns/fields (use '\\t' for TAB).";
  }

  /**
   * Set the buffer size to use - i.e. the number of rows to load and process in
   * memory at any one time
   * 
   * @param buff the buffer size (number of rows)
   */
  public void setBufferSize(int buff) {
    m_bufferSize = buff;
  }

  /**
   * Get the buffer size to use - i.e. the number of rows to load and process in
   * memory at any one time
   * 
   * @return
   */
  public int getBufferSize() {
    return m_bufferSize;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String bufferSizeTipText() {
    return "The number of rows to process in memory at any one time.";
  }

  /**
   * Set label specifications for nominal attributes.
   * 
   * @param specs an array of label specifications
   */
  public void setNominalLabelSpecs(Object[] specs) {
    m_nominalLabelSpecs.clear();
    for (Object s : specs) {
      m_nominalLabelSpecs.add(s.toString());
    }
  }

  /**
   * Get label specifications for nominal attributes.
   * 
   * @return an array of label specifications
   */
  public Object[] getNominalLabelSpecs() {
    return m_nominalLabelSpecs.toArray(new String[0]);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String nominalLabelSpecsTipText() {
    return "Optional specification of legal labels for nominal "
      + "attributes. May be specified multiple times. "
      + "Batch mode can determine this "
      + "automatically (and so can incremental mode if "
      + "the first in memory buffer load of instances "
      + "contains an example of each legal value). The "
      + "spec contains two parts separated by a \":\". The "
      + "first part can be a range of attribute indexes or "
      + "a comma-separated list off attruibute names; the "
      + "second part is a comma-separated list of labels. E.g "
      + "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\"";
  }

  @Override
  public Enumeration