All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.converters.CSVLoader Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;

/**
  
 * Reads a source that is in comma separated format
 * (the default). One can also change the column separator from comma to tab or
 * another character, specify string enclosures, specify whether aheader row is
 * present or not and specify which attributes are to beforced to be nominal or
 * date. Can operate in batch or incremental mode. In batch mode, a buffer is
 * used to process a fixed number of rows in memory at any one time and the data
 * is dumped to a temporary file. This allows the legal values for nominal
 * attributes to be automatically determined. The final ARFF file is produced in
 * a second pass over the temporary file using the structure determined on the
 * first pass. In incremental mode, the first buffer full of rows is used to
 * determine the structure automatically. Following this all rows are read and
 * output incrementally. An error will occur if a row containing nominal values
 * not seen in the initial buffer is encountered. In this case, the size of the
 * initial buffer can be increased, or the user can explicitly provide the legal
 * values of all nominal attributes using the -L (setNominalLabelSpecs) option.
 * *
 * 

* * Valid options are: *

* *

 * -H
 *  No header row present in the data.
 * 
* *
 * -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
* *
 * -L <nominal label spec>
 *  Optional specification of legal labels for nominal
 *  attributes. May be specified multiple times.
 *  Batch mode can determine this
 *  automatically (and so can incremental mode if
 *  the first in memory buffer load of instances
 *  contains an example of each legal value). The
 *  spec contains two parts separated by a ":". The
 *  first part can be a range of attribute indexes or
 *  a comma-separated list off attruibute names; the
 *  second part is a comma-separated list of labels. E.g
 *  "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
 * 
* *
 * -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
* *
 * -D <range>
 *  The range of attribute to force type to be DATE.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
 * 
* *
 * -format <date format>
 *  The date formatting string to use to parse date values.
 *  (default: "yyyy-MM-dd'T'HH:mm:ss")
 * 
* *
 * -M <str>
 *  The string representing a missing value.
 *  (default: ?)
 * 
* *
 * -F <separator>
 *  The field separator to be used.
 *  '\t' can be used as well.
 *  (default: ',')
 * 
* *
 * -E <enclosures>
 *  The enclosure character(s) to use for strings.
 *  Specify as a comma separated list (e.g. ",' (default: ",')
 * 
* *
 * -B <num>
 *  The size of the in memory buffer (in rows).
 *  (default: 100)
 * 
* * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 10920 $ */ public class CSVLoader extends AbstractFileLoader implements BatchConverter, IncrementalConverter, OptionHandler { /** For serialization */ private static final long serialVersionUID = -1300595850715808438L; /** the file extension. */ public static String FILE_EXTENSION = ".csv"; /** The reader for the data. */ protected transient BufferedReader m_sourceReader; /** Tokenizer for the data. */ protected transient StreamTokenizer m_st; protected transient File m_tempFile; protected transient PrintWriter m_dataDumper; /** the field separator. */ protected String m_FieldSeparator = ","; /** The placeholder for missing values. */ protected String m_MissingValue = "?"; /** The range of attributes to force to type nominal. */ protected Range m_NominalAttributes = new Range(); /** The user-supplied legal nominal values - each entry in the list is a spec */ protected List m_nominalLabelSpecs = new ArrayList(); /** The range of attributes to force to type string. */ protected Range m_StringAttributes = new Range(); /** The range of attributes to force to type date */ protected Range m_dateAttributes = new Range(); /** The formatting string to use to parse dates */ protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss"; /** The formatter to use on dates */ protected SimpleDateFormat m_formatter; /** whether the csv file contains a header row with att names */ protected boolean m_noHeaderRow = false; /** enclosure character(s) to use for strings */ protected String m_Enclosures = "\",\'"; /** The in memory row buffer */ protected List m_rowBuffer; /** The maximum number of rows to hold in memory at any one time */ protected int m_bufferSize = 100; /** Lookup for nominal values */ protected Map> m_nominalVals; /** Reader used to process and output data incrementally */ protected ArffReader m_incrementalReader; protected transient int m_rowCount; /** * Array holding field separator and enclosures to pass through to the * underlying ArffReader */ protected String[] m_fieldSeparatorAndEnclosures; /** * Returns a string describing this attribute evaluator. * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Reads a source that is in comma separated format (the default). " + "One can also change the column separator from comma to tab or " + "another character, specify string enclosures, specify whether a" + "header row is present or not and specify which attributes are to be" + "forced to be nominal or date. Can operate in batch or incremental mode. " + "In batch mode, a buffer is used to process a fixed number of rows in " + "memory at any one time and the data is dumped to a temporary file. This " + "allows the legal values for nominal attributes to be automatically " + "determined. The final ARFF file is produced in a second pass over the " + "temporary file using the structure determined on the first pass. In " + "incremental mode, the first buffer full of rows is used to determine " + "the structure automatically. Following this all rows are read and output " + "incrementally. An error will occur if a row containing nominal values not " + "seen in the initial buffer is encountered. In this case, the size of the " + "initial buffer can be increased, or the user can explicitly provide the " + "legal values of all nominal attributes using the -L (setNominalLabelSpecs) " + "option."; } /** * default constructor. */ public CSVLoader() { // No instances retrieved yet setRetrieval(NONE); } @Override public String getFileExtension() { return FILE_EXTENSION; } @Override public String[] getFileExtensions() { return new String[] { getFileExtension() }; } @Override public String getFileDescription() { return "CSV data files"; } @Override public String getRevision() { return "$Revision: 10920 $"; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String noHeaderRowPresentTipText() { return "First row of data does not contain attribute names"; } /** * Set whether there is no header row in the data. * * @param b true if there is no header row in the data */ public void setNoHeaderRowPresent(boolean b) { m_noHeaderRow = b; } /** * Get whether there is no header row in the data. * * @return true if there is no header row in the data */ public boolean getNoHeaderRowPresent() { return m_noHeaderRow; } /** * Sets the placeholder for missing values. * * @param value the placeholder */ public void setMissingValue(String value) { m_MissingValue = value; } /** * Returns the current placeholder for missing values. * * @return the placeholder */ public String getMissingValue() { return m_MissingValue; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String missingValueTipText() { return "The placeholder for missing values, default is '?'."; } /** * Sets the attribute range to be forced to type string. * * @param value the range */ public void setStringAttributes(String value) { m_StringAttributes.setRanges(value); } /** * Returns the current attribute range to be forced to type string. * * @return the range */ public String getStringAttributes() { return m_StringAttributes.getRanges(); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String stringAttributesTipText() { return "The range of attributes to force to be of type STRING, example " + "ranges: 'first-last', '1,4,7-14,50-last'."; } /** * Sets the attribute range to be forced to type nominal. * * @param value the range */ public void setNominalAttributes(String value) { m_NominalAttributes.setRanges(value); } /** * Returns the current attribute range to be forced to type nominal. * * @return the range */ public String getNominalAttributes() { return m_NominalAttributes.getRanges(); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String nominalAttributesTipText() { return "The range of attributes to force to be of type NOMINAL, example " + "ranges: 'first-last', '1,4,7-14,50-last'."; } /** * Set the format to use for parsing date values. * * @param value the format to use. */ public void setDateFormat(String value) { m_dateFormat = value; m_formatter = null; } /** * Get the format to use for parsing date values. * * @return the format to use for parsing date values. * */ public String getDateFormat() { return m_dateFormat; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String dateFormatTipText() { return "The format to use for parsing date values."; } /** * Set the attribute range to be forced to type date. * * @param value the range */ public void setDateAttributes(String value) { m_dateAttributes.setRanges(value); } /** * Returns the current attribute range to be forced to type date. * * @return the range. */ public String getDateAttributes() { return m_dateAttributes.getRanges(); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String dateAttributesTipText() { return "The range of attributes to force to type DATE, example " + "ranges: 'first-last', '1,4,7-14, 50-last'."; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String enclosureCharactersTipText() { return "The characters to use as enclosures for strings. E.g. \",'"; } /** * Set the character(s) to use/recognize as string enclosures * * @param enclosure the characters to use as string enclosures */ public void setEnclosureCharacters(String enclosure) { m_Enclosures = enclosure; } /** * Get the character(s) to use/recognize as string enclosures * * @return the characters to use as string enclosures */ public String getEnclosureCharacters() { return m_Enclosures; } /** * Sets the character used as column separator. * * @param value the character to use */ public void setFieldSeparator(String value) { m_FieldSeparator = Utils.unbackQuoteChars(value); if (m_FieldSeparator.length() != 1) { m_FieldSeparator = ","; System.err .println("Field separator can only be a single character (exception being '\t'), " + "defaulting back to '" + m_FieldSeparator + "'!"); } } /** * Returns the character used as column separator. * * @return the character to use */ public String getFieldSeparator() { return Utils.backQuoteChars(m_FieldSeparator); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String fieldSeparatorTipText() { return "The character to use as separator for the columns/fields (use '\\t' for TAB)."; } /** * Set the buffer size to use - i.e. the number of rows to load and process in * memory at any one time * * @param buff the buffer size (number of rows) */ public void setBufferSize(int buff) { m_bufferSize = buff; } /** * Get the buffer size to use - i.e. the number of rows to load and process in * memory at any one time * * @return */ public int getBufferSize() { return m_bufferSize; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String bufferSizeTipText() { return "The number of rows to process in memory at any one time."; } /** * Set label specifications for nominal attributes. * * @param specs an array of label specifications */ public void setNominalLabelSpecs(Object[] specs) { m_nominalLabelSpecs.clear(); for (Object s : specs) { m_nominalLabelSpecs.add(s.toString()); } } /** * Get label specifications for nominal attributes. * * @return an array of label specifications */ public Object[] getNominalLabelSpecs() { return m_nominalLabelSpecs.toArray(new String[0]); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String nominalLabelSpecsTipText() { return "Optional specification of legal labels for nominal " + "attributes. May be specified multiple times. " + "Batch mode can determine this " + "automatically (and so can incremental mode if " + "the first in memory buffer load of instances " + "contains an example of each legal value). The " + "spec contains two parts separated by a \":\". The " + "first part can be a range of attribute indexes or " + "a comma-separated list off attruibute names; the " + "second part is a comma-separated list of labels. E.g " + "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\""; } @Override public Enumeration