All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.converters.CSVLoader Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CSVLoader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;

/**
 
 * Reads a source that is in comma separated format (the default). One can also change the column separator from comma to tab or another character, specify string enclosures, specify whether aheader row is present or not and specify which attributes are to beforced to be nominal or date. Can operate in batch or incremental mode. In batch mode, a buffer is used to process a fixed number of rows in memory at any one time and the data is dumped to a temporary file. This allows the legal values for nominal attributes to be automatically determined. The final ARFF file is produced in a second pass over the temporary file using the structure determined on the first pass. In incremental mode, the first buffer full of rows is used to determine the structure automatically. Following this all rows are read and output incrementally. An error will occur if a row containing nominal values not seen in the initial buffer is encountered. In this case, the size of the initial buffer can be increased, or the user can explicitly provide the legal values of all nominal attributes using the -L (setNominalLabelSpecs) option.
 * 

* * Valid options are:

* *

 -H
 *  No header row present in the data.
* *
 -N <range>
 *  The range of attributes to force type to be NOMINAL.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
* *
 -L <nominal label spec>
 *  Optional specification of legal labels for nominal
 *  attributes. May be specified multiple times.
 *  Batch mode can determine this
 *  automatically (and so can incremental mode if
 *  the first in memory buffer load of instances
 *  contains an example of each legal value). The
 *  spec contains two parts separated by a ":". The
 *  first part can be a range of attribute indexes or
 *  a comma-separated list off attruibute names; the
 *  second part is a comma-separated list of labels. E.g
 *  "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
* *
 -S <range>
 *  The range of attribute to force type to be STRING.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
* *
 -D <range>
 *  The range of attribute to force type to be DATE.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
* *
 -format <date format>
 *  The date formatting string to use to parse date values.
 *  (default: "yyyy-MM-dd'T'HH:mm:ss")
* *
 -R <range>
 *  The range of attribute to force type to be NUMERIC.
 *  'first' and 'last' are accepted as well.
 *  Examples: "first-last", "1,4,5-27,50-last"
 *  (default: -none-)
* *
 -M <str>
 *  The string representing a missing value.
 *  (default: ?)
* *
 -F <separator>
 *  The field separator to be used.
 *  '\t' can be used as well.
 *  (default: ',')
* *
 -E <enclosures>
 *  The enclosure character(s) to use for strings.
 *  Specify as a comma separated list (e.g. ",' (default: ",')
* *
 -B <num>
 *  The size of the in memory buffer (in rows).
 *  (default: 100)
* * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 14115 $ */ public class CSVLoader extends AbstractFileLoader implements BatchConverter, IncrementalConverter, OptionHandler { /** For serialization */ private static final long serialVersionUID = -1300595850715808438L; /** the file extension. */ public static String FILE_EXTENSION = ".csv"; /** The reader for the data. */ protected transient BufferedReader m_sourceReader; /** Tokenizer for the data. */ protected transient StreamTokenizer m_st; protected transient File m_tempFile; protected transient PrintWriter m_dataDumper; /** the field separator. */ protected String m_FieldSeparator = ","; /** The placeholder for missing values. */ protected String m_MissingValue = "?"; /** The range of attributes to force to type nominal. */ protected Range m_NominalAttributes = new Range(); /** The user-supplied legal nominal values - each entry in the list is a spec */ protected List m_nominalLabelSpecs = new ArrayList(); /** The range of attributes to force to type string. */ protected Range m_StringAttributes = new Range(); /** The range of attributes to force to type date */ protected Range m_dateAttributes = new Range(); /** The range of attributes to force to type numeric */ protected Range m_numericAttributes = new Range(); /** The formatting string to use to parse dates */ protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss"; /** The formatter to use on dates */ protected SimpleDateFormat m_formatter; /** whether the csv file contains a header row with att names */ protected boolean m_noHeaderRow = false; /** enclosure character(s) to use for strings */ protected String m_Enclosures = "\",\'"; /** The in memory row buffer */ protected List m_rowBuffer; /** The maximum number of rows to hold in memory at any one time */ protected int m_bufferSize = 100; /** Lookup for nominal values */ protected Map> m_nominalVals; /** Reader used to process and output data incrementally */ protected ArffReader m_incrementalReader; protected transient int m_rowCount; /** * Array holding field separator and enclosures to pass through to the * underlying ArffReader */ protected String[] m_fieldSeparatorAndEnclosures; protected ArrayList m_current; protected TYPE[] m_types; private int m_numBufferedRows; /** * default constructor. */ public CSVLoader() { // No instances retrieved yet setRetrieval(NONE); } /** * Main method. * * @param args should contain the name of an input file. */ public static void main(String[] args) { runFileLoader(new CSVLoader(), args); } /** * Returns a string describing this attribute evaluator. * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Reads a source that is in comma separated format (the default). " + "One can also change the column separator from comma to tab or " + "another character, specify string enclosures, specify whether a" + "header row is present or not and specify which attributes are to be" + "forced to be nominal or date. Can operate in batch or incremental mode. " + "In batch mode, a buffer is used to process a fixed number of rows in " + "memory at any one time and the data is dumped to a temporary file. This " + "allows the legal values for nominal attributes to be automatically " + "determined. The final ARFF file is produced in a second pass over the " + "temporary file using the structure determined on the first pass. In " + "incremental mode, the first buffer full of rows is used to determine " + "the structure automatically. Following this all rows are read and output " + "incrementally. An error will occur if a row containing nominal values not " + "seen in the initial buffer is encountered. In this case, the size of the " + "initial buffer can be increased, or the user can explicitly provide the " + "legal values of all nominal attributes using the -L (setNominalLabelSpecs) " + "option."; } @Override public String getFileExtension() { return FILE_EXTENSION; } @Override public String[] getFileExtensions() { return new String[] { getFileExtension() }; } @Override public String getFileDescription() { return "CSV data files"; } @Override public String getRevision() { return "$Revision: 14115 $"; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String noHeaderRowPresentTipText() { return "First row of data does not contain attribute names"; } /** * Get whether there is no header row in the data. * * @return true if there is no header row in the data */ public boolean getNoHeaderRowPresent() { return m_noHeaderRow; } /** * Set whether there is no header row in the data. * * @param b true if there is no header row in the data */ public void setNoHeaderRowPresent(boolean b) { m_noHeaderRow = b; } /** * Returns the current placeholder for missing values. * * @return the placeholder */ public String getMissingValue() { return m_MissingValue; } /** * Sets the placeholder for missing values. * * @param value the placeholder */ public void setMissingValue(String value) { m_MissingValue = value; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String missingValueTipText() { return "The placeholder for missing values, default is '?'."; } /** * Returns the current attribute range to be forced to type string. * * @return the range */ public String getStringAttributes() { return m_StringAttributes.getRanges(); } /** * Sets the attribute range to be forced to type string. * * @param value the range */ public void setStringAttributes(String value) { m_StringAttributes.setRanges(value); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String stringAttributesTipText() { return "The range of attributes to force to be of type STRING, example " + "ranges: 'first-last', '1,4,7-14,50-last'."; } /** * Returns the current attribute range to be forced to type nominal. * * @return the range */ public String getNominalAttributes() { return m_NominalAttributes.getRanges(); } /** * Sets the attribute range to be forced to type nominal. * * @param value the range */ public void setNominalAttributes(String value) { m_NominalAttributes.setRanges(value); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String nominalAttributesTipText() { return "The range of attributes to force to be of type NOMINAL, example " + "ranges: 'first-last', '1,4,7-14,50-last'."; } /** * Gets the attribute range to be forced to type numeric * * @return the range */ public String getNumericAttributes() { return m_numericAttributes.getRanges(); } /** * Sets the attribute range to be forced to type numeric * * @param value the range */ public void setNumericAttributes(String value) { m_numericAttributes.setRanges(value); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String numericAttributesTipText() { return "The range of attributes to force to be of type NUMERIC, example " + "ranges: 'first-last', '1,4,7-14,50-last'."; } /** * Get the format to use for parsing date values. * * @return the format to use for parsing date values. * */ public String getDateFormat() { return m_dateFormat; } /** * Set the format to use for parsing date values. * * @param value the format to use. */ public void setDateFormat(String value) { m_dateFormat = value; m_formatter = null; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String dateFormatTipText() { return "The format to use for parsing date values."; } /** * Returns the current attribute range to be forced to type date. * * @return the range. */ public String getDateAttributes() { return m_dateAttributes.getRanges(); } /** * Set the attribute range to be forced to type date. * * @param value the range */ public void setDateAttributes(String value) { m_dateAttributes.setRanges(value); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String dateAttributesTipText() { return "The range of attributes to force to type DATE, example " + "ranges: 'first-last', '1,4,7-14, 50-last'."; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String enclosureCharactersTipText() { return "The characters to use as enclosures for strings. E.g. \",'"; } /** * Get the character(s) to use/recognize as string enclosures * * @return the characters to use as string enclosures */ public String getEnclosureCharacters() { return m_Enclosures; } /** * Set the character(s) to use/recognize as string enclosures * * @param enclosure the characters to use as string enclosures */ public void setEnclosureCharacters(String enclosure) { m_Enclosures = enclosure; } /** * Returns the character used as column separator. * * @return the character to use */ public String getFieldSeparator() { return Utils.backQuoteChars(m_FieldSeparator); } /** * Sets the character used as column separator. * * @param value the character to use */ public void setFieldSeparator(String value) { m_FieldSeparator = Utils.unbackQuoteChars(value); if (m_FieldSeparator.length() != 1) { m_FieldSeparator = ","; System.err .println("Field separator can only be a single character (exception being '\t'), " + "defaulting back to '" + m_FieldSeparator + "'!"); } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String fieldSeparatorTipText() { return "The character to use as separator for the columns/fields (use '\\t' for TAB)."; } /** * Get the buffer size to use - i.e. the number of rows to load and process in * memory at any one time * * @return */ public int getBufferSize() { return m_bufferSize; } /** * Set the buffer size to use - i.e. the number of rows to load and process in * memory at any one time * * @param buff the buffer size (number of rows) */ public void setBufferSize(int buff) { m_bufferSize = buff; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String bufferSizeTipText() { return "The number of rows to process in memory at any one time."; } /** * Get label specifications for nominal attributes. * * @return an array of label specifications */ public Object[] getNominalLabelSpecs() { return m_nominalLabelSpecs.toArray(new String[0]); } /** * Set label specifications for nominal attributes. * * @param specs an array of label specifications */ public void setNominalLabelSpecs(Object[] specs) { m_nominalLabelSpecs.clear(); for (Object s : specs) { m_nominalLabelSpecs.add(s.toString()); } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String nominalLabelSpecsTipText() { return "Optional specification of legal labels for nominal " + "attributes. May be specified multiple times. " + "Batch mode can determine this " + "automatically (and so can incremental mode if " + "the first in memory buffer load of instances " + "contains an example of each legal value). The " + "spec contains two parts separated by a \":\". The " + "first part can be a range of attribute indexes or " + "a comma-separated list off attruibute names; the " + "second part is a comma-separated list of labels. E.g " + "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\""; } @Override public Enumeration