weka.core.converters.CSVLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* CSVLoader.java
* Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.core.converters.ArffLoader.ArffReader;
/**
* Reads a source that is in comma separated format
* (the default). One can also change the column separator from comma to tab or
* another character, specify string enclosures, specify whether aheader row is
* present or not and specify which attributes are to beforced to be nominal or
* date. Can operate in batch or incremental mode. In batch mode, a buffer is
* used to process a fixed number of rows in memory at any one time and the data
* is dumped to a temporary file. This allows the legal values for nominal
* attributes to be automatically determined. The final ARFF file is produced in
* a second pass over the temporary file using the structure determined on the
* first pass. In incremental mode, the first buffer full of rows is used to
* determine the structure automatically. Following this all rows are read and
* output incrementally. An error will occur if a row containing nominal values
* not seen in the initial buffer is encountered. In this case, the size of the
* initial buffer can be increased, or the user can explicitly provide the legal
* values of all nominal attributes using the -L (setNominalLabelSpecs) option.
* *
*
*
* Valid options are:
*
*
*
* -H
* No header row present in the data.
*
*
*
* -N <range>
* The range of attributes to force type to be NOMINAL.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
*
*
*
* -L <nominal label spec>
* Optional specification of legal labels for nominal
* attributes. May be specified multiple times.
* Batch mode can determine this
* automatically (and so can incremental mode if
* the first in memory buffer load of instances
* contains an example of each legal value). The
* spec contains two parts separated by a ":". The
* first part can be a range of attribute indexes or
* a comma-separated list off attruibute names; the
* second part is a comma-separated list of labels. E.g
* "1,2,4-6:red,green,blue" or "att1,att2:red,green,blue"
*
*
*
* -S <range>
* The range of attribute to force type to be STRING.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
*
*
*
* -D <range>
* The range of attribute to force type to be DATE.
* 'first' and 'last' are accepted as well.
* Examples: "first-last", "1,4,5-27,50-last"
* (default: -none-)
*
*
*
* -format <date format>
* The date formatting string to use to parse date values.
* (default: "yyyy-MM-dd'T'HH:mm:ss")
*
*
*
* -M <str>
* The string representing a missing value.
* (default: ?)
*
*
*
* -F <separator>
* The field separator to be used.
* '\t' can be used as well.
* (default: ',')
*
*
*
* -E <enclosures>
* The enclosure character(s) to use for strings.
* Specify as a comma separated list (e.g. ",' (default: ",')
*
*
*
* -B <num>
* The size of the in memory buffer (in rows).
* (default: 100)
*
*
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 10920 $
*/
public class CSVLoader extends AbstractFileLoader implements BatchConverter,
IncrementalConverter, OptionHandler {
/** For serialization */
private static final long serialVersionUID = -1300595850715808438L;
/** the file extension. */
public static String FILE_EXTENSION = ".csv";
/** The reader for the data. */
protected transient BufferedReader m_sourceReader;
/** Tokenizer for the data. */
protected transient StreamTokenizer m_st;
protected transient File m_tempFile;
protected transient PrintWriter m_dataDumper;
/** the field separator. */
protected String m_FieldSeparator = ",";
/** The placeholder for missing values. */
protected String m_MissingValue = "?";
/** The range of attributes to force to type nominal. */
protected Range m_NominalAttributes = new Range();
/** The user-supplied legal nominal values - each entry in the list is a spec */
protected List m_nominalLabelSpecs = new ArrayList();
/** The range of attributes to force to type string. */
protected Range m_StringAttributes = new Range();
/** The range of attributes to force to type date */
protected Range m_dateAttributes = new Range();
/** The formatting string to use to parse dates */
protected String m_dateFormat = "yyyy-MM-dd'T'HH:mm:ss";
/** The formatter to use on dates */
protected SimpleDateFormat m_formatter;
/** whether the csv file contains a header row with att names */
protected boolean m_noHeaderRow = false;
/** enclosure character(s) to use for strings */
protected String m_Enclosures = "\",\'";
/** The in memory row buffer */
protected List m_rowBuffer;
/** The maximum number of rows to hold in memory at any one time */
protected int m_bufferSize = 100;
/** Lookup for nominal values */
protected Map> m_nominalVals;
/** Reader used to process and output data incrementally */
protected ArffReader m_incrementalReader;
protected transient int m_rowCount;
/**
* Array holding field separator and enclosures to pass through to the
* underlying ArffReader
*/
protected String[] m_fieldSeparatorAndEnclosures;
/**
* Returns a string describing this attribute evaluator.
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a source that is in comma separated format (the default). "
+ "One can also change the column separator from comma to tab or "
+ "another character, specify string enclosures, specify whether a"
+ "header row is present or not and specify which attributes are to be"
+ "forced to be nominal or date. Can operate in batch or incremental mode. "
+ "In batch mode, a buffer is used to process a fixed number of rows in "
+ "memory at any one time and the data is dumped to a temporary file. This "
+ "allows the legal values for nominal attributes to be automatically "
+ "determined. The final ARFF file is produced in a second pass over the "
+ "temporary file using the structure determined on the first pass. In "
+ "incremental mode, the first buffer full of rows is used to determine "
+ "the structure automatically. Following this all rows are read and output "
+ "incrementally. An error will occur if a row containing nominal values not "
+ "seen in the initial buffer is encountered. In this case, the size of the "
+ "initial buffer can be increased, or the user can explicitly provide the "
+ "legal values of all nominal attributes using the -L (setNominalLabelSpecs) "
+ "option.";
}
/**
* default constructor.
*/
public CSVLoader() {
// No instances retrieved yet
setRetrieval(NONE);
}
@Override
public String getFileExtension() {
return FILE_EXTENSION;
}
@Override
public String[] getFileExtensions() {
return new String[] { getFileExtension() };
}
@Override
public String getFileDescription() {
return "CSV data files";
}
@Override
public String getRevision() {
return "$Revision: 10920 $";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String noHeaderRowPresentTipText() {
return "First row of data does not contain attribute names";
}
/**
* Set whether there is no header row in the data.
*
* @param b true if there is no header row in the data
*/
public void setNoHeaderRowPresent(boolean b) {
m_noHeaderRow = b;
}
/**
* Get whether there is no header row in the data.
*
* @return true if there is no header row in the data
*/
public boolean getNoHeaderRowPresent() {
return m_noHeaderRow;
}
/**
* Sets the placeholder for missing values.
*
* @param value the placeholder
*/
public void setMissingValue(String value) {
m_MissingValue = value;
}
/**
* Returns the current placeholder for missing values.
*
* @return the placeholder
*/
public String getMissingValue() {
return m_MissingValue;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String missingValueTipText() {
return "The placeholder for missing values, default is '?'.";
}
/**
* Sets the attribute range to be forced to type string.
*
* @param value the range
*/
public void setStringAttributes(String value) {
m_StringAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type string.
*
* @return the range
*/
public String getStringAttributes() {
return m_StringAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String stringAttributesTipText() {
return "The range of attributes to force to be of type STRING, example "
+ "ranges: 'first-last', '1,4,7-14,50-last'.";
}
/**
* Sets the attribute range to be forced to type nominal.
*
* @param value the range
*/
public void setNominalAttributes(String value) {
m_NominalAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type nominal.
*
* @return the range
*/
public String getNominalAttributes() {
return m_NominalAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String nominalAttributesTipText() {
return "The range of attributes to force to be of type NOMINAL, example "
+ "ranges: 'first-last', '1,4,7-14,50-last'.";
}
/**
* Set the format to use for parsing date values.
*
* @param value the format to use.
*/
public void setDateFormat(String value) {
m_dateFormat = value;
m_formatter = null;
}
/**
* Get the format to use for parsing date values.
*
* @return the format to use for parsing date values.
*
*/
public String getDateFormat() {
return m_dateFormat;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String dateFormatTipText() {
return "The format to use for parsing date values.";
}
/**
* Set the attribute range to be forced to type date.
*
* @param value the range
*/
public void setDateAttributes(String value) {
m_dateAttributes.setRanges(value);
}
/**
* Returns the current attribute range to be forced to type date.
*
* @return the range.
*/
public String getDateAttributes() {
return m_dateAttributes.getRanges();
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String dateAttributesTipText() {
return "The range of attributes to force to type DATE, example "
+ "ranges: 'first-last', '1,4,7-14, 50-last'.";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String enclosureCharactersTipText() {
return "The characters to use as enclosures for strings. E.g. \",'";
}
/**
* Set the character(s) to use/recognize as string enclosures
*
* @param enclosure the characters to use as string enclosures
*/
public void setEnclosureCharacters(String enclosure) {
m_Enclosures = enclosure;
}
/**
* Get the character(s) to use/recognize as string enclosures
*
* @return the characters to use as string enclosures
*/
public String getEnclosureCharacters() {
return m_Enclosures;
}
/**
* Sets the character used as column separator.
*
* @param value the character to use
*/
public void setFieldSeparator(String value) {
m_FieldSeparator = Utils.unbackQuoteChars(value);
if (m_FieldSeparator.length() != 1) {
m_FieldSeparator = ",";
System.err
.println("Field separator can only be a single character (exception being '\t'), "
+ "defaulting back to '" + m_FieldSeparator + "'!");
}
}
/**
* Returns the character used as column separator.
*
* @return the character to use
*/
public String getFieldSeparator() {
return Utils.backQuoteChars(m_FieldSeparator);
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String fieldSeparatorTipText() {
return "The character to use as separator for the columns/fields (use '\\t' for TAB).";
}
/**
* Set the buffer size to use - i.e. the number of rows to load and process in
* memory at any one time
*
* @param buff the buffer size (number of rows)
*/
public void setBufferSize(int buff) {
m_bufferSize = buff;
}
/**
* Get the buffer size to use - i.e. the number of rows to load and process in
* memory at any one time
*
* @return
*/
public int getBufferSize() {
return m_bufferSize;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String bufferSizeTipText() {
return "The number of rows to process in memory at any one time.";
}
/**
* Set label specifications for nominal attributes.
*
* @param specs an array of label specifications
*/
public void setNominalLabelSpecs(Object[] specs) {
m_nominalLabelSpecs.clear();
for (Object s : specs) {
m_nominalLabelSpecs.add(s.toString());
}
}
/**
* Get label specifications for nominal attributes.
*
* @return an array of label specifications
*/
public Object[] getNominalLabelSpecs() {
return m_nominalLabelSpecs.toArray(new String[0]);
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String nominalLabelSpecsTipText() {
return "Optional specification of legal labels for nominal "
+ "attributes. May be specified multiple times. "
+ "Batch mode can determine this "
+ "automatically (and so can incremental mode if "
+ "the first in memory buffer load of instances "
+ "contains an example of each legal value). The "
+ "spec contains two parts separated by a \":\". The "
+ "first part can be a range of attribute indexes or "
+ "a comma-separated list off attruibute names; the "
+ "second part is a comma-separated list of labels. E.g "
+ "\"1,2,4-6:red,green,blue\" or \"att1,att2:red,green,blue\"";
}
@Override
public Enumeration
© 2015 - 2024 Weber Informatics LLC | Privacy Policy