weka.core.converters.C45Loader Maven / Gradle / Ivy
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* C45Loader.java
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
/**
* Reads a file that is C45 format. Can take a filestem or filestem with .names or .data appended. Assumes that path/<filestem>.names and path/<filestem>.data exist and contain the names and data respectively.
*
*
* @author Mark Hall ([email protected])
* @version $Revision: 1.16 $
* @see Loader
*/
public class C45Loader
extends AbstractFileLoader
implements BatchConverter, IncrementalConverter {
/** for serialization */
static final long serialVersionUID = 5454329403218219L;
/** the file extension */
public static String FILE_EXTENSION = ".names";
/**
* Describe variable m_sourceFileData
here.
*/
private File m_sourceFileData = null;
/**
* Reader for names file
*/
private transient Reader m_namesReader = null;
/**
* Reader for data file
*/
private transient Reader m_dataReader = null;
/**
* Holds the filestem.
*/
private String m_fileStem;
/**
* Number of attributes in the data (including ignore and label attributes).
*/
private int m_numAttribs;
/**
* Which attributes are ignore or label. These are *not* included in the
* arff representation.
*/
private boolean [] m_ignore;
/**
* Returns a string describing this attribute evaluator
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a file that is C45 format. Can take a filestem or filestem "
+"with .names or .data appended. Assumes that path/.names and "
+"path/.data exist and contain the names and data "
+"respectively.";
}
/**
* Resets the Loader ready to read a new data set or the
* same data set again.
*
* @throws IOException if something goes wrong
*/
public void reset() throws IOException {
m_structure = null;
setRetrieval(NONE);
if (m_File != null) {
setFile(new File(m_File));
}
}
/**
* Get the file extension used for arff files
*
* @return the file extension
*/
public String getFileExtension() {
return FILE_EXTENSION;
}
/**
* Gets all the file extensions used for this type of file
*
* @return the file extensions
*/
public String[] getFileExtensions() {
return new String[]{".names", ".data"};
}
/**
* Returns a description of the file type.
*
* @return a short file description
*/
public String getFileDescription() {
return "C4.5 data files";
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied File object.
*
* @param file the source file.
* @exception IOException if an error occurs
*/
public void setSource(File file) throws IOException {
m_structure = null;
setRetrieval(NONE);
if (file == null) {
throw new IOException("Source file object is null!");
}
String fname = file.getName();
String fileStem;
String path = file.getParent();
if (path != null) {
path += File.separator;
} else {
path = "";
}
if (fname.indexOf('.') < 0) {
fileStem = fname;
fname += ".names";
} else {
fileStem = fname.substring(0, fname.lastIndexOf('.'));
fname = fileStem + ".names";
}
m_fileStem = fileStem;
file = new File(path+fname);
m_sourceFile = file;
try {
BufferedReader br = new BufferedReader(new FileReader(file));
m_namesReader = br;
} catch (FileNotFoundException ex) {
throw new IOException("File not found : "+(path+fname));
}
m_sourceFileData = new File(path+fileStem+".data");
try {
BufferedReader br = new BufferedReader(new FileReader(m_sourceFileData));
m_dataReader = br;
} catch (FileNotFoundException ex) {
throw new IOException("File not found : "+(path+fname));
}
m_File = file.getAbsolutePath();
}
/**
* Determines and returns (if possible) the structure (internally the
* header) of the data set as an empty set of instances.
*
* @return the structure of the data set as an empty set of Instances
* @exception IOException if an error occurs
*/
public Instances getStructure() throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has beenspecified");
}
if (m_structure == null) {
setSource(m_sourceFile);
StreamTokenizer st = new StreamTokenizer(m_namesReader);
initTokenizer(st);
readHeader(st);
}
return m_structure;
}
/**
* Return the full data set. If the structure hasn't yet been determined
* by a call to getStructure then method should do so before processing
* the rest of the data set.
*
* @return the structure of the data set as an empty set of Instances
* @exception IOException if there is no source or parsing fails
*/
public Instances getDataSet() throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(BATCH);
if (m_structure == null) {
getStructure();
}
StreamTokenizer st = new StreamTokenizer(m_dataReader);
initTokenizer(st);
// st.ordinaryChar('.');
Instances result = new Instances(m_structure);
Instance current = getInstance(st);
while (current != null) {
result.add(current);
current = getInstance(st);
}
try {
// close the stream
m_dataReader.close();
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
return result;
}
/**
* Read the data set incrementally---get the next instance in the data
* set or returns null if there are no
* more instances to get. If the structure hasn't yet been
* determined by a call to getStructure then method should do so before
* returning the next instance in the data set.
*
* If it is not possible to read the data set incrementally (ie. in cases
* where the data set structure cannot be fully established before all
* instances have been seen) then an exception should be thrown.
*
* @param structure the dataset header information, will get updated in
* case of string or relational attributes
* @return the next instance in the data set as an Instance object or null
* if there are no more instances to be read
* @exception IOException if there is an error during parsing
*/
public Instance getNextInstance(Instances structure) throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == BATCH) {
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(INCREMENTAL);
if (m_structure == null) {
getStructure();
}
StreamTokenizer st = new StreamTokenizer(m_dataReader);
initTokenizer(st);
// st.ordinaryChar('.');
Instance nextI = getInstance(st);
if (nextI != null) {
nextI.setDataset(m_structure);
}
else{
try {
// close the stream
m_dataReader.close();
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return nextI;
}
/**
* Reads an instance using the supplied tokenizer.
*
* @param tokenizer the tokenizer to use
* @return an Instance or null if there are no more instances to read
* @exception IOException if an error occurs
*/
private Instance getInstance(StreamTokenizer tokenizer)
throws IOException {
double [] instance = new double[m_structure.numAttributes()];
ConverterUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
int counter = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (i > 0) {
ConverterUtils.getToken(tokenizer);
}
if (!m_ignore[i]) {
// Check if value is missing.
if (tokenizer.ttype == '?') {
instance[counter++] = Instance.missingValue();
} else {
String val = tokenizer.sval;
if (i == m_numAttribs - 1) {
// remove trailing period
if (val.charAt(val.length()-1) == '.') {
val = val.substring(0,val.length()-1);
}
}
if (m_structure.attribute(counter).isNominal()) {
int index = m_structure.attribute(counter).indexOfValue(val);
if (index == -1) {
ConverterUtils.errms(tokenizer, "nominal value not declared in "
+"header :"+val+" column "+i);
}
instance[counter++] = (double)index;
} else if (m_structure.attribute(counter).isNumeric()) {
try {
instance[counter++] = Double.valueOf(val).doubleValue();
} catch (NumberFormatException e) {
ConverterUtils.errms(tokenizer, "number expected");
}
} else {
System.err.println("Shouldn't get here");
System.exit(1);
}
}
}
}
return new Instance(1.0, instance);
}
/**
* removes the trailing period
*
* @param val the string to work on
* @return the processed string
*/
private String removeTrailingPeriod(String val) {
// remove trailing period
if (val.charAt(val.length()-1) == '.') {
val = val.substring(0,val.length()-1);
}
return val;
}
/**
* Reads header (from the names file) using the supplied tokenizer
*
* @param tokenizer the tokenizer to use
* @exception IOException if an error occurs
*/
private void readHeader(StreamTokenizer tokenizer) throws IOException {
FastVector attribDefs = new FastVector();
FastVector ignores = new FastVector();
ConverterUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
ConverterUtils.errms(tokenizer,"premature end of file");
}
m_numAttribs = 1;
// Read the class values
FastVector classVals = new FastVector();
while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
String val = tokenizer.sval.trim();
if (val.length() > 0) {
val = removeTrailingPeriod(val);
classVals.addElement(val);
}
ConverterUtils.getToken(tokenizer);
}
// read the attribute names and types
int counter = 0;
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
ConverterUtils.getFirstToken(tokenizer);
if (tokenizer.ttype != StreamTokenizer.TT_EOF) {
String attribName = tokenizer.sval;
ConverterUtils.getToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
ConverterUtils.errms(tokenizer, "premature end of line. Expected "
+"attribute type.");
}
String temp = tokenizer.sval.toLowerCase().trim();
if (temp.startsWith("ignore") || temp.startsWith("label")) {
ignores.addElement(new Integer(counter));
counter++;
} else if (temp.startsWith("continuous")) {
attribDefs.addElement(new Attribute(attribName));
counter++;
} else {
counter++;
// read the values of the attribute
FastVector attribVals = new FastVector();
while (tokenizer.ttype != StreamTokenizer.TT_EOL &&
tokenizer.ttype != StreamTokenizer.TT_EOF) {
String val = tokenizer.sval.trim();
if (val.length() > 0) {
val = removeTrailingPeriod(val);
attribVals.addElement(val);
}
ConverterUtils.getToken(tokenizer);
}
attribDefs.addElement(new Attribute(attribName, attribVals));
}
}
}
boolean ok = true;
int i = -1;
if (classVals.size() == 1) {
// look to see if this is an attribute name (ala c5 names file style)
for (i = 0; i < attribDefs.size(); i++) {
if (((Attribute)attribDefs.elementAt(i))
.name().compareTo((String)classVals.elementAt(0)) == 0) {
ok = false;
m_numAttribs--;
break;
}
}
}
if (ok) {
attribDefs.addElement(new Attribute("Class", classVals));
}
m_structure = new Instances(m_fileStem, attribDefs, 0);
try {
if (ok) {
m_structure.setClassIndex(m_structure.numAttributes()-1);
} else {
m_structure.setClassIndex(i);
}
} catch (Exception ex) {
ex.printStackTrace();
}
m_numAttribs = m_structure.numAttributes() + ignores.size();
m_ignore = new boolean[m_numAttribs];
for (i = 0; i < ignores.size(); i++) {
m_ignore[((Integer)ignores.elementAt(i)).intValue()] = true;
}
}
/**
* Initializes the stream tokenizer
*
* @param tokenizer the tokenizer to initialize
*/
private void initTokenizer(StreamTokenizer tokenizer) {
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, (' '-1));
tokenizer.wordChars(' ','\u00FF');
tokenizer.whitespaceChars(',',',');
tokenizer.whitespaceChars(':',':');
// tokenizer.whitespaceChars('.','.');
tokenizer.commentChar('|');
tokenizer.whitespaceChars('\t','\t');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
tokenizer.eolIsSignificant(true);
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 1.16 $");
}
/**
* Main method for testing this class.
*
* @param args should contain <filestem>[.names | data]
*/
public static void main (String [] args) {
runFileLoader(new C45Loader(), args);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy