weka.core.converters.C45Loader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* C45Loader.java
* Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;
import weka.core.Utils;
/**
* Reads a file that is C45 format. Can take a
* filestem or filestem with .names or .data appended. Assumes that
* path/<filestem>.names and path/<filestem>.data exist and contain
* the names and data respectively.
*
*
* @author Mark Hall ([email protected])
* @version $Revision: 9290 $
* @see Loader
*/
public class C45Loader extends AbstractFileLoader implements BatchConverter,
IncrementalConverter {
/** for serialization */
static final long serialVersionUID = 5454329403218219L;
/** the file extension */
public static String FILE_EXTENSION = ".names";
/**
* Describe variable m_sourceFileData
here.
*/
private File m_sourceFileData = null;
/**
* Reader for names file
*/
private transient Reader m_namesReader = null;
/**
* Reader for data file
*/
private transient Reader m_dataReader = null;
/**
* Holds the filestem.
*/
private String m_fileStem;
/**
* Number of attributes in the data (including ignore and label attributes).
*/
private int m_numAttribs;
/**
* Which attributes are ignore or label. These are *not* included in the arff
* representation.
*/
private boolean[] m_ignore;
/**
* Returns a string describing this attribute evaluator
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a file that is C45 format. Can take a filestem or filestem "
+ "with .names or .data appended. Assumes that path/.names and "
+ "path/.data exist and contain the names and data "
+ "respectively.";
}
/**
* Resets the Loader ready to read a new data set or the same data set again.
*
* @throws IOException if something goes wrong
*/
@Override
public void reset() throws IOException {
m_structure = null;
setRetrieval(NONE);
if (m_File != null) {
setFile(new File(m_File));
}
}
/**
* Get the file extension used for arff files
*
* @return the file extension
*/
@Override
public String getFileExtension() {
return FILE_EXTENSION;
}
/**
* Gets all the file extensions used for this type of file
*
* @return the file extensions
*/
@Override
public String[] getFileExtensions() {
return new String[] { ".names", ".data" };
}
/**
* Returns a description of the file type.
*
* @return a short file description
*/
@Override
public String getFileDescription() {
return "C4.5 data files";
}
/**
* Resets the Loader object and sets the source of the data set to be the
* supplied File object.
*
* @param file the source file.
* @exception IOException if an error occurs
*/
@Override
public void setSource(File file) throws IOException {
m_structure = null;
setRetrieval(NONE);
if (file == null) {
throw new IOException("Source file object is null!");
}
String fname = file.getName();
String fileStem;
String path = file.getParent();
if (path != null) {
path += File.separator;
} else {
path = "";
}
if (fname.indexOf('.') < 0) {
fileStem = fname;
fname += ".names";
} else {
fileStem = fname.substring(0, fname.lastIndexOf('.'));
fname = fileStem + ".names";
}
m_fileStem = fileStem;
file = new File(path + fname);
m_sourceFile = file;
try {
BufferedReader br = new BufferedReader(new FileReader(file));
m_namesReader = br;
} catch (FileNotFoundException ex) {
throw new IOException("File not found : " + (path + fname));
}
m_sourceFileData = new File(path + fileStem + ".data");
try {
BufferedReader br = new BufferedReader(new FileReader(m_sourceFileData));
m_dataReader = br;
} catch (FileNotFoundException ex) {
throw new IOException("File not found : " + (path + fname));
}
m_File = file.getAbsolutePath();
}
/**
* Determines and returns (if possible) the structure (internally the header)
* of the data set as an empty set of instances.
*
* @return the structure of the data set as an empty set of Instances
* @exception IOException if an error occurs
*/
@Override
public Instances getStructure() throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has beenspecified");
}
if (m_structure == null) {
setSource(m_sourceFile);
StreamTokenizer st = new StreamTokenizer(m_namesReader);
initTokenizer(st);
readHeader(st);
}
return m_structure;
}
/**
* Return the full data set. If the structure hasn't yet been determined by a
* call to getStructure then method should do so before processing the rest of
* the data set.
*
* @return the structure of the data set as an empty set of Instances
* @exception IOException if there is no source or parsing fails
*/
@Override
public Instances getDataSet() throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException(
"Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(BATCH);
if (m_structure == null) {
getStructure();
}
StreamTokenizer st = new StreamTokenizer(m_dataReader);
initTokenizer(st);
// st.ordinaryChar('.');
Instances result = new Instances(m_structure);
Instance current = getInstance(st);
while (current != null) {
result.add(current);
current = getInstance(st);
}
try {
// close the stream
m_dataReader.close();
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
return result;
}
/**
* Read the data set incrementally---get the next instance in the data set or
* returns null if there are no more instances to get. If the structure hasn't
* yet been determined by a call to getStructure then method should do so
* before returning the next instance in the data set.
*
* If it is not possible to read the data set incrementally (ie. in cases
* where the data set structure cannot be fully established before all
* instances have been seen) then an exception should be thrown.
*
* @param structure the dataset header information, will get updated in case
* of string or relational attributes
* @return the next instance in the data set as an Instance object or null if
* there are no more instances to be read
* @exception IOException if there is an error during parsing
*/
@Override
public Instance getNextInstance(Instances structure) throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == BATCH) {
throw new IOException(
"Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(INCREMENTAL);
if (m_structure == null) {
getStructure();
}
StreamTokenizer st = new StreamTokenizer(m_dataReader);
initTokenizer(st);
// st.ordinaryChar('.');
Instance nextI = getInstance(st);
if (nextI != null) {
nextI.setDataset(m_structure);
} else {
try {
// close the stream
m_dataReader.close();
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return nextI;
}
/**
* Reads an instance using the supplied tokenizer.
*
* @param tokenizer the tokenizer to use
* @return an Instance or null if there are no more instances to read
* @exception IOException if an error occurs
*/
private Instance getInstance(StreamTokenizer tokenizer) throws IOException {
double[] instance = new double[m_structure.numAttributes()];
StreamTokenizerUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
int counter = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (i > 0) {
StreamTokenizerUtils.getToken(tokenizer);
}
if (!m_ignore[i]) {
// Check if value is missing.
if (tokenizer.ttype == '?') {
instance[counter++] = Utils.missingValue();
} else {
String val = tokenizer.sval;
if (i == m_numAttribs - 1) {
// remove trailing period
if (val.charAt(val.length() - 1) == '.') {
val = val.substring(0, val.length() - 1);
}
}
if (m_structure.attribute(counter).isNominal()) {
int index = m_structure.attribute(counter).indexOfValue(val);
if (index == -1) {
StreamTokenizerUtils.errms(tokenizer,
"nominal value not declared in " + "header :" + val
+ " column " + i);
}
instance[counter++] = index;
} else if (m_structure.attribute(counter).isNumeric()) {
try {
instance[counter++] = Double.valueOf(val).doubleValue();
} catch (NumberFormatException e) {
StreamTokenizerUtils.errms(tokenizer, "number expected");
}
} else {
System.err.println("Shouldn't get here");
System.exit(1);
}
}
}
}
return new DenseInstance(1.0, instance);
}
/**
* removes the trailing period
*
* @param val the string to work on
* @return the processed string
*/
private String removeTrailingPeriod(String val) {
// remove trailing period
if (val.charAt(val.length() - 1) == '.') {
val = val.substring(0, val.length() - 1);
}
return val;
}
/**
* Reads header (from the names file) using the supplied tokenizer
*
* @param tokenizer the tokenizer to use
* @exception IOException if an error occurs
*/
private void readHeader(StreamTokenizer tokenizer) throws IOException {
ArrayList attribDefs = new ArrayList();
ArrayList ignores = new ArrayList();
StreamTokenizerUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
StreamTokenizerUtils.errms(tokenizer, "premature end of file");
}
m_numAttribs = 1;
// Read the class values
ArrayList classVals = new ArrayList();
while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
String val = tokenizer.sval.trim();
if (val.length() > 0) {
val = removeTrailingPeriod(val);
classVals.add(val);
}
StreamTokenizerUtils.getToken(tokenizer);
}
// read the attribute names and types
int counter = 0;
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
StreamTokenizerUtils.getFirstToken(tokenizer);
if (tokenizer.ttype != StreamTokenizer.TT_EOF) {
String attribName = tokenizer.sval;
StreamTokenizerUtils.getToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
StreamTokenizerUtils.errms(tokenizer,
"premature end of line. Expected " + "attribute type.");
}
String temp = tokenizer.sval.toLowerCase().trim();
if (temp.startsWith("ignore") || temp.startsWith("label")) {
ignores.add(new Integer(counter));
counter++;
} else if (temp.startsWith("continuous")) {
attribDefs.add(new Attribute(attribName));
counter++;
} else {
counter++;
// read the values of the attribute
ArrayList attribVals = new ArrayList();
while (tokenizer.ttype != StreamTokenizer.TT_EOL
&& tokenizer.ttype != StreamTokenizer.TT_EOF) {
String val = tokenizer.sval.trim();
if (val.length() > 0) {
val = removeTrailingPeriod(val);
attribVals.add(val);
}
StreamTokenizerUtils.getToken(tokenizer);
}
attribDefs.add(new Attribute(attribName, attribVals));
}
}
}
boolean ok = true;
int i = -1;
if (classVals.size() == 1) {
// look to see if this is an attribute name (ala c5 names file style)
for (i = 0; i < attribDefs.size(); i++) {
if (attribDefs.get(i).name().compareTo(classVals.get(0)) == 0) {
ok = false;
m_numAttribs--;
break;
}
}
}
if (ok) {
attribDefs.add(new Attribute("Class", classVals));
}
m_structure = new Instances(m_fileStem, attribDefs, 0);
try {
if (ok) {
m_structure.setClassIndex(m_structure.numAttributes() - 1);
} else {
m_structure.setClassIndex(i);
}
} catch (Exception ex) {
ex.printStackTrace();
}
m_numAttribs = m_structure.numAttributes() + ignores.size();
m_ignore = new boolean[m_numAttribs];
for (i = 0; i < ignores.size(); i++) {
m_ignore[ignores.get(i).intValue()] = true;
}
}
/**
* Initializes the stream tokenizer
*
* @param tokenizer the tokenizer to initialize
*/
private void initTokenizer(StreamTokenizer tokenizer) {
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, (' ' - 1));
tokenizer.wordChars(' ', '\u00FF');
tokenizer.whitespaceChars(',', ',');
tokenizer.whitespaceChars(':', ':');
// tokenizer.whitespaceChars('.','.');
tokenizer.commentChar('|');
tokenizer.whitespaceChars('\t', '\t');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
tokenizer.eolIsSignificant(true);
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 9290 $");
}
/**
* Main method for testing this class.
*
* @param args should contain <filestem>[.names | data]
*/
public static void main(String[] args) {
runFileLoader(new C45Loader(), args);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy