weka.core.converters.ArffLoader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ArffLoader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;

/**
 *  Reads a source that is in arff (attribute relation
 * file format) format.
 * 
 * 
 * 
 * @author Mark Hall ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 11136 $
 * @see Loader
 */
public class ArffLoader extends AbstractFileLoader implements BatchConverter,
  IncrementalConverter, URLSourcedLoader {

  /** for serialization */
  static final long serialVersionUID = 2726929550544048587L;

  /** the file extension */
  public static String FILE_EXTENSION = Instances.FILE_EXTENSION;
  public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz";

  /** the url */
  protected String m_URL = "http://";

  /** The reader for the source file. */
  protected transient Reader m_sourceReader = null;

  /** The parser for the ARFF file */
  protected transient ArffReader m_ArffReader = null;

  /**
   * Whether the values of string attributes should be retained in memory when
   * reading incrementally
   */
  protected boolean m_retainStringVals;

  /**
   * Reads data from an ARFF file, either in incremental or batch mode.
   * 

   * 
   * Typical code for batch usage:
   * 
   * 
   * BufferedReader reader =
   *   new BufferedReader(new FileReader("/some/where/file.arff"));
   * ArffReader arff = new ArffReader(reader);
   * Instances data = arff.getData();
   * data.setClassIndex(data.numAttributes() - 1);
   * 
   * 
   * Typical code for incremental usage:
   * 
   *    * BufferedReader reader =
   *   new BufferedReader(new FileReader("/some/where/file.arff"));
   * ArffReader arff = new ArffReader(reader, 1000);
   * Instances data = arff.getStructure();
   * data.setClassIndex(data.numAttributes() - 1);
   * Instance inst;
   * while ((inst = arff.readInstance(data)) != null) {
   *   data.add(inst);
   * }
   * 
   * 
   * @author Eibe Frank ([email protected])
   * @author Len Trigg ([email protected])
   * @author fracpete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 11136 $
   */
  public static class ArffReader implements RevisionHandler {

    /** the tokenizer for reading the stream */
    protected StreamTokenizer m_Tokenizer;

    /** Buffer of values for sparse instance */
    protected double[] m_ValueBuffer;

    /** Buffer of indices for sparse instance */
    protected int[] m_IndicesBuffer;

    protected List m_stringAttIndices;

    /** the actual data */
    protected Instances m_Data;

    /** the number of lines read so far */
    protected int m_Lines;

    protected boolean m_batchMode = true;

    /**
     * Whether the values for string attributes will accumulate in the header
     * when reading incrementally
     */
    protected boolean m_retainStringValues = false;

    /** Field separator (single character string) to use instead of the defaults */
    protected String m_fieldSeparator;

    /** List of (single character) enclosures to use instead of the defaults */
    protected List m_enclosures;

    /**
     * Reads the data completely from the reader. The data can be accessed via
     * the getData() method.
     * 
     * @param reader the reader to use
     * @throws IOException if something goes wrong
     * @see #getData()
     */
    public ArffReader(Reader reader) throws IOException {
      m_retainStringValues = true;
      m_batchMode = true;
      m_Tokenizer = new StreamTokenizer(reader);
      initTokenizer();

      readHeader(1000);
      initBuffers();

      Instance inst;
      while ((inst = readInstance(m_Data)) != null) {
        m_Data.add(inst);
      }

      compactify();
    }

    public ArffReader(Reader reader, int capacity) throws IOException {
      this(reader, capacity, true);
    }

    /**
     * Reads only the header and reserves the specified space for instances.
     * Further instances can be read via readInstance().
     * 
     * @param reader the reader to use
     * @param capacity the capacity of the new dataset
     * @param batch true if reading in batch mode
     * @throws IOException if something goes wrong
     * @throws IOException if a problem occurs
     * @see #getStructure()
     * @see #readInstance(Instances)
     */
    public ArffReader(Reader reader, int capacity, boolean batch)
      throws IOException {

      m_batchMode = batch;
      if (batch) {
        m_retainStringValues = true;
      }

      if (capacity < 0) {
        throw new IllegalArgumentException("Capacity has to be positive!");
      }

      m_Tokenizer = new StreamTokenizer(reader);
      initTokenizer();

      readHeader(capacity);
      initBuffers();
    }

    /**
     * Reads the data without header according to the specified template. The
     * data can be accessed via the getData() method.
     * 
     * @param reader the reader to use
     * @param template the template header
     * @param lines the lines read so far
     * @param fieldSepAndEnclosures an optional array of Strings containing the
     *          field separator and enclosures to use instead of the defaults.
     *          The first entry in the array is expected to be the single
     *          character field separator to use; the remaining entries (if any)
     *          are enclosure characters to use.
     * @throws IOException if something goes wrong
     * @see #getData()
     */
    public ArffReader(Reader reader, Instances template, int lines,
      String... fieldSepAndEnclosures) throws IOException {
      this(reader, template, lines, 100, true, fieldSepAndEnclosures);

      Instance inst;
      while ((inst = readInstance(m_Data)) != null) {
        m_Data.add(inst);
      }

      compactify();
    }

    /**
     * Initializes the reader without reading the header according to the
     * specified template. The data must be read via the
     * readInstance() method.
     * 
     * @param reader the reader to use
     * @param template the template header
     * @param lines the lines read so far
     * @param capacity the capacity of the new dataset
     * @param fieldSepAndEnclosures an optional array of Strings containing the
     *          field separator and enclosures to use instead of the defaults.
     *          The first entry in the array is expected to be the single
     *          character field separator to use; the remaining entries (if any)
     *          are enclosure characters to use.
     * @throws IOException if something goes wrong
     * @see #getData()
     */
    public ArffReader(Reader reader, Instances template, int lines,
      int capacity, String... fieldSepAndEnclosures) throws IOException {
      this(reader, template, lines, capacity, false, fieldSepAndEnclosures);
    }

    /**
     * Initializes the reader without reading the header according to the
     * specified template. The data must be read via the
     * readInstance() method.
     * 
     * @param reader the reader to use
     * @param template the template header
     * @param lines the lines read so far
     * @param capacity the capacity of the new dataset
     * @param batch true if the data is going to be read in batch mode
     * @param fieldSepAndEnclosures an optional array of Strings containing the
     *          field separator and enclosures to use instead of the defaults.
     *          The first entry in the array is expected to be the single
     *          character field separator to use; the remaining entries (if any)
     *          are enclosure characters to use.
     * @throws IOException if something goes wrong
     * @see #getData()
     */
    public ArffReader(Reader reader, Instances template, int lines,
      int capacity, boolean batch, String... fieldSepAndEnclosures)
      throws IOException {
      m_batchMode = batch;
      if (batch) {
        m_retainStringValues = true;
      }

      if (fieldSepAndEnclosures != null && fieldSepAndEnclosures.length > 0) {
        if (fieldSepAndEnclosures[0] != null
          && fieldSepAndEnclosures[0].length() > 0) {
          m_fieldSeparator = fieldSepAndEnclosures[0];
        }

        if (fieldSepAndEnclosures.length > 1) {
          // the rest are assumed to be enclosure characters
          m_enclosures = new ArrayList();
          for (int i = 1; i < fieldSepAndEnclosures.length; i++) {
            if (fieldSepAndEnclosures[i] != null
              && fieldSepAndEnclosures[i].length() > 0) {
              m_enclosures.add(fieldSepAndEnclosures[i]);
            }
          }

          if (m_enclosures.size() == 0) {
            m_enclosures = null;
          }
        }
      }

      m_Lines = lines;
      m_Tokenizer = new StreamTokenizer(reader);
      initTokenizer();

      m_Data = new Instances(template, capacity);
      initBuffers();
    }

    /**
     * initializes the buffers for sparse instances to be read
     * 
     * @see #m_ValueBuffer
     * @see #m_IndicesBuffer
     */
    protected void initBuffers() {
      m_ValueBuffer = new double[m_Data.numAttributes()];
      m_IndicesBuffer = new int[m_Data.numAttributes()];

      m_stringAttIndices = new ArrayList();
      if (m_Data.checkForStringAttributes()) {
        for (int i = 0; i < m_Data.numAttributes(); i++) {
          if (m_Data.attribute(i).isString()) {
            m_stringAttIndices.add(i);
          }
        }
      }
    }

    /**
     * compactifies the data
     */
    protected void compactify() {
      if (m_Data != null) {
        m_Data.compactify();
      }
    }

    /**
     * Throws error message with line number and last token read.
     * 
     * @param msg the error message to be thrown
     * @throws IOException containing the error message
     */
    protected void errorMessage(String msg) throws IOException {
      String str = msg + ", read " + m_Tokenizer.toString();
      if (m_Lines > 0) {
        int line = Integer.parseInt(str.replaceAll(".* line ", ""));
        str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1));
      }
      throw new IOException(str);
    }

    /**
     * returns the current line number
     * 
     * @return the current line number
     */
    public int getLineNo() {
      return m_Lines + m_Tokenizer.lineno();
    }

    /**
     * Gets next token, skipping empty lines.
     * 
     * @throws IOException if reading the next token fails
     */
    protected void getFirstToken() throws IOException {
      while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
      }
      ;

      if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) {
        m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
      } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD)
        && (m_Tokenizer.sval.equals("?"))) {
        m_Tokenizer.ttype = '?';
      }
    }

    /**
     * Gets index, checking for a premature and of line.
     * 
     * @throws IOException if it finds a premature end of line
     */
    protected void getIndex() throws IOException {
      if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
        errorMessage("premature end of line");
      }
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        errorMessage("premature end of file");
      }
    }

    /**
     * Gets token and checks if its end of line.
     * 
     * @param endOfFileOk whether EOF is OK
     * @throws IOException if it doesn't find an end of line
     */
    protected void getLastToken(boolean endOfFileOk) throws IOException {
      if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL)
        && ((m_Tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
        errorMessage("end of line expected");
      }
    }

    /**
     * Gets the value of an instance's weight (if one exists)
     * 
     * @return the value of the instance's weight, or NaN if no weight has been
     *         supplied in the file
     */
    protected double getInstanceWeight() throws IOException {
      double weight = Double.NaN;
      m_Tokenizer.nextToken();
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL
        || m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        return weight;
      }
      // see if we can read an instance weight
      // m_Tokenizer.pushBack();
      if (m_Tokenizer.ttype == '{') {
        m_Tokenizer.nextToken();
        String weightS = m_Tokenizer.sval;
        // try to parse weight as a double
        try {
          weight = Double.parseDouble(weightS);
        } catch (NumberFormatException e) {
          // quietly ignore
          return weight;
        }
        // see if we have the closing brace
        m_Tokenizer.nextToken();
        if (m_Tokenizer.ttype != '}') {
          errorMessage("Problem reading instance weight");
        }
      }
      return weight;
    }

    /**
     * Gets next token, checking for a premature and of line.
     * 
     * @throws IOException if it finds a premature end of line
     */
    protected void getNextToken() throws IOException {
      if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
        errorMessage("premature end of line");
      }
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        errorMessage("premature end of file");
      } else if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) {
        m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
      } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD)
        && (m_Tokenizer.sval.equals("?"))) {
        m_Tokenizer.ttype = '?';
      }
    }

    /**
     * Initializes the StreamTokenizer used for reading the ARFF file.
     */
    protected void initTokenizer() {
      m_Tokenizer.resetSyntax();
      m_Tokenizer.whitespaceChars(0, ' ');
      m_Tokenizer.wordChars(' ' + 1, '\u00FF');
      if (m_fieldSeparator != null) {
        m_Tokenizer.whitespaceChars(m_fieldSeparator.charAt(0),
          m_fieldSeparator.charAt(0));
      } else {
        m_Tokenizer.whitespaceChars(',', ',');
      }
      m_Tokenizer.commentChar('%');
      if (m_enclosures != null && m_enclosures.size() > 0) {
        for (String e : m_enclosures) {
          m_Tokenizer.quoteChar(e.charAt(0));
        }
      } else {
        m_Tokenizer.quoteChar('"');
        m_Tokenizer.quoteChar('\'');
      }
      m_Tokenizer.ordinaryChar('{');
      m_Tokenizer.ordinaryChar('}');
      m_Tokenizer.eolIsSignificant(true);
    }

    /**
     * Reads a single instance using the tokenizer and returns it.
     * 
     * @param structure the dataset header information, will get updated in case
     *          of string or relational attributes
     * @return null if end of file has been reached
     * @throws IOException if the information is not read successfully
     */
    public Instance readInstance(Instances structure) throws IOException {
      return readInstance(structure, true);
    }

    /**
     * Reads a single instance using the tokenizer and returns it.
     * 
     * @param structure the dataset header information, will get updated in case
     *          of string or relational attributes
     * @param flag if method should test for carriage return after each instance
     * @return null if end of file has been reached
     * @throws IOException if the information is not read successfully
     */
    public Instance readInstance(Instances structure, boolean flag)
      throws IOException {
      return getInstance(structure, flag);
    }

    /**
     * Reads a single instance using the tokenizer and returns it.
     * 
     * @param structure the dataset header information, will get updated in case
     *          of string or relational attributes
     * @param flag if method should test for carriage return after each instance
     * @return null if end of file has been reached
     * @throws IOException if the information is not read successfully
     */
    protected Instance getInstance(Instances structure, boolean flag)
      throws IOException {
      m_Data = structure;

      // Check if any attributes have been declared.
      if (m_Data.numAttributes() == 0) {
        errorMessage("no header information available");
      }

      // Check if end of file reached.
      getFirstToken();
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        return null;
      }

      // Parse instance
      if (m_Tokenizer.ttype == '{') {
        return getInstanceSparse(flag);
      } else {
        return getInstanceFull(flag);
      }
    }

    /**
     * Reads a single instance using the tokenizer and returns it.
     * 
     * @param flag if method should test for carriage return after each instance
     * @return null if end of file has been reached
     * @throws IOException if the information is not read successfully
     */
    protected Instance getInstanceSparse(boolean flag) throws IOException {
      int valIndex, numValues = 0, maxIndex = -1;

      // if reading incrementally, and we have string values, make sure that all
      // string
      // attributes are initialized to "0" with the dummy first value
      if (!m_batchMode && !m_retainStringValues && m_stringAttIndices != null) {
        for (int i = 0; i < m_stringAttIndices.size(); i++) {
          m_Data.attribute(m_stringAttIndices.get(i)).setStringValue(
            Attribute.DUMMY_STRING_VAL);
        }
      }

      // Get values
      do {
        // Get index
        getIndex();
        if (m_Tokenizer.ttype == '}') {
          break;
        }

        // Is index valid?
        try {
          m_IndicesBuffer[numValues] =
            Integer.valueOf(m_Tokenizer.sval).intValue();
        } catch (NumberFormatException e) {
          errorMessage("index number expected");
        }
        if (m_IndicesBuffer[numValues] <= maxIndex) {
          errorMessage("indices have to be ordered");
        }
        if ((m_IndicesBuffer[numValues] < 0)
          || (m_IndicesBuffer[numValues] >= m_Data.numAttributes())) {
          errorMessage("index out of bounds");
        }
        maxIndex = m_IndicesBuffer[numValues];

        // Get value;
        getNextToken();

        // Check if value is missing.
        if (m_Tokenizer.ttype == '?') {
          m_ValueBuffer[numValues] = Utils.missingValue();
        } else {

          // Check if token is valid.
          if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
            errorMessage("not a valid value");
          }
          switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) {
          case Attribute.NOMINAL:
            // Check if value appears in header.
            valIndex =
              m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(
                m_Tokenizer.sval);
            if (valIndex == -1) {
              errorMessage("nominal value not declared in header");
            }
            m_ValueBuffer[numValues] = valIndex;
            break;
          case Attribute.NUMERIC:
            // Check if value is really a number.
            try {
              m_ValueBuffer[numValues] =
                Double.valueOf(m_Tokenizer.sval).doubleValue();
            } catch (NumberFormatException e) {
              errorMessage("number expected");
            }
            break;
          case Attribute.STRING:
            if (m_batchMode || m_retainStringValues) {
              m_ValueBuffer[numValues] =
                m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(
                  m_Tokenizer.sval);
            } else {
              m_ValueBuffer[numValues] = 1;
              m_Data.attribute(m_IndicesBuffer[numValues]).setStringValue(
                Attribute.DUMMY_STRING_VAL);
              m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(
                m_Tokenizer.sval);
            }
            break;
          case Attribute.DATE:
            try {
              m_ValueBuffer[numValues] =
                m_Data.attribute(m_IndicesBuffer[numValues]).parseDate(
                  m_Tokenizer.sval);
            } catch (ParseException e) {
              errorMessage("unparseable date: " + m_Tokenizer.sval);
            }
            break;
          case Attribute.RELATIONAL:
            try {
              ArffReader arff =
                new ArffReader(new StringReader(m_Tokenizer.sval), m_Data
                  .attribute(m_IndicesBuffer[numValues]).relation(), 0);
              Instances data = arff.getData();
              m_ValueBuffer[numValues] =
                m_Data.attribute(m_IndicesBuffer[numValues]).addRelation(data);
            } catch (Exception e) {
              throw new IOException(e.toString() + " of line " + getLineNo());
            }
            break;
          default:
            errorMessage("unknown attribute type in column "
              + m_IndicesBuffer[numValues]);
          }
        }
        numValues++;
      } while (true);

      double weight = 1.0;
      if (flag) {
        // check for an instance weight
        weight = getInstanceWeight();
        if (!Double.isNaN(weight)) {
          getLastToken(true);
        } else {
          weight = 1.0;
        }
      }

      // Add instance to dataset
      double[] tempValues = new double[numValues];
      int[] tempIndices = new int[numValues];
      System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
      System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
      Instance inst =
        new SparseInstance(weight, tempValues, tempIndices,
          m_Data.numAttributes());
      inst.setDataset(m_Data);

      return inst;
    }

    /**
     * Reads a single instance using the tokenizer and returns it.
     * 
     * @param flag if method should test for carriage return after each instance
     * @return null if end of file has been reached
     * @throws IOException if the information is not read successfully
     */
    protected Instance getInstanceFull(boolean flag) throws IOException {
      double[] instance = new double[m_Data.numAttributes()];
      int index;

      // Get values for all attributes.
      for (int i = 0; i < m_Data.numAttributes(); i++) {
        // Get next token
        if (i > 0) {
          getNextToken();
        }

        // Check if value is missing.
        if (m_Tokenizer.ttype == '?') {
          instance[i] = Utils.missingValue();
        } else {

          // Check if token is valid.
          if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
            errorMessage("not a valid value");
          }
          switch (m_Data.attribute(i).type()) {
          case Attribute.NOMINAL:
            // Check if value appears in header.
            index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval);
            if (index == -1) {
              errorMessage("nominal value not declared in header");
            }
            instance[i] = index;
            break;
          case Attribute.NUMERIC:
            // Check if value is really a number.
            try {
              instance[i] = Double.valueOf(m_Tokenizer.sval).doubleValue();
            } catch (NumberFormatException e) {
              errorMessage("number expected");
            }
            break;
          case Attribute.STRING:
            if (m_batchMode || m_retainStringValues) {
              instance[i] =
                m_Data.attribute(i).addStringValue(m_Tokenizer.sval);
            } else {
              instance[i] = 0;
              m_Data.attribute(i).setStringValue(m_Tokenizer.sval);
            }
            break;
          case Attribute.DATE:
            try {
              instance[i] = m_Data.attribute(i).parseDate(m_Tokenizer.sval);
            } catch (ParseException e) {
              errorMessage("unparseable date: " + m_Tokenizer.sval);
            }
            break;
          case Attribute.RELATIONAL:
            try {
              ArffReader arff =
                new ArffReader(new StringReader(m_Tokenizer.sval), m_Data
                  .attribute(i).relation(), 0);
              Instances data = arff.getData();
              instance[i] = m_Data.attribute(i).addRelation(data);
            } catch (Exception e) {
              throw new IOException(e.toString() + " of line " + getLineNo());
            }
            break;
          default:
            errorMessage("unknown attribute type in column " + i);
          }
        }
      }

      double weight = 1.0;
      if (flag) {
        // check for an instance weight
        weight = getInstanceWeight();
        if (!Double.isNaN(weight)) {
          getLastToken(true);
        } else {
          weight = 1.0;
        }
      }

      // Add instance to dataset
      Instance inst = new DenseInstance(weight, instance);
      inst.setDataset(m_Data);

      return inst;
    }

    /**
     * Reads and stores header of an ARFF file.
     * 
     * @param capacity the number of instances to reserve in the data structure
     * @throws IOException if the information is not read successfully
     */
    protected void readHeader(int capacity) throws IOException {
      m_Lines = 0;
      String relationName = "";

      // Get name of relation.
      getFirstToken();
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        errorMessage("premature end of file");
      }
      if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
        getNextToken();
        relationName = m_Tokenizer.sval;
        getLastToken(false);
      } else {
        errorMessage("keyword " + Instances.ARFF_RELATION + " expected");
      }

      // Create vectors to hold information temporarily.
      ArrayList attributes = new ArrayList();

      // Get attribute declarations.
      getFirstToken();
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        errorMessage("premature end of file");
      }

      while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
        attributes = parseAttribute(attributes);
      }

      // Check if data part follows. We can't easily check for EOL.
      if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) {
        errorMessage("keyword " + Instances.ARFF_DATA + " expected");
      }

      // Check if any attributes have been declared.
      if (attributes.size() == 0) {
        errorMessage("no attributes declared");
      }

      m_Data = new Instances(relationName, attributes, capacity);
    }

    /**
     * Parses the attribute declaration.
     * 
     * @param attributes the current attributes vector
     * @return the new attributes vector
     * @throws IOException if the information is not read successfully
     */
    protected ArrayList parseAttribute(
      ArrayList attributes) throws IOException {
      String attributeName;
      ArrayList attributeValues;

      // Get attribute name.
      getNextToken();
      attributeName = m_Tokenizer.sval;
      getNextToken();

      // Check if attribute is nominal.
      if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) {

        // Attribute is real, integer, or string.
        if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL)
          || m_Tokenizer.sval
            .equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER)
          || m_Tokenizer.sval
            .equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
          attributes.add(new Attribute(attributeName, attributes.size()));
          readTillEOL();
        } else if (m_Tokenizer.sval
          .equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
          attributes.add(new Attribute(attributeName, (ArrayList) null,
            attributes.size()));
          readTillEOL();
        } else if (m_Tokenizer.sval
          .equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {
          String format = null;
          if (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
            if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD)
              && (m_Tokenizer.ttype != '\'') && (m_Tokenizer.ttype != '\"')) {
              errorMessage("not a valid date format");
            }
            format = m_Tokenizer.sval;
            readTillEOL();
          } else {
            m_Tokenizer.pushBack();
          }
          attributes
            .add(new Attribute(attributeName, format, attributes.size()));

        } else if (m_Tokenizer.sval
          .equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) {
          readTillEOL();

          // Read attributes for subrelation
          // First, save current set of attributes
          ArrayList atts = attributes;
          attributes = new ArrayList();

          // Now, read attributes until we hit end of declaration of relational
          // value
          getFirstToken();
          if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
            errorMessage("premature end of file");
          }
          do {
            if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
              attributes = parseAttribute(attributes);
            } else if (Attribute.ARFF_END_SUBRELATION
              .equalsIgnoreCase(m_Tokenizer.sval)) {
              getNextToken();
              if (!attributeName.equalsIgnoreCase(m_Tokenizer.sval)) {
                errorMessage("declaration of subrelation " + attributeName
                  + " must be terminated by " + "@end " + attributeName);
              }
              break;
            } else {
              errorMessage("declaration of subrelation " + attributeName
                + " must be terminated by " + "@end " + attributeName);
            }
          } while (true);

          // Make relation and restore original set of attributes
          Instances relation = new Instances(attributeName, attributes, 0);
          attributes = atts;
          attributes.add(new Attribute(attributeName, relation, attributes
            .size()));
        } else {
          errorMessage("no valid attribute type or invalid " + "enumeration");
        }
      } else {

        // Attribute is nominal.
        attributeValues = new ArrayList();
        m_Tokenizer.pushBack();

        // Get values for nominal attribute.
        if (m_Tokenizer.nextToken() != '{') {
          errorMessage("{ expected at beginning of enumeration");
        }
        while (m_Tokenizer.nextToken() != '}') {
          if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL) {
            errorMessage("} expected at end of enumeration");
          } else {
            attributeValues.add(m_Tokenizer.sval);
          }
        }
        attributes.add(new Attribute(attributeName, attributeValues, attributes
          .size()));
      }
      getLastToken(false);
      getFirstToken();
      if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
        errorMessage("premature end of file");
      }

      return attributes;
    }

    /**
     * Reads and skips all tokens before next end of line token.
     * 
     * @throws IOException in case something goes wrong
     */
    protected void readTillEOL() throws IOException {
      while (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
      }

      m_Tokenizer.pushBack();
    }

    /**
     * Returns the header format
     * 
     * @return the header format
     */
    public Instances getStructure() {
      return new Instances(m_Data, 0);
    }

    /**
     * Returns the data that was read
     * 
     * @return the data
     */
    public Instances getData() {
      return m_Data;
    }

    /**
     * Set whether to retain the values of string attributes in memory (in the
     * header) when reading incrementally.
     * 
     * @param retain true if string values are to be retained in memory when
     *          reading incrementally
     */
    public void setRetainStringValues(boolean retain) {
      m_retainStringValues = retain;
    }

    /**
     * Get whether to retain the values of string attributes in memory (in the
     * header) when reading incrementally.
     * 
     * @return true if string values are to be retained in memory when reading
     *         incrementally
     */
    public boolean getRetainStringValues() {
      return m_retainStringValues;
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 11136 $");
    }
  }

  /**
   * Returns a string describing this Loader
   * 
   * @return a description of the Loader suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in arff (attribute relation file format) "
      + "format. ";
  }

  /**
   * Tool tip text for this property
   * 
   * @return the tool tip for this property
   */
  public String retainStringValsTipText() {
    return "If true then the values of string attributes are "
      + "retained in memory when reading incrementally. Leave this "
      + "set to false when using incremental classifiers in the "
      + "Knowledge Flow.";
  }

  /**
   * Set whether to retain the values of string attributes in memory (in the
   * header) when reading incrementally.
   * 
   * @param retain true if string values are to be retained in memory when
   *          reading incrementally
   */
  public void setRetainStringVals(boolean retain) {
    m_retainStringVals = retain;
  }

  /**
   * Get whether to retain the values of string attributes in memory (in the
   * header) when reading incrementally.
   * 
   * @return true if string values are to be retained in memory when reading
   *         incrementally
   */
  public boolean getRetainStringVals() {
    return m_retainStringVals;
  }

  /**
   * Get the file extension used for arff files
   * 
   * @return the file extension
   */
  @Override
  public String getFileExtension() {
    return FILE_EXTENSION;
  }

  /**
   * Gets all the file extensions used for this type of file
   * 
   * @return the file extensions
   */
  @Override
  public String[] getFileExtensions() {
    return new String[] { FILE_EXTENSION, FILE_EXTENSION_COMPRESSED };
  }

  /**
   * Returns a description of the file type.
   * 
   * @return a short file description
   */
  @Override
  public String getFileDescription() {
    return "Arff data files";
  }

  /**
   * Resets the Loader ready to read a new data set or the same data set again.
   * 
   * @throws IOException if something goes wrong
   */
  @Override
  public void reset() throws IOException {
    m_structure = null;
    m_ArffReader = null;
    setRetrieval(NONE);

    if (m_File != null && !(new File(m_File).isDirectory())) {
      setFile(new File(m_File));
    } else if (m_URL != null && !m_URL.equals("http://")) {
      setURL(m_URL);
    }
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied url.
   * 
   * @param url the source url.
   * @throws IOException if an error occurs
   */
  public void setSource(URL url) throws IOException {
    m_structure = null;
    setRetrieval(NONE);

    setSource(url.openStream());

    m_URL = url.toString();
    // make sure that the file is null so that any calls to
    // reset() work properly
    m_File = null;
  }

  /**
   * get the File specified as the source
   * 
   * @return the source file
   */
  @Override
  public File retrieveFile() {
    return new File(m_File);
  }

  /**
   * sets the source File
   * 
   * @param file the source file
   * @throws IOException if an error occurs
   */
  @Override
  public void setFile(File file) throws IOException {
    m_File = file.getPath();
    setSource(file);
  }

  /**
   * Set the url to load from
   * 
   * @param url the url to load from
   * @throws IOException if the url can't be set.
   */
  @Override
  public void setURL(String url) throws IOException {
    m_URL = url;
    setSource(new URL(url));
  }

  /**
   * Return the current url
   * 
   * @return the current url
   */
  @Override
  public String retrieveURL() {
    return m_URL;
  }

  /**
   * Resets the Loader object and sets the source of the data set to be the
   * supplied InputStream.
   * 
   * @param in the source InputStream.
   * @throws IOException always thrown.
   */
  @Override
  public void setSource(InputStream in) throws IOException {
    m_File = (new File(System.getProperty("user.dir"))).getAbsolutePath();
    m_URL = "http://";

    m_sourceReader = new BufferedReader(new InputStreamReader(in));
  }

  /**
   * Determines and returns (if possible) the structure (internally the header)
   * of the data set as an empty set of instances.
   * 
   * @return the structure of the data set as an empty set of Instances
   * @throws IOException if an error occurs
   */
  @Override
  public Instances getStructure() throws IOException {

    if (m_structure == null) {
      if (m_sourceReader == null) {
        throw new IOException("No source has been specified");
      }

      try {
        m_ArffReader =
          new ArffReader(m_sourceReader, 1, (getRetrieval() == BATCH));
        m_ArffReader.setRetainStringValues(getRetainStringVals());
        m_structure = m_ArffReader.getStructure();
      } catch (Exception ex) {
        throw new IOException("Unable to determine structure as arff (Reason: "
          + ex.toString() + ").");
      }
    }

    return new Instances(m_structure, 0);
  }

  /**
   * Return the full data set. If the structure hasn't yet been determined by a
   * call to getStructure then method should do so before processing the rest of
   * the data set.
   * 
   * @return the structure of the data set as an empty set of Instances
   * @throws IOException if there is no source or parsing fails
   */
  @Override
  public Instances getDataSet() throws IOException {

    Instances insts = null;
    try {
      if (m_sourceReader == null) {
        throw new IOException("No source has been specified");
      }
      if (getRetrieval() == INCREMENTAL) {
        throw new IOException(
          "Cannot mix getting Instances in both incremental and batch modes");
      }
      setRetrieval(BATCH);
      if (m_structure == null) {
        getStructure();
      }

      // Read all instances
      insts = new Instances(m_structure, 0);
      Instance inst;
      while ((inst = m_ArffReader.readInstance(m_structure)) != null) {
        insts.add(inst);
      }

      // Instances readIn = new Instances(m_structure);

    } finally {
      if (m_sourceReader != null) {
        // close the stream
        m_sourceReader.close();
      }
    }

    return insts;
  }

  /**
   * Read the data set incrementally---get the next instance in the data set or
   * returns null if there are no more instances to get. If the structure hasn't
   * yet been determined by a call to getStructure then method should do so
   * before returning the next instance in the data set.
   * 
   * @param structure the dataset header information, will get updated in case
   *          of string or relational attributes
   * @return the next instance in the data set as an Instance object or null if
   *         there are no more instances to be read
   * @throws IOException if there is an error during parsing
   */
  @Override
  public Instance getNextInstance(Instances structure) throws IOException {

    m_structure = structure;

    if (getRetrieval() == BATCH) {
      throw new IOException(
        "Cannot mix getting Instances in both incremental and batch modes");
    }
    setRetrieval(INCREMENTAL);

    Instance current = null;
    if (m_sourceReader != null) {
      current = m_ArffReader.readInstance(m_structure);
    }

    if ((m_sourceReader != null) && (current == null)) {
      try {
        // close the stream
        m_sourceReader.close();
        m_sourceReader = null;
        // reset();
      } catch (Exception ex) {
        ex.printStackTrace();
      }
    }
    return current;
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 11136 $");
  }

  /**
   * Main method.
   * 
   * @param args should contain the name of an input file.
   */
  public static void main(String[] args) {
    runFileLoader(new ArffLoader(), args);
  }
}