weka.core.converters.ConverterUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ConverterUtils.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.WekaPackageClassLoaderManager;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;

/**
 * Utility routines for the converter package.
 * 
 * @author Mark Hall ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 15656 $
 * @see Serializable
 */
public class ConverterUtils implements Serializable, RevisionHandler {

  /** for serialization. */
  static final long serialVersionUID = -2460855349276148760L;

  /**
   * Helper class for loading data from files and URLs. Via the ConverterUtils
   * class it determines which converter to use for loading the data into
   * memory. If the chosen converter is an incremental one, then the data will
   * be loaded incrementally, otherwise as batch. In both cases the same
   * interface will be used (hasMoreElements,
   * nextElement). Before the data can be read again, one has to
   * call the reset method. The data source can also be initialized
   * with an Instances object, in order to provide a unified interface to files
   * and already loaded datasets.
   * 
   * @author FracPete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 15656 $
   * @see #hasMoreElements(Instances)
   * @see #nextElement(Instances)
   * @see #reset()
   * @see DataSink
   */
  public static class DataSource implements Serializable, RevisionHandler {

    /** for serialization. */
    private static final long serialVersionUID = -613122395928757332L;

    /** the file to load. */
    protected File m_File;

    /** the URL to load. */
    protected URL m_URL;

    /** the loader. */
    protected Loader m_Loader;

    /** whether the loader is incremental. */
    protected boolean m_Incremental;

    /** the instance counter for the batch case. */
    protected int m_BatchCounter;

    /** the last internally read instance. */
    protected Instance m_IncrementalBuffer;

    /** the batch buffer. */
    protected Instances m_BatchBuffer;

    /**
     * Tries to load the data from the file. Can be either a regular file or a
     * web location (http://, https://, ftp:// or file://).
     * 
     * @param location the name of the file to load
     * @throws Exception if initialization fails
     */
    public DataSource(String location) throws Exception {
      super();

      // file or URL?
      if (location.startsWith("http://") || location.startsWith("https://")
        || location.startsWith("ftp://") || location.startsWith("file://")) {
        m_URL = new URL(location);
      } else {
        m_File = new File(location);
      }

      // quick check: is it ARFF?
      if (isArff(location)) {
        m_Loader = new ArffLoader();
      } else {
        if (m_File != null) {
          m_Loader = ConverterUtils.getLoaderForFile(location);
        } else {
          m_Loader = ConverterUtils.getURLLoaderForFile(location);
        }

        // do we have a converter?
        if (m_Loader == null) {
          throw new IllegalArgumentException(
            "No suitable converter found for '" + location + "'!");
        }
      }

      // incremental loader?
      m_Incremental = (m_Loader instanceof IncrementalConverter);

      reset();
    }

    /**
     * Initializes the datasource with the given dataset.
     * 
     * @param inst the dataset to use
     */
    public DataSource(Instances inst) {
      super();

      m_BatchBuffer = inst;
      m_Loader = null;
      m_File = null;
      m_URL = null;
      m_Incremental = false;
    }

    /**
     * Initializes the datasource with the given Loader.
     * 
     * @param loader the Loader to use
     */
    public DataSource(Loader loader) {
      super();

      m_BatchBuffer = null;
      m_Loader = loader;
      m_File = null;
      m_URL = null;
      m_Incremental = (m_Loader instanceof IncrementalConverter);

      initBatchBuffer();
    }

    /**
     * Initializes the datasource with the given input stream. This stream is
     * always interpreted as ARFF.
     * 
     * @param stream the stream to use
     */
    public DataSource(InputStream stream) {
      super();

      m_BatchBuffer = null;
      m_Loader = new ArffLoader();
      try {
        m_Loader.setSource(stream);
      } catch (Exception e) {
        m_Loader = null;
      }
      m_File = null;
      m_URL = null;
      m_Incremental = (m_Loader instanceof IncrementalConverter);

      initBatchBuffer();
    }

    /**
     * initializes the batch buffer if necessary, i.e., for non-incremental
     * loaders.
     */
    protected void initBatchBuffer() {
      try {
        if (!isIncremental()) {
          m_BatchBuffer = m_Loader.getDataSet();
        } else {
          m_BatchBuffer = null;
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    /**
     * returns whether the extension of the location is likely to be of ARFF
     * format, i.e., ending in ".arff" or ".arff.gz" (case-insensitive).
     * 
     * @param location the file location to check
     * @return true if the location seems to be of ARFF format
     */
    public static boolean isArff(String location) {
      if (location.toLowerCase().endsWith(
        ArffLoader.FILE_EXTENSION.toLowerCase())
        || location.toLowerCase().endsWith(
          ArffLoader.FILE_EXTENSION_COMPRESSED.toLowerCase())) {
        return true;
      } else {
        return false;
      }
    }

    /**
     * returns whether the loader is an incremental one.
     * 
     * @return true if the loader is a true incremental one
     */
    public boolean isIncremental() {
      return m_Incremental;
    }

    /**
     * returns the determined loader, null if the DataSource was initialized
     * with data alone and not a file/URL.
     * 
     * @return the loader used for retrieving the data
     */
    public Loader getLoader() {
      return m_Loader;
    }

    /**
     * returns the full dataset, can be null in case of an error.
     * 
     * @return the full dataset
     * @throws Exception if resetting of loader fails
     */
    public Instances getDataSet() throws Exception {
      Instances result;

      result = null;

      // reset the loader
      reset();

      try {
        if (m_BatchBuffer == null) {
          result = m_Loader.getDataSet();
        } else {
          result = m_BatchBuffer;
        }
      } catch (Exception e) {
        e.printStackTrace();
        result = null;
      }

      return result;
    }

    /**
     * returns the full dataset with the specified class index set, can be null
     * in case of an error.
     * 
     * @param classIndex the class index for the dataset
     * @return the full dataset
     * @throws Exception if resetting of loader fails
     */
    public Instances getDataSet(int classIndex) throws Exception {
      Instances result;

      result = getDataSet();
      if (result != null) {
        result.setClassIndex(classIndex);
      }

      return result;
    }

    /**
     * resets the loader.
     * 
     * @throws Exception if resetting fails
     */
    public void reset() throws Exception {
      if (m_File != null) {
        ((AbstractFileLoader) m_Loader).setFile(m_File);
      } else if (m_URL != null) {
        ((URLSourcedLoader) m_Loader).setURL(m_URL.toString());
      } else if (m_Loader != null) {
        m_Loader.reset();
      }

      m_BatchCounter = 0;
      m_IncrementalBuffer = null;

      if (m_Loader != null) {
        if (!isIncremental()) {
          m_BatchBuffer = m_Loader.getDataSet();
        } else {
          m_BatchBuffer = null;
        }
      }
    }

    /**
     * returns the structure of the data.
     * 
     * @return the structure of the data
     * @throws Exception if something goes wrong
     */
    public Instances getStructure() throws Exception {
      if (m_BatchBuffer == null) {
        return m_Loader.getStructure();
      } else {
        return new Instances(m_BatchBuffer, 0);
      }
    }

    /**
     * returns the structure of the data, with the defined class index.
     * 
     * @param classIndex the class index for the dataset
     * @return the structure of the data
     * @throws Exception if something goes wrong
     */
    public Instances getStructure(int classIndex) throws Exception {
      Instances result;

      result = getStructure();
      if (result != null) {
        result.setClassIndex(classIndex);
      }

      return result;
    }

    /**
     * returns whether there are more Instance objects in the data.
     * 
     * @param structure the structure of the dataset
     * @return true if there are more Instance objects available
     * @see #nextElement(Instances)
     */
    public boolean hasMoreElements(Instances structure) {
      boolean result;

      result = false;

      if (isIncremental()) {
        // user still hasn't collected the last one?
        if (m_IncrementalBuffer != null) {
          result = true;
        } else {
          try {
            m_IncrementalBuffer = m_Loader.getNextInstance(structure);
            result = (m_IncrementalBuffer != null);
          } catch (Exception e) {
            e.printStackTrace();
            result = false;
          }
        }
      } else {
        result = (m_BatchCounter < m_BatchBuffer.numInstances());
      }

      return result;
    }

    /**
     * returns the next element and sets the specified dataset, null if none
     * available.
     * 
     * @param dataset the dataset to set for the instance
     * @return the next Instance
     */
    public Instance nextElement(Instances dataset) {
      Instance result;

      result = null;

      if (isIncremental()) {
        // is there still an instance in the buffer?
        if (m_IncrementalBuffer != null) {
          result = m_IncrementalBuffer;
          m_IncrementalBuffer = null;
        } else {
          try {
            result = m_Loader.getNextInstance(dataset);
          } catch (Exception e) {
            e.printStackTrace();
            result = null;
          }
        }
      } else {
        if (m_BatchCounter < m_BatchBuffer.numInstances()) {
          result = m_BatchBuffer.instance(m_BatchCounter);
          m_BatchCounter++;
        }
      }

      if (result != null) {
        result.setDataset(dataset);
      }

      return result;
    }

    /**
     * convencience method for loading a dataset in batch mode.
     * 
     * @param location the dataset to load
     * @return the dataset
     * @throws Exception if loading fails
     */
    public static Instances read(String location) throws Exception {
      DataSource source;
      Instances result;

      source = new DataSource(location);
      result = source.getDataSet();

      return result;
    }

    /**
     * convencience method for loading a dataset in batch mode from a stream.
     * 
     * @param stream the stream to load the dataset from
     * @return the dataset
     * @throws Exception if loading fails
     */
    public static Instances read(InputStream stream) throws Exception {
      DataSource source;
      Instances result;

      source = new DataSource(stream);
      result = source.getDataSet();

      return result;
    }

    /**
     * convencience method for loading a dataset in batch mode.
     * 
     * @param loader the loader to get the dataset from
     * @return the dataset
     * @throws Exception if loading fails
     */
    public static Instances read(Loader loader) throws Exception {
      DataSource source;
      Instances result;

      source = new DataSource(loader);
      result = source.getDataSet();

      return result;
    }

    /**
     * for testing only - takes a data file as input.
     * 
     * @param args the commandline arguments
     * @throws Exception if something goes wrong
     */
    public static void main(String[] args) throws Exception {
      if (args.length != 1) {
        System.out.println("\nUsage: " + DataSource.class.getName()
          + " \n");
        System.exit(1);
      }

      DataSource loader = new DataSource(args[0]);

      System.out.println("Incremental? " + loader.isIncremental());
      System.out.println("Loader: " + loader.getLoader().getClass().getName());
      System.out.println("Data:\n");
      Instances structure = loader.getStructure();
      System.out.println(structure);
      while (loader.hasMoreElements(structure)) {
        System.out.println(loader.nextElement(structure));
      }

      Instances inst = loader.getDataSet();
      loader = new DataSource(inst);
      System.out.println("\n\nProxy-Data:\n");
      System.out.println(loader.getStructure());
      while (loader.hasMoreElements(structure)) {
        System.out.println(loader.nextElement(inst));
      }
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 15656 $");
    }
  }

  /**
   * Helper class for saving data to files. Via the ConverterUtils class it
   * determines which converter to use for saving the data. It is the logical
   * counterpart to DataSource.
   * 
   * @author FracPete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 15656 $
   * @see DataSource
   */
  public static class DataSink implements Serializable, RevisionHandler {

    /** for serialization. */
    private static final long serialVersionUID = -1504966891136411204L;

    /** the saver to use for storing the data. */
    protected Saver m_Saver = null;

    /** the stream to store the data in (always in ARFF format). */
    protected OutputStream m_Stream = null;

    /**
     * initializes the sink to save the data to the given file.
     * 
     * @param filename the file to save data to
     * @throws Exception if set of saver fails
     */
    public DataSink(String filename) throws Exception {
      m_Stream = null;

      if (DataSource.isArff(filename)) {
        m_Saver = new ArffSaver();
      } else {
        m_Saver = getSaverForFile(filename);
      }

      ((AbstractFileSaver) m_Saver).setFile(new File(filename));
    }

    /**
     * initializes the sink to save the data to the given Saver (expected to be
     * fully configured).
     * 
     * @param saver the saver to use for saving the data
     */
    public DataSink(Saver saver) {
      m_Saver = saver;
      m_Stream = null;
    }

    /**
     * initializes the sink to save the data in the stream (always in ARFF
     * format).
     * 
     * @param stream the output stream to use for storing the data in ARFF
     *          format
     */
    public DataSink(OutputStream stream) {
      m_Saver = null;
      m_Stream = stream;
    }

    /**
     * writes the given data either via the saver or to the defined output
     * stream (depending on the constructor). In case of the stream, the stream
     * is only flushed, but not closed.
     * 
     * @param data the data to save
     * @throws Exception if saving fails
     */
    public void write(Instances data) throws Exception {
      if (m_Saver != null) {
        m_Saver.setInstances(data);
        m_Saver.writeBatch();
      } else {
        m_Stream.write(data.toString().getBytes());
        m_Stream.flush();
      }
    }

    /**
     * writes the data to the given file.
     * 
     * @param filename the file to write the data to
     * @param data the data to store
     * @throws Exception if writing fails
     */
    public static void write(String filename, Instances data) throws Exception {
      DataSink sink;

      sink = new DataSink(filename);
      sink.write(data);
    }

    /**
     * writes the data via the given saver.
     * 
     * @param saver the saver to use for writing the data
     * @param data the data to store
     * @throws Exception if writing fails
     */
    public static void write(Saver saver, Instances data) throws Exception {
      DataSink sink;

      sink = new DataSink(saver);
      sink.write(data);
    }

    /**
     * writes the data to the given stream (always in ARFF format).
     * 
     * @param stream the stream to write the data to (ARFF format)
     * @param data the data to store
     * @throws Exception if writing fails
     */
    public static void write(OutputStream stream, Instances data)
      throws Exception {
      DataSink sink;

      sink = new DataSink(stream);
      sink.write(data);
    }

    /**
     * for testing only - takes a data file as input and a data file for the
     * output.
     * 
     * @param args the commandline arguments
     * @throws Exception if something goes wrong
     */
    public static void main(String[] args) throws Exception {
      if (args.length != 2) {
        System.out.println("\nUsage: " + DataSource.class.getName()
          + "  \n");
        System.exit(1);
      }

      // load data
      Instances data = DataSource.read(args[0]);

      // save data
      DataSink.write(args[1], data);
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 15656 $");
    }
  }

  /** all available loaders (extension <-> classname). */
  protected static Hashtable> m_FileLoaders;

  /** all available URL loaders (extension <-> classname). */
  protected static Hashtable> m_URLFileLoaders;

  /** all available savers (extension <-> classname). */
  protected static Hashtable> m_FileSavers;

  // determine all loaders/savers
  static {
    initialize();
  }

  public static void initialize() {
    ConverterResources.initialize();

    m_FileLoaders    = ConverterResources.getFileLoaders();
    m_URLFileLoaders = ConverterResources.getURLFileLoaders();
    m_FileSavers     = ConverterResources.getFileSavers();
  }

  /**
   * Gets token, skipping empty lines.
   * 
   * @param tokenizer the stream tokenizer
   * @throws IOException if reading the next token fails
   */
  public static void getFirstToken(StreamTokenizer tokenizer)
    throws IOException {
    StreamTokenizerUtils.getFirstToken(tokenizer);
  }

  /**
   * Gets token.
   * 
   * @param tokenizer the stream tokenizer
   * @throws IOException if reading the next token fails
   */
  public static void getToken(StreamTokenizer tokenizer) throws IOException {
    StreamTokenizerUtils.getToken(tokenizer);
  }

  /**
   * Throws error message with line number and last token read.
   * 
   * @param theMsg the error message to be thrown
   * @param tokenizer the stream tokenizer
   * @throws IOException containing the error message
   */
  public static void errms(StreamTokenizer tokenizer, String theMsg)
    throws IOException {

    throw new IOException(theMsg + ", read " + tokenizer.toString());
  }

  /**
   * returns a vector with the classnames of all the loaders from the given
   * hashtable.
   * 
   * @param ht the hashtable with the extension/converter relation
   * @return the classnames of the loaders
   */
  protected static Vector getConverters(Hashtable> ht) {
    Vector result;
    Enumeration> enm;
    List convs;

    result = new Vector();

    // get all classnames
    enm = ht.elements();
    while (enm.hasMoreElements()) {
      convs = enm.nextElement();
      for (String converter: convs) {
        if (!result.contains(converter)) {
          result.add(converter);
        }
      }
    }

    // sort names
    Collections.sort(result);

    return result;
  }

  /**
   * tries to determine the converter to use for this kind of file, returns null
   * if none can be found in the given hashtable.
   * 
   * @param filename the file to return a converter for
   * @param ht the hashtable with the relation extension/converter
   * @return the converter if one was found, null otherwise
   */
  protected static Object getConverterForFile(String filename,
    Hashtable> ht) {
    Object result;
    String extension;
    int index;

    result = null;

    index = filename.lastIndexOf('.');
    if (index > -1) {
      extension = filename.substring(index).toLowerCase();
      result = getConverterForExtension(extension, ht);
      // is it a compressed format?
      if (extension.equals(".gz") && result == null) {
        index = filename.lastIndexOf('.', index - 1);
        extension = filename.substring(index).toLowerCase();
        result = getConverterForExtension(extension, ht);
      }
    }

    return result;
  }

  /**
   * tries to determine the loader to use for this kind of extension, returns
   * null if none can be found.
   * 
   * @param extension the file extension to return a converter for
   * @param ht the hashtable with the relation extension/converter
   * @return the converter if one was found, null otherwise
   */
  protected static Object getConverterForExtension(String extension,
    Hashtable> ht) {
    Object result;
    List classnames;
    String classname;

    result = null;
    classnames = ht.get(extension);
    if ((classnames != null) && !classnames.isEmpty()) {
      try {
        result = WekaPackageClassLoaderManager.forName(classnames.get(0)).newInstance();
      } catch (Exception e) {
        result = null;
        e.printStackTrace();
      }
    }

    return result;
  }

  /**
   * tries to determine the converter to use for this kind of file, returns null
   * if none can be found in the given hashtable.
   *
   * @param filename the file to return a converter for
   * @param ht the hashtable with the relation extension/converter
   * @return the converter if one was found, null otherwise
   */
  protected static List