All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.datagenerators.DataGenerator Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * DataGenerator.java
 * Copyright (C) 2005-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.datagenerators;

import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Random;
import java.util.Vector;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Randomizable;
import weka.core.RevisionHandler;
import weka.core.Utils;

/**
 * Abstract superclass for data generators that generate data for classifiers
 * and clusterers.
 * 
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 15437 $
 */
public abstract class DataGenerator implements OptionHandler, Randomizable,
  Serializable, RevisionHandler {

  /** for serialization */
  private static final long serialVersionUID = -3698585946221802578L;

  /** Debugging mode */
  protected boolean m_Debug = false;

  /** The format for the generated dataset */
  protected Instances m_DatasetFormat = null;

  /** Relation name specified by the user (relation name will be auto-generated if empty) */
  protected String m_RelationName = "";

  /**
   * Number of instances that should be produced into the dataset this number is
   * by default m_NumExamples, but can be reset by the generator
   */
  protected int m_NumExamplesAct;

  /** default output (stdout) */
  protected transient PrintWriter m_DefaultOutput = new PrintWriter(
    new java.io.OutputStreamWriter(System.out));

  /** PrintWriter for outputting the generated data */
  protected transient PrintWriter m_Output = m_DefaultOutput;

  /** random number generator seed */
  protected int m_Seed;

  /** random number generator */
  protected Random m_Random = null;

  /** This flag is no longer used (left here to maintain compatibility for serialization) */
  protected boolean m_CreatingRelationName = false;

  /**
   * a black list for options not to be listed (for derived generators) in the
   * makeOptionString method
   * 
   * @see #makeOptionString(DataGenerator)
   */
  protected static HashSet m_OptionBlacklist;
  static {
    m_OptionBlacklist = new HashSet();
  }

  /**
   * initializes with default settings. 
* Note: default values are set via a default<name> method. These * default methods are also used in the listOptions method and in the * setOptions method. Why? Derived generators can override the return value of * these default methods, to avoid exceptions. */ public DataGenerator() { clearBlacklist(); setNumExamplesAct(defaultNumExamplesAct()); setSeed(defaultSeed()); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ @Override public Enumeration

* * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String tmpStr; // remove unwanted options options = removeBlacklist(options); tmpStr = Utils.getOption('r', options); if (tmpStr.length() != 0) { setRelationName(Utils.unquote(tmpStr)); } else { setRelationName(""); } tmpStr = Utils.getOption('o', options); if (tmpStr.length() != 0) { setOutput(new PrintWriter(new FileOutputStream(tmpStr))); } else if (getOutput() == null) { throw new Exception("No Output defined!"); } setDebug(Utils.getFlag('d', options)); tmpStr = Utils.getOption('S', options); if (tmpStr.length() != 0) { setSeed(Integer.parseInt(tmpStr)); } else { setSeed(defaultSeed()); } } /** * Gets the current settings of the datagenerator RDG1. Removing of * blacklisted options has to be done in the derived class, that defines the * blacklist-entry. * * @return an array of strings suitable for passing to setOptions * @see #removeBlacklist(String[]) */ @Override public String[] getOptions() { Vector result = new Vector(); if (getRelationName().length() > 0) { result.add("-r"); result.add(Utils.quote(getRelationName())); } if (getDebug()) { result.add("-d"); } result.add("-S"); result.add("" + getSeed()); return result.toArray(new String[result.size()]); } /** * Constructs the Instances object representing the format of the generated data. * * This default implementation simply returns the Instances object that holds the dataset format * currently stored in m_DatasetFormat. * * @return the format for the dataset * @throws Exception if the generating of the format failed * @see #defaultRelationName() */ public Instances defineDataFormat() throws Exception { return m_DatasetFormat; } /** * Generates one example of the dataset. * * @return the generated example * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExamples which * means in non single mode */ public abstract Instance generateExample() throws Exception; /** * Generates all examples of the dataset. * * @return the generated dataset * @throws Exception if the format of the dataset is not yet defined * @throws Exception if the generator only works with generateExample, which * means in single mode */ public abstract Instances generateExamples() throws Exception; /** * Generates a comment string that documentates the data generator. By default * this string is added at the beginning of the produced output as ARFF file * type, next after the options. * * @return string contains info about the generated rules * @throws Exception if the generating of the documentation fails */ public abstract String generateStart() throws Exception; /** * Generates a comment string that documentates the data generator. By default * this string is added at the end of the produced output as ARFF file type. * * @return string contains info about the generated rules * @throws Exception if the generating of the documentation fails */ public abstract String generateFinished() throws Exception; /** * Return if single mode is set for the given data generator mode depends on * option setting and or generator type. * * @return single mode flag * @throws Exception if mode is not set yet */ public abstract boolean getSingleModeFlag() throws Exception; /** * Sets the debug flag. * * @param debug the new debug flag */ public void setDebug(boolean debug) { m_Debug = debug; } /** * Gets the debug flag. * * @return the debug flag */ public boolean getDebug() { return m_Debug; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String debugTipText() { return "Whether the generator is run in debug mode or not."; } /** * Sets the relation name the dataset should have. * * @param relationName the new relation name */ public void setRelationName(String relationName) { m_RelationName = relationName; } /** * returns a relation name based on the options * * @return a relation name based on the options */ protected String defaultRelationName() { StringBuffer result; String[] options; String option; int i; result = new StringBuffer(this.getClass().getName()); options = getOptions(); for (i = 0; i < options.length; i++) { option = options[i].trim(); if (i > 0) { result.append("_"); } result.append(option.replaceAll(" ", "_")); } return result.toString(); } /** * returns the relation name to use, i.e., in case the currently set relation * name is empty, a generic one is returned. Must be used in * defineDataFormat() * * @return the relation name * @see #defaultRelationName() * @see #defineDataFormat() */ protected String getRelationNameToUse() { String result; result = getRelationName(); if (result.length() == 0) { result = defaultRelationName(); } return result; } /** * Gets the relation name the dataset should have. * * @return the relation name the dataset should have */ public String getRelationName() { return m_RelationName; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String relationNameTipText() { return "The relation name of the generated data (if empty, a generic one will be supplied)."; } /** * returns the default number of actual examples * * @return the default number of actual examples */ protected int defaultNumExamplesAct() { return 0; } /** * Sets the number of examples the dataset should have. * * @param numExamplesAct the new number of examples */ protected void setNumExamplesAct(int numExamplesAct) { m_NumExamplesAct = numExamplesAct; } /** * Gets the number of examples the dataset should have. * * @return the number of examples the dataset should have */ public int getNumExamplesAct() { return m_NumExamplesAct; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ protected String numExamplesActTipText() { return "The actual number of examples to generate."; } /** * Sets the print writer. * * @param newOutput the new print writer */ public void setOutput(PrintWriter newOutput) { m_Output = newOutput; m_DefaultOutput = null; } /** * Gets the print writer. * * @return print writer object */ public PrintWriter getOutput() { return m_Output; } /** * Gets writer, which is used for outputting to stdout. A workaround for the * problem of closing stdout when closing the associated Printwriter. * * @return writer object */ public PrintWriter defaultOutput() { return m_DefaultOutput; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String outputTipText() { return "The output writer to use for printing the generated data."; } /** * Sets the format of the dataset that is to be generated. * * @param newFormat the new dataset format of the dataset */ public void setDatasetFormat(Instances newFormat) { m_DatasetFormat = new Instances(newFormat, 0); } /** * Gets the format of the dataset that is to be generated. * * @return the dataset format of the dataset */ public Instances getDatasetFormat() { if (m_DatasetFormat != null) { return new Instances(m_DatasetFormat, 0); } else { return null; } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String formatTipText() { return "The data format to use."; } /** * returns the default seed * * @return the default seed */ protected int defaultSeed() { return 1; } /** * Gets the random number seed. * * @return the random number seed. */ @Override public int getSeed() { return m_Seed; } /** * Sets the random number seed. * * @param newSeed the new random number seed. */ @Override public void setSeed(int newSeed) { m_Seed = newSeed; m_Random = new Random(newSeed); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String seedTipText() { return "The seed value for the random number generator."; } /** * Gets the random generator. * * @return the random generator */ public Random getRandom() { if (m_Random == null) { m_Random = new Random(getSeed()); } return m_Random; } /** * Sets the random generator. * * @param newRandom is the random generator. */ public void setRandom(Random newRandom) { m_Random = newRandom; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String randomTipText() { return "The random number generator to use."; } /** * Returns a string representing the dataset in the instance queue. * * @return the string representing the output data format */ protected String toStringFormat() { if (m_DatasetFormat == null) { return ""; } return m_DatasetFormat.toString(); } /** * removes all entries from the options blacklist */ protected static void clearBlacklist() { m_OptionBlacklist.clear(); } /** * adds the given option, e.g., for "-V" use "V", to the blacklist of options * that are not to be output via the makeOptionString method * * @param option the option to exclude from listing * @see #makeOptionString(DataGenerator) */ protected static void addToBlacklist(String option) { m_OptionBlacklist.add(option); } /** * checks, whether the given option is in the blacklist of options not to be * output by makeOptionString * * @param option the option to check * @return true if the option is on the blacklist * @see #makeOptionString(DataGenerator) */ protected static boolean isOnBlacklist(String option) { return m_OptionBlacklist.contains(option); } /** * removes all the options from the options array that are blacklisted * * @param options the options to remove from the blacklist * @return the processed options array */ protected String[] removeBlacklist(String[] options) { Hashtable pool; Option option; // retrieve options that are on blacklist Enumeration





© 2015 - 2024 Weber Informatics LLC | Privacy Policy