weka.experiment.ExplicitTestsetResultProducer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ExplicitTestsetResultProducer.java
 *    Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.experiment;

import java.io.File;
import java.util.Calendar;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.TimeZone;
import java.util.Vector;

import weka.core.AdditionalMeasureProducer;
import weka.core.Environment;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.WekaException;
import weka.core.converters.ConverterUtils.DataSource;

/**
 *  Loads the external test set and calls the
 * appropriate SplitEvaluator to generate some results.

 * The filename of the test set is constructed as follows:

 * <dir> + / + <prefix> + <relation-name> + <suffix>

 * The relation-name can be modified by using the regular expression to replace
 * the matching sub-string with a specified replacement string. In order to get
 * rid of the string that the Weka filters add to the end of the relation name,
 * just use '.*-weka' as the regular expression to find.

 * The suffix determines the type of file to load, i.e., one is not restricted
 * to ARFF files. As long as Weka recognizes the extension specified in the
 * suffix, the data will be loaded with one of Weka's converters.
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -D
 * Save raw split evaluator output.
 * 
 * 
 *  * -O <file/directory name/path>
 *  The filename where raw output will be stored.
 *  If a directory name is specified then then individual
 *  outputs will be gzipped, otherwise all output will be
 *  zipped to the named file. Use in conjuction with -D.
 *  (default: splitEvalutorOut.zip)
 * 
 * 
 *  * -W <class name>
 *  The full class name of a SplitEvaluator.
 *  eg: weka.experiment.ClassifierSplitEvaluator
 * 
 * 
 *  * -R
 *  Set when data is to be randomized.
 * 
 * 
 *  * -dir <directory>
 *  The directory containing the test sets.
 *  (default: current directory)
 * 
 * 
 *  * -prefix <string>
 *  An optional prefix for the test sets (before the relation name).
 * (default: empty string)
 * 
 * 
 *  * -suffix <string>
 *  The suffix to append to the test set.
 *  (default: _test.arff)
 * 
 * 
 *  * -find <regular expression>
 *  The regular expression to search the relation name with.
 *  Not used if an empty string.
 *  (default: empty string)
 * 
 * 
 *  * -replace <string>
 *  The replacement string for the all the matches of '-find'.
 *  (default: empty string)
 * 
 * 
 *  * Options specific to split evaluator weka.experiment.ClassifierSplitEvaluator:
 * 
 * 
 *  * -W <class name>
 *  The full class name of the classifier.
 *  eg: weka.classifiers.bayes.NaiveBayes
 * 
 * 
 *  * -C <index>
 *  The index of the class for which IR statistics
 *  are to be output. (default 1)
 * 
 * 
 *  * -I <index>
 *  The index of an attribute to output in the
 *  results. This attribute should identify an
 *  instance in order to know which instances are
 *  in the test set of a cross validation. if 0
 *  no output (default 0).
 * 
 * 
 *  * -P
 *  Add target and prediction columns to the result
 *  for each fold.
 * 
 * 
 *  * Options specific to classifier weka.classifiers.rules.ZeroR:
 * 
 * 
 *  * -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * 
 * 
 * 
 * 
 * All options after -- will be passed to the split evaluator.
 * 
 * @author Len Trigg ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 10203 $
 */
public class ExplicitTestsetResultProducer implements ResultProducer,
  OptionHandler, AdditionalMeasureProducer, RevisionHandler {

  /** for serialization. */
  private static final long serialVersionUID = 2613585409333652530L;

  /** the default suffix. */
  public final static String DEFAULT_SUFFIX = "_test.arff";

  /** The dataset of interest. */
  protected Instances m_Instances;

  /** The ResultListener to send results to. */
  protected ResultListener m_ResultListener = new CSVResultListener();

  /** The directory containing all the test sets. */
  protected File m_TestsetDir = new File(System.getProperty("user.dir"));

  /** The prefix for all the test sets. */
  protected String m_TestsetPrefix = "";

  /** The suffix for all the test sets. */
  protected String m_TestsetSuffix = DEFAULT_SUFFIX;

  /** The regular expression to search for in the relation name. */
  protected String m_RelationFind = "";

  /** The string to use to replace the matches of the regular expression. */
  protected String m_RelationReplace = "";

  /** Whether dataset is to be randomized. */
  protected boolean m_randomize = false;

  /** The SplitEvaluator used to generate results. */
  protected SplitEvaluator m_SplitEvaluator = new ClassifierSplitEvaluator();

  /** The names of any additional measures to look for in SplitEvaluators. */
  protected String[] m_AdditionalMeasures = null;

  /** Save raw output of split evaluators --- for debugging purposes. */
  protected boolean m_debugOutput = false;

  /** The output zipper to use for saving raw splitEvaluator output. */
  protected OutputZipper m_ZipDest = null;

  /** The destination output file/directory for raw output. */
  protected File m_OutputFile = new File(new File(
    System.getProperty("user.dir")), "splitEvalutorOut.zip");

  /** The name of the key field containing the dataset name. */
  public static String DATASET_FIELD_NAME = "Dataset";

  /** The name of the key field containing the run number. */
  public static String RUN_FIELD_NAME = "Run";

  /** The name of the result field containing the timestamp. */
  public static String TIMESTAMP_FIELD_NAME = "Date_time";

  protected transient Environment m_env;

  /**
   * Returns a string describing this result producer.
   * 
   * @return a description of the result producer suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Loads the external test set and calls the appropriate "
      + "SplitEvaluator to generate some results.\n"
      + "The filename of the test set is constructed as follows:\n"
      + "    + / +  +  + \n"
      + "The relation-name can be modified by using the regular expression "
      + "to replace the matching sub-string with a specified replacement "
      + "string. In order to get rid of the string that the Weka filters "
      + "add to the end of the relation name, just use '.*-weka' as the "
      + "regular expression to find.\n"
      + "The suffix determines the type of file to load, i.e., one is "
      + "not restricted to ARFF files. As long as Weka recognizes the "
      + "extension specified in the suffix, the data will be loaded with "
      + "one of Weka's converters.";
  }

  /**
   * Returns an enumeration describing the available options..
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration