weka.estimators.CheckEstimator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CheckEstimator.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.estimators;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TestInstances;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;

/**
 * Class for examining the capabilities and finding problems with estimators. If
 * you implement a estimator using the WEKA.libraries, you should run the checks
 * on it to ensure robustness and correct operation. Passing all the tests of
 * this object does not mean bugs in the estimator don't exist, but this will
 * help find some common ones.
 * 
 * 
 * Typical usage:
 * 

 * java weka.estimators.CheckEstimator -W estimator_name 
 * estimator_options 
 * 

 * 
 * This class uses code from the CheckEstimatorClass ATTENTION! Current
 * estimators can only 1. split on a nominal class attribute 2. build estimators
 * for nominal and numeric attributes 3. build estimators independendly of the
 * class type The functionality to test on other class and attribute types is
 * left in big parts in the code.
 * 
 * CheckEstimator reports on the following:
 * 

 * Estimator abilities
 * 
 * Possible command line options to the estimator
 * Whether the estimator can predict nominal, numeric, string, date or
 * relational class attributes. Warnings will be displayed if performance is
 * worse than ZeroR
 * Whether the estimator can be trained incrementally
 * Whether the estimator can build estimates for numeric attributes
 * Whether the estimator can handle nominal attributes
 * Whether the estimator can handle string attributes
 * Whether the estimator can handle date attributes
 * Whether the estimator can handle relational attributes
 * Whether the estimator build estimates for multi-instance data
 * Whether the estimator can handle missing attribute values
 * Whether the estimator can handle missing class values
 * Whether a nominal estimator only handles 2 class problems
 * Whether the estimator can handle instance weights
 * 
 * 
 * Correct functioning
 * 
 * Correct initialisation during addvalues (i.e. no result changes when
 * addValues called repeatedly)
 * Whether incremental training produces the same results as during
 * non-incremental training (which may or may not be OK)
 * Whether the estimator alters the data pased to it (number of instances,
 * instance order, instance weights, etc)
 * 
 * 
 * Degenerate cases
 * 
 * building estimator with zero training instances
 * all but one attribute attribute values missing
 * all attribute attribute values missing
 * all but one class values missing
 * all class values missing
 * 
 * 
 * 
 * Running CheckEstimator with the debug option set will output the training and
 * test datasets for any failed tests.
 * 
 * 
 * The weka.estimators.AbstractEstimatorTest uses this class to
 * test all the estimators. Any changes here, have to be checked in that
 * abstract test class, too.
 * 

 * 
 *  Valid options are:
 * 

 * 
 * 
 * -D
 *  Turn on debugging output.
 * 
 * 
 *  * -S
 *  Silent mode - prints nothing to stdout.
 * 
 * 
 *  * -N <num>
 *  The number of instances in the datasets (default 100).
 * 
 * 
 *  * -W
 *  Full name of the estimator analysed.
 *  eg: weka.estimators.NormalEstimator
 * 
 * 
 *  * Options specific to estimator weka.estimators.NormalEstimator:
 * 
 * 
 *  * -D
 *  If set, estimator is run in debug mode and
 *  may output additional info to the console
 * 
 * 
 * 
 * 
 * Options after -- are passed to the designated estimator.
 * 
 * 
 * @author Len Trigg ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 11247 $
 * @see TestInstances
 */
public class CheckEstimator implements OptionHandler, RevisionHandler {

  /*
   * Note about test methods: - methods return array of booleans - first index:
   * success or not - second index: acceptable or not (e.g., Exception is OK) -
   * in case the performance is worse than that of ZeroR both indices are true
   * 
   * FracPete (fracpete at waikato dot ac dot nz)
   */

  /**
   * a class for postprocessing the test-data
   */
  public class PostProcessor implements RevisionHandler {
    /**
     * Provides a hook for derived classes to further modify the data.
     * Currently, the data is just passed through.
     * 
     * @param data the data to process
     * @return the processed data
     */
    protected Instances process(Instances data) {
      return data;
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 11247 $");
    }
  }

  /*** The estimator to be examined */
  protected Estimator m_Estimator = new weka.estimators.NormalEstimator(
    0.000001);

  /** The options to be passed to the base estimator. */
  protected String[] m_EstimatorOptions;

  /** The results of the analysis as a string */
  protected String m_AnalysisResults;

  /** Debugging mode, gives extra output if true */
  protected boolean m_Debug = false;

  /** Silent mode, for no output at all to stdout */
  protected boolean m_Silent = false;

  /** The number of instances in the datasets */
  protected int m_NumInstances = 100;

  /** for post-processing the data even further */
  protected PostProcessor m_PostProcessor = null;

  /** whether classpath problems occurred */
  protected boolean m_ClasspathProblems = false;

  /**
   * class that contains info about the attribute types the estimator can
   * estimate estimator work on one attribute only
   */
  public static class AttrTypes implements RevisionHandler {

    boolean nominal = false;
    boolean numeric = false;
    boolean string = false;
    boolean date = false;
    boolean relational = false;

    AttrTypes() {
    }

    AttrTypes(AttrTypes newTypes) {
      nominal = newTypes.nominal;
      numeric = newTypes.numeric;
      string = newTypes.string;
      date = newTypes.date;
      relational = newTypes.relational;
    }

    AttrTypes(int type) {
      if (type == Attribute.NOMINAL) {
        nominal = true;
      }
      if (type == Attribute.NUMERIC) {
        numeric = true;
      }
      if (type == Attribute.STRING) {
        string = true;
      }
      if (type == Attribute.DATE) {
        date = true;
      }
      if (type == Attribute.RELATIONAL) {
        relational = true;
      }
    }

    int getSetType() throws Exception {
      int sum = 0;
      int type = -1;
      if (nominal) {
        sum++;
        type = Attribute.NOMINAL;
      }
      if (numeric) {
        sum++;
        type = Attribute.NUMERIC;
      }
      if (string) {
        sum++;
        type = Attribute.STRING;
      }
      if (date) {
        sum++;
        type = Attribute.DATE;
      }
      if (relational) {
        sum++;
        type = Attribute.RELATIONAL;
      }
      if (sum > 1) {
        throw new Exception("Expected to have only one type set used wrongly.");
      }
      if (type < 0) {
        throw new Exception("No type set.");
      }
      return type;
    }

    boolean oneIsSet() {
      return (nominal || numeric || string || date || relational);
    }

    public Vector getVectorOfAttrTypes() {
      Vector attrs = new Vector();
      if (nominal) {
        attrs.add(new Integer(Attribute.NOMINAL));
      }
      if (numeric) {
        attrs.add(new Integer(Attribute.NUMERIC));
      }
      if (string) {
        attrs.add(new Integer(Attribute.STRING));
      }
      if (date) {
        attrs.add(new Integer(Attribute.DATE));
      }
      if (relational) {
        attrs.add(new Integer(Attribute.RELATIONAL));
      }
      return attrs;
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 11247 $");
    }
  }

  /**
   * public class that contains info about the chosen attribute type estimator
   * work on one attribute only
   */
  public static class EstTypes implements RevisionHandler {

    boolean incremental = false;
    boolean weighted = false;
    boolean supervised = false;

    /**
     * Constructor
     */
    public EstTypes() {
    }

    /**
     * Constructor
     */
    public EstTypes(boolean i, boolean w, boolean s) {
      incremental = i;
      weighted = w;
      supervised = s;
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 11247 $");
    }
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration

   * 
   * 
   * -D
   *  Turn on debugging output.
   * 
   * 
   *    * -S
   *  Silent mode - prints nothing to stdout.
   * 
   * 
   *    * -N <num>
   *  The number of instances in the datasets (default 100).
   * 
   * 
   *    * -W
   *  Full name of the estimator analysed.
   *  eg: weka.estimators.NormalEstimator
   * 
   * 
   *    * Options specific to estimator weka.estimators.NormalEstimator:
   * 
   * 
   *    * -D
   *  If set, estimator is run in debug mode and
   *  may output additional info to the console
   * 
   * 
   * 
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {
    String tmpStr;

    setDebug(Utils.getFlag('D', options));

    setSilent(Utils.getFlag('S', options));

    tmpStr = Utils.getOption('N', options);
    if (tmpStr.length() != 0) {
      setNumInstances(Integer.parseInt(tmpStr));
    } else {
      setNumInstances(100);
    }

    tmpStr = Utils.getOption('W', options);
    if (tmpStr.length() == 0) {
      throw new Exception("A estimator must be specified with the -W option.");
    }
    setEstimator(Estimator.forName(tmpStr, Utils.partitionOptions(options)));
  }

  /**
   * Gets the current settings of the CheckEstimator.
   * 
   * @return an array of strings suitable for passing to setOptions
   */
  @Override
  public String[] getOptions() {
    Vector result = new Vector();

    if (getDebug()) {
      result.add("-D");
    }

    if (getSilent()) {
      result.add("-S");
    }

    result.add("-N");
    result.add("" + getNumInstances());

    if (getEstimator() != null) {
      result.add("-W");
      result.add(getEstimator().getClass().getName());
    }

    if ((m_Estimator != null) && (m_Estimator instanceof OptionHandler)) {
      String[] options = ((OptionHandler) m_Estimator).getOptions();

      if (options.length > 0) {
        result.add("--");
        Collections.addAll(result, options);
      }
    }

    return result.toArray(new String[result.size()]);
  }

  /**
   * sets the PostProcessor to use
   * 
   * @param value the new PostProcessor
   * @see #m_PostProcessor
   */
  public void setPostProcessor(PostProcessor value) {
    m_PostProcessor = value;
  }

  /**
   * returns the current PostProcessor, can be null
   * 
   * @return the current PostProcessor
   */
  public PostProcessor getPostProcessor() {
    return m_PostProcessor;
  }

  /**
   * returns TRUE if the estimator returned a "not in classpath" Exception
   * 
   * @return true if CLASSPATH problems occurred
   */
  public boolean hasClasspathProblems() {
    return m_ClasspathProblems;
  }

  /**
   * Begin the tests, reporting results to System.out
   */
  public void doTests() {

    if (getEstimator() == null) {
      println("\n=== No estimator set ===");
      return;
    }
    println("\n=== Check on Estimator: " + getEstimator().getClass().getName()
      + " ===\n");

    m_ClasspathProblems = false;

    // Start tests with test for options
    canTakeOptions();

    // test what type of estimator it is
    EstTypes estTypes = new EstTypes();
    estTypes.incremental = incrementalEstimator()[0];
    estTypes.weighted = weightedInstancesHandler()[0];
    estTypes.supervised = supervisedEstimator()[0];

    // in none of the estimators yet the functionality is depending on the class
    // type
    // since this could change the basic structure taken from checkclassifiers
    // is kept here
    int classType = Attribute.NOMINAL;
    AttrTypes attrTypes = testsPerClassType(classType, estTypes);

    // only nominal class can be split up so far
    canSplitUpClass(attrTypes, classType);
  }

  /**
   * Set debugging mode
   * 
   * @param debug true if debug output should be printed
   */
  public void setDebug(boolean debug) {
    m_Debug = debug;

    // disable silent mode, if necessary
    if (getDebug()) {
      setSilent(false);
    }
  }

  /**
   * Get whether debugging is turned on
   * 
   * @return true if debugging output is on
   */
  public boolean getDebug() {
    return m_Debug;
  }

  /**
   * Set slient mode, i.e., no output at all to stdout
   * 
   * @param value whether silent mode is active or not
   */
  public void setSilent(boolean value) {
    m_Silent = value;
  }

  /**
   * Get whether silent mode is turned on
   * 
   * @return true if silent mode is on
   */
  public boolean getSilent() {
    return m_Silent;
  }

  /**
   * Sets the number of instances to use in the datasets (some estimators might
   * require more instances).
   * 
   * @param value the number of instances to use
   */
  public void setNumInstances(int value) {
    m_NumInstances = value;
  }

  /**
   * Gets the current number of instances to use for the datasets.
   * 
   * @return the number of instances
   */
  public int getNumInstances() {
    return m_NumInstances;
  }

  /**
   * Set the estimator for boosting.
   * 
   * @param newEstimator the Estimator to use.
   */
  public void setEstimator(Estimator newEstimator) {
    m_Estimator = newEstimator;
  }

  /**
   * Get the estimator used as the estimator
   * 
   * @return the estimator used as the estimator
   */
  public Estimator getEstimator() {
    return m_Estimator;
  }

  /**
   * prints the given message to stdout, if not silent mode
   * 
   * @param msg the text to print to stdout
   */
  protected void print(Object msg) {
    if (!getSilent()) {
      System.out.print(msg);
    }
  }

  /**
   * prints the given message (+ LF) to stdout, if not silent mode
   * 
   * @param msg the message to println to stdout
   */
  protected void println(Object msg) {
    print(msg + "\n");
  }

  /**
   * prints a LF to stdout, if not silent mode
   */
  protected void println() {
    print("\n");
  }

  /**
   * Run a battery of tests for a given class attribute type
   * 
   * @param classType true if the class attribute should be numeric
   * @param estTypes types the estimator is, like incremental, weighted,
   *          supervised etc
   * @return attribute types estimator can work with
   */
  protected AttrTypes testsPerClassType(int classType, EstTypes estTypes) {

    // in none of the estimators yet is the estimation depending on the class
    // type
    // since this could change the basic structure taken from checkclassifiers
    // is kept here

    // test A: simple test - if can estimate
    AttrTypes attrTypes = new AttrTypes();
    AttrTypes at = new AttrTypes(Attribute.NOMINAL);
    attrTypes.nominal = canEstimate(at, estTypes.supervised, classType)[0];
    at = new AttrTypes(Attribute.NUMERIC);
    attrTypes.numeric = canEstimate(at, estTypes.supervised, classType)[0];
    attrTypes.string = false;
    attrTypes.date = false;
    attrTypes.relational = false;

    // if (!multiInstance)
    // PRel = canEstimate(false, false, false, false, true, classType)[0];
    // else
    // PRel = false;

    // one of the attribute types succeeded

    if (attrTypes.oneIsSet()) {
      Vector attributesSet = attrTypes.getVectorOfAttrTypes();

      // make tests for each attribute
      for (int i = 0; i < attributesSet.size(); i++) {
        AttrTypes workAttrTypes = new AttrTypes(attributesSet.elementAt(i)
          .intValue());

        // test B: weights change estimate or not
        if (estTypes.weighted) {
          instanceWeights(workAttrTypes, classType);
        }

        if (classType == Attribute.NOMINAL) {
          int numClasses = 4;
          canHandleNClasses(workAttrTypes, numClasses);
        }

        // tests with class not the last attribute and the attribute not the
        // first

        // if (!multiInstance) {
        int numAtt = 4;

        canHandleClassAsNthAttribute(workAttrTypes, numAtt, 0, classType, 1);

        // TODOTODOcanHandleAttrAsNthAttribute(workAttrTypes, numAtt, 2,
        // classType);
        // }

        canHandleZeroTraining(workAttrTypes, classType);
        boolean handleMissingAttributes = canHandleMissing(workAttrTypes,
          classType, true, false, 20)[0];
        if (handleMissingAttributes) {
          canHandleMissing(workAttrTypes, classType, true, false, 100);
        }

        boolean handleMissingClass = canHandleMissing(workAttrTypes, classType,
          false, true, 20)[0];
        if (handleMissingClass) {
          canHandleMissing(workAttrTypes, classType, false, true, 100);
        }

        correctBuildInitialisation(workAttrTypes, classType);
        datasetIntegrity(workAttrTypes, classType, handleMissingAttributes,
          handleMissingClass);

        if (estTypes.incremental) {
          incrementingEquality(workAttrTypes, classType);
        }
      }
    }
    return attrTypes;
  }

  /**
   * Checks whether the scheme can take command line options.
   * 
   * @return index 0 is true if the estimator can take options
   */
  protected boolean[] canTakeOptions() {

    boolean[] result = new boolean[2];

    print("options...");
    if (m_Estimator instanceof OptionHandler) {
      println("yes");
      if (m_Debug) {
        println("\n=== Full report ===");
        Enumeration