weka.classifiers.CheckClassifier Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    CheckClassifier.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.CheckScheme;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.MultiInstanceCapabilitiesHandler;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SerializationHelper;
import weka.core.TestInstances;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;

/**
 * Class for examining the capabilities and finding problems with classifiers.
 * If you implement a classifier using the WEKA.libraries, you should run the
 * checks on it to ensure robustness and correct operation. Passing all the
 * tests of this object does not mean bugs in the classifier don't exist, but
 * this will help find some common ones.
 * 
 * 
 * Typical usage:
 * 

 * java weka.classifiers.CheckClassifier -W classifier_name
 * classifier_options 
 * 

 * 
 * CheckClassifier reports on the following:
 * 

 * Classifier abilities
 * 
 * Possible command line options to the classifier
 * Whether the classifier can predict nominal, numeric, string, date or
 * relational class attributes. Warnings will be displayed if performance is
 * worse than ZeroR
 * Whether the classifier can be trained incrementally
 * Whether the classifier can handle numeric predictor attributes
 * Whether the classifier can handle nominal predictor attributes
 * Whether the classifier can handle string predictor attributes
 * Whether the classifier can handle date predictor attributes
 * Whether the classifier can handle relational predictor attributes
 * Whether the classifier can handle multi-instance data
 * Whether the classifier can handle missing predictor values
 * Whether the classifier can handle missing class values
 * Whether a nominal classifier only handles 2 class problems
 * Whether the classifier can handle instance weights
 * 
 * 
 * Correct functioning
 * 
 * Correct initialisation during buildClassifier (i.e. no result changes
 * when buildClassifier called repeatedly)
 * Whether incremental training produces the same results as during
 * non-incremental training (which may or may not be OK)
 * Whether the classifier alters the data pased to it (number of instances,
 * instance order, instance weights, etc)
 * Whether the toString() method works correctly before the classifier has
 * been built.
 * 
 * 
 * Degenerate cases
 * 
 * building classifier with zero training instances
 * all but one predictor attribute values missing
 * all predictor attribute values missing
 * all but one class values missing
 * all class values missing
 * 
 * 
 * 
 * Running CheckClassifier with the debug option set will output the training
 * and test datasets for any failed tests.
 * 
 * 
 * The weka.classifiers.AbstractClassifierTest uses this class to
 * test all the classifiers. Any changes here, have to be checked in that
 * abstract test class, too.
 * 

 * 
 *  Valid options are:
 * 

 * 
 * 
 * -D
 *  Turn on debugging output.
 * 
 * 
 *  * -S
 *  Silent mode - prints nothing to stdout.
 * 
 * 
 *  * -N <num>
 *  The number of instances in the datasets (default 20).
 * 
 * 
 *  * -nominal <num>
 *  The number of nominal attributes (default 2).
 * 
 * 
 *  * -nominal-values <num>
 *  The number of values for nominal attributes (default 1).
 * 
 * 
 *  * -numeric <num>
 *  The number of numeric attributes (default 1).
 * 
 * 
 *  * -string <num>
 *  The number of string attributes (default 1).
 * 
 * 
 *  * -date <num>
 *  The number of date attributes (default 1).
 * 
 * 
 *  * -relational <num>
 *  The number of relational attributes (default 1).
 * 
 * 
 *  * -num-instances-relational <num>
 *  The number of instances in relational/bag attributes (default 10).
 * 
 * 
 *  * -words <comma-separated-list>
 *  The words to use in string attributes.
 * 
 * 
 *  * -word-separators <chars>
 *  The word separators to use in string attributes.
 * 
 * 
 *  * -W
 *  Full name of the classifier analysed.
 *  eg: weka.classifiers.bayes.NaiveBayes
 *  (default weka.classifiers.rules.ZeroR)
 * 
 * 
 *  * Options specific to classifier weka.classifiers.rules.ZeroR:
 * 
 * 
 *  * -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * 
 * 
 * 
 * 
 * Options after -- are passed to the designated classifier.
 * 
 * 
 * @author Len Trigg ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 11253 $
 * @see TestInstances
 */
public class CheckClassifier extends CheckScheme {

  /*
   * Note about test methods: - methods return array of booleans - first index:
   * success or not - second index: acceptable or not (e.g., Exception is OK) -
   * in case the performance is worse than that of ZeroR both indices are true
   * 
   * FracPete (fracpete at waikato dot ac dot nz)
   */

  /*** The classifier to be examined */
  protected Classifier m_Classifier = new weka.classifiers.rules.ZeroR();

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration

   * 
   * 
   * -D
   *  Turn on debugging output.
   * 
   * 
   *    * -S
   *  Silent mode - prints nothing to stdout.
   * 
   * 
   *    * -N <num>
   *  The number of instances in the datasets (default 20).
   * 
   * 
   *    * -nominal <num>
   *  The number of nominal attributes (default 2).
   * 
   * 
   *    * -nominal-values <num>
   *  The number of values for nominal attributes (default 1).
   * 
   * 
   *    * -numeric <num>
   *  The number of numeric attributes (default 1).
   * 
   * 
   *    * -string <num>
   *  The number of string attributes (default 1).
   * 
   * 
   *    * -date <num>
   *  The number of date attributes (default 1).
   * 
   * 
   *    * -relational <num>
   *  The number of relational attributes (default 1).
   * 
   * 
   *    * -num-instances-relational <num>
   *  The number of instances in relational/bag attributes (default 10).
   * 
   * 
   *    * -words <comma-separated-list>
   *  The words to use in string attributes.
   * 
   * 
   *    * -word-separators <chars>
   *  The word separators to use in string attributes.
   * 
   * 
   *    * -W
   *  Full name of the classifier analysed.
   *  eg: weka.classifiers.bayes.NaiveBayes
   *  (default weka.classifiers.rules.ZeroR)
   * 
   * 
   *    * Options specific to classifier weka.classifiers.rules.ZeroR:
   * 
   * 
   *    * -D
   *  If set, classifier is run in debug mode and
   *  may output additional info to the console
   * 
   * 
   * 
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {
    String tmpStr;

    super.setOptions(options);

    tmpStr = Utils.getOption('W', options);
    if (tmpStr.length() == 0) {
      tmpStr = weka.classifiers.rules.ZeroR.class.getName();
    }
    setClassifier((Classifier) forName("weka.classifiers", Classifier.class,
      tmpStr, Utils.partitionOptions(options)));
  }

  /**
   * Gets the current settings of the CheckClassifier.
   * 
   * @return an array of strings suitable for passing to setOptions
   */
  @Override
  public String[] getOptions() {
    Vector result;
    String[] options;

    result = new Vector();

    Collections.addAll(result, super.getOptions());

    if (getClassifier() != null) {
      result.add("-W");
      result.add(getClassifier().getClass().getName());
    }

    if ((m_Classifier != null) && (m_Classifier instanceof OptionHandler)) {

      options = ((OptionHandler) m_Classifier).getOptions();
      if (options.length > 0) {
        result.add("--");
        Collections.addAll(result, options);
      }
    }

    return result.toArray(new String[result.size()]);
  }

  /**
   * Begin the tests, reporting results to System.out
   */
  @Override
  public void doTests() {

    if (getClassifier() == null) {
      println("\n=== No classifier set ===");
      return;
    }
    println("\n=== Check on Classifier: "
      + getClassifier().getClass().getName() + " ===\n");

    // Start tests
    m_ClasspathProblems = false;
    println("--> Checking for interfaces");
    canTakeOptions();
    boolean updateableClassifier = updateableClassifier()[0];
    boolean weightedInstancesHandler = weightedInstancesHandler()[0];
    boolean multiInstanceHandler = multiInstanceHandler()[0];
    println("--> Classifier tests");
    declaresSerialVersionUID();
    testToString();
    testsPerClassType(Attribute.NOMINAL, updateableClassifier,
      weightedInstancesHandler, multiInstanceHandler);
    testsPerClassType(Attribute.NUMERIC, updateableClassifier,
      weightedInstancesHandler, multiInstanceHandler);
    testsPerClassType(Attribute.DATE, updateableClassifier,
      weightedInstancesHandler, multiInstanceHandler);
    testsPerClassType(Attribute.STRING, updateableClassifier,
      weightedInstancesHandler, multiInstanceHandler);
    testsPerClassType(Attribute.RELATIONAL, updateableClassifier,
      weightedInstancesHandler, multiInstanceHandler);
  }

  /**
   * Set the classifier for boosting.
   * 
   * @param newClassifier the Classifier to use.
   */
  public void setClassifier(Classifier newClassifier) {
    m_Classifier = newClassifier;
  }

  /**
   * Get the classifier used as the classifier
   * 
   * @return the classifier used as the classifier
   */
  public Classifier getClassifier() {
    return m_Classifier;
  }

  /**
   * Run a battery of tests for a given class attribute type
   * 
   * @param classType true if the class attribute should be numeric
   * @param updateable true if the classifier is updateable
   * @param weighted true if the classifier says it handles weights
   * @param multiInstance true if the classifier is a multi-instance classifier
   */
  protected void testsPerClassType(int classType, boolean updateable,
    boolean weighted, boolean multiInstance) {

    boolean PNom = canPredict(true, false, false, false, false, multiInstance,
      classType)[0];
    boolean PNum = canPredict(false, true, false, false, false, multiInstance,
      classType)[0];
    boolean PStr = canPredict(false, false, true, false, false, multiInstance,
      classType)[0];
    boolean PDat = canPredict(false, false, false, true, false, multiInstance,
      classType)[0];
    boolean PRel;
    if (!multiInstance) {
      PRel = canPredict(false, false, false, false, true, multiInstance,
        classType)[0];
    } else {
      PRel = false;
    }

    if (PNom || PNum || PStr || PDat || PRel) {
      if (weighted) {
        instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
      }

      canHandleOnlyClass(PNom, PNum, PStr, PDat, PRel, classType);

      if (classType == Attribute.NOMINAL) {
        canHandleNClasses(PNom, PNum, PStr, PDat, PRel, multiInstance, 4);
      }

      if (!multiInstance) {
        canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat, PRel,
          multiInstance, classType, 0);
        canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat, PRel,
          multiInstance, classType, 1);
      }

      canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance,
        classType);
      boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr,
        PDat, PRel, multiInstance, classType, true, false, 20)[0];
      if (handleMissingPredictors) {
        canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance,
          classType, true, false, 100);
      }

      boolean handleMissingClass = canHandleMissing(PNom, PNum, PStr, PDat,
        PRel, multiInstance, classType, false, true, 20)[0];
      if (handleMissingClass) {
        canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance,
          classType, false, true, 100);
      }

      correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance,
        classType);
      datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, classType,
        handleMissingPredictors, handleMissingClass);
      doesntUseTestClassVal(PNom, PNum, PStr, PDat, PRel, multiInstance,
        classType);
      if (updateable) {
        updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance, classType);
      }
    }
  }

  /**
   * Checks whether the scheme's toString() method works even though the
   * classifies hasn't been built yet.
   * 
   * @return index 0 is true if the toString() method works fine
   */
  protected boolean[] testToString() {
    boolean[] result = new boolean[2];

    print("toString...");

    try {
      Classifier copy = m_Classifier.getClass().newInstance();
      copy.toString();
      result[0] = true;
      println("yes");
    } catch (Exception e) {
      result[0] = false;
      println("no");
      if (m_Debug) {
        println("\n=== Full report ===");
        e.printStackTrace();
        println("\n");
      }
    }

    return result;
  }

  /**
   * tests for a serialVersionUID. Fails in case the scheme doesn't declare a
   * UID.
   * 
   * @return index 0 is true if the scheme declares a UID
   */
  protected boolean[] declaresSerialVersionUID() {
    boolean[] result = new boolean[2];

    print("serialVersionUID...");

    result[0] = !SerializationHelper.needsUID(m_Classifier.getClass());

    if (result[0]) {
      println("yes");
    } else {
      println("no");
    }

    return result;
  }

  /**
   * Checks whether the scheme can take command line options.
   * 
   * @return index 0 is true if the classifier can take options
   */
  protected boolean[] canTakeOptions() {

    boolean[] result = new boolean[2];

    print("options...");
    if (m_Classifier instanceof OptionHandler) {
      println("yes");
      if (m_Debug) {
        println("\n=== Full report ===");
        Enumeration