weka.datagenerators.classifiers.classification.RDG1 Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * RDG1.java
 * Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.datagenerators.classifiers.classification;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.datagenerators.ClassificationGenerator;
import weka.datagenerators.Test;

/**
 *  A data generator that produces data randomly by
 * producing a decision list.

 * The decision list consists of rules.

 * Instances are generated randomly one by one. If decision list fails to
 * classify the current instance, a new rule according to this current instance
 * is generated and added to the decision list.

 * 

 * The option -V switches on voting, which means that at the end of the
 * generation all instances are reclassified to the class value that is
 * supported by the most rules.

 * 

 * This data generator can generate 'boolean' attributes (= nominal with the
 * values {true, false}) and numeric attributes. The rules can be 'A' or 'NOT A'
 * for boolean values and 'B < random_value' or 'B >= random_value' for
 * numeric values.
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -h
 *  Prints this help.
 * 
 * 
 *  * -o <file>
 *  The name of the output file, otherwise the generated data is
 *  printed to stdout.
 * 
 * 
 *  * -r <name>
 *  The name of the relation.
 * 
 * 
 *  * -d
 *  Whether to print debug informations.
 * 
 * 
 *  * -S
 *  The seed for random function (default 1)
 * 
 * 
 *  * -n <num>
 *  The number of examples to generate (default 100)
 * 
 * 
 *  * -a <num>
 *  The number of attributes (default 10).
 * 
 * 
 *  * -c <num>
 *  The number of classes (default 2)
 * 
 * 
 *  * -R <num>
 *  maximum size for rules (default 10)
 * 
 * 
 *  * -M <num>
 *  minimum size for rules (default 1)
 * 
 * 
 *  * -I <num>
 *  number of irrelevant attributes (default 0)
 * 
 * 
 *  * -N
 *  number of numeric attributes (default 0)
 * 
 * 
 *  * -V
 *  switch on voting (default is no voting)
 * 
 * 
 * 
 * 
 * Following an example of a generated dataset: 

 * 
 *  * %
 * % weka.datagenerators.RDG1 -r expl -a 2 -c 3 -n 4 -N 1 -I 0 -M 2 -R 10 -S 2
 * %
 * relation expl
 * 
 * attribute a0 {false,true}
 * attribute a1 numeric
 * attribute class {c0,c1,c2}
 * 
 * data
 * 
 * true,0.496823,c0
 * false,0.743158,c1
 * false,0.408285,c1
 * false,0.993687,c2
 * %
 * % Number of attributes chosen as irrelevant = 0
 * %
 * % DECISIONLIST (number of rules = 3):
 * % RULE 0:   c0 := a1 < 0.986, a0
 * % RULE 1:   c1 := a1 < 0.95, not(a0)
 * % RULE 2:   c2 := not(a0), a1 >= 0.562
 * 
 * 
 * @author Gabi Schmidberger ([email protected])
 * @version $Revision: 10203 $
 */
public class RDG1 extends ClassificationGenerator {

  /** for serialization */
  static final long serialVersionUID = 7751005204635320414L;

  /**
   * class to represent decisionlist
   */
  private class RuleList implements Serializable, RevisionHandler {

    /** for serialization */
    static final long serialVersionUID = 2830125413361938177L;

    /** rule list */
    private ArrayList m_RuleList = null;

    /** class */
    double m_ClassValue = 0.0;

    /**
     * returns the class value
     * 
     * @return the class value
     */
    public double getClassValue() {
      return m_ClassValue;
    }

    /**
     * sets the class value
     * 
     * @param newClassValue the new classvalue
     */
    public void setClassValue(double newClassValue) {
      m_ClassValue = newClassValue;
    }

    /**
     * adds the given test to the list
     * 
     * @param newTest the test to add
     */
    private void addTest(Test newTest) {
      if (m_RuleList == null) {
        m_RuleList = new ArrayList();
      }

      m_RuleList.add(newTest);
    }

    /**
     * classifies the given example
     * 
     * @param example the instance to classify
     * @return the classification
     * @throws Exception if classification fails
     */
    private double classifyInstance(Instance example) throws Exception {
      boolean passedAllTests = true;
      for (Enumeration e = new WekaEnumeration(m_RuleList); passedAllTests
        && e.hasMoreElements();) {
        Test test = e.nextElement();
        passedAllTests = test.passesTest(example);
      }
      if (passedAllTests) {
        return m_ClassValue;
      } else {
        return -1.0;
      }
    }

    /**
     * returns a string representation of the rule list
     * 
     * @return the rule list as string
     */
    @Override
    public String toString() {
      StringBuffer str = new StringBuffer();
      str = str.append("  c" + (int) m_ClassValue + " := ");
      Enumeration e = new WekaEnumeration(m_RuleList);
      if (e.hasMoreElements()) {
        Test test = e.nextElement();
        str = str.append(test.toPrologString());
      }
      while (e.hasMoreElements()) {
        Test test = e.nextElement();
        str = str.append(", " + test.toPrologString());
      }
      return str.toString();
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 10203 $");
    }
  } /* end class RuleList ***** */

  /** Number of attribute the dataset should have */
  protected int m_NumAttributes;

  /** Number of Classes the dataset should have */
  protected int m_NumClasses;

  /** maximum rule size */
  private int m_MaxRuleSize;

  /** minimum rule size */
  private int m_MinRuleSize;

  /** number of irrelevant attributes. */
  private int m_NumIrrelevant;

  /** number of numeric attribute */
  private int m_NumNumeric;

  /** flag that stores if voting is wished */
  private boolean m_VoteFlag = false;

  /** decision list */
  private ArrayList m_DecisionList = null;

  /**
   * array defines which attributes are irrelevant, with: true = attribute is
   * irrelevant; false = attribute is not irrelevant
   */
  boolean[] m_AttList_Irr;

  /**
   * initializes the generator with default values
   */
  public RDG1() {
    super();

    setNumAttributes(defaultNumAttributes());
    setNumClasses(defaultNumClasses());
    setMaxRuleSize(defaultMaxRuleSize());
    setMinRuleSize(defaultMinRuleSize());
    setNumIrrelevant(defaultNumIrrelevant());
    setNumNumeric(defaultNumNumeric());
  }

  /**
   * Returns a string describing this data generator.
   * 
   * @return a description of the data generator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "A data generator that produces data randomly by producing a decision list.\n"
      + "The decision list consists of rules.\n"
      + "Instances are generated randomly one by one. If decision list fails "
      + "to classify the current instance, a new rule according to this current "
      + "instance is generated and added to the decision list.\n\n"
      + "The option -V switches on voting, which means that at the end "
      + "of the generation all instances are "
      + "reclassified to the class value that is supported by the most rules.\n\n"
      + "This data generator can generate 'boolean' attributes (= nominal with "
      + "the values {true, false}) and numeric attributes. The rules can be "
      + "'A' or 'NOT A' for boolean values and 'B < random_value' or "
      + "'B >= random_value' for numeric values.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options
   */
  @Override
  public Enumeration