weka.datagenerators.clusterers.SubspaceCluster Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    SubspaceCluster.java
 *    Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.datagenerators.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Tag;
import weka.core.Utils;
import weka.datagenerators.ClusterDefinition;
import weka.datagenerators.ClusterGenerator;

/**
 *  A data generator that produces data points in
 * hyperrectangular subspace clusters.
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -h
 *  Prints this help.
 * 
 * 
 *  * -o <file>
 *  The name of the output file, otherwise the generated data is
 *  printed to stdout.
 * 
 * 
 *  * -r <name>
 *  The name of the relation.
 * 
 * 
 *  * -d
 *  Whether to print debug informations.
 * 
 * 
 *  * -S
 *  The seed for random function (default 1)
 * 
 * 
 *  * -a <num>
 *  The number of attributes (default 1).
 * 
 * 
 *  * -c
 *  Class Flag, if set, the cluster is listed in extra attribute.
 * 
 * 
 *  * -b <range>
 *  The indices for boolean attributes.
 * 
 * 
 *  * -m <range>
 *  The indices for nominal attributes.
 * 
 * 
 *  * -P <num>
 *  The noise rate in percent (default 0.0).
 *  Can be between 0% and 30%. (Remark: The original 
 *  algorithm only allows noise up to 10%.)
 * 
 * 
 *  * -C <cluster-definition>
 *  A cluster definition of class 'SubspaceClusterDefinition'
 *  (definition needs to be quoted to be recognized as 
 *  a single argument).
 * 
 * 
 *  * Options specific to weka.datagenerators.clusterers.SubspaceClusterDefinition:
 * 
 * 
 *  * -A <range>
 *  Generates randomly distributed instances in the cluster.
 * 
 * 
 *  * -U <range>
 *  Generates uniformly distributed instances in the cluster.
 * 
 * 
 *  * -G <range>
 *  Generates gaussian distributed instances in the cluster.
 * 
 * 
 *  * -D <num>,<num>
 *  The attribute min/max (-A and -U) or mean/stddev (-G) for
 *  the cluster.
 * 
 * 
 *  * -N <num>..<num>
 *  The range of number of instances per cluster (default 1..50).
 * 
 * 
 *  * -I
 *  Uses integer instead of continuous values (default continuous).
 * 
 * 
 * 
 * 
 * @author Gabi Schmidberger ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 12478 $
 */
public class SubspaceCluster extends ClusterGenerator {

  /** for serialization */
  static final long serialVersionUID = -3454999858505621128L;

  /** noise rate in percent (option P, between 0 and 30) */
  protected double m_NoiseRate;

  /** cluster list */
  protected ClusterDefinition[] m_Clusters;

  /** if nominal, store number of values */
  protected int[] m_numValues;

  /** cluster type: uniform/random */
  public static final int UNIFORM_RANDOM = 0;
  /** cluster type: total uniform */
  public static final int TOTAL_UNIFORM = 1;
  /** cluster type: gaussian */
  public static final int GAUSSIAN = 2;
  /** the tags for the cluster types */
  public static final Tag[] TAGS_CLUSTERTYPE = {
    new Tag(UNIFORM_RANDOM, "uniform/random"),
    new Tag(TOTAL_UNIFORM, "total uniform"), new Tag(GAUSSIAN, "gaussian") };

  /** cluster subtype: continuous */
  public static final int CONTINUOUS = 0;
  /** cluster subtype: integer */
  public static final int INTEGER = 1;
  /** the tags for the cluster types */
  public static final Tag[] TAGS_CLUSTERSUBTYPE = {
    new Tag(CONTINUOUS, "continuous"), new Tag(INTEGER, "integer") };

  /**
   * initializes the generator, sets the number of clusters to 0, since user has
   * to specify them explicitly
   */
  public SubspaceCluster() {
    super();

    setNoiseRate(defaultNoiseRate());
  }

  /**
   * Returns a string describing this data generator.
   * 
   * @return a description of the data generator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "A data generator that produces data points in "
      + "hyperrectangular subspace clusters.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options
   */
  @Override
  public Enumeration