weka.datagenerators.clusterers.BIRCHCluster Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    BIRCHCluster.java
 *    Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.datagenerators.clusterers;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.datagenerators.ClusterGenerator;

/**
 *  Cluster data generator designed for the BIRCH
 * System

 * 

 * Dataset is generated with instances in K clusters.

 * Instances are 2-d data points.

 * Each cluster is characterized by the number of data points in itits radius
 * and its center. The location of the cluster centers isdetermined by the
 * pattern parameter. Three patterns are currentlysupported grid, sine and
 * random.

 * 

 * For more information refer to:

 * 

 * Tian Zhang, Raghu Ramakrishnan, Miron Livny: BIRCH: An Efficient Data
 * Clustering Method for Very Large Databases. In: ACM SIGMOD International
 * Conference on Management of Data, 103-114, 1996.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @inproceedings{Zhang1996,
 *    author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny},
 *    booktitle = {ACM SIGMOD International Conference on Management of Data},
 *    pages = {103-114},
 *    publisher = {ACM Press},
 *    title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases},
 *    year = {1996}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -h
 *  Prints this help.
 * 
 * 
 *  * -o <file>
 *  The name of the output file, otherwise the generated data is
 *  printed to stdout.
 * 
 * 
 *  * -r <name>
 *  The name of the relation.
 * 
 * 
 *  * -d
 *  Whether to print debug informations.
 * 
 * 
 *  * -S
 *  The seed for random function (default 1)
 * 
 * 
 *  * -a <num>
 *  The number of attributes (default 10).
 * 
 * 
 *  * -c
 *  Class Flag, if set, the cluster is listed in extra attribute.
 * 
 * 
 *  * -b <range>
 *  The indices for boolean attributes.
 * 
 * 
 *  * -m <range>
 *  The indices for nominal attributes.
 * 
 * 
 *  * -k <num>
 *  The number of clusters (default 4)
 * 
 * 
 *  * -G
 *  Set pattern to grid (default is random).
 *  This flag cannot be used at the same time as flag I.
 *  The pattern is random, if neither flag G nor flag I is set.
 * 
 * 
 *  * -I
 *  Set pattern to sine (default is random).
 *  This flag cannot be used at the same time as flag I.
 *  The pattern is random, if neither flag G nor flag I is set.
 * 
 * 
 *  * -N <num>..<num>
 *  The range of number of instances per cluster (default 1..50).
 *  Lower number must be between 0 and 2500,
 *  upper number must be between 50 and 2500.
 * 
 * 
 *  * -R <num>..<num>
 *  The range of radius per cluster (default 0.1..1.4142135623730951).
 *  Lower number must be between 0 and SQRT(2), 
 *  upper number must be between SQRT(2) and SQRT(32).
 * 
 * 
 *  * -M <num>
 *  The distance multiplier (default 4.0).
 * 
 * 
 *  * -C <num>
 *  The number of cycles (default 4).
 * 
 * 
 *  * -O
 *  Flag for input order is ORDERED. If flag is not set then 
 *  input order is RANDOMIZED. RANDOMIZED is currently not 
 *  implemented, therefore is the input order always ORDERED.
 * 
 * 
 *  * -P <num>
 *  The noise rate in percent (default 0.0).
 *  Can be between 0% and 30%. (Remark: The original 
 *  algorithm only allows noise up to 10%.)
 * 
 * 
 * 
 * 
 * @author Gabi Schmidberger ([email protected])
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 12471 $
 */
public class BIRCHCluster extends ClusterGenerator implements
  TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = -334820527230755027L;

  /** Number of Clusters the dataset should have */
  protected int m_NumClusters;

  /** minimal number of instances per cluster (option N) */
  private int m_MinInstNum;

  /** maximal number of instances per cluster (option N) */
  private int m_MaxInstNum;

  /** minimum radius (option R) */
  private double m_MinRadius;

  /** maximum radius (option R) */
  private double m_MaxRadius;

  /** Constant set for choice of pattern. (option G) */
  public static final int GRID = 0;
  /** Constant set for choice of pattern. (option I) */
  public static final int SINE = 1;
  /** Constant set for choice of pattern. (default) */
  public static final int RANDOM = 2;
  /** the pattern tags */
  public static final Tag[] TAGS_PATTERN = { new Tag(GRID, "Grid"),
    new Tag(SINE, "Sine"), new Tag(RANDOM, "Random") };

  /** pattern (changed with options G or S) */
  private int m_Pattern;

  /** distance multiplier (option M) */
  private double m_DistMult;

  /** number of cycles (option C) */
  private int m_NumCycles;

  /** Constant set for input order (option O) */
  public static final int ORDERED = 0;
  /** Constant set for input order (default) */
  public static final int RANDOMIZED = 1;
  /** the input order tags */
  public static final Tag[] TAGS_INPUTORDER = { new Tag(ORDERED, "ordered"),
    new Tag(RANDOMIZED, "randomized") };

  /** input order (changed with option O) */
  private int m_InputOrder;

  /** noise rate in percent (option P, between 0 and 30) */
  private double m_NoiseRate;

  /** cluster list */
  private ArrayList m_ClusterList;

  // following are used for pattern is GRID
  /** grid size */
  private int m_GridSize;

  /** grid width */
  private double m_GridWidth;

  /**
   * class to represent cluster
   */
  private class Cluster implements Serializable, RevisionHandler {

    /** for serialization */
    static final long serialVersionUID = -8336901069823498140L;

    /** number of instances for this cluster */
    private final int m_InstNum;

    /**
     * radius of cluster variance is radius ** 2 / 2
     */
    private final double m_Radius;

    /** center of cluster = array of Double values */
    private final double[] m_Center;

    /**
     * Constructor, used for pattern = RANDOM
     * 
     * @param instNum the number of instances
     * @param radius radius of the cluster
     * @param random the random number generator to use
     */
    private Cluster(int instNum, double radius, Random random) {
      m_InstNum = instNum;
      m_Radius = radius;
      m_Center = new double[getNumAttributes()];
      for (int i = 0; i < getNumAttributes(); i++) {
        m_Center[i] = random.nextDouble() * m_NumClusters;
      }
    }

    /**
     * Constructor, used for pattern = GRID
     * 
     * @param instNum the number of instances
     * @param radius radius of the cluster
     * @param gridVector vector for grid positions
     * @param gridWidth factor for grid position
     */
    // center is defined in the constructor of cluster
    private Cluster(int instNum, double radius, int[] gridVector,
      double gridWidth) {
      m_InstNum = instNum;
      m_Radius = radius;
      m_Center = new double[getNumAttributes()];
      for (int i = 0; i < getNumAttributes(); i++) {
        m_Center[i] = (gridVector[i] + 1.0) * gridWidth;
      }

    }

    /**
     * returns the number of instances
     * 
     * @return the number of instances
     */
    private int getInstNum() {
      return m_InstNum;
    }

    /**
     * returns the standard deviation
     * 
     * @return the standard deviation
     */
    private double getStdDev() {
      return (m_Radius / Math.pow(2.0, 0.5));
    }

    /**
     * returns the centers
     * 
     * @return the centers
     */
    private double[] getCenter() {
      return m_Center;
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 12471 $");
    }
  } // end class Cluster

  /**
   * class to represent Vector for placement of the center in space
   */
  private class GridVector implements Serializable, RevisionHandler {

    /** for serialization */
    static final long serialVersionUID = -1900309948991039522L;

    /** array of integer */
    private final int[] m_GridVector;

    /**
     * one higher then the highest possible integer value in any of the integers
     * in the gridvector
     */
    private final int m_Base;

    /** size of vector */
    private final int m_Size;

    /**
     * Constructor
     * 
     * @param numDim number of dimensions = number of attributes
     * @param base is one higher then the highest possible integer value in any
     *          of the integers in the gridvector
     */
    private GridVector(int numDim, int base) {
      m_Size = numDim;
      m_Base = base;
      m_GridVector = new int[numDim];
      for (int i = 0; i < numDim; i++) {
        m_GridVector[i] = 0;
      }
    }

    /**
     * returns the integer array
     * 
     * @return the integer array
     */
    private int[] getGridVector() {
      return m_GridVector;
    }

    /**
     * Overflow has occurred when integer is zero.
     * 
     * @param digit the input integer
     * @return true if digit is 0
     */
    private boolean overflow(int digit) {
      return (digit == 0);
    }

    /**
     * Adds one to integer and sets to zero, if new value was equal m_Base.
     * 
     * @param digit the input integer
     * @return new integer object
     */
    private int addOne(int digit) {
      int value = digit + 1;
      if (value >= m_Base) {
        value = 0;
      }
      return value;
    }

    /**
     * add 1 to vector
     */
    private void addOne() {
      m_GridVector[0] = addOne(m_GridVector[0]);
      int i = 1;
      while (overflow(m_GridVector[i - 1]) && i < m_Size) {
        m_GridVector[i] = addOne(m_GridVector[i]);
        i++;
      }

    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
      return RevisionUtils.extract("$Revision: 12471 $");
    }
  } // end class GridVector

  /**
   * initializes the generator with default values
   */
  public BIRCHCluster() {
    super();

    setNumClusters(defaultNumClusters());
    setMinInstNum(defaultMinInstNum());
    setMaxInstNum(defaultMaxInstNum());
    setMinRadius(defaultMinRadius());
    setMaxRadius(defaultMaxRadius());
    setPattern(defaultPattern());
    setDistMult(defaultDistMult());
    setNumCycles(defaultNumCycles());
    setInputOrder(defaultInputOrder());
    setNoiseRate(defaultNoiseRate());
  }

  /**
   * Returns a string describing this data generator.
   * 
   * @return a description of the data generator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data generator designed for the BIRCH System\n\n"
      + "Dataset is generated with instances in K clusters.\n"
      + "Instances are 2-d data points.\n"
      + "Each cluster is characterized by the number of data points in it"
      + "its radius and its center. The location of the cluster centers is"
      + "determined by the pattern parameter. Three patterns are currently"
      + "supported grid, sine and random.\n\n"
      + "For more information refer to:\n\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR,
      "Tian Zhang and Raghu Ramakrishnan and Miron Livny");
    result.setValue(Field.TITLE,
      "BIRCH: An Efficient Data Clustering Method for Very Large Databases");
    result.setValue(Field.BOOKTITLE,
      "ACM SIGMOD International Conference on Management of Data");
    result.setValue(Field.YEAR, "1996");
    result.setValue(Field.PAGES, "103-114");
    result.setValue(Field.PUBLISHER, "ACM Press");

    return result;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options
   */
  @Override
  public Enumeration