All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.datagenerators.clusterers.BIRCHCluster Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    BIRCHCluster.java
 *    Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.datagenerators.clusterers;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.datagenerators.ClusterGenerator;

/**
 *  Cluster data generator designed for the BIRCH
 * System
*
* Dataset is generated with instances in K clusters.
* Instances are 2-d data points.
* Each cluster is characterized by the number of data points in itits radius * and its center. The location of the cluster centers isdetermined by the * pattern parameter. Three patterns are currentlysupported grid, sine and * random.
*
* For more information refer to:
*
* Tian Zhang, Raghu Ramakrishnan, Miron Livny: BIRCH: An Efficient Data * Clustering Method for Very Large Databases. In: ACM SIGMOD International * Conference on Management of Data, 103-114, 1996. *

* * * BibTeX: * *

 * @inproceedings{Zhang1996,
 *    author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny},
 *    booktitle = {ACM SIGMOD International Conference on Management of Data},
 *    pages = {103-114},
 *    publisher = {ACM Press},
 *    title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases},
 *    year = {1996}
 * }
 * 
*

* * * Valid options are: *

* *

 * -h
 *  Prints this help.
 * 
* *
 * -o <file>
 *  The name of the output file, otherwise the generated data is
 *  printed to stdout.
 * 
* *
 * -r <name>
 *  The name of the relation.
 * 
* *
 * -d
 *  Whether to print debug informations.
 * 
* *
 * -S
 *  The seed for random function (default 1)
 * 
* *
 * -a <num>
 *  The number of attributes (default 10).
 * 
* *
 * -c
 *  Class Flag, if set, the cluster is listed in extra attribute.
 * 
* *
 * -b <range>
 *  The indices for boolean attributes.
 * 
* *
 * -m <range>
 *  The indices for nominal attributes.
 * 
* *
 * -k <num>
 *  The number of clusters (default 4)
 * 
* *
 * -G
 *  Set pattern to grid (default is random).
 *  This flag cannot be used at the same time as flag I.
 *  The pattern is random, if neither flag G nor flag I is set.
 * 
* *
 * -I
 *  Set pattern to sine (default is random).
 *  This flag cannot be used at the same time as flag I.
 *  The pattern is random, if neither flag G nor flag I is set.
 * 
* *
 * -N <num>..<num>
 *  The range of number of instances per cluster (default 1..50).
 *  Lower number must be between 0 and 2500,
 *  upper number must be between 50 and 2500.
 * 
* *
 * -R <num>..<num>
 *  The range of radius per cluster (default 0.1..1.4142135623730951).
 *  Lower number must be between 0 and SQRT(2), 
 *  upper number must be between SQRT(2) and SQRT(32).
 * 
* *
 * -M <num>
 *  The distance multiplier (default 4.0).
 * 
* *
 * -C <num>
 *  The number of cycles (default 4).
 * 
* *
 * -O
 *  Flag for input order is ORDERED. If flag is not set then 
 *  input order is RANDOMIZED. RANDOMIZED is currently not 
 *  implemented, therefore is the input order always ORDERED.
 * 
* *
 * -P <num>
 *  The noise rate in percent (default 0.0).
 *  Can be between 0% and 30%. (Remark: The original 
 *  algorithm only allows noise up to 10%.)
 * 
* * * * @author Gabi Schmidberger ([email protected]) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 12471 $ */ public class BIRCHCluster extends ClusterGenerator implements TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -334820527230755027L; /** Number of Clusters the dataset should have */ protected int m_NumClusters; /** minimal number of instances per cluster (option N) */ private int m_MinInstNum; /** maximal number of instances per cluster (option N) */ private int m_MaxInstNum; /** minimum radius (option R) */ private double m_MinRadius; /** maximum radius (option R) */ private double m_MaxRadius; /** Constant set for choice of pattern. (option G) */ public static final int GRID = 0; /** Constant set for choice of pattern. (option I) */ public static final int SINE = 1; /** Constant set for choice of pattern. (default) */ public static final int RANDOM = 2; /** the pattern tags */ public static final Tag[] TAGS_PATTERN = { new Tag(GRID, "Grid"), new Tag(SINE, "Sine"), new Tag(RANDOM, "Random") }; /** pattern (changed with options G or S) */ private int m_Pattern; /** distance multiplier (option M) */ private double m_DistMult; /** number of cycles (option C) */ private int m_NumCycles; /** Constant set for input order (option O) */ public static final int ORDERED = 0; /** Constant set for input order (default) */ public static final int RANDOMIZED = 1; /** the input order tags */ public static final Tag[] TAGS_INPUTORDER = { new Tag(ORDERED, "ordered"), new Tag(RANDOMIZED, "randomized") }; /** input order (changed with option O) */ private int m_InputOrder; /** noise rate in percent (option P, between 0 and 30) */ private double m_NoiseRate; /** cluster list */ private ArrayList m_ClusterList; // following are used for pattern is GRID /** grid size */ private int m_GridSize; /** grid width */ private double m_GridWidth; /** * class to represent cluster */ private class Cluster implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = -8336901069823498140L; /** number of instances for this cluster */ private final int m_InstNum; /** * radius of cluster variance is radius ** 2 / 2 */ private final double m_Radius; /** center of cluster = array of Double values */ private final double[] m_Center; /** * Constructor, used for pattern = RANDOM * * @param instNum the number of instances * @param radius radius of the cluster * @param random the random number generator to use */ private Cluster(int instNum, double radius, Random random) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[getNumAttributes()]; for (int i = 0; i < getNumAttributes(); i++) { m_Center[i] = random.nextDouble() * m_NumClusters; } } /** * Constructor, used for pattern = GRID * * @param instNum the number of instances * @param radius radius of the cluster * @param gridVector vector for grid positions * @param gridWidth factor for grid position */ // center is defined in the constructor of cluster private Cluster(int instNum, double radius, int[] gridVector, double gridWidth) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[getNumAttributes()]; for (int i = 0; i < getNumAttributes(); i++) { m_Center[i] = (gridVector[i] + 1.0) * gridWidth; } } /** * returns the number of instances * * @return the number of instances */ private int getInstNum() { return m_InstNum; } /** * returns the standard deviation * * @return the standard deviation */ private double getStdDev() { return (m_Radius / Math.pow(2.0, 0.5)); } /** * returns the centers * * @return the centers */ private double[] getCenter() { return m_Center; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 12471 $"); } } // end class Cluster /** * class to represent Vector for placement of the center in space */ private class GridVector implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = -1900309948991039522L; /** array of integer */ private final int[] m_GridVector; /** * one higher then the highest possible integer value in any of the integers * in the gridvector */ private final int m_Base; /** size of vector */ private final int m_Size; /** * Constructor * * @param numDim number of dimensions = number of attributes * @param base is one higher then the highest possible integer value in any * of the integers in the gridvector */ private GridVector(int numDim, int base) { m_Size = numDim; m_Base = base; m_GridVector = new int[numDim]; for (int i = 0; i < numDim; i++) { m_GridVector[i] = 0; } } /** * returns the integer array * * @return the integer array */ private int[] getGridVector() { return m_GridVector; } /** * Overflow has occurred when integer is zero. * * @param digit the input integer * @return true if digit is 0 */ private boolean overflow(int digit) { return (digit == 0); } /** * Adds one to integer and sets to zero, if new value was equal m_Base. * * @param digit the input integer * @return new integer object */ private int addOne(int digit) { int value = digit + 1; if (value >= m_Base) { value = 0; } return value; } /** * add 1 to vector */ private void addOne() { m_GridVector[0] = addOne(m_GridVector[0]); int i = 1; while (overflow(m_GridVector[i - 1]) && i < m_Size) { m_GridVector[i] = addOne(m_GridVector[i]); i++; } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 12471 $"); } } // end class GridVector /** * initializes the generator with default values */ public BIRCHCluster() { super(); setNumClusters(defaultNumClusters()); setMinInstNum(defaultMinInstNum()); setMaxInstNum(defaultMaxInstNum()); setMinRadius(defaultMinRadius()); setMaxRadius(defaultMaxRadius()); setPattern(defaultPattern()); setDistMult(defaultDistMult()); setNumCycles(defaultNumCycles()); setInputOrder(defaultInputOrder()); setNoiseRate(defaultNoiseRate()); } /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Cluster data generator designed for the BIRCH System\n\n" + "Dataset is generated with instances in K clusters.\n" + "Instances are 2-d data points.\n" + "Each cluster is characterized by the number of data points in it" + "its radius and its center. The location of the cluster centers is" + "determined by the pattern parameter. Three patterns are currently" + "supported grid, sine and random.\n\n" + "For more information refer to:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Tian Zhang and Raghu Ramakrishnan and Miron Livny"); result.setValue(Field.TITLE, "BIRCH: An Efficient Data Clustering Method for Very Large Databases"); result.setValue(Field.BOOKTITLE, "ACM SIGMOD International Conference on Management of Data"); result.setValue(Field.YEAR, "1996"); result.setValue(Field.PAGES, "103-114"); result.setValue(Field.PUBLISHER, "ACM Press"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ @Override public Enumeration




© 2015 - 2024 Weber Informatics LLC | Privacy Policy