Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* BIRCHCluster.java
* Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.datagenerators.clusterers;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.datagenerators.ClusterGenerator;
/**
* Cluster data generator designed for the BIRCH
* System
*
* Dataset is generated with instances in K clusters.
* Instances are 2-d data points.
* Each cluster is characterized by the number of data points in itits radius
* and its center. The location of the cluster centers isdetermined by the
* pattern parameter. Three patterns are currentlysupported grid, sine and
* random.
*
* For more information refer to:
*
* Tian Zhang, Raghu Ramakrishnan, Miron Livny: BIRCH: An Efficient Data
* Clustering Method for Very Large Databases. In: ACM SIGMOD International
* Conference on Management of Data, 103-114, 1996.
*
*
*
* BibTeX:
*
*
* @inproceedings{Zhang1996,
* author = {Tian Zhang and Raghu Ramakrishnan and Miron Livny},
* booktitle = {ACM SIGMOD International Conference on Management of Data},
* pages = {103-114},
* publisher = {ACM Press},
* title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases},
* year = {1996}
* }
*
*
*
*
* Valid options are:
*
*
*
* -h
* Prints this help.
*
*
*
* -o <file>
* The name of the output file, otherwise the generated data is
* printed to stdout.
*
*
*
* -r <name>
* The name of the relation.
*
*
*
* -d
* Whether to print debug informations.
*
*
*
* -S
* The seed for random function (default 1)
*
*
*
* -a <num>
* The number of attributes (default 10).
*
*
*
* -c
* Class Flag, if set, the cluster is listed in extra attribute.
*
*
*
* -b <range>
* The indices for boolean attributes.
*
*
*
* -m <range>
* The indices for nominal attributes.
*
*
*
* -k <num>
* The number of clusters (default 4)
*
*
*
* -G
* Set pattern to grid (default is random).
* This flag cannot be used at the same time as flag I.
* The pattern is random, if neither flag G nor flag I is set.
*
*
*
* -I
* Set pattern to sine (default is random).
* This flag cannot be used at the same time as flag I.
* The pattern is random, if neither flag G nor flag I is set.
*
*
*
* -N <num>..<num>
* The range of number of instances per cluster (default 1..50).
* Lower number must be between 0 and 2500,
* upper number must be between 50 and 2500.
*
*
*
* -R <num>..<num>
* The range of radius per cluster (default 0.1..1.4142135623730951).
* Lower number must be between 0 and SQRT(2),
* upper number must be between SQRT(2) and SQRT(32).
*
*
*
* -M <num>
* The distance multiplier (default 4.0).
*
*
*
* -C <num>
* The number of cycles (default 4).
*
*
*
* -O
* Flag for input order is ORDERED. If flag is not set then
* input order is RANDOMIZED. RANDOMIZED is currently not
* implemented, therefore is the input order always ORDERED.
*
*
*
* -P <num>
* The noise rate in percent (default 0.0).
* Can be between 0% and 30%. (Remark: The original
* algorithm only allows noise up to 10%.)
*
*
*
*
* @author Gabi Schmidberger ([email protected])
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 10203 $
*/
public class BIRCHCluster extends ClusterGenerator implements
TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = -334820527230755027L;
/** Number of Clusters the dataset should have */
protected int m_NumClusters;
/** minimal number of instances per cluster (option N) */
private int m_MinInstNum;
/** maximal number of instances per cluster (option N) */
private int m_MaxInstNum;
/** minimum radius (option R) */
private double m_MinRadius;
/** maximum radius (option R) */
private double m_MaxRadius;
/** Constant set for choice of pattern. (option G) */
public static final int GRID = 0;
/** Constant set for choice of pattern. (option I) */
public static final int SINE = 1;
/** Constant set for choice of pattern. (default) */
public static final int RANDOM = 2;
/** the pattern tags */
public static final Tag[] TAGS_PATTERN = { new Tag(GRID, "Grid"),
new Tag(SINE, "Sine"), new Tag(RANDOM, "Random") };
/** pattern (changed with options G or S) */
private int m_Pattern;
/** distance multiplier (option M) */
private double m_DistMult;
/** number of cycles (option C) */
private int m_NumCycles;
/** Constant set for input order (option O) */
public static final int ORDERED = 0;
/** Constant set for input order (default) */
public static final int RANDOMIZED = 1;
/** the input order tags */
public static final Tag[] TAGS_INPUTORDER = { new Tag(ORDERED, "ordered"),
new Tag(RANDOMIZED, "randomized") };
/** input order (changed with option O) */
private int m_InputOrder;
/** noise rate in percent (option P, between 0 and 30) */
private double m_NoiseRate;
/** cluster list */
private ArrayList m_ClusterList;
// following are used for pattern is GRID
/** grid size */
private int m_GridSize;
/** grid width */
private double m_GridWidth;
/**
* class to represent cluster
*/
private class Cluster implements Serializable, RevisionHandler {
/** for serialization */
static final long serialVersionUID = -8336901069823498140L;
/** number of instances for this cluster */
private final int m_InstNum;
/**
* radius of cluster variance is radius ** 2 / 2
*/
private final double m_Radius;
/** center of cluster = array of Double values */
private final double[] m_Center;
/**
* Constructor, used for pattern = RANDOM
*
* @param instNum the number of instances
* @param radius radius of the cluster
* @param random the random number generator to use
*/
private Cluster(int instNum, double radius, Random random) {
m_InstNum = instNum;
m_Radius = radius;
m_Center = new double[getNumAttributes()];
for (int i = 0; i < getNumAttributes(); i++) {
m_Center[i] = random.nextDouble() * m_NumClusters;
}
}
/**
* Constructor, used for pattern = GRID
*
* @param instNum the number of instances
* @param radius radius of the cluster
* @param gridVector vector for grid positions
* @param gridWidth factor for grid position
*/
// center is defined in the constructor of cluster
private Cluster(int instNum, double radius, int[] gridVector,
double gridWidth) {
m_InstNum = instNum;
m_Radius = radius;
m_Center = new double[getNumAttributes()];
for (int i = 0; i < getNumAttributes(); i++) {
m_Center[i] = (gridVector[i] + 1.0) * gridWidth;
}
}
/**
* returns the number of instances
*
* @return the number of instances
*/
private int getInstNum() {
return m_InstNum;
}
/**
* returns the standard deviation
*
* @return the standard deviation
*/
private double getStdDev() {
return (m_Radius / Math.pow(2.0, 0.5));
}
/**
* returns the centers
*
* @return the centers
*/
private double[] getCenter() {
return m_Center;
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 10203 $");
}
} // end class Cluster
/**
* class to represent Vector for placement of the center in space
*/
private class GridVector implements Serializable, RevisionHandler {
/** for serialization */
static final long serialVersionUID = -1900309948991039522L;
/** array of integer */
private final int[] m_GridVector;
/**
* one higher then the highest possible integer value in any of the integers
* in the gridvector
*/
private final int m_Base;
/** size of vector */
private final int m_Size;
/**
* Constructor
*
* @param numDim number of dimensions = number of attributes
* @param base is one higher then the highest possible integer value in any
* of the integers in the gridvector
*/
private GridVector(int numDim, int base) {
m_Size = numDim;
m_Base = base;
m_GridVector = new int[numDim];
for (int i = 0; i < numDim; i++) {
m_GridVector[i] = 0;
}
}
/**
* returns the integer array
*
* @return the integer array
*/
private int[] getGridVector() {
return m_GridVector;
}
/**
* Overflow has occurred when integer is zero.
*
* @param digit the input integer
* @return true if digit is 0
*/
private boolean overflow(int digit) {
return (digit == 0);
}
/**
* Adds one to integer and sets to zero, if new value was equal m_Base.
*
* @param digit the input integer
* @return new integer object
*/
private int addOne(int digit) {
int value = digit + 1;
if (value >= m_Base) {
value = 0;
}
return value;
}
/**
* add 1 to vector
*/
private void addOne() {
m_GridVector[0] = addOne(m_GridVector[0]);
int i = 1;
while (overflow(m_GridVector[i - 1]) && i < m_Size) {
m_GridVector[i] = addOne(m_GridVector[i]);
i++;
}
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 10203 $");
}
} // end class GridVector
/**
* initializes the generator with default values
*/
public BIRCHCluster() {
super();
setNumClusters(defaultNumClusters());
setMinInstNum(defaultMinInstNum());
setMaxInstNum(defaultMaxInstNum());
setMinRadius(defaultMinRadius());
setMaxRadius(defaultMaxRadius());
setPattern(defaultPattern());
setDistMult(defaultDistMult());
setNumCycles(defaultNumCycles());
setInputOrder(defaultInputOrder());
setNoiseRate(defaultNoiseRate());
}
/**
* Returns a string describing this data generator.
*
* @return a description of the data generator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data generator designed for the BIRCH System\n\n"
+ "Dataset is generated with instances in K clusters.\n"
+ "Instances are 2-d data points.\n"
+ "Each cluster is characterized by the number of data points in it"
+ "its radius and its center. The location of the cluster centers is"
+ "determined by the pattern parameter. Three patterns are currently"
+ "supported grid, sine and random.\n\n"
+ "For more information refer to:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR,
"Tian Zhang and Raghu Ramakrishnan and Miron Livny");
result.setValue(Field.TITLE,
"BIRCH: An Efficient Data Clustering Method for Very Large Databases");
result.setValue(Field.BOOKTITLE,
"ACM SIGMOD International Conference on Management of Data");
result.setValue(Field.YEAR, "1996");
result.setValue(Field.PAGES, "103-114");
result.setValue(Field.PUBLISHER, "ACM Press");
return result;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options
*/
@Override
public Enumeration