weka.clusterers.Cobweb Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other updates.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* Cobweb.java
* Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.clusterers;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Drawable;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.experiment.Stats;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Add;
/**
*
* Class implementing the Cobweb and Classit clustering algorithms.
*
* Note: the application of node operators (merging, splitting etc.) in terms of ordering and priority differs (and is somewhat ambiguous) between the original Cobweb and Classit papers. This algorithm always compares the best host, adding a new leaf, merging the two best hosts, and splitting the best host when considering where to place a new instance.
*
* For more information see:
*
* D. Fisher (1987). Knowledge acquisition via incremental conceptual clustering. Machine Learning. 2(2):139-172.
*
* J. H. Gennari, P. Langley, D. Fisher (1990). Models of incremental concept formation. Artificial Intelligence. 40:11-61.
*
*
*
*
* BibTeX:
*
* @article{Fisher1987,
* author = {D. Fisher},
* journal = {Machine Learning},
* number = {2},
* pages = {139-172},
* title = {Knowledge acquisition via incremental conceptual clustering},
* volume = {2},
* year = {1987}
* }
*
* @article{Gennari1990,
* author = {J. H. Gennari and P. Langley and D. Fisher},
* journal = {Artificial Intelligence},
* pages = {11-61},
* title = {Models of incremental concept formation},
* volume = {40},
* year = {1990}
* }
*
*
*
*
*
* Valid options are:
*
* -A <acuity>
* Acuity.
* (default=1.0)
*
* -C <cutoff>
* Cutoff.
* (default=0.002)
*
* -save-data
* Save instance data.
*
* -S <num>
* Random number seed.
* (default 42)
*
* -output-debug-info
* If set, clusterer is run in debug mode and
* may output additional info to the console
*
* -do-not-check-capabilities
* If set, clusterer capabilities are not checked before clusterer is built
* (use with caution).
*
*
*
* @author Mark Hall
* @version $Revision: 11556 $
* @see RandomizableClusterer
* @see Drawable
*/
public class Cobweb extends RandomizableClusterer implements Drawable,
TechnicalInformationHandler, UpdateableClusterer {
/** for serialization */
static final long serialVersionUID = 928406656495092318L;
/**
* Inner class handling node operations for Cobweb.
*
* @see Serializable
*/
public class CNode implements Serializable, RevisionHandler {
/** for serialization */
static final long serialVersionUID = 3452097436933325631L;
/**
* Within cluster attribute statistics
*/
private AttributeStats[] m_attStats;
/**
* Number of attributes
*/
private final int m_numAttributes;
/**
* Instances at this node
*/
protected Instances m_clusterInstances = null;
/**
* Children of this node
*/
private ArrayList m_children = null;
/**
* Total instances at this node
*/
private double m_totalInstances = 0.0;
/**
* Cluster number of this node
*/
private int m_clusterNum = -1;
/**
* Creates an empty CNode
instance.
*
* @param numAttributes the number of attributes in the data
*/
public CNode(int numAttributes) {
m_numAttributes = numAttributes;
}
/**
* Creates a new leaf CNode
instance.
*
* @param numAttributes the number of attributes in the data
* @param leafInstance the instance to store at this leaf
*/
public CNode(int numAttributes, Instance leafInstance) {
this(numAttributes);
if (m_clusterInstances == null) {
m_clusterInstances = new Instances(leafInstance.dataset(), 1);
}
m_clusterInstances.add(leafInstance);
updateStats(leafInstance, false);
}
/**
* Adds an instance to this cluster.
*
* @param newInstance the instance to add
* @throws Exception if an error occurs
*/
protected void addInstance(Instance newInstance) throws Exception {
// Add the instance to this cluster
if (m_clusterInstances == null) {
m_clusterInstances = new Instances(newInstance.dataset(), 1);
m_clusterInstances.add(newInstance);
updateStats(newInstance, false);
return;
} else if (m_children == null) {
/*
* we are a leaf, so make our existing instance(s) into a child and then
* add the new instance as a child
*/
m_children = new ArrayList();
CNode tempSubCluster = new CNode(m_numAttributes,
m_clusterInstances.instance(0));
// System.out.println("Dumping "+m_clusterInstances.numInstances());
for (int i = 1; i < m_clusterInstances.numInstances(); i++) {
tempSubCluster.m_clusterInstances.add(m_clusterInstances.instance(i));
tempSubCluster.updateStats(m_clusterInstances.instance(i), false);
}
m_children = new ArrayList();
m_children.add(tempSubCluster);
m_children.add(new CNode(m_numAttributes, newInstance));
m_clusterInstances.add(newInstance);
updateStats(newInstance, false);
// here is where we check against cutoff (also check cutoff
// in findHost)
if (categoryUtility() < m_cutoff) {
// System.out.println("Cutting (leaf add) ");
m_children = null;
}
return;
}
// otherwise, find the best host for this instance
CNode bestHost = findHost(newInstance, false);
if (bestHost != null) {
// now add to the best host
bestHost.addInstance(newInstance);
}
}
/**
* Temporarily adds a new instance to each of this nodes children in turn
* and computes the category utility.
*
* @param newInstance the new instance to evaluate
* @return an array of category utility values---the result of considering
* each child in turn as a host for the new instance
* @throws Exception if an error occurs
*/
private double[] cuScoresForChildren(Instance newInstance) throws Exception {
// look for a host in existing children
double[] categoryUtils = new double[m_children.size()];
// look for a home for this instance in the existing children
for (int i = 0; i < m_children.size(); i++) {
CNode temp = m_children.get(i);
// tentitively add the new instance to this child
temp.updateStats(newInstance, false);
categoryUtils[i] = categoryUtility();
// remove the new instance from this child
temp.updateStats(newInstance, true);
}
return categoryUtils;
}
private double cuScoreForBestTwoMerged(CNode merged, CNode a, CNode b,
Instance newInstance) throws Exception {
double mergedCU = -Double.MAX_VALUE;
// consider merging the best and second
// best.
merged.m_clusterInstances = new Instances(m_clusterInstances, 1);
merged.addChildNode(a);
merged.addChildNode(b);
merged.updateStats(newInstance, false); // add new instance to stats
// remove the best and second best nodes
m_children.remove(m_children.indexOf(a));
m_children.remove(m_children.indexOf(b));
m_children.add(merged);
mergedCU = categoryUtility();
// restore the status quo
merged.updateStats(newInstance, true);
m_children.remove(m_children.indexOf(merged));
m_children.add(a);
m_children.add(b);
return mergedCU;
}
/**
* Finds a host for the new instance in this nodes children. Also considers
* merging the two best hosts and splitting the best host.
*
* @param newInstance the instance to find a host for
* @param structureFrozen true if the instance is not to be added to the
* tree and instead the best potential host is to be returned
* @return the best host
* @throws Exception if an error occurs
*/
private CNode findHost(Instance newInstance, boolean structureFrozen)
throws Exception {
if (!structureFrozen) {
updateStats(newInstance, false);
}
// look for a host in existing children and also consider as a new leaf
double[] categoryUtils = cuScoresForChildren(newInstance);
// make a temporary new leaf for this instance and get CU
CNode newLeaf = new CNode(m_numAttributes, newInstance);
m_children.add(newLeaf);
double bestHostCU = categoryUtility();
CNode finalBestHost = newLeaf;
// remove new leaf when searching for best and second best nodes to
// consider for merging and splitting
m_children.remove(m_children.size() - 1);
// now determine the best host (and the second best)
int best = 0;
int secondBest = 0;
for (int i = 0; i < categoryUtils.length; i++) {
if (categoryUtils[i] > categoryUtils[secondBest]) {
if (categoryUtils[i] > categoryUtils[best]) {
secondBest = best;
best = i;
} else {
secondBest = i;
}
}
}
CNode a = m_children.get(best);
CNode b = m_children.get(secondBest);
if (categoryUtils[best] > bestHostCU) {
bestHostCU = categoryUtils[best];
finalBestHost = a;
// System.out.println("Node is best");
}
if (structureFrozen) {
if (finalBestHost == newLeaf) {
return null; // *this* node is the best host
} else {
return finalBestHost;
}
}
double mergedCU = -Double.MAX_VALUE;
CNode merged = new CNode(m_numAttributes);
if (a != b) {
mergedCU = cuScoreForBestTwoMerged(merged, a, b, newInstance);
if (mergedCU > bestHostCU) {
bestHostCU = mergedCU;
finalBestHost = merged;
}
}
// Consider splitting the best
double splitCU = -Double.MAX_VALUE;
double splitBestChildCU = -Double.MAX_VALUE;
double splitPlusNewLeafCU = -Double.MAX_VALUE;
double splitPlusMergeBestTwoCU = -Double.MAX_VALUE;
if (a.m_children != null) {
ArrayList tempChildren = new ArrayList();
for (int i = 0; i < m_children.size(); i++) {
CNode existingChild = m_children.get(i);
if (existingChild != a) {
tempChildren.add(existingChild);
}
}
for (int i = 0; i < a.m_children.size(); i++) {
CNode promotedChild = a.m_children.get(i);
tempChildren.add(promotedChild);
}
// also add the new leaf
tempChildren.add(newLeaf);
ArrayList saveStatusQuo = m_children;
m_children = tempChildren;
splitPlusNewLeafCU = categoryUtility(); // split + new leaf
// remove the new leaf
tempChildren.remove(tempChildren.size() - 1);
// now look for best and second best
categoryUtils = cuScoresForChildren(newInstance);
// now determine the best host (and the second best)
best = 0;
secondBest = 0;
for (int i = 0; i < categoryUtils.length; i++) {
if (categoryUtils[i] > categoryUtils[secondBest]) {
if (categoryUtils[i] > categoryUtils[best]) {
secondBest = best;
best = i;
} else {
secondBest = i;
}
}
}
CNode sa = m_children.get(best);
CNode sb = m_children.get(secondBest);
splitBestChildCU = categoryUtils[best];
// now merge best and second best
CNode mergedSplitChildren = new CNode(m_numAttributes);
if (sa != sb) {
splitPlusMergeBestTwoCU = cuScoreForBestTwoMerged(
mergedSplitChildren, sa, sb, newInstance);
}
splitCU = (splitBestChildCU > splitPlusNewLeafCU) ? splitBestChildCU
: splitPlusNewLeafCU;
splitCU = (splitCU > splitPlusMergeBestTwoCU) ? splitCU
: splitPlusMergeBestTwoCU;
if (splitCU > bestHostCU) {
bestHostCU = splitCU;
finalBestHost = this;
// tempChildren.remove(tempChildren.size()-1);
} else {
// restore the status quo
m_children = saveStatusQuo;
}
}
if (finalBestHost != this) {
// can commit the instance to the set of instances at this node
m_clusterInstances.add(newInstance);
} else {
m_numberSplits++;
}
if (finalBestHost == merged) {
m_numberMerges++;
m_children.remove(m_children.indexOf(a));
m_children.remove(m_children.indexOf(b));
m_children.add(merged);
}
if (finalBestHost == newLeaf) {
finalBestHost = new CNode(m_numAttributes);
m_children.add(finalBestHost);
}
if (bestHostCU < m_cutoff) {
if (finalBestHost == this) {
// splitting was the best, but since we are cutting all children
// recursion is aborted and we still need to add the instance
// to the set of instances at this node
m_clusterInstances.add(newInstance);
}
m_children = null;
finalBestHost = null;
}
if (finalBestHost == this) {
// splitting is still the best, so downdate the stats as
// we'll be recursively calling on this node
updateStats(newInstance, true);
}
return finalBestHost;
}
/**
* Adds the supplied node as a child of this node. All of the child's
* instances are added to this nodes instances
*
* @param child the child to add
*/
protected void addChildNode(CNode child) {
for (int i = 0; i < child.m_clusterInstances.numInstances(); i++) {
Instance temp = child.m_clusterInstances.instance(i);
m_clusterInstances.add(temp);
updateStats(temp, false);
}
if (m_children == null) {
m_children = new ArrayList();
}
m_children.add(child);
}
/**
* Computes the utility of all children with respect to this node
*
* @return the category utility of the children with respect to this node.
* @throws Exception if there are no children
*/
protected double categoryUtility() throws Exception {
if (m_children == null) {
throw new Exception("categoryUtility: No children!");
}
double totalCU = 0;
for (int i = 0; i < m_children.size(); i++) {
CNode child = m_children.get(i);
totalCU += categoryUtilityChild(child);
}
totalCU /= m_children.size();
return totalCU;
}
/**
* Computes the utility of a single child with respect to this node
*
* @param child the child for which to compute the utility
* @return the utility of the child with respect to this node
* @throws Exception if something goes wrong
*/
protected double categoryUtilityChild(CNode child) throws Exception {
double sum = 0;
for (int i = 0; i < m_numAttributes; i++) {
if (m_clusterInstances.attribute(i).isNominal()) {
for (int j = 0; j < m_clusterInstances.attribute(i).numValues(); j++) {
double x = child.getProbability(i, j);
double y = getProbability(i, j);
sum += (x - y)*(x + y);
}
} else {
// numeric attribute
sum += ((m_normal / child.getStandardDev(i)) - (m_normal / getStandardDev(i)));
}
}
return (child.m_totalInstances / m_totalInstances) * sum;
}
/**
* Returns the probability of a value of a nominal attribute in this node
*
* @param attIndex the index of the attribute
* @param valueIndex the index of the value of the attribute
* @return the probability
* @throws Exception if the requested attribute is not nominal
*/
protected double getProbability(int attIndex, int valueIndex)
throws Exception {
if (!m_clusterInstances.attribute(attIndex).isNominal()) {
throw new Exception("getProbability: attribute is not nominal");
}
if (m_attStats[attIndex].totalCount <= 0) {
return 0;
}
return (double) m_attStats[attIndex].nominalCounts[valueIndex]
/ (double) m_attStats[attIndex].totalCount;
}
/**
* Returns the standard deviation of a numeric attribute
*
* @param attIndex the index of the attribute
* @return the standard deviation
* @throws Exception if an error occurs
*/
protected double getStandardDev(int attIndex) throws Exception {
if (!m_clusterInstances.attribute(attIndex).isNumeric()) {
throw new Exception("getStandardDev: attribute is not numeric");
}
m_attStats[attIndex].numericStats.calculateDerived();
double stdDev = m_attStats[attIndex].numericStats.stdDev;
if (Double.isNaN(stdDev) || Double.isInfinite(stdDev)) {
return m_acuity;
}
return Math.max(m_acuity, stdDev);
}
/**
* Update attribute stats using the supplied instance.
*
* @param updateInstance the instance for updating
* @param delete true if the values of the supplied instance are to be
* removed from the statistics
*/
protected void updateStats(Instance updateInstance, boolean delete) {
if (m_attStats == null) {
m_attStats = new AttributeStats[m_numAttributes];
for (int i = 0; i < m_numAttributes; i++) {
m_attStats[i] = new AttributeStats();
if (m_clusterInstances.attribute(i).isNominal()) {
m_attStats[i].nominalCounts = new int[m_clusterInstances.attribute(
i).numValues()];
} else {
m_attStats[i].numericStats = new Stats();
}
}
}
for (int i = 0; i < m_numAttributes; i++) {
if (!updateInstance.isMissing(i)) {
double value = updateInstance.value(i);
if (m_clusterInstances.attribute(i).isNominal()) {
m_attStats[i].nominalCounts[(int) value] += (delete) ? (-1.0 * updateInstance
.weight()) : updateInstance.weight();
m_attStats[i].totalCount += (delete) ? (-1.0 * updateInstance
.weight()) : updateInstance.weight();
} else {
if (delete) {
m_attStats[i].numericStats.subtract(value,
updateInstance.weight());
} else {
m_attStats[i].numericStats.add(value, updateInstance.weight());
}
}
}
}
m_totalInstances += (delete) ? (-1.0 * updateInstance.weight())
: (updateInstance.weight());
}
/**
* Recursively assigns numbers to the nodes in the tree.
*
* @param cl_num an int[]
value
* @throws Exception if an error occurs
*/
private void assignClusterNums(int[] cl_num) throws Exception {
if (m_children != null && m_children.size() < 2) {
throw new Exception("assignClusterNums: tree not built correctly!");
}
m_clusterNum = cl_num[0];
cl_num[0]++;
if (m_children != null) {
for (int i = 0; i < m_children.size(); i++) {
CNode child = m_children.get(i);
child.assignClusterNums(cl_num);
}
}
}
/**
* Recursively build a string representation of the Cobweb tree
*
* @param depth depth of this node in the tree
* @param text holds the string representation
*/
protected void dumpTree(int depth, StringBuffer text) {
if (depth == 0) {
determineNumberOfClusters();
}
if (m_children == null) {
text.append("\n");
for (int j = 0; j < depth; j++) {
text.append("| ");
}
text.append("leaf " + m_clusterNum + " ["
+ m_clusterInstances.numInstances() + "]");
} else {
for (int i = 0; i < m_children.size(); i++) {
text.append("\n");
for (int j = 0; j < depth; j++) {
text.append("| ");
}
text.append("node " + m_clusterNum + " ["
+ m_clusterInstances.numInstances() + "]");
m_children.get(i).dumpTree(depth + 1, text);
}
}
}
/**
* Returns the instances at this node as a string. Appends the cluster
* number of the child that each instance belongs to.
*
* @return a String
value
* @throws Exception if an error occurs
*/
protected String dumpData() throws Exception {
if (m_children == null) {
return m_clusterInstances.toString();
}
// construct instances string with cluster numbers attached
CNode tempNode = new CNode(m_numAttributes);
tempNode.m_clusterInstances = new Instances(m_clusterInstances, 1);
for (int i = 0; i < m_children.size(); i++) {
tempNode.addChildNode(m_children.get(i));
}
Instances tempInst = tempNode.m_clusterInstances;
tempNode = null;
Add af = new Add();
af.setAttributeName("Cluster");
String labels = "";
for (int i = 0; i < m_children.size(); i++) {
CNode temp = m_children.get(i);
labels += ("C" + temp.m_clusterNum);
if (i < m_children.size() - 1) {
labels += ",";
}
}
af.setNominalLabels(labels);
af.setInputFormat(tempInst);
tempInst = Filter.useFilter(tempInst, af);
tempInst.setRelationName("Cluster " + m_clusterNum);
int z = 0;
for (int i = 0; i < m_children.size(); i++) {
CNode temp = m_children.get(i);
for (int j = 0; j < temp.m_clusterInstances.numInstances(); j++) {
tempInst.instance(z).setValue(m_numAttributes, i);
z++;
}
}
return tempInst.toString();
}
/**
* Recursively generate the graph string for the Cobweb tree.
*
* @param text holds the graph string
* @throws Exception if generation fails
*/
protected void graphTree(StringBuffer text) throws Exception {
text.append("N" + m_clusterNum + " [label=\""
+ ((m_children == null) ? "leaf " : "node ") + m_clusterNum + " "
+ " (" + m_clusterInstances.numInstances() + ")\" "
+ ((m_children == null) ? "shape=box style=filled " : "")
+ (m_saveInstances ? "data =\n" + dumpData() + "\n,\n" : "") + "]\n");
if (m_children != null) {
for (int i = 0; i < m_children.size(); i++) {
CNode temp = m_children.get(i);
text.append("N" + m_clusterNum + "->" + "N" + temp.m_clusterNum
+ "\n");
}
for (int i = 0; i < m_children.size(); i++) {
CNode temp = m_children.get(i);
temp.graphTree(text);
}
}
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 11556 $");
}
}
/**
* Normal constant.
*/
protected static final double m_normal = 1.0 / (2 * Math.sqrt(Math.PI));
/**
* Acuity (minimum standard deviation).
*/
protected double m_acuity = 1.0;
/**
* Cutoff (minimum category utility).
*/
protected double m_cutoff = 0.01 * Cobweb.m_normal;
/**
* Holds the root of the Cobweb tree.
*/
protected CNode m_cobwebTree = null;
/**
* Number of clusters (nodes in the tree). Must never be queried directly,
* only via the method numberOfClusters(). Otherwise it's not guaranteed that
* it contains the correct value.
*
* @see #numberOfClusters()
* @see #m_numberOfClustersDetermined
*/
protected int m_numberOfClusters = -1;
/** whether the number of clusters was already determined */
protected boolean m_numberOfClustersDetermined = false;
/** the number of splits that happened */
protected int m_numberSplits;
/** the number of merges that happened */
protected int m_numberMerges;
/**
* Output instances in graph representation of Cobweb tree (Allows instances
* at nodes in the tree to be visualized in the Explorer).
*/
protected boolean m_saveInstances = false;
/**
* default constructor
*/
public Cobweb() {
super();
m_SeedDefault = 42;
setSeed(m_SeedDefault);
}
/**
* Returns a string describing this clusterer
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Class implementing the Cobweb and Classit clustering algorithms.\n\n"
+ "Note: the application of node operators (merging, splitting etc.) in "
+ "terms of ordering and priority differs (and is somewhat ambiguous) "
+ "between the original Cobweb and Classit papers. This algorithm always "
+ "compares the best host, adding a new leaf, merging the two best hosts, "
+ "and splitting the best host when considering where to place a new "
+ "instance.\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.ARTICLE);
result.setValue(Field.AUTHOR, "D. Fisher");
result.setValue(Field.YEAR, "1987");
result.setValue(Field.TITLE,
"Knowledge acquisition via incremental conceptual clustering");
result.setValue(Field.JOURNAL, "Machine Learning");
result.setValue(Field.VOLUME, "2");
result.setValue(Field.NUMBER, "2");
result.setValue(Field.PAGES, "139-172");
additional = result.add(Type.ARTICLE);
additional.setValue(Field.AUTHOR,
"J. H. Gennari and P. Langley and D. Fisher");
additional.setValue(Field.YEAR, "1990");
additional.setValue(Field.TITLE, "Models of incremental concept formation");
additional.setValue(Field.JOURNAL, "Artificial Intelligence");
additional.setValue(Field.VOLUME, "40");
additional.setValue(Field.PAGES, "11-61");
return result;
}
/**
* Returns default capabilities of the clusterer.
*
* @return the capabilities of this clusterer
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
result.enable(Capability.NO_CLASS);
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// other
result.setMinimumNumberInstances(0);
return result;
}
/**
* Builds the clusterer.
*
* @param data the training instances.
* @throws Exception if something goes wrong.
*/
@Override
public void buildClusterer(Instances data) throws Exception {
m_numberOfClusters = -1;
m_cobwebTree = null;
m_numberSplits = 0;
m_numberMerges = 0;
// can clusterer handle the data?
getCapabilities().testWithFail(data);
// randomize the instances
data = new Instances(data);
if (getSeed() >= 0) {
data.randomize(new Random(getSeed()));
}
for (int i = 0; i < data.numInstances(); i++) {
updateClusterer(data.instance(i));
}
updateFinished();
}
/**
* Singals the end of the updating.
*/
@Override
public void updateFinished() {
determineNumberOfClusters();
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an interger if the class is
* enumerated, otherwise the predicted value
* @throws Exception if instance could not be classified successfully
*/
@Override
public int clusterInstance(Instance instance) throws Exception {
CNode host = m_cobwebTree;
CNode temp = null;
determineNumberOfClusters();
do {
if (host.m_children == null) {
temp = null;
break;
}
// host.updateStats(instance, false);
temp = host.findHost(instance, true);
// host.updateStats(instance, true);
if (temp != null) {
host = temp;
}
} while (temp != null);
return host.m_clusterNum;
}
/**
* determines the number of clusters if necessary
*
* @see #m_numberOfClusters
* @see #m_numberOfClustersDetermined
*/
protected void determineNumberOfClusters() {
if (!m_numberOfClustersDetermined && (m_cobwebTree != null)) {
int[] numClusts = new int[1];
numClusts[0] = 0;
try {
m_cobwebTree.assignClusterNums(numClusts);
} catch (Exception e) {
e.printStackTrace();
numClusts[0] = 0;
}
m_numberOfClusters = numClusts[0];
m_numberOfClustersDetermined = true;
}
}
/**
* Returns the number of clusters.
*
* @return the number of clusters
*/
@Override
public int numberOfClusters() {
determineNumberOfClusters();
return m_numberOfClusters;
}
/**
* Get the root of the tree.
*
* @return the root of the tree.
*/
public CNode getTreeRoot() {
return m_cobwebTree;
}
/**
* Adds an instance to the clusterer.
*
* @param newInstance the instance to be added
* @throws Exception if something goes wrong
*/
@Override
public void updateClusterer(Instance newInstance) throws Exception {
m_numberOfClustersDetermined = false;
if (m_cobwebTree == null) {
m_cobwebTree = new CNode(newInstance.numAttributes(), newInstance);
} else {
m_cobwebTree.addInstance(newInstance);
}
}
/**
* Adds an instance to the Cobweb tree.
*
* @param newInstance the instance to be added
* @throws Exception if something goes wrong
* @deprecated updateClusterer(Instance) should be used instead
* @see #updateClusterer(Instance)
*/
@Deprecated
public void addInstance(Instance newInstance) throws Exception {
updateClusterer(newInstance);
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
**/
@Override
public Enumeration