weka.clusterers.HierarchicalClusterer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* HierarchicalClusterer.java
* Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
*/
package weka.clusterers;
import java.io.Serializable;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Locale;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.PriorityQueue;
import java.util.Vector;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DistanceFunction;
import weka.core.Drawable;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
/**
* Hierarchical clustering class. Implements a number
* of classic hierarchical clustering methods.
*
* Valid options are:
*
*
*
* -N
* number of clusters
*
*
*
*
* -L
* Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
* [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]
*
*
*
* -A
* Distance function to use. (default: weka.core.EuclideanDistance)
*
*
*
* -P
* Print hierarchy in Newick format, which can be used for display in other programs.
*
*
*
* -D
* If set, classifier is run in debug mode and may output additional info to the console.
*
*
*
* -B
* \If set, distance is interpreted as branch length, otherwise it is node height.
*
*
*
*
*
* @author Remco Bouckaert ([email protected], [email protected])
* @author Eibe Frank ([email protected])
* @version $Revision: 15519 $
*/
public class HierarchicalClusterer extends AbstractClusterer implements
OptionHandler, Drawable {
private static final long serialVersionUID = 1L;
/**
* Whether the distance represent node height (if false) or branch length (if
* true).
*/
protected boolean m_bDistanceIsBranchLength = false;
/** training data **/
Instances m_instances;
/** number of clusters desired in clustering **/
int m_nNumClusters = 2;
public void setNumClusters(int nClusters) {
m_nNumClusters = Math.max(1, nClusters);
}
public int getNumClusters() {
return m_nNumClusters;
}
/** distance function used for comparing members of a cluster **/
protected DistanceFunction m_DistanceFunction = new EuclideanDistance();
public DistanceFunction getDistanceFunction() {
return m_DistanceFunction;
}
public void setDistanceFunction(DistanceFunction distanceFunction) {
m_DistanceFunction = distanceFunction;
}
/**
* used for priority queue for efficient retrieval of pair of clusters to
* merge
**/
class Tuple {
public Tuple(double d, int i, int j, int nSize1, int nSize2) {
m_fDist = d;
m_iCluster1 = i;
m_iCluster2 = j;
m_nClusterSize1 = nSize1;
m_nClusterSize2 = nSize2;
}
double m_fDist;
int m_iCluster1;
int m_iCluster2;
int m_nClusterSize1;
int m_nClusterSize2;
}
/** comparator used by priority queue **/
class TupleComparator implements Comparator {
@Override
public int compare(Tuple o1, Tuple o2) {
if (o1.m_fDist < o2.m_fDist) {
return -1;
} else if (o1.m_fDist == o2.m_fDist) {
return 0;
}
return 1;
}
}
/** the various link types */
final static int SINGLE = 0;
final static int COMPLETE = 1;
final static int AVERAGE = 2;
final static int MEAN = 3;
final static int CENTROID = 4;
final static int WARD = 5;
final static int ADJCOMPLETE = 6;
final static int NEIGHBOR_JOINING = 7;
public static final Tag[] TAGS_LINK_TYPE = { new Tag(SINGLE, "SINGLE"),
new Tag(COMPLETE, "COMPLETE"), new Tag(AVERAGE, "AVERAGE"),
new Tag(MEAN, "MEAN"), new Tag(CENTROID, "CENTROID"),
new Tag(WARD, "WARD"), new Tag(ADJCOMPLETE, "ADJCOMPLETE"),
new Tag(NEIGHBOR_JOINING, "NEIGHBOR_JOINING") };
/**
* Holds the Link type used calculate distance between clusters
*/
int m_nLinkType = SINGLE;
boolean m_bPrintNewick = true;;
public boolean getPrintNewick() {
return m_bPrintNewick;
}
public void setPrintNewick(boolean bPrintNewick) {
m_bPrintNewick = bPrintNewick;
}
public void setLinkType(SelectedTag newLinkType) {
if (newLinkType.getTags() == TAGS_LINK_TYPE) {
m_nLinkType = newLinkType.getSelectedTag().getID();
}
}
public SelectedTag getLinkType() {
return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE);
}
/** class representing node in cluster hierarchy **/
class Node implements Serializable {
/** ID added to avoid warning */
private static final long serialVersionUID = 7639483515789717908L;
Node m_left;
Node m_right;
Node m_parent;
int m_iLeftInstance;
int m_iRightInstance;
double m_fLeftLength = 0;
double m_fRightLength = 0;
double m_fHeight = 0;
public String toString(int attIndex) {
NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en","US"));
DecimalFormat myFormatter = (DecimalFormat)nf;
myFormatter.applyPattern("#.#####");
if (m_left == null) {
if (m_right == null) {
return "("
+ m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_instances.instance(m_iRightInstance).stringValue(attIndex)
+ ":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "("
+ m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_right.toString(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
}
} else {
if (m_right == null) {
return "(" + m_left.toString(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_instances.instance(m_iRightInstance).stringValue(attIndex)
+ ":" + myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_left.toString(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_right.toString(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
}
}
}
public String toString2(int attIndex) {
NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en","US"));
DecimalFormat myFormatter = (DecimalFormat)nf;
myFormatter.applyPattern("#.#####");
if (m_left == null) {
if (m_right == null) {
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex)
+ ":" + myFormatter.format(m_fLeftLength) + ","
+ m_instances.instance(m_iRightInstance).value(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_instances.instance(m_iLeftInstance).value(attIndex)
+ ":" + myFormatter.format(m_fLeftLength) + ","
+ m_right.toString2(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
}
} else {
if (m_right == null) {
return "(" + m_left.toString2(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_instances.instance(m_iRightInstance).value(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
} else {
return "(" + m_left.toString2(attIndex) + ":"
+ myFormatter.format(m_fLeftLength) + ","
+ m_right.toString2(attIndex) + ":"
+ myFormatter.format(m_fRightLength) + ")";
}
}
}
void setHeight(double fHeight1, double fHeight2) {
m_fHeight = fHeight1;
if (m_left == null) {
m_fLeftLength = fHeight1;
} else {
m_fLeftLength = fHeight1 - m_left.m_fHeight;
}
if (m_right == null) {
m_fRightLength = fHeight2;
} else {
m_fRightLength = fHeight2 - m_right.m_fHeight;
}
}
void setLength(double fLength1, double fLength2) {
m_fLeftLength = fLength1;
m_fRightLength = fLength2;
m_fHeight = fLength1;
if (m_left != null) {
m_fHeight += m_left.m_fHeight;
}
}
}
protected Node[] m_clusters;
int[] m_nClusterNr;
@Override
public void buildClusterer(Instances data) throws Exception {
// /System.err.println("Method " + m_nLinkType);
m_instances = data;
int nInstances = m_instances.numInstances();
if (nInstances == 0) {
return;
}
m_DistanceFunction.setInstances(m_instances);
// use array of integer vectors to store cluster indices,
// starting with one cluster per instance
@SuppressWarnings("unchecked")
Vector[] nClusterID = new Vector[data.numInstances()];
for (int i = 0; i < data.numInstances(); i++) {
nClusterID[i] = new Vector();
nClusterID[i].add(i);
}
// calculate distance matrix
int nClusters = data.numInstances();
// used for keeping track of hierarchy
Node[] clusterNodes = new Node[nInstances];
if (m_nLinkType == NEIGHBOR_JOINING) {
neighborJoining(nClusters, nClusterID, clusterNodes);
} else {
doLinkClustering(nClusters, nClusterID, clusterNodes);
}
// move all clusters in m_nClusterID array
// & collect hierarchy
int iCurrent = 0;
m_clusters = new Node[m_nNumClusters];
m_nClusterNr = new int[nInstances];
for (int i = 0; i < nInstances; i++) {
if (nClusterID[i].size() > 0) {
for (int j = 0; j < nClusterID[i].size(); j++) {
m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent;
}
m_clusters[iCurrent] = clusterNodes[i];
iCurrent++;
}
}
} // buildClusterer
/**
* use neighbor joining algorithm for clustering This is roughly based on the
* RapidNJ simple implementation and runs at O(n^3) More efficient
* implementations exist, see RapidNJ (or my GPU implementation :-))
*
* @param nClusters
* @param nClusterID
* @param clusterNodes
*/
void neighborJoining(int nClusters, Vector[] nClusterID,
Node[] clusterNodes) {
int n = m_instances.numInstances();
double[][] fDist = new double[nClusters][nClusters];
for (int i = 0; i < nClusters; i++) {
fDist[i][i] = 0;
for (int j = i + 1; j < nClusters; j++) {
fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
fDist[j][i] = fDist[i][j];
}
}
double[] fSeparationSums = new double[n];
double[] fSeparations = new double[n];
int[] nNextActive = new int[n];
// calculate initial separation rows
for (int i = 0; i < n; i++) {
double fSum = 0;
for (int j = 0; j < n; j++) {
fSum += fDist[i][j];
}
fSeparationSums[i] = fSum;
fSeparations[i] = fSum / (nClusters - 2);
nNextActive[i] = i + 1;
}
while (nClusters > 2) {
// find minimum
int iMin1 = -1;
int iMin2 = -1;
double fMin = Double.MAX_VALUE;
if (m_Debug) {
for (int i = 0; i < n; i++) {
if (nClusterID[i].size() > 0) {
double[] fRow = fDist[i];
double fSep1 = fSeparations[i];
for (int j = 0; j < n; j++) {
if (nClusterID[j].size() > 0 && i != j) {
double fSep2 = fSeparations[j];
double fVal = fRow[j] - fSep1 - fSep2;
if (fVal < fMin) {
// new minimum
iMin1 = i;
iMin2 = j;
fMin = fVal;
}
}
}
}
}
} else {
int i = 0;
while (i < n) {
double fSep1 = fSeparations[i];
double[] fRow = fDist[i];
int j = nNextActive[i];
while (j < n) {
double fSep2 = fSeparations[j];
double fVal = fRow[j] - fSep1 - fSep2;
if (fVal < fMin) {
// new minimum
iMin1 = i;
iMin2 = j;
fMin = fVal;
}
j = nNextActive[j];
}
i = nNextActive[i];
}
}
// record distance
double fMinDistance = fDist[iMin1][iMin2];
nClusters--;
double fSep1 = fSeparations[iMin1];
double fSep2 = fSeparations[iMin2];
double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2));
double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1));
if (nClusters > 2) {
// update separations & distance
double fNewSeparationSum = 0;
double fMutualDistance = fDist[iMin1][iMin2];
double[] fRow1 = fDist[iMin1];
double[] fRow2 = fDist[iMin2];
for (int i = 0; i < n; i++) {
if (i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) {
fRow1[i] = 0;
} else {
double fVal1 = fRow1[i];
double fVal2 = fRow2[i];
double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0;
fNewSeparationSum += fDistance;
// update the separationsum of cluster i.
fSeparationSums[i] += (fDistance - fVal1 - fVal2);
fSeparations[i] = fSeparationSums[i] / (nClusters - 2);
fRow1[i] = fDistance;
fDist[i][iMin1] = fDistance;
}
}
fSeparationSums[iMin1] = fNewSeparationSum;
fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2);
fSeparationSums[iMin2] = 0;
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
int iPrev = iMin2;
// since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop
// should be save
while (nClusterID[iPrev].size() == 0) {
iPrev--;
}
nNextActive[iPrev] = nNextActive[iMin2];
} else {
merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
break;
}
}
for (int i = 0; i < n; i++) {
if (nClusterID[i].size() > 0) {
for (int j = i + 1; j < n; j++) {
if (nClusterID[j].size() > 0) {
double fDist1 = fDist[i][j];
if (nClusterID[i].size() == 1) {
merge(i, j, fDist1, 0, nClusterID, clusterNodes);
} else if (nClusterID[j].size() == 1) {
merge(i, j, 0, fDist1, nClusterID, clusterNodes);
} else {
merge(i, j, fDist1 / 2.0, fDist1 / 2.0, nClusterID, clusterNodes);
}
break;
}
}
}
}
} // neighborJoining
/**
* Perform clustering using a link method This implementation uses a priority
* queue resulting in a O(n^2 log(n)) algorithm
*
* @param nClusters number of clusters
* @param nClusterID
* @param clusterNodes
*/
void doLinkClustering(int nClusters, Vector[] nClusterID,
Node[] clusterNodes) {
int nInstances = m_instances.numInstances();
PriorityQueue queue = new PriorityQueue(nClusters * nClusters
/ 2, new TupleComparator());
double[][] fDistance0 = new double[nClusters][nClusters];
double[][] fClusterDistance = null;
if (m_Debug) {
fClusterDistance = new double[nClusters][nClusters];
}
for (int i = 0; i < nClusters; i++) {
fDistance0[i][i] = 0;
for (int j = i + 1; j < nClusters; j++) {
fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
fDistance0[j][i] = fDistance0[i][j];
queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1));
if (m_Debug) {
fClusterDistance[i][j] = fDistance0[i][j];
fClusterDistance[j][i] = fDistance0[i][j];
}
}
}
while (nClusters > m_nNumClusters) {
int iMin1 = -1;
int iMin2 = -1;
// find closest two clusters
if (m_Debug) {
/* simple but inefficient implementation */
double fMinDistance = Double.MAX_VALUE;
for (int i = 0; i < nInstances; i++) {
if (nClusterID[i].size() > 0) {
for (int j = i + 1; j < nInstances; j++) {
if (nClusterID[j].size() > 0) {
double fDist = fClusterDistance[i][j];
if (fDist < fMinDistance) {
fMinDistance = fDist;
iMin1 = i;
iMin2 = j;
}
}
}
}
}
merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID,
clusterNodes);
} else {
// use priority queue to find next best pair to cluster
Tuple t;
do {
t = queue.poll();
} while (t != null
&& (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2]
.size() != t.m_nClusterSize2));
iMin1 = t.m_iCluster1;
iMin2 = t.m_iCluster2;
merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes);
}
// merge clusters
// update distances & queue
for (int i = 0; i < nInstances; i++) {
if (i != iMin1 && nClusterID[i].size() != 0) {
int i1 = Math.min(iMin1, i);
int i2 = Math.max(iMin1, i);
double fDistance = getDistance(fDistance0, nClusterID[i1],
nClusterID[i2]);
if (m_Debug) {
fClusterDistance[i1][i2] = fDistance;
fClusterDistance[i2][i1] = fDistance;
}
queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(),
nClusterID[i2].size()));
}
}
nClusters--;
}
} // doLinkClustering
void merge(int iMin1, int iMin2, double fDist1, double fDist2,
Vector[] nClusterID, Node[] clusterNodes) {
if (m_Debug) {
System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " "
+ fDist2);
}
if (iMin1 > iMin2) {
int h = iMin1;
iMin1 = iMin2;
iMin2 = h;
double f = fDist1;
fDist1 = fDist2;
fDist2 = f;
}
nClusterID[iMin1].addAll(nClusterID[iMin2]);
nClusterID[iMin2].removeAllElements();
// track hierarchy
Node node = new Node();
if (clusterNodes[iMin1] == null) {
node.m_iLeftInstance = iMin1;
} else {
node.m_left = clusterNodes[iMin1];
clusterNodes[iMin1].m_parent = node;
}
if (clusterNodes[iMin2] == null) {
node.m_iRightInstance = iMin2;
} else {
node.m_right = clusterNodes[iMin2];
clusterNodes[iMin2].m_parent = node;
}
if (m_bDistanceIsBranchLength) {
node.setLength(fDist1, fDist2);
} else {
node.setHeight(fDist1, fDist2);
}
clusterNodes[iMin1] = node;
} // merge
/** calculate distance the first time when setting up the distance matrix **/
double getDistance0(Vector cluster1, Vector cluster2) {
double fBestDist = Double.MAX_VALUE;
switch (m_nLinkType) {
case SINGLE:
case NEIGHBOR_JOINING:
case CENTROID:
case COMPLETE:
case ADJCOMPLETE:
case AVERAGE:
case MEAN:
// set up two instances for distance function
Instance instance1 = (Instance) m_instances.instance(
cluster1.elementAt(0)).copy();
Instance instance2 = (Instance) m_instances.instance(
cluster2.elementAt(0)).copy();
fBestDist = m_DistanceFunction.distance(instance1, instance2);
break;
case WARD: {
// finds the distance of the change in caused by merging the cluster.
// The information of a cluster is calculated as the error sum of squares
// of the
// centroids of the cluster and its members.
double ESS1 = calcESS(cluster1);
double ESS2 = calcESS(cluster2);
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
double ESS = calcESS(merged);
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2
* cluster2.size();
}
break;
}
return fBestDist;
} // getDistance0
/**
* calculate the distance between two clusters
*
* @param cluster1 list of indices of instances in the first cluster
* @param cluster2 dito for second cluster
* @return distance between clusters based on link type
*/
double getDistance(double[][] fDistance, Vector cluster1,
Vector cluster2) {
double fBestDist = Double.MAX_VALUE;
switch (m_nLinkType) {
case SINGLE:
// find single link distance aka minimum link, which is the closest
// distance between
// any item in cluster1 and any item in cluster2
fBestDist = Double.MAX_VALUE;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fBestDist > fDist) {
fBestDist = fDist;
}
}
}
break;
case COMPLETE:
case ADJCOMPLETE:
// find complete link distance aka maximum link, which is the largest
// distance between
// any item in cluster1 and any item in cluster2
fBestDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fBestDist < fDist) {
fBestDist = fDist;
}
}
}
if (m_nLinkType == COMPLETE) {
break;
}
// calculate adjustment, which is the largest within cluster distance
double fMaxDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = i + 1; j < cluster1.size(); j++) {
int i2 = cluster1.elementAt(j);
double fDist = fDistance[i1][i2];
if (fMaxDist < fDist) {
fMaxDist = fDist;
}
}
}
for (int i = 0; i < cluster2.size(); i++) {
int i1 = cluster2.elementAt(i);
for (int j = i + 1; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
double fDist = fDistance[i1][i2];
if (fMaxDist < fDist) {
fMaxDist = fDist;
}
}
}
fBestDist -= fMaxDist;
break;
case AVERAGE:
// finds average distance between the elements of the two clusters
fBestDist = 0;
for (int i = 0; i < cluster1.size(); i++) {
int i1 = cluster1.elementAt(i);
for (int j = 0; j < cluster2.size(); j++) {
int i2 = cluster2.elementAt(j);
fBestDist += fDistance[i1][i2];
}
}
fBestDist /= (cluster1.size() * cluster2.size());
break;
case MEAN: {
// calculates the mean distance of a merged cluster (akak Group-average
// agglomerative clustering)
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
fBestDist = 0;
for (int i = 0; i < merged.size(); i++) {
int i1 = merged.elementAt(i);
for (int j = i + 1; j < merged.size(); j++) {
int i2 = merged.elementAt(j);
fBestDist += fDistance[i1][i2];
}
}
int n = merged.size();
fBestDist /= (n * (n - 1.0) / 2.0);
}
break;
case CENTROID:
// finds the distance of the centroids of the clusters
double[] fValues1 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster1.size(); i++) {
Instance instance = m_instances.instance(cluster1.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] += instance.value(j);
}
}
double[] fValues2 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster2.size(); i++) {
Instance instance = m_instances.instance(cluster2.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues2[j] += instance.value(j);
}
}
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] /= cluster1.size();
fValues2[j] /= cluster2.size();
}
fBestDist = m_DistanceFunction.distance(m_instances.instance(0).copy(fValues1),
m_instances.instance(0).copy(fValues2));
break;
case WARD: {
// finds the distance of the change in caused by merging the cluster.
// The information of a cluster is calculated as the error sum of squares
// of the
// centroids of the cluster and its members.
double ESS1 = calcESS(cluster1);
double ESS2 = calcESS(cluster2);
Vector merged = new Vector();
merged.addAll(cluster1);
merged.addAll(cluster2);
double ESS = calcESS(merged);
fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2
* cluster2.size();
}
break;
}
return fBestDist;
} // getDistance
/** calculated error sum-of-squares for instances wrt centroid **/
double calcESS(Vector cluster) {
double[] fValues1 = new double[m_instances.numAttributes()];
for (int i = 0; i < cluster.size(); i++) {
Instance instance = m_instances.instance(cluster.elementAt(i));
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] += instance.value(j);
}
}
for (int j = 0; j < m_instances.numAttributes(); j++) {
fValues1[j] /= cluster.size();
}
// set up instance for distance function
Instance centroid = m_instances.instance(cluster.elementAt(0)).copy(fValues1);
double fESS = 0;
for (int i = 0; i < cluster.size(); i++) {
Instance instance = m_instances.instance(cluster.elementAt(i));
fESS += m_DistanceFunction.distance(centroid, instance);
}
return fESS / cluster.size();
} // calcESS
@Override
/** instances are assigned a cluster by finding the instance in the training data
* with the closest distance to the instance to be clustered. The cluster index of
* the training data point is taken as the cluster index.
*/
public int clusterInstance(Instance instance) throws Exception {
if (m_instances.numInstances() == 0) {
return 0;
}
double fBestDist = Double.MAX_VALUE;
int iBestInstance = -1;
for (int i = 0; i < m_instances.numInstances(); i++) {
double fDist = m_DistanceFunction.distance(instance,
m_instances.instance(i));
if (fDist < fBestDist) {
fBestDist = fDist;
iBestInstance = i;
}
}
return m_nClusterNr[iBestInstance];
}
@Override
/** create distribution with all clusters having zero probability, except the
* cluster the instance is assigned to.
*/
public double[] distributionForInstance(Instance instance) throws Exception {
if (numberOfClusters() == 0) {
double[] p = new double[1];
p[0] = 1;
return p;
}
double[] p = new double[numberOfClusters()];
p[clusterInstance(instance)] = 1.0;
return p;
}
@Override
public Capabilities getCapabilities() {
Capabilities result = new Capabilities(this);
result.disableAll();
result.enable(Capability.NO_CLASS);
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
result.enable(Capability.STRING_ATTRIBUTES);
// other
result.setMinimumNumberInstances(0);
return result;
}
@Override
public int numberOfClusters() throws Exception {
return Math.min(m_nNumClusters, m_instances.numInstances());
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration
© 2015 - 2024 Weber Informatics LLC | Privacy Policy