weka.clusterers.SimpleKMeans Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other updates.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* SimpleKMeans.java
* Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.clusterers;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.ManhattanDistance;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
/**
* Cluster data using the k means algorithm. Can use
* either the Euclidean distance (default) or the Manhattan distance. If the
* Manhattan distance is used, then centroids are computed as the component-wise
* median rather than mean. For more information see:
*
* D. Arthur, S. Vassilvitskii: k-means++: the advantages of carefull seeding.
* In: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete
* algorithms, 1027-1035, 2007.
*
*
*
* BibTeX:
*
*
* @inproceedings{Arthur2007,
* author = {D. Arthur and S. Vassilvitskii},
* booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
* pages = {1027-1035},
* title = {k-means++: the advantages of carefull seeding},
* year = {2007}
* }
*
*
*
*
* Valid options are:
*
*
*
* -N <num>
* Number of clusters.
* (default 2).
*
*
*
* -init
* Initialization method to use.
* 0 = random, 1 = k-means++, 2 = canopy, 3 = farthest first.
* (default = 0)
*
*
*
* -C
* Use canopies to reduce the number of distance calculations.
*
*
*
* -max-candidates <num>
* Maximum number of candidate canopies to retain in memory
* at any one time when using canopy clustering.
* T2 distance plus, data characteristics,
* will determine how many candidate canopies are formed before
* periodic and final pruning are performed, which might result
* in exceess memory consumption. This setting avoids large numbers
* of candidate canopies consuming memory. (default = 100)
*
*
*
* -periodic-pruning <num>
* How often to prune low density canopies when using canopy clustering.
* (default = every 10,000 training instances)
*
*
*
* -min-density
* Minimum canopy density, when using canopy clustering, below which
* a canopy will be pruned during periodic pruning. (default = 2 instances)
*
*
*
* -t2
* The T2 distance to use when using canopy clustering. Values < 0 indicate that
* a heuristic based on attribute std. deviation should be used to set this.
* (default = -1.0)
*
*
*
* -t1
* The T1 distance to use when using canopy clustering. A value < 0 is taken as a
* positive multiplier for T2. (default = -1.5)
*
*
*
* -V
* Display std. deviations for centroids.
*
*
*
* -M
* Don't replace missing values with mean/mode.
*
*
*
* -A <classname and options>
* Distance function to use.
* (default: weka.core.EuclideanDistance)
*
*
*
* -I <num>
* Maximum number of iterations.
*
*
*
* -O
* Preserve order of instances.
*
*
*
* -fast
* Enables faster distance calculations, using cut-off values.
* Disables the calculation/output of squared errors/distances.
*
*
*
* -num-slots <num>
* Number of execution slots.
* (default 1 - i.e. no parallelism)
*
*
*
* -S <num>
* Random number seed.
* (default 10)
*
*
*
* -output-debug-info
* If set, clusterer is run in debug mode and
* may output additional info to the console
*
*
*
* -do-not-check-capabilities
* If set, clusterer capabilities are not checked before clusterer is built
* (use with caution).
*
*
*
*
* @author Mark Hall ([email protected])
* @author Eibe Frank ([email protected])
* @version $Revision: 11444 $
* @see RandomizableClusterer
*/
public class SimpleKMeans extends RandomizableClusterer implements
NumberOfClustersRequestable, WeightedInstancesHandler,
TechnicalInformationHandler {
/** for serialization. */
static final long serialVersionUID = -3235809600124455376L;
/**
* replace missing values in training instances.
*/
protected ReplaceMissingValues m_ReplaceMissingFilter;
/**
* number of clusters to generate.
*/
protected int m_NumClusters = 2;
/**
* Holds the initial start points, as supplied by the initialization method
* used
*/
protected Instances m_initialStartPoints;
/**
* holds the cluster centroids.
*/
protected Instances m_ClusterCentroids;
/**
* Holds the standard deviations of the numeric attributes in each cluster.
*/
protected Instances m_ClusterStdDevs;
/**
* For each cluster, holds the frequency counts for the values of each nominal
* attribute.
*/
protected double[][][] m_ClusterNominalCounts;
protected double[][] m_ClusterMissingCounts;
/**
* Stats on the full data set for comparison purposes. In case the attribute
* is numeric the value is the mean if is being used the Euclidian distance or
* the median if Manhattan distance and if the attribute is nominal then it's
* mode is saved.
*/
protected double[] m_FullMeansOrMediansOrModes;
protected double[] m_FullStdDevs;
protected double[][] m_FullNominalCounts;
protected double[] m_FullMissingCounts;
/**
* Display standard deviations for numeric atts.
*/
protected boolean m_displayStdDevs;
/**
* Replace missing values globally?
*/
protected boolean m_dontReplaceMissing = false;
/**
* The number of instances in each cluster.
*/
protected double[] m_ClusterSizes;
/**
* Maximum number of iterations to be executed.
*/
protected int m_MaxIterations = 500;
/**
* Keep track of the number of iterations completed before convergence.
*/
protected int m_Iterations = 0;
/**
* Holds the squared errors for all clusters.
*/
protected double[] m_squaredErrors;
/** the distance function used. */
protected DistanceFunction m_DistanceFunction = new EuclideanDistance();
/**
* Preserve order of instances.
*/
protected boolean m_PreserveOrder = false;
/**
* Assignments obtained.
*/
protected int[] m_Assignments = null;
/** whether to use fast calculation of distances (using a cut-off). */
protected boolean m_FastDistanceCalc = false;
public static final int RANDOM = 0;
public static final int KMEANS_PLUS_PLUS = 1;
public static final int CANOPY = 2;
public static final int FARTHEST_FIRST = 3;
/** Initialization methods */
public static final Tag[] TAGS_SELECTION = { new Tag(RANDOM, "Random"),
new Tag(KMEANS_PLUS_PLUS, "k-means++"), new Tag(CANOPY, "Canopy"),
new Tag(FARTHEST_FIRST, "Farthest first") };
/** The initialization method to use */
protected int m_initializationMethod = RANDOM;
/**
* Whether to reducet the number of distance calcs done by k-means with
* canopies
*/
protected boolean m_speedUpDistanceCompWithCanopies = false;
/** Canopies that each centroid falls into (determined by T1 radius) */
protected List m_centroidCanopyAssignments;
/** Canopies that each training instance falls into (determined by T1 radius) */
protected List m_dataPointCanopyAssignments;
/** The canopy clusterer (if being used) */
protected Canopy m_canopyClusters;
/**
* The maximum number of candidate canopies to hold in memory at any one time
* (if using canopy clustering)
*/
protected int m_maxCanopyCandidates = 100;
/**
* Prune low-density candidate canopies after every x instances have been seen
* (if using canopy clustering)
*/
protected int m_periodicPruningRate = 10000;
/**
* The minimum cluster density (according to T2 distance) allowed. Used when
* periodically pruning candidate canopies (if using canopy clustering)
*/
protected double m_minClusterDensity = 2;
/** The t2 radius to pass through to Canopy */
protected double m_t2 = Canopy.DEFAULT_T2;
/** The t1 radius to pass through to Canopy */
protected double m_t1 = Canopy.DEFAULT_T1;
/** Number of threads to run */
protected int m_executionSlots = 1;
/** For parallel execution mode */
protected transient ExecutorService m_executorPool;
/**
* the default constructor.
*/
public SimpleKMeans() {
super();
m_SeedDefault = 10;
setSeed(m_SeedDefault);
}
/**
* Start the pool of execution threads
*/
protected void startExecutorPool() {
if (m_executorPool != null) {
m_executorPool.shutdownNow();
}
m_executorPool = Executors.newFixedThreadPool(m_executionSlots);
}
protected int m_completed;
protected int m_failed;
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "D. Arthur and S. Vassilvitskii");
result.setValue(Field.TITLE,
"k-means++: the advantages of carefull seeding");
result.setValue(Field.BOOKTITLE, "Proceedings of the eighteenth annual "
+ "ACM-SIAM symposium on Discrete algorithms");
result.setValue(Field.YEAR, "2007");
result.setValue(Field.PAGES, "1027-1035");
return result;
}
/**
* Returns a string describing this clusterer.
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data using the k means algorithm. Can use either "
+ "the Euclidean distance (default) or the Manhattan distance."
+ " If the Manhattan distance is used, then centroids are computed "
+ "as the component-wise median rather than mean."
+ " For more information see:\n\n" + getTechnicalInformation().toString();
}
/**
* Returns default capabilities of the clusterer.
*
* @return the capabilities of this clusterer
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
result.enable(Capability.NO_CLASS);
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
return result;
}
private class KMeansComputeCentroidTask implements Callable {
protected Instances m_cluster;
protected int m_centroidIndex;
public KMeansComputeCentroidTask(int centroidIndex, Instances cluster) {
m_cluster = cluster;
m_centroidIndex = centroidIndex;
}
@Override
public double[] call() {
return moveCentroid(m_centroidIndex, m_cluster, true, false);
}
}
/**
* Launch the move centroids tasks
*
* @param clusters the cluster centroids
* @return the number of empty clusters
*/
protected int launchMoveCentroids(Instances[] clusters) {
int emptyClusterCount = 0;
List> results = new ArrayList>();
for (int i = 0; i < m_NumClusters; i++) {
if (clusters[i].numInstances() == 0) {
emptyClusterCount++;
} else {
Future futureCentroid =
m_executorPool.submit(new KMeansComputeCentroidTask(i, clusters[i]));
results.add(futureCentroid);
}
}
try {
for (Future d : results) {
m_ClusterCentroids.add(new DenseInstance(1.0, d.get()));
}
} catch (Exception ex) {
ex.printStackTrace();
}
return emptyClusterCount;
}
private class KMeansClusterTask implements Callable {
protected int m_start;
protected int m_end;
protected Instances m_inst;
protected int[] m_clusterAssignments;
public KMeansClusterTask(Instances inst, int start, int end,
int[] clusterAssignments) {
m_start = start;
m_end = end;
m_inst = inst;
m_clusterAssignments = clusterAssignments;
}
@Override
public Boolean call() {
boolean converged = true;
for (int i = m_start; i < m_end; i++) {
Instance toCluster = m_inst.instance(i);
long[] instanceCanopies =
m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments
.get(i) : null;
int newC = clusterInstance(toCluster, instanceCanopies);
if (newC != m_clusterAssignments[i]) {
converged = false;
}
m_clusterAssignments[i] = newC;
}
return converged;
}
protected int clusterInstance(Instance inst, long[] instanceCanopies) {
double minDist = Integer.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
double dist;
if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null
&& instanceCanopies.length > 0) {
try {
if (!Canopy.nonEmptyCanopySetIntersection(
m_centroidCanopyAssignments.get(i), instanceCanopies)) {
// System.err.println("Skipping distance calc... "
// + Canopy.printSingleAssignment(instanceCanopies));
continue;
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
dist =
m_DistanceFunction.distance(inst, m_ClusterCentroids.instance(i),
minDist);
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
return bestCluster;
}
}
/**
* Launch the tasks that assign instances to clusters
*
* @param insts the instances to be clustered
* @param clusterAssignments the array of cluster assignments
* @return true if k means has converged
* @throws Exception if a problem occurs
*/
protected boolean launchAssignToClusters(Instances insts,
int[] clusterAssignments) throws Exception {
int numPerTask = insts.numInstances() / m_executionSlots;
List> results = new ArrayList>();
for (int i = 0; i < m_executionSlots; i++) {
int start = i * numPerTask;
int end = start + numPerTask;
if (i == m_executionSlots - 1) {
end = insts.numInstances();
}
Future futureKM =
m_executorPool.submit(new KMeansClusterTask(insts, start, end,
clusterAssignments));
results.add(futureKM);
}
boolean converged = true;
for (Future f : results) {
if (!f.get()) {
converged = false;
}
}
return converged;
}
/**
* Generates a clusterer. Has to initialize all fields of the clusterer that
* are not being set via options.
*
* @param data set of instances serving as training data
* @throws Exception if the clusterer has not been generated successfully
*/
@Override
public void buildClusterer(Instances data) throws Exception {
m_canopyClusters = null;
// can clusterer handle the data?
getCapabilities().testWithFail(data);
m_Iterations = 0;
m_ReplaceMissingFilter = new ReplaceMissingValues();
Instances instances = new Instances(data);
instances.setClassIndex(-1);
if (!m_dontReplaceMissing) {
m_ReplaceMissingFilter.setInputFormat(instances);
instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
}
m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][];
m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
if (m_displayStdDevs) {
m_FullStdDevs = instances.variances();
}
m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);
m_FullMissingCounts = m_ClusterMissingCounts[0];
m_FullNominalCounts = m_ClusterNominalCounts[0];
double sumOfWeights = instances.sumOfWeights();
for (int i = 0; i < instances.numAttributes(); i++) {
if (instances.attribute(i).isNumeric()) {
if (m_displayStdDevs) {
m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]);
}
if (m_FullMissingCounts[i] == sumOfWeights) {
m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
}
} else {
if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils
.maxIndex(m_FullNominalCounts[i])]) {
m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
// value
}
}
}
m_ClusterCentroids = new Instances(instances, m_NumClusters);
int[] clusterAssignments = new int[instances.numInstances()];
if (m_PreserveOrder) {
m_Assignments = clusterAssignments;
}
m_DistanceFunction.setInstances(instances);
Random RandomO = new Random(getSeed());
int instIndex;
HashMap initC =
new HashMap();
DecisionTableHashKey hk = null;
Instances initInstances = null;
if (m_PreserveOrder) {
initInstances = new Instances(instances);
} else {
initInstances = instances;
}
if (m_speedUpDistanceCompWithCanopies) {
m_canopyClusters = new Canopy();
m_canopyClusters.setNumClusters(m_NumClusters);
m_canopyClusters.setSeed(getSeed());
m_canopyClusters.setT2(getCanopyT2());
m_canopyClusters.setT1(getCanopyT1());
m_canopyClusters
.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
m_canopyClusters.setDebug(getDebug());
m_canopyClusters.buildClusterer(initInstances);
// System.err.println(m_canopyClusters);
m_centroidCanopyAssignments = new ArrayList();
m_dataPointCanopyAssignments = new ArrayList();
}
if (m_initializationMethod == KMEANS_PLUS_PLUS) {
kMeansPlusPlusInit(initInstances);
m_initialStartPoints = new Instances(m_ClusterCentroids);
} else if (m_initializationMethod == CANOPY) {
canopyInit(initInstances);
m_initialStartPoints = new Instances(m_canopyClusters.getCanopies());
} else if (m_initializationMethod == FARTHEST_FIRST) {
farthestFirstInit(initInstances);
m_initialStartPoints = new Instances(m_ClusterCentroids);
} else {
// random
for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
instIndex = RandomO.nextInt(j + 1);
hk =
new DecisionTableHashKey(initInstances.instance(instIndex),
initInstances.numAttributes(), true);
if (!initC.containsKey(hk)) {
m_ClusterCentroids.add(initInstances.instance(instIndex));
initC.put(hk, null);
}
initInstances.swap(j, instIndex);
if (m_ClusterCentroids.numInstances() == m_NumClusters) {
break;
}
}
m_initialStartPoints = new Instances(m_ClusterCentroids);
}
if (m_speedUpDistanceCompWithCanopies) {
// assign canopies to training data
for (int i = 0; i < instances.numInstances(); i++) {
m_dataPointCanopyAssignments.add(m_canopyClusters
.assignCanopies(instances.instance(i)));
}
}
m_NumClusters = m_ClusterCentroids.numInstances();
// removing reference
initInstances = null;
int i;
boolean converged = false;
int emptyClusterCount;
Instances[] tempI = new Instances[m_NumClusters];
m_squaredErrors = new double[m_NumClusters];
m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
startExecutorPool();
while (!converged) {
if (m_speedUpDistanceCompWithCanopies) {
// re-assign canopies to the current cluster centers
m_centroidCanopyAssignments.clear();
for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) {
m_centroidCanopyAssignments.add(m_canopyClusters
.assignCanopies(m_ClusterCentroids.instance(kk)));
}
}
emptyClusterCount = 0;
m_Iterations++;
converged = true;
if (m_executionSlots <= 1
|| instances.numInstances() < 2 * m_executionSlots) {
for (i = 0; i < instances.numInstances(); i++) {
Instance toCluster = instances.instance(i);
int newC =
clusterProcessedInstance(
toCluster,
false,
true,
m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments
.get(i) : null);
if (newC != clusterAssignments[i]) {
converged = false;
}
clusterAssignments[i] = newC;
}
} else {
converged = launchAssignToClusters(instances, clusterAssignments);
}
// update centroids
m_ClusterCentroids = new Instances(instances, m_NumClusters);
for (i = 0; i < m_NumClusters; i++) {
tempI[i] = new Instances(instances, 0);
}
for (i = 0; i < instances.numInstances(); i++) {
tempI[clusterAssignments[i]].add(instances.instance(i));
}
if (m_executionSlots <= 1
|| instances.numInstances() < 2 * m_executionSlots) {
for (i = 0; i < m_NumClusters; i++) {
if (tempI[i].numInstances() == 0) {
// empty cluster
emptyClusterCount++;
} else {
moveCentroid(i, tempI[i], true, true);
}
}
} else {
emptyClusterCount = launchMoveCentroids(tempI);
}
if (m_Iterations == m_MaxIterations) {
converged = true;
}
if (emptyClusterCount > 0) {
m_NumClusters -= emptyClusterCount;
if (converged) {
Instances[] t = new Instances[m_NumClusters];
int index = 0;
for (int k = 0; k < tempI.length; k++) {
if (tempI[k].numInstances() > 0) {
t[index] = tempI[k];
for (i = 0; i < tempI[k].numAttributes(); i++) {
m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
}
index++;
}
}
tempI = t;
} else {
tempI = new Instances[m_NumClusters];
}
}
if (!converged) {
m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
}
}
// calculate errors
if (!m_FastDistanceCalc) {
for (i = 0; i < instances.numInstances(); i++) {
clusterProcessedInstance(instances.instance(i), true, false, null);
}
}
if (m_displayStdDevs) {
m_ClusterStdDevs = new Instances(instances, m_NumClusters);
}
m_ClusterSizes = new double[m_NumClusters];
for (i = 0; i < m_NumClusters; i++) {
if (m_displayStdDevs) {
double[] vals2 = tempI[i].variances();
for (int j = 0; j < instances.numAttributes(); j++) {
if (instances.attribute(j).isNumeric()) {
vals2[j] = Math.sqrt(vals2[j]);
} else {
vals2[j] = Utils.missingValue();
}
}
m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
}
m_ClusterSizes[i] = tempI[i].sumOfWeights();
}
m_executorPool.shutdown();
// save memory!
m_DistanceFunction.clean();
}
/**
* Initialize with the canopy centers of the Canopy clustering method
*
* @param data the training data
* @throws Exception if a problem occurs
*/
protected void canopyInit(Instances data) throws Exception {
if (m_canopyClusters == null) {
m_canopyClusters = new Canopy();
m_canopyClusters.setNumClusters(m_NumClusters);
m_canopyClusters.setSeed(getSeed());
m_canopyClusters.setT2(getCanopyT2());
m_canopyClusters.setT1(getCanopyT1());
m_canopyClusters
.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
m_canopyClusters.setDebug(getDebug());
m_canopyClusters.buildClusterer(data);
}
m_ClusterCentroids = m_canopyClusters.getCanopies();
}
/**
* Initialize with the fartherst first centers
*
* @param data the training data
* @throws Exception if a problem occurs
*/
protected void farthestFirstInit(Instances data) throws Exception {
FarthestFirst ff = new FarthestFirst();
ff.setNumClusters(m_NumClusters);
ff.buildClusterer(data);
m_ClusterCentroids = ff.getClusterCentroids();
}
/**
* Initialize using the k-means++ method
*
* @param data the training data
* @throws Exception if a problem occurs
*/
protected void kMeansPlusPlusInit(Instances data) throws Exception {
Random randomO = new Random(getSeed());
HashMap initC =
new HashMap();
// choose initial center uniformly at random
int index = randomO.nextInt(data.numInstances());
m_ClusterCentroids.add(data.instance(index));
DecisionTableHashKey hk =
new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
initC.put(hk, null);
int iteration = 0;
int remainingInstances = data.numInstances() - 1;
if (m_NumClusters > 1) {
// proceed with selecting the rest
// distances to the initial randomly chose center
double[] distances = new double[data.numInstances()];
double[] cumProbs = new double[data.numInstances()];
for (int i = 0; i < data.numInstances(); i++) {
distances[i] =
m_DistanceFunction.distance(data.instance(i),
m_ClusterCentroids.instance(iteration));
}
// now choose the remaining cluster centers
for (int i = 1; i < m_NumClusters; i++) {
// distances converted to probabilities
double[] weights = new double[data.numInstances()];
System.arraycopy(distances, 0, weights, 0, distances.length);
Utils.normalize(weights);
double sumOfProbs = 0;
for (int k = 0; k < data.numInstances(); k++) {
sumOfProbs += weights[k];
cumProbs[k] = sumOfProbs;
}
cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no
// rounding issues
// choose a random instance
double prob = randomO.nextDouble();
for (int k = 0; k < cumProbs.length; k++) {
if (prob < cumProbs[k]) {
Instance candidateCenter = data.instance(k);
hk =
new DecisionTableHashKey(candidateCenter, data.numAttributes(),
true);
if (!initC.containsKey(hk)) {
initC.put(hk, null);
m_ClusterCentroids.add(candidateCenter);
} else {
// we shouldn't get here because any instance that is a duplicate
// of
// an already chosen cluster center should have zero distance (and
// hence
// zero probability of getting chosen) to that center.
System.err.println("We shouldn't get here....");
}
remainingInstances--;
break;
}
}
iteration++;
if (remainingInstances == 0) {
break;
}
// prepare to choose the next cluster center.
// check distances against the new cluster center to see if it is closer
for (int k = 0; k < data.numInstances(); k++) {
if (distances[k] > 0) {
double newDist =
m_DistanceFunction.distance(data.instance(k),
m_ClusterCentroids.instance(iteration));
if (newDist < distances[k]) {
distances[k] = newDist;
}
}
}
}
}
}
/**
* Move the centroid to it's new coordinates. Generate the centroid
* coordinates based on it's members (objects assigned to the cluster of the
* centroid) and the distance function being used.
*
* @param centroidIndex index of the centroid which the coordinates will be
* computed
* @param members the objects that are assigned to the cluster of this
* centroid
* @param updateClusterInfo if the method is supposed to update the m_Cluster
* arrays
* @param addToCentroidInstances true if the method is to add the computed
* coordinates to the Instances holding the centroids
* @return the centroid coordinates
*/
protected double[] moveCentroid(int centroidIndex, Instances members,
boolean updateClusterInfo, boolean addToCentroidInstances) {
double[] vals = new double[members.numAttributes()];
double[][] nominalDists = new double[members.numAttributes()][];
double[] weightMissing = new double[members.numAttributes()];
double[] weightNonMissing = new double[members.numAttributes()];
// Quickly calculate some relevant statistics
for (int j = 0; j < members.numAttributes(); j++) {
if (members.attribute(j).isNominal()) {
nominalDists[j] = new double[members.attribute(j).numValues()];
}
}
for (Instance inst : members) {
for (int j = 0; j < members.numAttributes(); j++) {
if (inst.isMissing(j)) {
weightMissing[j] += inst.weight();
} else {
weightNonMissing[j] += inst.weight();
if (members.attribute(j).isNumeric()) {
vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case
} else {
nominalDists[j][(int)inst.value(j)] += inst.weight();
}
}
}
}
for (int j = 0; j < members.numAttributes(); j++) {
if (members.attribute(j).isNumeric()) {
if (weightNonMissing[j] > 0) {
vals[j] /= weightNonMissing[j];
} else {
vals[j] = Utils.missingValue();
}
} else {
double max = -Double.MAX_VALUE;
double maxIndex = -1;
for (int i = 0; i < nominalDists[j].length; i++) {
if (nominalDists[j][i] > max) {
max = nominalDists[j][i];
maxIndex = i;
}
if (max < weightMissing[j]) {
vals[j] = Utils.missingValue();
} else {
vals[j] = maxIndex;
}
}
}
}
if (m_DistanceFunction instanceof ManhattanDistance) {
// Need to replace means by medians
Instances sortedMembers = null;
int middle = (members.numInstances() - 1) / 2;
boolean dataIsEven = ((members.numInstances() % 2) == 0);
if (m_PreserveOrder) {
sortedMembers = members;
} else {
sortedMembers = new Instances(members);
}
for (int j = 0; j < members.numAttributes(); j++) {
if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) {
// singleton special case
if (members.numInstances() == 1) {
vals[j] = members.instance(0).value(j);
} else {
vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
if (dataIsEven) {
vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
}
}
}
}
}
if (updateClusterInfo) {
for (int j = 0; j < members.numAttributes(); j++) {
m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j];
m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j];
}
}
if (addToCentroidInstances) {
m_ClusterCentroids.add(new DenseInstance(1.0, vals));
}
return vals;
}
/**
* clusters an instance that has been through the filters.
*
* @param instance the instance to assign a cluster to
* @param updateErrors if true, update the within clusters sum of errors
* @param useFastDistCalc whether to use the fast distance calculation or not
* @param instanceCanopies the canopies covering the instance to be clustered,
* or null if not using the option to reduce the number of distance
* computations via canopies
* @return a cluster number
*/
private int clusterProcessedInstance(Instance instance, boolean updateErrors,
boolean useFastDistCalc, long[] instanceCanopies) {
double minDist = Integer.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
double dist;
if (useFastDistCalc) {
if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null
&& instanceCanopies.length > 0) {
try {
if (!Canopy.nonEmptyCanopySetIntersection(
m_centroidCanopyAssignments.get(i), instanceCanopies)) {
continue;
}
} catch (Exception ex) {
ex.printStackTrace();
}
dist =
m_DistanceFunction.distance(instance,
m_ClusterCentroids.instance(i), minDist);
} else {
dist =
m_DistanceFunction.distance(instance,
m_ClusterCentroids.instance(i), minDist);
}
} else {
dist =
m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i));
}
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
if (updateErrors) {
if (m_DistanceFunction instanceof EuclideanDistance) {
// Euclidean distance to Squared Euclidean distance
minDist *= minDist * instance.weight();
}
m_squaredErrors[bestCluster] += minDist;
}
return bestCluster;
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an interger if the class is
* enumerated, otherwise the predicted value
* @throws Exception if instance could not be classified successfully
*/
@Override
public int clusterInstance(Instance instance) throws Exception {
Instance inst = null;
if (!m_dontReplaceMissing) {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
inst = m_ReplaceMissingFilter.output();
} else {
inst = instance;
}
return clusterProcessedInstance(inst, false, true, null);
}
/**
* Returns the number of clusters.
*
* @return the number of clusters generated for a training dataset.
* @throws Exception if number of clusters could not be returned successfully
*/
@Override
public int numberOfClusters() throws Exception {
return m_NumClusters;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration