All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.clusterers.SimpleKMeans Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    SimpleKMeans.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.ManhattanDistance;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 *  Cluster data using the k means algorithm. Can use
 * either the Euclidean distance (default) or the Manhattan distance. If the
 * Manhattan distance is used, then centroids are computed as the component-wise
 * median rather than mean. For more information see:
*
* D. Arthur, S. Vassilvitskii: k-means++: the advantages of carefull seeding. * In: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete * algorithms, 1027-1035, 2007. *

* * * BibTeX: * *

 * @inproceedings{Arthur2007,
 *    author = {D. Arthur and S. Vassilvitskii},
 *    booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
 *    pages = {1027-1035},
 *    title = {k-means++: the advantages of carefull seeding},
 *    year = {2007}
 * }
 * 
*

* * * Valid options are: *

* *

 * -N <num>
 *  Number of clusters.
 *  (default 2).
 * 
* *
 * -init
 *  Initialization method to use.
 *  0 = random, 1 = k-means++, 2 = canopy, 3 = farthest first.
 *  (default = 0)
 * 
* *
 * -C
 *  Use canopies to reduce the number of distance calculations.
 * 
* *
 * -max-candidates <num>
 *  Maximum number of candidate canopies to retain in memory
 *  at any one time when using canopy clustering.
 *  T2 distance plus, data characteristics,
 *  will determine how many candidate canopies are formed before
 *  periodic and final pruning are performed, which might result
 *  in exceess memory consumption. This setting avoids large numbers
 *  of candidate canopies consuming memory. (default = 100)
 * 
* *
 * -periodic-pruning <num>
 *  How often to prune low density canopies when using canopy clustering. 
 *  (default = every 10,000 training instances)
 * 
* *
 * -min-density
 *  Minimum canopy density, when using canopy clustering, below which
 *   a canopy will be pruned during periodic pruning. (default = 2 instances)
 * 
* *
 * -t2
 *  The T2 distance to use when using canopy clustering. Values < 0 indicate that
 *  a heuristic based on attribute std. deviation should be used to set this.
 *  (default = -1.0)
 * 
* *
 * -t1
 *  The T1 distance to use when using canopy clustering. A value < 0 is taken as a
 *  positive multiplier for T2. (default = -1.5)
 * 
* *
 * -V
 *  Display std. deviations for centroids.
 * 
* *
 * -M
 *  Don't replace missing values with mean/mode.
 * 
* *
 * -A <classname and options>
 *  Distance function to use.
 *  (default: weka.core.EuclideanDistance)
 * 
* *
 * -I <num>
 *  Maximum number of iterations.
 * 
* *
 * -O
 *  Preserve order of instances.
 * 
* *
 * -fast
 *  Enables faster distance calculations, using cut-off values.
 *  Disables the calculation/output of squared errors/distances.
 * 
* *
 * -num-slots <num>
 *  Number of execution slots.
 *  (default 1 - i.e. no parallelism)
 * 
* *
 * -S <num>
 *  Random number seed.
 *  (default 10)
 * 
* *
 * -output-debug-info
 *  If set, clusterer is run in debug mode and
 *  may output additional info to the console
 * 
* *
 * -do-not-check-capabilities
 *  If set, clusterer capabilities are not checked before clusterer is built
 *  (use with caution).
 * 
* * * * @author Mark Hall ([email protected]) * @author Eibe Frank ([email protected]) * @version $Revision: 11444 $ * @see RandomizableClusterer */ public class SimpleKMeans extends RandomizableClusterer implements NumberOfClustersRequestable, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization. */ static final long serialVersionUID = -3235809600124455376L; /** * replace missing values in training instances. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * number of clusters to generate. */ protected int m_NumClusters = 2; /** * Holds the initial start points, as supplied by the initialization method * used */ protected Instances m_initialStartPoints; /** * holds the cluster centroids. */ protected Instances m_ClusterCentroids; /** * Holds the standard deviations of the numeric attributes in each cluster. */ protected Instances m_ClusterStdDevs; /** * For each cluster, holds the frequency counts for the values of each nominal * attribute. */ protected double[][][] m_ClusterNominalCounts; protected double[][] m_ClusterMissingCounts; /** * Stats on the full data set for comparison purposes. In case the attribute * is numeric the value is the mean if is being used the Euclidian distance or * the median if Manhattan distance and if the attribute is nominal then it's * mode is saved. */ protected double[] m_FullMeansOrMediansOrModes; protected double[] m_FullStdDevs; protected double[][] m_FullNominalCounts; protected double[] m_FullMissingCounts; /** * Display standard deviations for numeric atts. */ protected boolean m_displayStdDevs; /** * Replace missing values globally? */ protected boolean m_dontReplaceMissing = false; /** * The number of instances in each cluster. */ protected double[] m_ClusterSizes; /** * Maximum number of iterations to be executed. */ protected int m_MaxIterations = 500; /** * Keep track of the number of iterations completed before convergence. */ protected int m_Iterations = 0; /** * Holds the squared errors for all clusters. */ protected double[] m_squaredErrors; /** the distance function used. */ protected DistanceFunction m_DistanceFunction = new EuclideanDistance(); /** * Preserve order of instances. */ protected boolean m_PreserveOrder = false; /** * Assignments obtained. */ protected int[] m_Assignments = null; /** whether to use fast calculation of distances (using a cut-off). */ protected boolean m_FastDistanceCalc = false; public static final int RANDOM = 0; public static final int KMEANS_PLUS_PLUS = 1; public static final int CANOPY = 2; public static final int FARTHEST_FIRST = 3; /** Initialization methods */ public static final Tag[] TAGS_SELECTION = { new Tag(RANDOM, "Random"), new Tag(KMEANS_PLUS_PLUS, "k-means++"), new Tag(CANOPY, "Canopy"), new Tag(FARTHEST_FIRST, "Farthest first") }; /** The initialization method to use */ protected int m_initializationMethod = RANDOM; /** * Whether to reducet the number of distance calcs done by k-means with * canopies */ protected boolean m_speedUpDistanceCompWithCanopies = false; /** Canopies that each centroid falls into (determined by T1 radius) */ protected List m_centroidCanopyAssignments; /** Canopies that each training instance falls into (determined by T1 radius) */ protected List m_dataPointCanopyAssignments; /** The canopy clusterer (if being used) */ protected Canopy m_canopyClusters; /** * The maximum number of candidate canopies to hold in memory at any one time * (if using canopy clustering) */ protected int m_maxCanopyCandidates = 100; /** * Prune low-density candidate canopies after every x instances have been seen * (if using canopy clustering) */ protected int m_periodicPruningRate = 10000; /** * The minimum cluster density (according to T2 distance) allowed. Used when * periodically pruning candidate canopies (if using canopy clustering) */ protected double m_minClusterDensity = 2; /** The t2 radius to pass through to Canopy */ protected double m_t2 = Canopy.DEFAULT_T2; /** The t1 radius to pass through to Canopy */ protected double m_t1 = Canopy.DEFAULT_T1; /** Number of threads to run */ protected int m_executionSlots = 1; /** For parallel execution mode */ protected transient ExecutorService m_executorPool; /** * the default constructor. */ public SimpleKMeans() { super(); m_SeedDefault = 10; setSeed(m_SeedDefault); } /** * Start the pool of execution threads */ protected void startExecutorPool() { if (m_executorPool != null) { m_executorPool.shutdownNow(); } m_executorPool = Executors.newFixedThreadPool(m_executionSlots); } protected int m_completed; protected int m_failed; @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "D. Arthur and S. Vassilvitskii"); result.setValue(Field.TITLE, "k-means++: the advantages of carefull seeding"); result.setValue(Field.BOOKTITLE, "Proceedings of the eighteenth annual " + "ACM-SIAM symposium on Discrete algorithms"); result.setValue(Field.YEAR, "2007"); result.setValue(Field.PAGES, "1027-1035"); return result; } /** * Returns a string describing this clusterer. * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the k means algorithm. Can use either " + "the Euclidean distance (default) or the Manhattan distance." + " If the Manhattan distance is used, then centroids are computed " + "as the component-wise median rather than mean." + " For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } private class KMeansComputeCentroidTask implements Callable { protected Instances m_cluster; protected int m_centroidIndex; public KMeansComputeCentroidTask(int centroidIndex, Instances cluster) { m_cluster = cluster; m_centroidIndex = centroidIndex; } @Override public double[] call() { return moveCentroid(m_centroidIndex, m_cluster, true, false); } } /** * Launch the move centroids tasks * * @param clusters the cluster centroids * @return the number of empty clusters */ protected int launchMoveCentroids(Instances[] clusters) { int emptyClusterCount = 0; List> results = new ArrayList>(); for (int i = 0; i < m_NumClusters; i++) { if (clusters[i].numInstances() == 0) { emptyClusterCount++; } else { Future futureCentroid = m_executorPool.submit(new KMeansComputeCentroidTask(i, clusters[i])); results.add(futureCentroid); } } try { for (Future d : results) { m_ClusterCentroids.add(new DenseInstance(1.0, d.get())); } } catch (Exception ex) { ex.printStackTrace(); } return emptyClusterCount; } private class KMeansClusterTask implements Callable { protected int m_start; protected int m_end; protected Instances m_inst; protected int[] m_clusterAssignments; public KMeansClusterTask(Instances inst, int start, int end, int[] clusterAssignments) { m_start = start; m_end = end; m_inst = inst; m_clusterAssignments = clusterAssignments; } @Override public Boolean call() { boolean converged = true; for (int i = m_start; i < m_end; i++) { Instance toCluster = m_inst.instance(i); long[] instanceCanopies = m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments .get(i) : null; int newC = clusterInstance(toCluster, instanceCanopies); if (newC != m_clusterAssignments[i]) { converged = false; } m_clusterAssignments[i] = newC; } return converged; } protected int clusterInstance(Instance inst, long[] instanceCanopies) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist; if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null && instanceCanopies.length > 0) { try { if (!Canopy.nonEmptyCanopySetIntersection( m_centroidCanopyAssignments.get(i), instanceCanopies)) { // System.err.println("Skipping distance calc... " // + Canopy.printSingleAssignment(instanceCanopies)); continue; } } catch (Exception ex) { ex.printStackTrace(); } } dist = m_DistanceFunction.distance(inst, m_ClusterCentroids.instance(i), minDist); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; } } /** * Launch the tasks that assign instances to clusters * * @param insts the instances to be clustered * @param clusterAssignments the array of cluster assignments * @return true if k means has converged * @throws Exception if a problem occurs */ protected boolean launchAssignToClusters(Instances insts, int[] clusterAssignments) throws Exception { int numPerTask = insts.numInstances() / m_executionSlots; List> results = new ArrayList>(); for (int i = 0; i < m_executionSlots; i++) { int start = i * numPerTask; int end = start + numPerTask; if (i == m_executionSlots - 1) { end = insts.numInstances(); } Future futureKM = m_executorPool.submit(new KMeansClusterTask(insts, start, end, clusterAssignments)); results.add(futureKM); } boolean converged = true; for (Future f : results) { if (!f.get()) { converged = false; } } return converged; } /** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { m_canopyClusters = null; // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = instances.variances(); } m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false); m_FullMissingCounts = m_ClusterMissingCounts[0]; m_FullNominalCounts = m_ClusterNominalCounts[0]; double sumOfWeights = instances.sumOfWeights(); for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]); } if (m_FullMissingCounts[i] == sumOfWeights) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils .maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } if (m_speedUpDistanceCompWithCanopies) { m_canopyClusters = new Canopy(); m_canopyClusters.setNumClusters(m_NumClusters); m_canopyClusters.setSeed(getSeed()); m_canopyClusters.setT2(getCanopyT2()); m_canopyClusters.setT1(getCanopyT1()); m_canopyClusters .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory()); m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate()); m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity()); m_canopyClusters.setDebug(getDebug()); m_canopyClusters.buildClusterer(initInstances); // System.err.println(m_canopyClusters); m_centroidCanopyAssignments = new ArrayList(); m_dataPointCanopyAssignments = new ArrayList(); } if (m_initializationMethod == KMEANS_PLUS_PLUS) { kMeansPlusPlusInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else if (m_initializationMethod == CANOPY) { canopyInit(initInstances); m_initialStartPoints = new Instances(m_canopyClusters.getCanopies()); } else if (m_initializationMethod == FARTHEST_FIRST) { farthestFirstInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else { // random for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_initialStartPoints = new Instances(m_ClusterCentroids); } if (m_speedUpDistanceCompWithCanopies) { // assign canopies to training data for (int i = 0; i < instances.numInstances(); i++) { m_dataPointCanopyAssignments.add(m_canopyClusters .assignCanopies(instances.instance(i))); } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; startExecutorPool(); while (!converged) { if (m_speedUpDistanceCompWithCanopies) { // re-assign canopies to the current cluster centers m_centroidCanopyAssignments.clear(); for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) { m_centroidCanopyAssignments.add(m_canopyClusters .assignCanopies(m_ClusterCentroids.instance(kk))); } } emptyClusterCount = 0; m_Iterations++; converged = true; if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance( toCluster, false, true, m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments .get(i) : null); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false, null); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new double[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = tempI[i].variances(); for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(vals2[j]); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].sumOfWeights(); } m_executorPool.shutdown(); // save memory! m_DistanceFunction.clean(); } /** * Initialize with the canopy centers of the Canopy clustering method * * @param data the training data * @throws Exception if a problem occurs */ protected void canopyInit(Instances data) throws Exception { if (m_canopyClusters == null) { m_canopyClusters = new Canopy(); m_canopyClusters.setNumClusters(m_NumClusters); m_canopyClusters.setSeed(getSeed()); m_canopyClusters.setT2(getCanopyT2()); m_canopyClusters.setT1(getCanopyT1()); m_canopyClusters .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory()); m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate()); m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity()); m_canopyClusters.setDebug(getDebug()); m_canopyClusters.buildClusterer(data); } m_ClusterCentroids = m_canopyClusters.getCanopies(); } /** * Initialize with the fartherst first centers * * @param data the training data * @throws Exception if a problem occurs */ protected void farthestFirstInit(Instances data) throws Exception { FarthestFirst ff = new FarthestFirst(); ff.setNumClusters(m_NumClusters); ff.buildClusterer(data); m_ClusterCentroids = ff.getClusterCentroids(); } /** * Initialize using the k-means++ method * * @param data the training data * @throws Exception if a problem occurs */ protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap initC = new HashMap(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null); int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no // rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate // of // an already chosen cluster center should have zero distance (and // hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } } /** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @param addToCentroidInstances true if the method is to add the computed * coordinates to the Instances holding the centroids * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo, boolean addToCentroidInstances) { double[] vals = new double[members.numAttributes()]; double[][] nominalDists = new double[members.numAttributes()][]; double[] weightMissing = new double[members.numAttributes()]; double[] weightNonMissing = new double[members.numAttributes()]; // Quickly calculate some relevant statistics for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNominal()) { nominalDists[j] = new double[members.attribute(j).numValues()]; } } for (Instance inst : members) { for (int j = 0; j < members.numAttributes(); j++) { if (inst.isMissing(j)) { weightMissing[j] += inst.weight(); } else { weightNonMissing[j] += inst.weight(); if (members.attribute(j).isNumeric()) { vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case } else { nominalDists[j][(int)inst.value(j)] += inst.weight(); } } } } for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNumeric()) { if (weightNonMissing[j] > 0) { vals[j] /= weightNonMissing[j]; } else { vals[j] = Utils.missingValue(); } } else { double max = -Double.MAX_VALUE; double maxIndex = -1; for (int i = 0; i < nominalDists[j].length; i++) { if (nominalDists[j][i] > max) { max = nominalDists[j][i]; maxIndex = i; } if (max < weightMissing[j]) { vals[j] = Utils.missingValue(); } else { vals[j] = maxIndex; } } } } if (m_DistanceFunction instanceof ManhattanDistance) { // Need to replace means by medians Instances sortedMembers = null; int middle = (members.numInstances() - 1) / 2; boolean dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } for (int j = 0; j < members.numAttributes(); j++) { if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } } } if (updateClusterInfo) { for (int j = 0; j < members.numAttributes(); j++) { m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j]; m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j]; } } if (addToCentroidInstances) { m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } return vals; } /** * clusters an instance that has been through the filters. * * @param instance the instance to assign a cluster to * @param updateErrors if true, update the within clusters sum of errors * @param useFastDistCalc whether to use the fast distance calculation or not * @param instanceCanopies the canopies covering the instance to be clustered, * or null if not using the option to reduce the number of distance * computations via canopies * @return a cluster number */ private int clusterProcessedInstance(Instance instance, boolean updateErrors, boolean useFastDistCalc, long[] instanceCanopies) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist; if (useFastDistCalc) { if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null && instanceCanopies.length > 0) { try { if (!Canopy.nonEmptyCanopySetIntersection( m_centroidCanopyAssignments.get(i), instanceCanopies)) { continue; } } catch (Exception ex) { ex.printStackTrace(); } dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i), minDist); } else { dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i), minDist); } } else { dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i)); } if (dist < minDist) { minDist = dist; bestCluster = i; } } if (updateErrors) { if (m_DistanceFunction instanceof EuclideanDistance) { // Euclidean distance to Squared Euclidean distance minDist *= minDist * instance.weight(); } m_squaredErrors[bestCluster] += minDist; } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an interger if the class is * enumerated, otherwise the predicted value * @throws Exception if instance could not be classified successfully */ @Override public int clusterInstance(Instance instance) throws Exception { Instance inst = null; if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.input(instance); m_ReplaceMissingFilter.batchFinished(); inst = m_ReplaceMissingFilter.output(); } else { inst = instance; } return clusterProcessedInstance(inst, false, true, null); } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. * @throws Exception if number of clusters could not be returned successfully */ @Override public int numberOfClusters() throws Exception { return m_NumClusters; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2025 Weber Informatics LLC | Privacy Policy