weka.clusterers.SimpleKMeans Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    SimpleKMeans.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.ManhattanDistance;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 *  Cluster data using the k means algorithm. Can use
 * either the Euclidean distance (default) or the Manhattan distance. If the
 * Manhattan distance is used, then centroids are computed as the component-wise
 * median rather than mean. For more information see:

 * 

 * D. Arthur, S. Vassilvitskii: k-means++: the advantages of carefull seeding.
 * In: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete
 * algorithms, 1027-1035, 2007.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @inproceedings{Arthur2007,
 *    author = {D. Arthur and S. Vassilvitskii},
 *    booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
 *    pages = {1027-1035},
 *    title = {k-means++: the advantages of carefull seeding},
 *    year = {2007}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -N <num>
 *  Number of clusters.
 *  (default 2).
 * 
 * 
 *  * -init
 *  Initialization method to use.
 *  0 = random, 1 = k-means++, 2 = canopy, 3 = farthest first.
 *  (default = 0)
 * 
 * 
 *  * -C
 *  Use canopies to reduce the number of distance calculations.
 * 
 * 
 *  * -max-candidates <num>
 *  Maximum number of candidate canopies to retain in memory
 *  at any one time when using canopy clustering.
 *  T2 distance plus, data characteristics,
 *  will determine how many candidate canopies are formed before
 *  periodic and final pruning are performed, which might result
 *  in exceess memory consumption. This setting avoids large numbers
 *  of candidate canopies consuming memory. (default = 100)
 * 
 * 
 *  * -periodic-pruning <num>
 *  How often to prune low density canopies when using canopy clustering. 
 *  (default = every 10,000 training instances)
 * 
 * 
 *  * -min-density
 *  Minimum canopy density, when using canopy clustering, below which
 *   a canopy will be pruned during periodic pruning. (default = 2 instances)
 * 
 * 
 *  * -t2
 *  The T2 distance to use when using canopy clustering. Values < 0 indicate that
 *  a heuristic based on attribute std. deviation should be used to set this.
 *  (default = -1.0)
 * 
 * 
 *  * -t1
 *  The T1 distance to use when using canopy clustering. A value < 0 is taken as a
 *  positive multiplier for T2. (default = -1.5)
 * 
 * 
 *  * -V
 *  Display std. deviations for centroids.
 * 
 * 
 *  * -M
 *  Don't replace missing values with mean/mode.
 * 
 * 
 *  * -A <classname and options>
 *  Distance function to use.
 *  (default: weka.core.EuclideanDistance)
 * 
 * 
 *  * -I <num>
 *  Maximum number of iterations.
 * 
 * 
 *  * -O
 *  Preserve order of instances.
 * 
 * 
 *  * -fast
 *  Enables faster distance calculations, using cut-off values.
 *  Disables the calculation/output of squared errors/distances.
 * 
 * 
 *  * -num-slots <num>
 *  Number of execution slots.
 *  (default 1 - i.e. no parallelism)
 * 
 * 
 *  * -S <num>
 *  Random number seed.
 *  (default 10)
 * 
 * 
 *  * -output-debug-info
 *  If set, clusterer is run in debug mode and
 *  may output additional info to the console
 * 
 * 
 *  * -do-not-check-capabilities
 *  If set, clusterer capabilities are not checked before clusterer is built
 *  (use with caution).
 * 
 * 
 * 
 * 
 * @author Mark Hall ([email protected])
 * @author Eibe Frank ([email protected])
 * @version $Revision: 11444 $
 * @see RandomizableClusterer
 */
public class SimpleKMeans extends RandomizableClusterer implements
  NumberOfClustersRequestable, WeightedInstancesHandler,
  TechnicalInformationHandler {

  /** for serialization. */
  static final long serialVersionUID = -3235809600124455376L;

  /**
   * replace missing values in training instances.
   */
  protected ReplaceMissingValues m_ReplaceMissingFilter;

  /**
   * number of clusters to generate.
   */
  protected int m_NumClusters = 2;

  /**
   * Holds the initial start points, as supplied by the initialization method
   * used
   */
  protected Instances m_initialStartPoints;

  /**
   * holds the cluster centroids.
   */
  protected Instances m_ClusterCentroids;

  /**
   * Holds the standard deviations of the numeric attributes in each cluster.
   */
  protected Instances m_ClusterStdDevs;

  /**
   * For each cluster, holds the frequency counts for the values of each nominal
   * attribute.
   */
  protected double[][][] m_ClusterNominalCounts;
  protected double[][] m_ClusterMissingCounts;

  /**
   * Stats on the full data set for comparison purposes. In case the attribute
   * is numeric the value is the mean if is being used the Euclidian distance or
   * the median if Manhattan distance and if the attribute is nominal then it's
   * mode is saved.
   */
  protected double[] m_FullMeansOrMediansOrModes;
  protected double[] m_FullStdDevs;
  protected double[][] m_FullNominalCounts;
  protected double[] m_FullMissingCounts;

  /**
   * Display standard deviations for numeric atts.
   */
  protected boolean m_displayStdDevs;

  /**
   * Replace missing values globally?
   */
  protected boolean m_dontReplaceMissing = false;

  /**
   * The number of instances in each cluster.
   */
  protected double[] m_ClusterSizes;

  /**
   * Maximum number of iterations to be executed.
   */
  protected int m_MaxIterations = 500;

  /**
   * Keep track of the number of iterations completed before convergence.
   */
  protected int m_Iterations = 0;

  /**
   * Holds the squared errors for all clusters.
   */
  protected double[] m_squaredErrors;

  /** the distance function used. */
  protected DistanceFunction m_DistanceFunction = new EuclideanDistance();

  /**
   * Preserve order of instances.
   */
  protected boolean m_PreserveOrder = false;

  /**
   * Assignments obtained.
   */
  protected int[] m_Assignments = null;

  /** whether to use fast calculation of distances (using a cut-off). */
  protected boolean m_FastDistanceCalc = false;

  public static final int RANDOM = 0;
  public static final int KMEANS_PLUS_PLUS = 1;
  public static final int CANOPY = 2;
  public static final int FARTHEST_FIRST = 3;

  /** Initialization methods */
  public static final Tag[] TAGS_SELECTION = { new Tag(RANDOM, "Random"),
    new Tag(KMEANS_PLUS_PLUS, "k-means++"), new Tag(CANOPY, "Canopy"),
    new Tag(FARTHEST_FIRST, "Farthest first") };

  /** The initialization method to use */
  protected int m_initializationMethod = RANDOM;

  /**
   * Whether to reducet the number of distance calcs done by k-means with
   * canopies
   */
  protected boolean m_speedUpDistanceCompWithCanopies = false;

  /** Canopies that each centroid falls into (determined by T1 radius) */
  protected List m_centroidCanopyAssignments;

  /** Canopies that each training instance falls into (determined by T1 radius) */
  protected List m_dataPointCanopyAssignments;

  /** The canopy clusterer (if being used) */
  protected Canopy m_canopyClusters;

  /**
   * The maximum number of candidate canopies to hold in memory at any one time
   * (if using canopy clustering)
   */
  protected int m_maxCanopyCandidates = 100;

  /**
   * Prune low-density candidate canopies after every x instances have been seen
   * (if using canopy clustering)
   */
  protected int m_periodicPruningRate = 10000;

  /**
   * The minimum cluster density (according to T2 distance) allowed. Used when
   * periodically pruning candidate canopies (if using canopy clustering)
   */
  protected double m_minClusterDensity = 2;

  /** The t2 radius to pass through to Canopy */
  protected double m_t2 = Canopy.DEFAULT_T2;

  /** The t1 radius to pass through to Canopy */
  protected double m_t1 = Canopy.DEFAULT_T1;

  /** Number of threads to run */
  protected int m_executionSlots = 1;

  /** For parallel execution mode */
  protected transient ExecutorService m_executorPool;

  /**
   * the default constructor.
   */
  public SimpleKMeans() {
    super();

    m_SeedDefault = 10;
    setSeed(m_SeedDefault);
  }

  /**
   * Start the pool of execution threads
   */
  protected void startExecutorPool() {
    if (m_executorPool != null) {
      m_executorPool.shutdownNow();
    }

    m_executorPool = Executors.newFixedThreadPool(m_executionSlots);
  }

  protected int m_completed;
  protected int m_failed;

  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "D. Arthur and S. Vassilvitskii");
    result.setValue(Field.TITLE,
      "k-means++: the advantages of carefull seeding");
    result.setValue(Field.BOOKTITLE, "Proceedings of the eighteenth annual "
      + "ACM-SIAM symposium on Discrete algorithms");
    result.setValue(Field.YEAR, "2007");
    result.setValue(Field.PAGES, "1027-1035");

    return result;
  }

  /**
   * Returns a string describing this clusterer.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the k means algorithm. Can use either "
      + "the Euclidean distance (default) or the Manhattan distance."
      + " If the Manhattan distance is used, then centroids are computed "
      + "as the component-wise median rather than mean."
      + " For more information see:\n\n" + getTechnicalInformation().toString();
  }

  /**
   * Returns default capabilities of the clusterer.
   * 
   * @return the capabilities of this clusterer
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();
    result.enable(Capability.NO_CLASS);

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    return result;
  }

  private class KMeansComputeCentroidTask implements Callable {

    protected Instances m_cluster;
    protected int m_centroidIndex;

    public KMeansComputeCentroidTask(int centroidIndex, Instances cluster) {
      m_cluster = cluster;
      m_centroidIndex = centroidIndex;
    }

    @Override
    public double[] call() {
      return moveCentroid(m_centroidIndex, m_cluster, true, false);
    }
  }

  /**
   * Launch the move centroids tasks
   * 
   * @param clusters the cluster centroids
   * @return the number of empty clusters
   */
  protected int launchMoveCentroids(Instances[] clusters) {
    int emptyClusterCount = 0;
    List> results = new ArrayList>();

    for (int i = 0; i < m_NumClusters; i++) {
      if (clusters[i].numInstances() == 0) {
        emptyClusterCount++;
      } else {
        Future futureCentroid =
          m_executorPool.submit(new KMeansComputeCentroidTask(i, clusters[i]));
        results.add(futureCentroid);
      }
    }

    try {
      for (Future d : results) {
        m_ClusterCentroids.add(new DenseInstance(1.0, d.get()));
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return emptyClusterCount;
  }

  private class KMeansClusterTask implements Callable {

    protected int m_start;
    protected int m_end;
    protected Instances m_inst;
    protected int[] m_clusterAssignments;

    public KMeansClusterTask(Instances inst, int start, int end,
      int[] clusterAssignments) {
      m_start = start;
      m_end = end;
      m_inst = inst;
      m_clusterAssignments = clusterAssignments;
    }

    @Override
    public Boolean call() {
      boolean converged = true;
      for (int i = m_start; i < m_end; i++) {
        Instance toCluster = m_inst.instance(i);
        long[] instanceCanopies =
          m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments
            .get(i) : null;
        int newC = clusterInstance(toCluster, instanceCanopies);
        if (newC != m_clusterAssignments[i]) {
          converged = false;
        }
        m_clusterAssignments[i] = newC;
      }

      return converged;
    }

    protected int clusterInstance(Instance inst, long[] instanceCanopies) {
      double minDist = Integer.MAX_VALUE;
      int bestCluster = 0;
      for (int i = 0; i < m_NumClusters; i++) {
        double dist;

        if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null
          && instanceCanopies.length > 0) {
          try {
            if (!Canopy.nonEmptyCanopySetIntersection(
              m_centroidCanopyAssignments.get(i), instanceCanopies)) {
              // System.err.println("Skipping distance calc... "
              // + Canopy.printSingleAssignment(instanceCanopies));
              continue;
            }
          } catch (Exception ex) {
            ex.printStackTrace();
          }
        }

        dist =
          m_DistanceFunction.distance(inst, m_ClusterCentroids.instance(i),
            minDist);

        if (dist < minDist) {
          minDist = dist;
          bestCluster = i;
        }
      }

      return bestCluster;
    }
  }

  /**
   * Launch the tasks that assign instances to clusters
   * 
   * @param insts the instances to be clustered
   * @param clusterAssignments the array of cluster assignments
   * @return true if k means has converged
   * @throws Exception if a problem occurs
   */
  protected boolean launchAssignToClusters(Instances insts,
    int[] clusterAssignments) throws Exception {
    int numPerTask = insts.numInstances() / m_executionSlots;

    List> results = new ArrayList>();
    for (int i = 0; i < m_executionSlots; i++) {
      int start = i * numPerTask;
      int end = start + numPerTask;
      if (i == m_executionSlots - 1) {
        end = insts.numInstances();
      }

      Future futureKM =
        m_executorPool.submit(new KMeansClusterTask(insts, start, end,
          clusterAssignments));
      results.add(futureKM);
    }

    boolean converged = true;
    for (Future f : results) {
      if (!f.get()) {
        converged = false;
      }
    }

    return converged;
  }

  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer that
   * are not being set via options.
   * 
   * @param data set of instances serving as training data
   * @throws Exception if the clusterer has not been generated successfully
   */
  @Override
  public void buildClusterer(Instances data) throws Exception {

    m_canopyClusters = null;

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
      m_ReplaceMissingFilter.setInputFormat(instances);
      instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    if (m_displayStdDevs) {
      m_FullStdDevs = instances.variances();
    }
   
    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);
    
    m_FullMissingCounts = m_ClusterMissingCounts[0];
    m_FullNominalCounts = m_ClusterNominalCounts[0];
    double sumOfWeights = instances.sumOfWeights();
    for (int i = 0; i < instances.numAttributes(); i++) {
      if (instances.attribute(i).isNumeric()) {
        if (m_displayStdDevs) {
          m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]);
        }
        if (m_FullMissingCounts[i] == sumOfWeights) {
          m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
        }
      } else {
        if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils
          .maxIndex(m_FullNominalCounts[i])]) {
          m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                               // value
        }
      }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder) {
      m_Assignments = clusterAssignments;
    }

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC =
      new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder) {
      initInstances = new Instances(instances);
    } else {
      initInstances = instances;
    }

    if (m_speedUpDistanceCompWithCanopies) {
      m_canopyClusters = new Canopy();
      m_canopyClusters.setNumClusters(m_NumClusters);
      m_canopyClusters.setSeed(getSeed());
      m_canopyClusters.setT2(getCanopyT2());
      m_canopyClusters.setT1(getCanopyT1());
      m_canopyClusters
        .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
      m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
      m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
      m_canopyClusters.setDebug(getDebug());
      m_canopyClusters.buildClusterer(initInstances);
      // System.err.println(m_canopyClusters);
      m_centroidCanopyAssignments = new ArrayList();
      m_dataPointCanopyAssignments = new ArrayList();
    }

    if (m_initializationMethod == KMEANS_PLUS_PLUS) {
      kMeansPlusPlusInit(initInstances);

      m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else if (m_initializationMethod == CANOPY) {
      canopyInit(initInstances);

      m_initialStartPoints = new Instances(m_canopyClusters.getCanopies());
    } else if (m_initializationMethod == FARTHEST_FIRST) {
      farthestFirstInit(initInstances);

      m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else {
      // random
      for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
        instIndex = RandomO.nextInt(j + 1);
        hk =
          new DecisionTableHashKey(initInstances.instance(instIndex),
            initInstances.numAttributes(), true);
        if (!initC.containsKey(hk)) {
          m_ClusterCentroids.add(initInstances.instance(instIndex));
          initC.put(hk, null);
        }
        initInstances.swap(j, instIndex);

        if (m_ClusterCentroids.numInstances() == m_NumClusters) {
          break;
        }
      }

      m_initialStartPoints = new Instances(m_ClusterCentroids);
    }

    if (m_speedUpDistanceCompWithCanopies) {
      // assign canopies to training data
      for (int i = 0; i < instances.numInstances(); i++) {
        m_dataPointCanopyAssignments.add(m_canopyClusters
          .assignCanopies(instances.instance(i)));
      }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    startExecutorPool();

    while (!converged) {
      if (m_speedUpDistanceCompWithCanopies) {
        // re-assign canopies to the current cluster centers
        m_centroidCanopyAssignments.clear();
        for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) {
          m_centroidCanopyAssignments.add(m_canopyClusters
            .assignCanopies(m_ClusterCentroids.instance(kk)));
        }
      }

      emptyClusterCount = 0;
      m_Iterations++;
      converged = true;

      if (m_executionSlots <= 1
        || instances.numInstances() < 2 * m_executionSlots) {
        for (i = 0; i < instances.numInstances(); i++) {
          Instance toCluster = instances.instance(i);
          int newC =
            clusterProcessedInstance(
              toCluster,
              false,
              true,
              m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments
                .get(i) : null);
          if (newC != clusterAssignments[i]) {
            converged = false;
          }
          clusterAssignments[i] = newC;
        }
      } else {
        converged = launchAssignToClusters(instances, clusterAssignments);
      }

      // update centroids
      m_ClusterCentroids = new Instances(instances, m_NumClusters);
      for (i = 0; i < m_NumClusters; i++) {
        tempI[i] = new Instances(instances, 0);
      }
      for (i = 0; i < instances.numInstances(); i++) {
        tempI[clusterAssignments[i]].add(instances.instance(i));
      }
      if (m_executionSlots <= 1
        || instances.numInstances() < 2 * m_executionSlots) {
        for (i = 0; i < m_NumClusters; i++) {
          if (tempI[i].numInstances() == 0) {
            // empty cluster
            emptyClusterCount++;
          } else {
            moveCentroid(i, tempI[i], true, true);
          }
        }
      } else {
        emptyClusterCount = launchMoveCentroids(tempI);
      }

      if (m_Iterations == m_MaxIterations) {
        converged = true;
      }

      if (emptyClusterCount > 0) {
        m_NumClusters -= emptyClusterCount;
        if (converged) {
          Instances[] t = new Instances[m_NumClusters];
          int index = 0;
          for (int k = 0; k < tempI.length; k++) {
            if (tempI[k].numInstances() > 0) {
              t[index] = tempI[k];

              for (i = 0; i < tempI[k].numAttributes(); i++) {
                m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
              }
              index++;
            }
          }
          tempI = t;
        } else {
          tempI = new Instances[m_NumClusters];
        }
      }

      if (!converged) {
        m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
      }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
      for (i = 0; i < instances.numInstances(); i++) {
        clusterProcessedInstance(instances.instance(i), true, false, null);
      }
    }

    if (m_displayStdDevs) {
      m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new double[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
      if (m_displayStdDevs) {
        double[] vals2 = tempI[i].variances();
        for (int j = 0; j < instances.numAttributes(); j++) {
          if (instances.attribute(j).isNumeric()) {
            vals2[j] = Math.sqrt(vals2[j]);
          } else {
            vals2[j] = Utils.missingValue();
          }
        }
        m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
      }
      m_ClusterSizes[i] = tempI[i].sumOfWeights();
    }

    m_executorPool.shutdown();

    // save memory!
    m_DistanceFunction.clean();
  }

  /**
   * Initialize with the canopy centers of the Canopy clustering method
   * 
   * @param data the training data
   * @throws Exception if a problem occurs
   */
  protected void canopyInit(Instances data) throws Exception {
    if (m_canopyClusters == null) {
      m_canopyClusters = new Canopy();
      m_canopyClusters.setNumClusters(m_NumClusters);
      m_canopyClusters.setSeed(getSeed());
      m_canopyClusters.setT2(getCanopyT2());
      m_canopyClusters.setT1(getCanopyT1());
      m_canopyClusters
        .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
      m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
      m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
      m_canopyClusters.setDebug(getDebug());
      m_canopyClusters.buildClusterer(data);
    }
    m_ClusterCentroids = m_canopyClusters.getCanopies();
  }

  /**
   * Initialize with the fartherst first centers
   * 
   * @param data the training data
   * @throws Exception if a problem occurs
   */
  protected void farthestFirstInit(Instances data) throws Exception {
    FarthestFirst ff = new FarthestFirst();
    ff.setNumClusters(m_NumClusters);
    ff.buildClusterer(data);

    m_ClusterCentroids = ff.getClusterCentroids();
  }

  /**
   * Initialize using the k-means++ method
   * 
   * @param data the training data
   * @throws Exception if a problem occurs
   */
  protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap initC =
      new HashMap();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk =
      new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
    initC.put(hk, null);

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
      // proceed with selecting the rest

      // distances to the initial randomly chose center
      double[] distances = new double[data.numInstances()];
      double[] cumProbs = new double[data.numInstances()];
      for (int i = 0; i < data.numInstances(); i++) {
        distances[i] =
          m_DistanceFunction.distance(data.instance(i),
            m_ClusterCentroids.instance(iteration));
      }

      // now choose the remaining cluster centers
      for (int i = 1; i < m_NumClusters; i++) {

        // distances converted to probabilities
        double[] weights = new double[data.numInstances()];
        System.arraycopy(distances, 0, weights, 0, distances.length);
        Utils.normalize(weights);

        double sumOfProbs = 0;
        for (int k = 0; k < data.numInstances(); k++) {
          sumOfProbs += weights[k];
          cumProbs[k] = sumOfProbs;
        }

        cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no
                                                 // rounding issues

        // choose a random instance
        double prob = randomO.nextDouble();
        for (int k = 0; k < cumProbs.length; k++) {
          if (prob < cumProbs[k]) {
            Instance candidateCenter = data.instance(k);
            hk =
              new DecisionTableHashKey(candidateCenter, data.numAttributes(),
                true);
            if (!initC.containsKey(hk)) {
              initC.put(hk, null);
              m_ClusterCentroids.add(candidateCenter);
            } else {
              // we shouldn't get here because any instance that is a duplicate
              // of
              // an already chosen cluster center should have zero distance (and
              // hence
              // zero probability of getting chosen) to that center.
              System.err.println("We shouldn't get here....");
            }
            remainingInstances--;
            break;
          }
        }
        iteration++;

        if (remainingInstances == 0) {
          break;
        }

        // prepare to choose the next cluster center.
        // check distances against the new cluster center to see if it is closer
        for (int k = 0; k < data.numInstances(); k++) {
          if (distances[k] > 0) {
            double newDist =
              m_DistanceFunction.distance(data.instance(k),
                m_ClusterCentroids.instance(iteration));
            if (newDist < distances[k]) {
              distances[k] = newDist;
            }
          }
        }
      }
    }
  }

  /**
   * Move the centroid to it's new coordinates. Generate the centroid
   * coordinates based on it's members (objects assigned to the cluster of the
   * centroid) and the distance function being used.
   * 
   * @param centroidIndex index of the centroid which the coordinates will be
   *          computed
   * @param members the objects that are assigned to the cluster of this
   *          centroid
   * @param updateClusterInfo if the method is supposed to update the m_Cluster
   *          arrays
   * @param addToCentroidInstances true if the method is to add the computed
   *          coordinates to the Instances holding the centroids
   * @return the centroid coordinates
   */
  protected double[] moveCentroid(int centroidIndex, Instances members,
    boolean updateClusterInfo, boolean addToCentroidInstances) {
    
    double[] vals = new double[members.numAttributes()];
    double[][] nominalDists = new double[members.numAttributes()][];
    double[] weightMissing = new double[members.numAttributes()];
    double[] weightNonMissing = new double[members.numAttributes()];
    
    // Quickly calculate some relevant statistics 
    for (int j = 0; j < members.numAttributes(); j++) {
      if (members.attribute(j).isNominal()) {
        nominalDists[j] = new double[members.attribute(j).numValues()];
      }
    }
    for (Instance inst : members) {
      for (int j = 0; j < members.numAttributes(); j++) {
        if (inst.isMissing(j)) {
          weightMissing[j] += inst.weight(); 
        } else {
          weightNonMissing[j] += inst.weight();
          if (members.attribute(j).isNumeric()) {
            vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case
          } else {
            nominalDists[j][(int)inst.value(j)] += inst.weight();
          }
        }
      }      
    }
    for (int j = 0; j < members.numAttributes(); j++) {
      if (members.attribute(j).isNumeric()) {
        if  (weightNonMissing[j] > 0) {
          vals[j] /= weightNonMissing[j];
        } else {
          vals[j] = Utils.missingValue();
        }
      } else {
        double max = -Double.MAX_VALUE;
        double maxIndex = -1;
        for (int i = 0; i < nominalDists[j].length; i++) {
          if (nominalDists[j][i] > max) {
            max = nominalDists[j][i];
            maxIndex = i;
          }
          if (max < weightMissing[j]) {
            vals[j] = Utils.missingValue();
          } else {
            vals[j] = maxIndex;
          }
        }
      }
    }
    
    if (m_DistanceFunction instanceof ManhattanDistance) {
      
      // Need to replace means by medians
      Instances sortedMembers = null;
      int middle = (members.numInstances() - 1) / 2;
      boolean dataIsEven = ((members.numInstances() % 2) == 0);
      if (m_PreserveOrder) {
        sortedMembers = members;
      } else {
        sortedMembers = new Instances(members);
      }
      for (int j = 0; j < members.numAttributes(); j++) {
        if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) {
          // singleton special case
          if (members.numInstances() == 1) {
            vals[j] = members.instance(0).value(j);
          } else {
            vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
            if (dataIsEven) {
              vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
            }
          }
        }
      }
    }
      
    if (updateClusterInfo) {   
      for (int j = 0; j < members.numAttributes(); j++) {
        m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j];
        m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j];
      } 
    }
    
    if (addToCentroidInstances) {
      m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }
    
    return vals;
  }

  /**
   * clusters an instance that has been through the filters.
   * 
   * @param instance the instance to assign a cluster to
   * @param updateErrors if true, update the within clusters sum of errors
   * @param useFastDistCalc whether to use the fast distance calculation or not
   * @param instanceCanopies the canopies covering the instance to be clustered,
   *          or null if not using the option to reduce the number of distance
   *          computations via canopies
   * @return a cluster number
   */
  private int clusterProcessedInstance(Instance instance, boolean updateErrors,
    boolean useFastDistCalc, long[] instanceCanopies) {
    double minDist = Integer.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      double dist;
      if (useFastDistCalc) {
        if (m_speedUpDistanceCompWithCanopies && instanceCanopies != null
          && instanceCanopies.length > 0) {
          try {
            if (!Canopy.nonEmptyCanopySetIntersection(
              m_centroidCanopyAssignments.get(i), instanceCanopies)) {
              continue;
            }
          } catch (Exception ex) {
            ex.printStackTrace();
          }
          dist =
            m_DistanceFunction.distance(instance,
              m_ClusterCentroids.instance(i), minDist);
        } else {
          dist =
            m_DistanceFunction.distance(instance,
              m_ClusterCentroids.instance(i), minDist);
        }
      } else {
        dist =
          m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i));
      }
      if (dist < minDist) {
        minDist = dist;
        bestCluster = i;
      }
    }
    if (updateErrors) {
      if (m_DistanceFunction instanceof EuclideanDistance) {
        // Euclidean distance to Squared Euclidean distance
        minDist *= minDist * instance.weight();
      }
      m_squaredErrors[bestCluster] += minDist;
    }
    return bestCluster;
  }

  /**
   * Classifies a given instance.
   * 
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an interger if the class is
   *         enumerated, otherwise the predicted value
   * @throws Exception if instance could not be classified successfully
   */
  @Override
  public int clusterInstance(Instance instance) throws Exception {
    Instance inst = null;
    if (!m_dontReplaceMissing) {
      m_ReplaceMissingFilter.input(instance);
      m_ReplaceMissingFilter.batchFinished();
      inst = m_ReplaceMissingFilter.output();
    } else {
      inst = instance;
    }

    return clusterProcessedInstance(inst, false, true, null);
  }

  /**
   * Returns the number of clusters.
   * 
   * @return the number of clusters generated for a training dataset.
   * @throws Exception if number of clusters could not be returned successfully
   */
  @Override
  public int numberOfClusters() throws Exception {
    return m_NumClusters;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration