All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.clusterers.ClusterUtils Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    ClusterUtils.java
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.ArrayList;
import java.util.List;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SelectedTag;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NominalStats;
import weka.core.stats.NumericStats;
import weka.distributed.CSVToARFFHeaderMapTask;
import weka.distributed.CSVToARFFHeaderReduceTask;
import weka.distributed.DistributedWekaException;
import weka.distributed.KMeansReduceTask;

/**
 * Some static utils methods for clustering
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: $
 */
public class ClusterUtils {

  /**
   * Constructor
   */
  private ClusterUtils() {
    // prevent instantiation of this utility class
  }

  /**
   * Utility method to perform the last phase of the k-means|| initialization -
   * i.e. takes the final centroid sketches and the clustering stats that show
   * how many training points are assigned to each instance (candidate center)
   * in each sketch, and then weights the sketch centroids and runs a local
   * KMeans on the weighted data to get the final requested number of starting
   * points.
   * 
   * @param numRuns number of runs of k-means being done
   * @param numClusters the requested number of clusters (and thus starting
   *          points for k-means)
   * @param finalSketches the final centroid sketches produced by the first
   *          phase of k-means||
   * @param statsForSketches the clustering statistics for the final sketches
   * @param debug true if debugging info is to be output
   * @return a final list of starting points (one set of instances for each run
   *         of k-means to be performed)
   * @throws DistributedWekaException if a problem occurs
   */
  public static List weightSketchesAndClusterToFinalStartPoints(
    int numRuns, int numClusters, CentroidSketch[] finalSketches,
    KMeansReduceTask[] statsForSketches, boolean debug)
    throws DistributedWekaException {

    List finalStartPointsForRuns = new ArrayList();

    for (int i = 0; i < numRuns; i++) {
      List centroidSummaries =
        statsForSketches[i].getAggregatedCentroidSummaries();
      Instances sketchForRun = finalSketches[i].getCurrentSketch();

      // empty clusters shouldn't be a problem - in
      // one iteration each sketch member should at minimum
      // have itself assigned (i.e. count >= 1). NOTE: The only exception
      // could occur if the sketch contains duplicate instances. However,
      // this shouldn't happen within a single WeightedReservoirSampling
      // as candidate instances with weight 0 (i.e. distance 0 to the sketch
      // in this case) are never added to the sketch.
      if (centroidSummaries.size() != sketchForRun.numInstances()) {
        throw new DistributedWekaException(
          "Was expecting as many summary headers as "
            + "there are center candidates in the sketch for run " + i);
      }

      for (int j = 0; j < sketchForRun.numInstances(); j++) {
        Instance centerCandidate = sketchForRun.instance(j);
        Instances centerStats = centroidSummaries.get(j);
        double weightForCandidate = -1.0;
        // now grab the first summary attribute and get count
        for (int k = 0; k < sketchForRun.numAttributes(); k++) {

          if (sketchForRun.attribute(k).isNumeric()) {
            Attribute statsAtt =
              centerStats
                .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                  + sketchForRun.attribute(k).name());
            weightForCandidate =
              ArffSummaryNumericMetric.COUNT.valueFromAttribute(statsAtt) +
                ArffSummaryNumericMetric.MISSING.valueFromAttribute(statsAtt);
            break;
          } else if (sketchForRun.attribute(k).isNominal()) {
            Attribute statsAtt =
              centerStats
                .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                  + sketchForRun.attribute(k).name());
            NominalStats ns = NominalStats.attributeToStats(statsAtt);
            weightForCandidate = 0;
            for (String s : ns.getLabels()) {
              weightForCandidate += ns.getCount(s);
            }
            weightForCandidate += ns.getNumMissing();
          }
        }

        if (weightForCandidate < 0) {
          throw new DistributedWekaException(
            "Unable to compute the number of training instances "
              + "assigned to sketch member " + j + " in run " + i);
        }

        // finally - set the weight
        centerCandidate.setWeight(weightForCandidate);
      }

      if (debug) {
        System.err
          .println("********** Final weighted sketch prior to local KMeans:\n"
            + sketchForRun);
      }

      // now run standard k-means on the weighted sketch to
      // (hopefully) get the requested number of start points
      SimpleKMeans localKMeans = new SimpleKMeans();
      try {
        localKMeans.setNumClusters(numClusters);
        localKMeans.setInitializationMethod(new SelectedTag(
          SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
        localKMeans.buildClusterer(sketchForRun);
        finalStartPointsForRuns.add(localKMeans.getClusterCentroids());
      } catch (Exception ex) {
        throw new DistributedWekaException(ex);
      }
    }

    return finalStartPointsForRuns;
  }

  /**
   * Compute priming data for a distance function (i.e. data that can be used to
   * intialize the mins and maxes of numeric variables for normalization
   * purposes).
   * 
   * @param headerWithSummary header with summary attributes
   * @return a two-instance data set containing min and maxes for all numeric
   *         attributes. Nominal attributes will have missing values
   * @throws DistributedWekaException if a problem occurs
   */
  public static Instances getPrimingDataForDistanceFunction(
    Instances headerWithSummary) throws DistributedWekaException {

    Instances headerNoSummary =
      CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

    ArrayList atts = new ArrayList();
    double[] mins = new double[headerNoSummary.numAttributes()];
    double[] maxes = new double[headerNoSummary.numAttributes()];

    for (int i = 0; i < headerNoSummary.numAttributes(); i++) {
      Attribute orig = headerNoSummary.attribute(i);
      Attribute summary =
        headerWithSummary
          .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
            + orig.name());

      atts.add((Attribute) orig.copy());

      if (orig.isNumeric()) {

        // number of non-missing values
        double count =
          NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.COUNT
            .ordinal()];
        if (count > 2) {
          mins[i] =
            NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MIN
              .ordinal()];
          maxes[i] =
            NumericStats.attributeToStats(summary).getStats()[ArffSummaryNumericMetric.MAX
              .ordinal()];
        }
      } else if (orig.isNominal()) {
        // doesn't matter for non numeric attributes
        mins[i] = Utils.missingValue();
        maxes[i] = Utils.missingValue();
      }
    }

    Instances dummy = new Instances("Dummy", atts, 0);
    dummy.add(new DenseInstance(1.0, mins));
    dummy.add(new DenseInstance(1.0, maxes));

    return dummy;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy