weka.clusterers.MakeDensityBasedClusterer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    MakeDensityBasedClusterer.java
 *    Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.estimators.DiscreteEstimator;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 *  Class for wrapping a Clusterer to make it return a
 * distribution and density. Fits normal distributions and discrete
 * distributions within each cluster produced by the wrapped clusterer. Supports
 * the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -M <num>
 *  minimum allowable standard deviation for normal density computation 
 *  (default 1e-6)
 * 
 * 
 *  * -W <clusterer name>
 *  Clusterer to wrap.
 *  (default weka.clusterers.SimpleKMeans)
 * 
 * 
 *  * Options specific to clusterer weka.clusterers.SimpleKMeans:
 * 
 * 
 *  * -N <num>
 *  number of clusters.
 *  (default 2).
 * 
 * 
 *  * -V
 *  Display std. deviations for centroids.
 * 
 * 
 *  * -M
 *  Replace missing values with mean/mode.
 * 
 * 
 *  * -S <num>
 *  Random number seed.
 *  (default 10)
 * 
 * 
 * 
 * 
 * Options after "--" are passed on to the base clusterer.
 * 
 * @author Richard Kirkby ([email protected])
 * @author Mark Hall ([email protected])
 * @author Eibe Frank ([email protected])
 * @version $Revision: 10203 $
 */
public class MakeDensityBasedClusterer extends AbstractDensityBasedClusterer
  implements NumberOfClustersRequestable, OptionHandler,
  WeightedInstancesHandler {

  /** for serialization */
  static final long serialVersionUID = -5643302427972186631L;

  /** holds training instances header information */
  private Instances m_theInstances;
  /** prior probabilities for the fitted clusters */
  private double[] m_priors;
  /** normal distributions fitted to each numeric attribute in each cluster */
  private double[][][] m_modelNormal;
  /** discrete distributions fitted to each discrete attribute in each cluster */
  private DiscreteEstimator[][] m_model;
  /** default minimum standard deviation */
  private double m_minStdDev = 1e-6;
  /** The clusterer being wrapped */
  private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
  /** globally replace missing values */
  private ReplaceMissingValues m_replaceMissing;

  /**
   * Default constructor.
   * 
   */
  public MakeDensityBasedClusterer() {
    super();
  }

  /**
   * Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
   * 
   * @param toWrap the clusterer to wrap around
   */
  public MakeDensityBasedClusterer(Clusterer toWrap) {

    setClusterer(toWrap);
  }

  /**
   * Returns a string describing classifier
   * 
   * @return a description suitable for displaying in the explorer/experimenter
   *         gui
   */
  public String globalInfo() {
    return "Class for wrapping a Clusterer to make it return a distribution "
      + "and density. Fits normal distributions and discrete distributions "
      + "within each cluster produced by the wrapped clusterer. Supports the "
      + "NumberOfClustersRequestable interface only if the wrapped Clusterer "
      + "does.";
  }

  /**
   * String describing default clusterer.
   * 
   * @return the default clusterer classname
   */
  protected String defaultClustererString() {
    return SimpleKMeans.class.getName();
  }

  /**
   * Set the number of clusters to generate.
   * 
   * @param n the number of clusters to generate
   * @throws Exception if the wrapped clusterer has not been set, or if the
   *           wrapped clusterer does not implement this facility.
   */
  @Override
  public void setNumClusters(int n) throws Exception {
    if (m_wrappedClusterer == null) {
      throw new Exception("Can't set the number of clusters to generate - "
        + "no clusterer has been set yet.");
    }
    if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {
      throw new Exception("Can't set the number of clusters to generate - "
        + "wrapped clusterer does not support this facility.");
    }

    ((NumberOfClustersRequestable) m_wrappedClusterer).setNumClusters(n);
  }

  /**
   * Returns default capabilities of the clusterer (i.e., of the wrapper
   * clusterer).
   * 
   * @return the capabilities of this clusterer
   */
  @Override
  public Capabilities getCapabilities() {
    if (m_wrappedClusterer != null) {
      return m_wrappedClusterer.getCapabilities();
    }
    Capabilities result = super.getCapabilities();
    result.disableAll();
    result.enable(Capability.NO_CLASS);

    return result;
  }

  /**
   * Builds a clusterer for a set of instances.
   * 
   * @param data the instances to train the clusterer with
   * @throws Exception if the clusterer hasn't been set or something goes wrong
   */
  @Override
  public void buildClusterer(Instances data) throws Exception {
    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_replaceMissing = new ReplaceMissingValues();
    m_replaceMissing.setInputFormat(data);
    data = weka.filters.Filter.useFilter(data, m_replaceMissing);

    m_theInstances = new Instances(data, 0);
    if (m_wrappedClusterer == null) {
      throw new Exception("No clusterer has been set");
    }
    m_wrappedClusterer.buildClusterer(data);
    m_model = new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data
      .numAttributes()];
    m_modelNormal = new double[m_wrappedClusterer.numberOfClusters()][data
      .numAttributes()][2];
    double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data
      .numAttributes()];
    m_priors = new double[m_wrappedClusterer.numberOfClusters()];
    for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
      m_priors[i] = 1.0; // laplace correction
      for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j).isNominal()) {
          m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),
            true);
        }
      }
    }

    Instance inst = null;

    // Compute mean, etc.
    int[] clusterIndex = new int[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
      inst = data.instance(i);
      int cluster = m_wrappedClusterer.clusterInstance(inst);
      m_priors[cluster] += inst.weight();
      for (int j = 0; j < data.numAttributes(); j++) {
        if (!inst.isMissing(j)) {
          if (data.attribute(j).isNominal()) {
            m_model[cluster][j].addValue(inst.value(j), inst.weight());
          } else {
            m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);
            weights[cluster][j] += inst.weight();
          }
        }
      }
      clusterIndex[i] = cluster;
    }

    for (int j = 0; j < data.numAttributes(); j++) {
      if (data.attribute(j).isNumeric()) {
        for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
          if (weights[i][j] > 0) {
            m_modelNormal[i][j][0] /= weights[i][j];
          }
        }
      }
    }

    // Compute standard deviations
    for (int i = 0; i < data.numInstances(); i++) {
      inst = data.instance(i);
      for (int j = 0; j < data.numAttributes(); j++) {
        if (!inst.isMissing(j)) {
          if (data.attribute(j).isNumeric()) {
            double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);
            m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;
          }
        }
      }
    }

    for (int j = 0; j < data.numAttributes(); j++) {
      if (data.attribute(j).isNumeric()) {
        for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
          if (weights[i][j] > 0) {
            m_modelNormal[i][j][1] = Math.sqrt(m_modelNormal[i][j][1]
              / weights[i][j]);
          } else if (weights[i][j] <= 0) {
            m_modelNormal[i][j][1] = Double.MAX_VALUE;
          }
          if (m_modelNormal[i][j][1] <= m_minStdDev) {
            m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
            if (m_modelNormal[i][j][1] <= m_minStdDev) {
              m_modelNormal[i][j][1] = m_minStdDev;
            }
          }
        }
      }
    }

    Utils.normalize(m_priors);
  }

  /**
   * Returns the cluster priors.
   * 
   * @return the cluster priors
   */
  @Override
  public double[] clusterPriors() {

    double[] n = new double[m_priors.length];

    System.arraycopy(m_priors, 0, n, 0, n.length);
    return n;
  }

  /**
   * Computes the log of the conditional density (per cluster) for a given
   * instance.
   * 
   * @param inst the instance to compute the density for
   * @return an array containing the estimated densities
   * @throws Exception if the density could not be computed successfully
   */
  @Override
  public double[] logDensityPerClusterForInstance(Instance inst)
    throws Exception {

    int i, j;
    double logprob;
    double[] wghts = new double[m_wrappedClusterer.numberOfClusters()];

    m_replaceMissing.input(inst);
    inst = m_replaceMissing.output();

    for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
      logprob = 0;
      for (j = 0; j < inst.numAttributes(); j++) {
        if (!inst.isMissing(j)) {
          if (inst.attribute(j).isNominal()) {
            logprob += Math.log(m_model[i][j].getProbability(inst.value(j)));
          } else { // numeric attribute
            logprob += logNormalDens(inst.value(j), m_modelNormal[i][j][0],
              m_modelNormal[i][j][1]);
          }
        }
      }
      wghts[i] = logprob;
    }
    return wghts;
  }

  /** Constant for normal distribution. */
  private static double m_normConst = 0.5 * Math.log(2 * Math.PI);

  /**
   * Density function of normal distribution.
   * 
   * @param x input value
   * @param mean mean of distribution
   * @param stdDev standard deviation of distribution
   * @return the density
   */
  private double logNormalDens(double x, double mean, double stdDev) {

    double diff = x - mean;

    return -(diff * diff / (2 * stdDev * stdDev)) - m_normConst
      - Math.log(stdDev);
  }

  /**
   * Returns the number of clusters.
   * 
   * @return the number of clusters generated for a training dataset.
   * @throws Exception if number of clusters could not be returned successfully
   */
  @Override
  public int numberOfClusters() throws Exception {

    return m_wrappedClusterer.numberOfClusters();
  }

  /**
   * Returns a description of the clusterer.
   * 
   * @return a string containing a description of the clusterer
   */
  @Override
  public String toString() {
    if (m_priors == null) {
      return "No clusterer built yet!";
    }

    StringBuffer text = new StringBuffer();
    text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
      + m_wrappedClusterer.toString());

    text.append("\nFitted estimators (with ML estimates of variance):\n");

    for (int j = 0; j < m_priors.length; j++) {
      text.append("\nCluster: " + j + " Prior probability: "
        + Utils.doubleToString(m_priors[j], 4) + "\n\n");

      for (int i = 0; i < m_model[0].length; i++) {
        text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n");

        if (m_theInstances.attribute(i).isNominal()) {
          if (m_model[j][i] != null) {
            text.append(m_model[j][i].toString());
          }
        } else {
          text.append("Normal Distribution. Mean = "
            + Utils.doubleToString(m_modelNormal[j][i][0], 4) + " StdDev = "
            + Utils.doubleToString(m_modelNormal[j][i][1], 4) + "\n");
        }
      }
    }

    return text.toString();
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String clustererTipText() {
    return "the clusterer to wrap";
  }

  /**
   * Sets the clusterer to wrap.
   * 
   * @param toWrap the clusterer
   */
  public void setClusterer(Clusterer toWrap) {

    m_wrappedClusterer = toWrap;
  }

  /**
   * Gets the clusterer being wrapped.
   * 
   * @return the clusterer
   */
  public Clusterer getClusterer() {

    return m_wrappedClusterer;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String minStdDevTipText() {
    return "set minimum allowable standard deviation";
  }

  /**
   * Set the minimum value for standard deviation when calculating normal
   * density. Reducing this value can help prevent arithmetic overflow resulting
   * from multiplying large densities (arising from small standard deviations)
   * when there are many singleton or near singleton values.
   * 
   * @param m minimum value for standard deviation
   */
  public void setMinStdDev(double m) {
    m_minStdDev = m;
  }

  /**
   * Get the minimum allowable standard deviation.
   * 
   * @return the minumum allowable standard deviation
   */
  public double getMinStdDev() {
    return m_minStdDev;
  }

  /**
   * Returns an enumeration describing the available options..
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration