All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.clusterers.MakeDensityBasedClusterer Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    MakeDensityBasedClusterer.java
 *    Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.estimators.DiscreteEstimator;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 *  Class for wrapping a Clusterer to make it return a
 * distribution and density. Fits normal distributions and discrete
 * distributions within each cluster produced by the wrapped clusterer. Supports
 * the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
 * 

* * * Valid options are: *

* *

 * -M <num>
 *  minimum allowable standard deviation for normal density computation 
 *  (default 1e-6)
 * 
* *
 * -W <clusterer name>
 *  Clusterer to wrap.
 *  (default weka.clusterers.SimpleKMeans)
 * 
* *
 * Options specific to clusterer weka.clusterers.SimpleKMeans:
 * 
* *
 * -N <num>
 *  number of clusters.
 *  (default 2).
 * 
* *
 * -V
 *  Display std. deviations for centroids.
 * 
* *
 * -M
 *  Replace missing values with mean/mode.
 * 
* *
 * -S <num>
 *  Random number seed.
 *  (default 10)
 * 
* * * * Options after "--" are passed on to the base clusterer. * * @author Richard Kirkby ([email protected]) * @author Mark Hall ([email protected]) * @author Eibe Frank ([email protected]) * @version $Revision: 10203 $ */ public class MakeDensityBasedClusterer extends AbstractDensityBasedClusterer implements NumberOfClustersRequestable, OptionHandler, WeightedInstancesHandler { /** for serialization */ static final long serialVersionUID = -5643302427972186631L; /** holds training instances header information */ private Instances m_theInstances; /** prior probabilities for the fitted clusters */ private double[] m_priors; /** normal distributions fitted to each numeric attribute in each cluster */ private double[][][] m_modelNormal; /** discrete distributions fitted to each discrete attribute in each cluster */ private DiscreteEstimator[][] m_model; /** default minimum standard deviation */ private double m_minStdDev = 1e-6; /** The clusterer being wrapped */ private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans(); /** globally replace missing values */ private ReplaceMissingValues m_replaceMissing; /** * Default constructor. * */ public MakeDensityBasedClusterer() { super(); } /** * Contructs a MakeDensityBasedClusterer wrapping a given Clusterer. * * @param toWrap the clusterer to wrap around */ public MakeDensityBasedClusterer(Clusterer toWrap) { setClusterer(toWrap); } /** * Returns a string describing classifier * * @return a description suitable for displaying in the explorer/experimenter * gui */ public String globalInfo() { return "Class for wrapping a Clusterer to make it return a distribution " + "and density. Fits normal distributions and discrete distributions " + "within each cluster produced by the wrapped clusterer. Supports the " + "NumberOfClustersRequestable interface only if the wrapped Clusterer " + "does."; } /** * String describing default clusterer. * * @return the default clusterer classname */ protected String defaultClustererString() { return SimpleKMeans.class.getName(); } /** * Set the number of clusters to generate. * * @param n the number of clusters to generate * @throws Exception if the wrapped clusterer has not been set, or if the * wrapped clusterer does not implement this facility. */ @Override public void setNumClusters(int n) throws Exception { if (m_wrappedClusterer == null) { throw new Exception("Can't set the number of clusters to generate - " + "no clusterer has been set yet."); } if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) { throw new Exception("Can't set the number of clusters to generate - " + "wrapped clusterer does not support this facility."); } ((NumberOfClustersRequestable) m_wrappedClusterer).setNumClusters(n); } /** * Returns default capabilities of the clusterer (i.e., of the wrapper * clusterer). * * @return the capabilities of this clusterer */ @Override public Capabilities getCapabilities() { if (m_wrappedClusterer != null) { return m_wrappedClusterer.getCapabilities(); } Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); return result; } /** * Builds a clusterer for a set of instances. * * @param data the instances to train the clusterer with * @throws Exception if the clusterer hasn't been set or something goes wrong */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_replaceMissing = new ReplaceMissingValues(); m_replaceMissing.setInputFormat(data); data = weka.filters.Filter.useFilter(data, m_replaceMissing); m_theInstances = new Instances(data, 0); if (m_wrappedClusterer == null) { throw new Exception("No clusterer has been set"); } m_wrappedClusterer.buildClusterer(data); m_model = new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data .numAttributes()]; m_modelNormal = new double[m_wrappedClusterer.numberOfClusters()][data .numAttributes()][2]; double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data .numAttributes()]; m_priors = new double[m_wrappedClusterer.numberOfClusters()]; for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) { m_priors[i] = 1.0; // laplace correction for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j).isNominal()) { m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(), true); } } } Instance inst = null; // Compute mean, etc. int[] clusterIndex = new int[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); int cluster = m_wrappedClusterer.clusterInstance(inst); m_priors[cluster] += inst.weight(); for (int j = 0; j < data.numAttributes(); j++) { if (!inst.isMissing(j)) { if (data.attribute(j).isNominal()) { m_model[cluster][j].addValue(inst.value(j), inst.weight()); } else { m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j); weights[cluster][j] += inst.weight(); } } } clusterIndex[i] = cluster; } for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j).isNumeric()) { for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) { if (weights[i][j] > 0) { m_modelNormal[i][j][0] /= weights[i][j]; } } } } // Compute standard deviations for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (!inst.isMissing(j)) { if (data.attribute(j).isNumeric()) { double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j); m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff; } } } } for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j).isNumeric()) { for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) { if (weights[i][j] > 0) { m_modelNormal[i][j][1] = Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]); } else if (weights[i][j] <= 0) { m_modelNormal[i][j][1] = Double.MAX_VALUE; } if (m_modelNormal[i][j][1] <= m_minStdDev) { m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev; if (m_modelNormal[i][j][1] <= m_minStdDev) { m_modelNormal[i][j][1] = m_minStdDev; } } } } } Utils.normalize(m_priors); } /** * Returns the cluster priors. * * @return the cluster priors */ @Override public double[] clusterPriors() { double[] n = new double[m_priors.length]; System.arraycopy(m_priors, 0, n, 0, n.length); return n; } /** * Computes the log of the conditional density (per cluster) for a given * instance. * * @param inst the instance to compute the density for * @return an array containing the estimated densities * @throws Exception if the density could not be computed successfully */ @Override public double[] logDensityPerClusterForInstance(Instance inst) throws Exception { int i, j; double logprob; double[] wghts = new double[m_wrappedClusterer.numberOfClusters()]; m_replaceMissing.input(inst); inst = m_replaceMissing.output(); for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) { logprob = 0; for (j = 0; j < inst.numAttributes(); j++) { if (!inst.isMissing(j)) { if (inst.attribute(j).isNominal()) { logprob += Math.log(m_model[i][j].getProbability(inst.value(j))); } else { // numeric attribute logprob += logNormalDens(inst.value(j), m_modelNormal[i][j][0], m_modelNormal[i][j][1]); } } } wghts[i] = logprob; } return wghts; } /** Constant for normal distribution. */ private static double m_normConst = 0.5 * Math.log(2 * Math.PI); /** * Density function of normal distribution. * * @param x input value * @param mean mean of distribution * @param stdDev standard deviation of distribution * @return the density */ private double logNormalDens(double x, double mean, double stdDev) { double diff = x - mean; return -(diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev); } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. * @throws Exception if number of clusters could not be returned successfully */ @Override public int numberOfClusters() throws Exception { return m_wrappedClusterer.numberOfClusters(); } /** * Returns a description of the clusterer. * * @return a string containing a description of the clusterer */ @Override public String toString() { if (m_priors == null) { return "No clusterer built yet!"; } StringBuffer text = new StringBuffer(); text.append("MakeDensityBasedClusterer: \n\nWrapped clusterer: " + m_wrappedClusterer.toString()); text.append("\nFitted estimators (with ML estimates of variance):\n"); for (int j = 0; j < m_priors.length; j++) { text.append("\nCluster: " + j + " Prior probability: " + Utils.doubleToString(m_priors[j], 4) + "\n\n"); for (int i = 0; i < m_model[0].length; i++) { text.append("Attribute: " + m_theInstances.attribute(i).name() + "\n"); if (m_theInstances.attribute(i).isNominal()) { if (m_model[j][i] != null) { text.append(m_model[j][i].toString()); } } else { text.append("Normal Distribution. Mean = " + Utils.doubleToString(m_modelNormal[j][i][0], 4) + " StdDev = " + Utils.doubleToString(m_modelNormal[j][i][1], 4) + "\n"); } } } return text.toString(); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String clustererTipText() { return "the clusterer to wrap"; } /** * Sets the clusterer to wrap. * * @param toWrap the clusterer */ public void setClusterer(Clusterer toWrap) { m_wrappedClusterer = toWrap; } /** * Gets the clusterer being wrapped. * * @return the clusterer */ public Clusterer getClusterer() { return m_wrappedClusterer; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String minStdDevTipText() { return "set minimum allowable standard deviation"; } /** * Set the minimum value for standard deviation when calculating normal * density. Reducing this value can help prevent arithmetic overflow resulting * from multiplying large densities (arising from small standard deviations) * when there are many singleton or near singleton values. * * @param m minimum value for standard deviation */ public void setMinStdDev(double m) { m_minStdDev = m; } /** * Get the minimum allowable standard deviation. * * @return the minumum allowable standard deviation */ public double getMinStdDev() { return m_minStdDev; } /** * Returns an enumeration describing the available options.. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2025 Weber Informatics LLC | Privacy Policy