All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.DENCLUE Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;

import smile.math.Math;
import smile.util.MulticoreExecutor;

/**
 * DENsity CLUstering. The DENCLUE algorithm employs a cluster model based on
 * kernel density estimation. A cluster is defined by a local maximum of the
 * estimated density function. Data points going to the same local maximum
 * are put into the same cluster.
 * 

* Clearly, DENCLUE doesn't work on data with uniform distribution. In high * dimensional space, the data always look like uniformly distributed because * of the curse of dimensionality. Therefore, DENCLUDE doesn't work well on * high-dimensional data in general. * *

References

*
    *
  1. A. Hinneburg and D. A. Keim. A general approach to clustering in large databases with noise. Knowledge and Information Systems, 5(4):387-415, 2003.
  2. *
  3. Alexander Hinneburg and Hans-Henning Gabriel. DENCLUE 2.0: Fast Clustering based on Kernel Density Estimation. IDA, 2007.
  4. *
* * @author Haifeng Li */ public class DENCLUE extends PartitionClustering { /** * The epsilon of finishing density attractor searching. */ private double eps = 1E-7; /** * The smooth parameter in the Gaussian kernel. The user can * choose sigma such that number of density attractors is constant for a * long interval of sigma. */ private double sigma; /** * The smooth parameter in the Gaussian kernel. It is -0.5 / (sigma * sigma). */ private double gamma; /** * The density attractor of each cluster. */ private double[][] attractors; /** * The radius of density attractor. */ private double[] radius; /** * The samples decided by NeuralGas used in the iterations of hill climbing. */ private double[][] samples; /** * Constructor. Clustering data. * @param data the dataset for clustering. * @param sigma the smooth parameter in the Gaussian kernel. The user can * choose sigma such that number of density attractors is constant for a * long interval of sigma. * @param m the number of selected samples used in the iteration. * This number should be much smaller than the number of data points * to speed up the algorithm. It should also be large enough to capture * the sufficient information of underlying distribution. */ public DENCLUE(double[][] data, double sigma, int m) { if (sigma <= 0.0) { throw new IllegalArgumentException("Invalid standard deviation of Gaussian kernel: " + sigma); } if (m <= 0) { throw new IllegalArgumentException("Invlaid number of selected samples: " + m); } if (m < 10) { throw new IllegalArgumentException("The number of selected samples is too small: " + m); } this.sigma = sigma; this.gamma = -0.5 / (sigma * sigma); KMeans kmeans = new KMeans(data, m); samples = kmeans.centroids(); int n = data.length; int d = data[0].length; attractors = new double[n][]; for (int i = 0; i < n; i++) { attractors[i] = data[i].clone(); } double[] attractor = new double[d]; double[] prob = new double[n]; radius = new double[n]; int np = MulticoreExecutor.getThreadPoolSize(); List tasks = null; if (n >= 1000 && np >= 2) { tasks = new ArrayList(np + 1); int step = n / np; if (step < 100) { step = 100; } int start = 0; int end = step; for (int i = 0; i < np-1; i++) { tasks.add(new DENCLUEThread(prob, start, end)); start += step; end += step; } tasks.add(new DENCLUEThread(prob, start, n)); try { MulticoreExecutor.run(tasks); } catch (Exception ex) { System.err.println(ex); for (DENCLUEThread task : tasks) { task.call(); } } } else { for (int i = 0; i < n; i++) { double diff = 1.0; while (diff > eps) { double weight = 0.0; for (int j = 0; j < m; j++) { double w = Math.exp(gamma * Math.squaredDistance(attractors[i], samples[j])); weight += w; for (int l = 0; l < d; l++) { attractor[l] += w * samples[j][l]; } } for (int l = 0; l < d; l++) { attractor[l] /= weight; } weight /= m; diff = weight - prob[i]; prob[i] = weight; if (diff > 1E-5) { radius[i] = 2 * Math.distance(attractors[i], attractor); } System.arraycopy(attractor, 0, attractors[i], 0, d); Arrays.fill(attractor, 0.0); } } } y = new int[n]; ArrayList cluster = new ArrayList(); ArrayList probability = new ArrayList(); ArrayList step = new ArrayList(); y[0] = 0; cluster.add(attractors[0]); probability.add(prob[0]); step.add(radius[0]); boolean newcluster = true; for (int i = 1; i < n; i++) { newcluster = true; for (int j = 0; j < cluster.size(); j++) { if (Math.distance(attractors[i], cluster.get(j)) < radius[i] + step.get(j)) { y[i] = j; newcluster = false; if (prob[i] > probability.get(j)) { cluster.set(j, attractors[i]); probability.set(j, prob[i]); step.set(j, radius[i]); } break; } } if (newcluster) { y[i] = cluster.size(); cluster.add(attractors[i]); probability.add(prob[i]); step.add(radius[i]); } } size = new int[cluster.size()]; for (int i = 0; i < n; i++) { size[y[i]]++; } k = cluster.size(); attractors = new double[k][]; for (int i = 0; i < k; i++) { attractors[i] = cluster.get(i); } } /** * Returns the smooth (standard deviation) parameter in the Gaussian kernel. * @return the smooth (standard deviation) parameter in the Gaussian kernel. */ public double getSigma() { return sigma; } /** * Adapter for running DENCLUE algorithm in thread pool. */ class DENCLUEThread implements Callable { /** * The start index of data portion for this task. */ final int start; /** * The end index of data portion for this task. */ final int end; double[] attractor; double[] prob; DENCLUEThread(double[] prob, int start, int end) { this.prob = prob; this.start = start; this.end = end; attractor = new double[samples[0].length]; } @Override public DENCLUEThread call() { int m = samples.length; int d = samples[0].length; for (int i = start; i < end; i++) { double diff = 1.0; while (diff > eps) { double weight = 0.0; for (int j = 0; j < m; j++) { double w = Math.exp(gamma * Math.squaredDistance(attractors[i], samples[j])); weight += w; for (int l = 0; l < d; l++) { attractor[l] += w * samples[j][l]; } } for (int l = 0; l < d; l++) { attractor[l] /= weight; } weight /= m; diff = weight - prob[i]; prob[i] = weight; if (diff > 1E-5) { radius[i] = 2 * Math.distance(attractors[i], attractor); } System.arraycopy(attractor, 0, attractors[i], 0, d); Arrays.fill(attractor, 0.0); } } return this; } } /** * Returns the density attractors of cluster. */ public double[][] getDensityAttractors() { return attractors; } @Override public int predict(double[] x) { int p = attractors[0].length; if (x.length != p) { throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, p)); } double prob = 0.0; double diff = 1.0; double step = 0.0; double[] z = x.clone(); double[] attractor = new double[p]; while (diff > eps) { double weight = 0.0; for (int i = 0; i < samples.length; i++) { double w = Math.exp(gamma * Math.squaredDistance(samples[i], z)); weight += w; for (int j = 0; j < p; j++) { attractor[j] += w * samples[i][j]; } } for (int j = 0; j < p; j++) { attractor[j] /= weight; } weight /= k; diff = weight - prob; prob = weight; if (diff > 1E-5) { step = 2 * Math.distance(attractor, z); } for (int j = 0; j < p; j++) { z[j] = attractor[j]; attractor[j] = 0; } } for (int i = 0; i < k; i++) { if (Math.distance(attractors[i], z) < radius[i] + step) { return i; } } return OUTLIER; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("DENCLUE clusters of %d data points:\n", y.length)); for (int i = 0; i < k; i++) { int r = (int) Math.round(1000.0 * size[i] / y.length); sb.append(String.format("%3d\t%5d (%2d.%1d%%)\n", i, size[i], r / 10, r % 10)); } return sb.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy