All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.CLARANS Maven / Gradle / Ivy

There is a newer version: 2024.12.1
Show newest version
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import smile.math.Math;
import smile.math.distance.Distance;
import smile.util.MulticoreExecutor;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;

/**
 * Clustering Large Applications based upon RANdomized Search. CLARANS is an
 * efficient medoid-based clustering algorithm. The k-medoids algorithm is an
 * adaptation of the k-means algorithm. Rather than calculate the mean of the
 * items in each cluster, a representative item, or medoid, is chosen for each
 * cluster at each iteration. In CLARANS, the process of finding k medoids from
 * n objects is viewed abstractly as searching through a certain graph. In the
 * graph, a node is represented by a set of k objects as selected medoids. Two
 * nodes are neighbors if their sets differ by only one object. In each iteration,
 * CLARANS considers a set of randomly chosen neighbor nodes as candidate
 * of new medoids. We will move to the neighbor node if the neighbor
 * is a better choice for medoids. Otherwise, a local optima is discovered. The
 * entire process is repeated multiple time to find better.
 * 

* CLARANS has two parameters: the maximum number of neighbors examined * (maxNeighbor) and the number of local minima obtained (numLocal). The * higher the value of maxNeighbor, the closer is CLARANS to PAM, and the * longer is each search of a local minima. But the quality of such a local * minima is higher and fewer local minima needs to be obtained. * *

References

*
    *
  1. R. Ng and J. Han. CLARANS: A Method for Clustering Objects for Spatial Data Mining. IEEE TRANS. KNOWLEDGE AND DATA ENGINEERING, 2002.
  2. *
* * @param the type of input object. * * @author Haifeng Li */ public class CLARANS extends PartitionClustering { private static final long serialVersionUID = 1L; /** * The total distortion. */ double distortion; /** * The distance measure for calculation of distortion. */ private Distance distance; /** * The number of local minima to search for. */ private int numLocal; /** * The maximum number of neighbors examined during a search of local minima. */ private int maxNeighbor; /** * The medoids of each cluster. */ T[] medoids; /** * Constructor. Clustering data into k clusters. The maximum number of * random search is set to 0.02 * k * (n - k), where n is the number of * data and k is the number clusters. The number of local searches is * max(8, numProcessors). * * @param data the dataset for clustering. * @param distance the distance/dissimilarity measure. * @param k the number of clusters. */ public CLARANS(T[] data, Distance distance, int k) { this(data, distance, k, (int) Math.round(0.0125 * k * (data.length - k))); } /** * Constructor. Clustering data into k clusters. * @param data the dataset for clustering. * @param distance the distance/dissimilarity measure. * @param k the number of clusters. * @param maxNeighbor the maximum number of neighbors examined during a random search of local minima. */ public CLARANS(T[] data, Distance distance, int k, int maxNeighbor) { this(data, distance, k, maxNeighbor, Math.max(2, MulticoreExecutor.getThreadPoolSize())); } /** * Constructor. Clustering data into k clusters. * @param data the dataset for clustering. * @param distance the distance/dissimilarity measure. * @param k the number of clusters. * @param maxNeighbor the maximum number of neighbors examined during a random search of local minima. * @param numLocal the number of local minima to search for. */ public CLARANS(T[] data, Distance distance, int k, int maxNeighbor, int numLocal) { if (maxNeighbor <= 0) { throw new IllegalArgumentException("Invalid maxNeighbor: " + maxNeighbor); } if (numLocal <= 0) { throw new IllegalArgumentException("Invalid numLocal: " + numLocal); } int n = data.length; if (k >= n) { throw new IllegalArgumentException("Too large k: " + k); } if (maxNeighbor > n) { throw new IllegalArgumentException("Too large maxNeighbor: " + maxNeighbor); } int minmax = 100; if (k * (n - k) < minmax) { minmax = k * (n - k); } if (maxNeighbor < minmax) { maxNeighbor = minmax; } this.k = k; this.distance = distance; this.numLocal = numLocal; this.maxNeighbor = maxNeighbor; List tasks = new ArrayList<>(); for (int i = 0; i < numLocal; i++) { tasks.add(new CLARANSTask(data)); } try { MulticoreExecutor.run(tasks); } catch (Exception e) { System.out.println("Failed to run CLARANS on multi-core:"+e); for (CLARANSTask task : tasks) { task.call(); } } distortion = Double.POSITIVE_INFINITY; for (CLARANSTask task : tasks) { if (task.distortion < distortion) { distortion = task.distortion; medoids = task.medoids; y = task.y; } } size = new int[k]; for (int i = 0; i < n; i++) { size[y[i]]++; } } /** * Adapter for running one local of CLARANS in thread pool. */ class CLARANSTask implements Callable { final T[] data; double distortion; T[] medoids; int[] y; CLARANSTask(T[] data) { this.data = data; } @Override @SuppressWarnings("unchecked") public CLARANSTask call() { int n = data.length; medoids = (T[]) java.lang.reflect.Array.newInstance(data.getClass().getComponentType(), k); T[] newMedoids = medoids.clone(); y = new int[n]; int[] newY = new int[n]; double[] d = new double[n]; double[] newD = new double[n]; distortion = seed(distance, data, medoids, y, d); System.arraycopy(medoids, 0, newMedoids, 0, k); System.arraycopy(y, 0, newY, 0, n); System.arraycopy(d, 0, newD, 0, n); for (int neighborCount = 1; neighborCount <= maxNeighbor; neighborCount++) { double randomNeighborDistortion = getRandomNeighbor(data, newMedoids, newY, newD); if (randomNeighborDistortion < distortion) { neighborCount = 0; distortion = randomNeighborDistortion; System.arraycopy(newMedoids, 0, medoids, 0, k); System.arraycopy(newY, 0, y, 0, n); System.arraycopy(newD, 0, d, 0, n); } else { System.arraycopy(medoids, 0, newMedoids, 0, k); System.arraycopy(y, 0, newY, 0, n); System.arraycopy(d, 0, newD, 0, n); } } return this; } } /** * Generate a random neighbor which differs in only one medoid with current clusters. */ private double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d) { int n = data.length; int index = Math.randomInt(k); T medoid = null; boolean dup; do { dup = false; medoid = data[Math.randomInt(n)]; for (int i = 0; i < k; i++) { if (medoid == medoids[i]) { dup = true; break; } } } while (dup); medoids[index] = medoid; for (int i = 0; i < n; i++) { double dist = distance.d(data[i], medoid); if (d[i] > dist) { y[i] = index; d[i] = dist; } else if (y[i] == index) { d[i] = dist; y[i] = index; for (int j = 0; j < k; j++) { if (j != index) { dist = distance.d(data[i], medoids[j]); if (d[i] > dist) { y[i] = j; d[i] = dist; } } } } } return Math.sum(d); } /** * Returns the number of local minima to search for. */ public int getNumLocalMinima() { return numLocal; } /** * Returns the maximum number of neighbors examined during a search of local minima. */ public int getMaxNeighbor() { return maxNeighbor; } /** * Returns the distortion. */ public double distortion() { return distortion; } /** * Returns the medoids. */ public T[] medoids() { return medoids; } /** * Cluster a new instance. * @param x a new instance. * @return the cluster label, which is the index of nearest medoid. */ @Override public int predict(T x) { double minDist = Double.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < k; i++) { double dist = distance.d(x, medoids[i]); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("CLARANS distortion: %.5f%n", distortion)); sb.append(String.format("Clusters of %d data points:%n", y.length)); for (int i = 0; i < k; i++) { int r = (int) Math.round(1000.0 * size[i] / y.length); sb.append(String.format("%3d\t%5d (%2d.%1d%%)%n", i, size[i], r / 10, r % 10)); } return sb.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy