smile.clustering.CLARANS Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
There is a newer version: 2024.11.2
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import smile.math.Math;
import smile.math.distance.Distance;
import smile.util.MulticoreExecutor;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;

/**
 * Clustering Large Applications based upon RANdomized Search. CLARANS is an
 * efficient medoid-based clustering algorithm. The k-medoids algorithm is an
 * adaptation of the k-means algorithm. Rather than calculate the mean of the
 * items in each cluster, a representative item, or medoid, is chosen for each
 * cluster at each iteration. In CLARANS, the process of finding k medoids from
 * n objects is viewed abstractly as searching through a certain graph. In the
 * graph, a node is represented by a set of k objects as selected medoids. Two
 * nodes are neighbors if their sets differ by only one object. In each iteration,
 * CLARANS considers a set of randomly chosen neighbor nodes as candidate
 * of new medoids. We will move to the neighbor node if the neighbor
 * is a better choice for medoids. Otherwise, a local optima is discovered. The
 * entire process is repeated multiple time to find better.
 * 
 * CLARANS has two parameters: the maximum number of neighbors examined
 * (maxNeighbor) and the number of local minima obtained (numLocal). The
 * higher the value of maxNeighbor, the closer is CLARANS to PAM, and the
 * longer is each search of a local minima. But the quality of such a local
 * minima is higher and fewer local minima needs to be obtained.
 *
 * 
References
 * 
 * R. Ng and J. Han. CLARANS: A Method for Clustering Objects for Spatial Data Mining. IEEE TRANS. KNOWLEDGE AND DATA ENGINEERING, 2002.
 * 
 * 
 * @param  the type of input object.
 * 
 * @author Haifeng Li
 */
public class CLARANS  extends PartitionClustering {
    private static final long serialVersionUID = 1L;

    /**
     * The total distortion.
     */
    double distortion;
    /**
     * The distance measure for calculation of distortion.
     */
    private Distance distance;
    /**
     * The number of local minima to search for.
     */
    private int numLocal;
    /**
     * The maximum number of neighbors examined during a search of local minima.
     */
    private int maxNeighbor;
    /**
     * The medoids of each cluster.
     */
    T[] medoids;

    /**
     * Constructor. Clustering data into k clusters. The maximum number of
     * random search is set to 0.02 * k * (n - k), where n is the number of
     * data and k is the number clusters. The number of local searches is 
     * max(8, numProcessors).
     * 
     * @param data the dataset for clustering.
     * @param distance the distance/dissimilarity measure.
     * @param k the number of clusters.
     */
    public CLARANS(T[] data, Distance distance, int k) {
        this(data, distance, k, (int) Math.round(0.0125 * k * (data.length - k)));
    }
    
    /**
     * Constructor. Clustering data into k clusters.
     * @param data the dataset for clustering.
     * @param distance the distance/dissimilarity measure.
     * @param k the number of clusters.
     * @param maxNeighbor the maximum number of neighbors examined during a random search of local minima.
     */
    public CLARANS(T[] data, Distance distance, int k, int maxNeighbor) {
        this(data, distance, k, maxNeighbor, Math.max(2, MulticoreExecutor.getThreadPoolSize()));        
    }
    
    /**
     * Constructor. Clustering data into k clusters.
     * @param data the dataset for clustering.
     * @param distance the distance/dissimilarity measure.
     * @param k the number of clusters.
     * @param maxNeighbor the maximum number of neighbors examined during a random search of local minima.
     * @param numLocal the number of local minima to search for.
     */
    public CLARANS(T[] data, Distance distance, int k, int maxNeighbor, int numLocal) {
        if (maxNeighbor <= 0) {
            throw new IllegalArgumentException("Invalid maxNeighbor: " + maxNeighbor);
        }
        
        if (numLocal <= 0) {
            throw new IllegalArgumentException("Invalid numLocal: " + numLocal);            
        }
        
        int n = data.length;

        if (k >= n) {
            throw new IllegalArgumentException("Too large k: " + k);
        }

        if (maxNeighbor > n) {
            throw new IllegalArgumentException("Too large maxNeighbor: " + maxNeighbor);
        }

        int minmax = 100;
        if (k * (n - k) < minmax) {
            minmax = k * (n - k);
        }
        
        if (maxNeighbor < minmax) {
            maxNeighbor = minmax;
        }
        
        this.k = k;
        this.distance = distance;
        this.numLocal = numLocal;
        this.maxNeighbor = maxNeighbor;
        
        List tasks = new ArrayList<>();
        for (int i = 0; i < numLocal; i++) {
            tasks.add(new CLARANSTask(data));
        }

        try {
            MulticoreExecutor.run(tasks);
        } catch (Exception e) {
            System.out.println("Failed to run CLARANS on multi-core:"+e);

            for (CLARANSTask task : tasks) {
                task.call();
            }
        }
        
        distortion = Double.POSITIVE_INFINITY;
        for (CLARANSTask task : tasks) {
            if (task.distortion < distortion) {
                distortion = task.distortion;
                medoids = task.medoids;
                y = task.y;
            }
        }
        
        size = new int[k];
        for (int i = 0; i < n; i++) {
            size[y[i]]++;
        }
    }

    /**
     * Adapter for running one local of CLARANS in thread pool.
     */
    class CLARANSTask implements Callable {
        final T[] data;
        
        double distortion;
        T[] medoids;
        int[] y;

        CLARANSTask(T[] data) {
            this.data = data;
        }

        @Override
        @SuppressWarnings("unchecked")
        public CLARANSTask call() {
            int n = data.length;
            medoids = (T[]) java.lang.reflect.Array.newInstance(data.getClass().getComponentType(), k);
            T[] newMedoids = medoids.clone();
            y = new int[n];
            int[] newY = new int[n];
            double[] d = new double[n];
            double[] newD = new double[n];
            
            distortion = seed(distance, data, medoids, y, d);

            System.arraycopy(medoids, 0, newMedoids, 0, k);
            System.arraycopy(y, 0, newY, 0, n);
            System.arraycopy(d, 0, newD, 0, n);
            
            for (int neighborCount = 1; neighborCount <= maxNeighbor; neighborCount++) {
                double randomNeighborDistortion = getRandomNeighbor(data, newMedoids, newY, newD);
                if (randomNeighborDistortion < distortion) {
                    neighborCount = 0;
                    distortion = randomNeighborDistortion;
                    System.arraycopy(newMedoids, 0, medoids, 0, k);
                    System.arraycopy(newY, 0, y, 0, n);
                    System.arraycopy(newD, 0, d, 0, n);
                } else {
                    System.arraycopy(medoids, 0, newMedoids, 0, k);
                    System.arraycopy(y, 0, newY, 0, n);
                    System.arraycopy(d, 0, newD, 0, n);
                }
            }

            return this;
        }
    }
    
    /**
     * Generate a random neighbor which differs in only one medoid with current clusters.
     */
    private double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d) {
        int n = data.length;

        int index = Math.randomInt(k);
        T medoid = null;
        boolean dup;
        do {
            dup = false;
            medoid = data[Math.randomInt(n)];
            for (int i = 0; i < k; i++) {
                if (medoid == medoids[i]) {
                    dup = true;
                    break;
                }
            }
        } while (dup);

        medoids[index] = medoid;

        for (int i = 0; i < n; i++) {
            double dist = distance.d(data[i], medoid);
            if (d[i] > dist) {
                y[i] = index;
                d[i] = dist;
            } else if (y[i] == index) {
                d[i] = dist;
                y[i] = index;
                for (int j = 0; j < k; j++) {
                    if (j != index) {
                        dist = distance.d(data[i], medoids[j]);
                        if (d[i] > dist) {
                            y[i] = j;
                            d[i] = dist;
                        }
                    }
                }
            }
        }

        return Math.sum(d);
    }
    
    /**
     * Returns the number of local minima to search for.
     */
    public int getNumLocalMinima() {
        return numLocal;
    }

    /**
     * Returns the maximum number of neighbors examined during a search of local minima.
     */
    public int getMaxNeighbor() {
        return maxNeighbor;
    }

    /**
     * Returns the distortion.
     */
    public double distortion() {
        return distortion;
    }

    /**
     * Returns the medoids.
     */
    public T[] medoids() {
        return medoids;
    }

    /**
     * Cluster a new instance.
     * @param x a new instance.
     * @return the cluster label, which is the index of nearest medoid.
     */
    @Override
    public int predict(T x) {
        double minDist = Double.MAX_VALUE;
        int bestCluster = 0;

        for (int i = 0; i < k; i++) {
            double dist = distance.d(x, medoids[i]);
            if (dist < minDist) {
                minDist = dist;
                bestCluster = i;
            }
        }

        return bestCluster;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        
        sb.append(String.format("CLARANS distortion: %.5f%n", distortion));
        sb.append(String.format("Clusters of %d data points:%n", y.length));
        for (int i = 0; i < k; i++) {
            int r = (int) Math.round(1000.0 * size[i] / y.length);
            sb.append(String.format("%3d\t%5d (%2d.%1d%%)%n", i, size[i], r / 10, r % 10));
        }
        
        return sb.toString();
    }
}