All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.CLARANS Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.clustering;

import java.util.function.ToDoubleBiFunction;
import java.util.stream.IntStream;
import smile.math.MathEx;
import smile.math.distance.Distance;

/**
 * Clustering Large Applications based upon RANdomized Search. CLARANS is an
 * efficient medoid-based clustering algorithm. The k-medoids algorithm is an
 * adaptation of the k-means algorithm. Rather than calculate the mean of the
 * items in each cluster, a representative item, or medoid, is chosen for each
 * cluster at each iteration. In CLARANS, the process of finding k medoids from
 * n objects is viewed abstractly as searching through a certain graph. In the
 * graph, a node is represented by a set of k objects as selected medoids. Two
 * nodes are neighbors if their sets differ by only one object. In each iteration,
 * CLARANS considers a set of randomly chosen neighbor nodes as candidate
 * of new medoids. We will move to the neighbor node if the neighbor
 * is a better choice for medoids. Otherwise, a local optima is discovered. The
 * entire process is repeated multiple time to find better.
 * 

* CLARANS has two parameters: the maximum number of neighbors examined * (maxNeighbor) and the number of local minima obtained (numLocal). The * higher the value of maxNeighbor, the closer is CLARANS to PAM, and the * longer is each search of a local minima. But the quality of such a local * minima is higher and fewer local minima needs to be obtained. * *

References

*
    *
  1. R. Ng and J. Han. CLARANS: A Method for Clustering Objects for Spatial Data Mining. IEEE TRANS. KNOWLEDGE AND DATA ENGINEERING, 2002.
  2. *
* * @param the type of input object. * * @author Haifeng Li */ public class CLARANS extends CentroidClustering { private static final long serialVersionUID = 2L; private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(CLARANS.class); /** * The lambda of distance measure. */ private final Distance distance; /** * Constructor. * * @param distortion the total distortion. * @param medoids the medoids of each cluster. * @param y the cluster labels. * @param distance the lambda of distance measure. */ public CLARANS(double distortion, T[] medoids, int[] y, Distance distance) { super(distortion, medoids, y); this.distance = distance; } @Override protected double distance(T x, T y) { return distance.d(x, y); } /** * Clustering data into k clusters. The maximum number of * random search is set to 1.25% * k * (n - k), where n is the number of * data and k is the number clusters. * * @param data the observations. * @param k the number of clusters. * @param distance the lambda of distance measure. * @param the data type. * @return the model. */ public static CLARANS fit(T[] data, Distance distance, int k) { return fit(data, distance, k, (int) Math.round(0.0125 * k * (data.length - k))); } /** * Constructor. Clustering data into k clusters. * * @param data the observations. * @param k the number of clusters. * @param maxNeighbor the maximum number of neighbors examined during * the random search of local minima. * @param distance the lambda of distance measure. * @param the data type. * @return the model. */ public static CLARANS fit(T[] data, Distance distance, int k, int maxNeighbor) { if (maxNeighbor <= 0) { throw new IllegalArgumentException("Invalid maxNeighbors: " + maxNeighbor); } int n = data.length; if (k >= n) { throw new IllegalArgumentException("Too large k: " + k); } if (maxNeighbor > n) { throw new IllegalArgumentException("Too large maxNeighbor: " + maxNeighbor); } int minmax = 100; if (k * (n - k) < minmax) { minmax = k * (n - k); } if (maxNeighbor < minmax) { maxNeighbor = minmax; } @SuppressWarnings("unchecked") T[] medoids = (T[]) java.lang.reflect.Array.newInstance(data.getClass().getComponentType(), k); T[] newMedoids = medoids.clone(); int[] y = new int[n]; int[] newY = new int[n]; double[] newD = new double[n]; double[] d = seed(data, medoids, y, distance); double distortion = MathEx.sum(d); System.arraycopy(medoids, 0, newMedoids, 0, k); System.arraycopy(y, 0, newY, 0, n); System.arraycopy(d, 0, newD, 0, n); for (int neighborCount = 1; neighborCount <= maxNeighbor; neighborCount++) { double randomNeighborDistortion = getRandomNeighbor(data, newMedoids, newY, newD, distance); if (randomNeighborDistortion < distortion) { logger.info(String.format("Distortion reduces to %.4f after %3d random neighbors", distortion, neighborCount)); neighborCount = 0; distortion = randomNeighborDistortion; System.arraycopy(newMedoids, 0, medoids, 0, k); System.arraycopy(newY, 0, y, 0, n); System.arraycopy(newD, 0, d, 0, n); } else { System.arraycopy(medoids, 0, newMedoids, 0, k); System.arraycopy(y, 0, newY, 0, n); System.arraycopy(d, 0, newD, 0, n); } } logger.info(String.format("Final distortion: %.4f", distortion)); return new CLARANS<>(distortion, medoids, y, distance); } /** * Picks a random neighbor which differs in only one medoid with current clusters. */ private static double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d, ToDoubleBiFunction distance) { int n = data.length; int k = medoids.length; int cluster = MathEx.randomInt(k); T medoid = getRandomMedoid(data, medoids); medoids[cluster] = medoid; IntStream.range(0, n).parallel().forEach(i -> { double dist = distance.applyAsDouble(data[i], medoid); if (d[i] > dist) { y[i] = cluster; d[i] = dist; } else if (y[i] == cluster) { d[i] = dist; for (int j = 0; j < k; j++) { if (j != cluster) { dist = distance.applyAsDouble(data[i], medoids[j]); if (d[i] > dist) { d[i] = dist; y[i] = j; } } } } }); return MathEx.sum(d); } /** * Picks a random observation as new medoid. */ private static T getRandomMedoid(T[] data, T[] medoids) { int n = data.length; T medoid = data[MathEx.randomInt(n)]; while (contains(medoids, medoid)) { medoid = data[MathEx.randomInt(n)]; } return medoid; } /** * Returns true if the array contains the object. */ private static boolean contains(T[] medoids, T medoid) { for (T m : medoids) { if (m == medoid) return true; } return false; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy