smile.clustering.CLARANS Maven / Gradle / Ivy
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.clustering;
import java.util.function.ToDoubleBiFunction;
import java.util.stream.IntStream;
import smile.math.MathEx;
import smile.math.distance.Distance;
/**
* Clustering Large Applications based upon RANdomized Search. CLARANS is an
* efficient medoid-based clustering algorithm. The k-medoids algorithm is an
* adaptation of the k-means algorithm. Rather than calculate the mean of the
* items in each cluster, a representative item, or medoid, is chosen for each
* cluster at each iteration. In CLARANS, the process of finding k medoids from
* n objects is viewed abstractly as searching through a certain graph. In the
* graph, a node is represented by a set of k objects as selected medoids. Two
* nodes are neighbors if their sets differ by only one object. In each iteration,
* CLARANS considers a set of randomly chosen neighbor nodes as candidate
* of new medoids. We will move to the neighbor node if the neighbor
* is a better choice for medoids. Otherwise, a local optima is discovered. The
* entire process is repeated multiple time to find better.
*
* CLARANS has two parameters: the maximum number of neighbors examined
* (maxNeighbor) and the number of local minima obtained (numLocal). The
* higher the value of maxNeighbor, the closer is CLARANS to PAM, and the
* longer is each search of a local minima. But the quality of such a local
* minima is higher and fewer local minima needs to be obtained.
*
*
References
*
* - R. Ng and J. Han. CLARANS: A Method for Clustering Objects for Spatial Data Mining. IEEE TRANS. KNOWLEDGE AND DATA ENGINEERING, 2002.
*
*
* @param the type of input object.
*
* @author Haifeng Li
*/
public class CLARANS extends CentroidClustering {
private static final long serialVersionUID = 2L;
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(CLARANS.class);
/**
* The lambda of distance measure.
*/
private final Distance distance;
/**
* Constructor.
*
* @param distortion the total distortion.
* @param medoids the medoids of each cluster.
* @param y the cluster labels.
* @param distance the lambda of distance measure.
*/
public CLARANS(double distortion, T[] medoids, int[] y, Distance distance) {
super(distortion, medoids, y);
this.distance = distance;
}
@Override
protected double distance(T x, T y) {
return distance.d(x, y);
}
/**
* Clustering data into k clusters. The maximum number of
* random search is set to 1.25% * k * (n - k), where n is the number of
* data and k is the number clusters.
*
* @param data the observations.
* @param k the number of clusters.
* @param distance the lambda of distance measure.
* @param the data type.
* @return the model.
*/
public static CLARANS fit(T[] data, Distance distance, int k) {
return fit(data, distance, k, (int) Math.round(0.0125 * k * (data.length - k)));
}
/**
* Constructor. Clustering data into k clusters.
*
* @param data the observations.
* @param k the number of clusters.
* @param maxNeighbor the maximum number of neighbors examined during
* the random search of local minima.
* @param distance the lambda of distance measure.
* @param the data type.
* @return the model.
*/
public static CLARANS fit(T[] data, Distance distance, int k, int maxNeighbor) {
if (maxNeighbor <= 0) {
throw new IllegalArgumentException("Invalid maxNeighbors: " + maxNeighbor);
}
int n = data.length;
if (k >= n) {
throw new IllegalArgumentException("Too large k: " + k);
}
if (maxNeighbor > n) {
throw new IllegalArgumentException("Too large maxNeighbor: " + maxNeighbor);
}
int minmax = 100;
if (k * (n - k) < minmax) {
minmax = k * (n - k);
}
if (maxNeighbor < minmax) {
maxNeighbor = minmax;
}
@SuppressWarnings("unchecked")
T[] medoids = (T[]) java.lang.reflect.Array.newInstance(data.getClass().getComponentType(), k);
T[] newMedoids = medoids.clone();
int[] y = new int[n];
int[] newY = new int[n];
double[] newD = new double[n];
double[] d = seed(data, medoids, y, distance);
double distortion = MathEx.sum(d);
System.arraycopy(medoids, 0, newMedoids, 0, k);
System.arraycopy(y, 0, newY, 0, n);
System.arraycopy(d, 0, newD, 0, n);
for (int neighborCount = 1; neighborCount <= maxNeighbor; neighborCount++) {
double randomNeighborDistortion = getRandomNeighbor(data, newMedoids, newY, newD, distance);
if (randomNeighborDistortion < distortion) {
logger.info(String.format("Distortion reduces to %.4f after %3d random neighbors", distortion, neighborCount));
neighborCount = 0;
distortion = randomNeighborDistortion;
System.arraycopy(newMedoids, 0, medoids, 0, k);
System.arraycopy(newY, 0, y, 0, n);
System.arraycopy(newD, 0, d, 0, n);
} else {
System.arraycopy(medoids, 0, newMedoids, 0, k);
System.arraycopy(y, 0, newY, 0, n);
System.arraycopy(d, 0, newD, 0, n);
}
}
logger.info(String.format("Final distortion: %.4f", distortion));
return new CLARANS<>(distortion, medoids, y, distance);
}
/**
* Picks a random neighbor which differs in only one medoid with current clusters.
*/
private static double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d, ToDoubleBiFunction distance) {
int n = data.length;
int k = medoids.length;
int cluster = MathEx.randomInt(k);
T medoid = getRandomMedoid(data, medoids);
medoids[cluster] = medoid;
IntStream.range(0, n).parallel().forEach(i -> {
double dist = distance.applyAsDouble(data[i], medoid);
if (d[i] > dist) {
y[i] = cluster;
d[i] = dist;
} else if (y[i] == cluster) {
d[i] = dist;
for (int j = 0; j < k; j++) {
if (j != cluster) {
dist = distance.applyAsDouble(data[i], medoids[j]);
if (d[i] > dist) {
d[i] = dist;
y[i] = j;
}
}
}
}
});
return MathEx.sum(d);
}
/**
* Picks a random observation as new medoid.
*/
private static T getRandomMedoid(T[] data, T[] medoids) {
int n = data.length;
T medoid = data[MathEx.randomInt(n)];
while (contains(medoids, medoid)) {
medoid = data[MathEx.randomInt(n)];
}
return medoid;
}
/**
* Returns true if the array contains the object.
*/
private static boolean contains(T[] medoids, T medoid) {
for (T m : medoids) {
if (m == medoid) return true;
}
return false;
}
}