smile.clustering.MEC Maven / Gradle / Ivy
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.clustering;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;
import smile.math.MathEx;
import smile.math.distance.Distance;
import smile.math.distance.EuclideanDistance;
import smile.neighbor.LinearSearch;
import smile.neighbor.Neighbor;
import smile.neighbor.RNNSearch;
/**
* Non-parametric Minimum Conditional Entropy Clustering. This method performs
* very well especially when the exact number of clusters is unknown.
* The method can also correctly reveal the structure of data and effectively
* identify outliers simultaneously.
*
* The clustering criterion is based on the conditional entropy H(C | x), where
* C is the cluster label and x is an observation. According to Fano's
* inequality, we can estimate C with a low probability of error only if the
* conditional entropy H(C | X) is small. MEC also generalizes the criterion
* by replacing Shannon's entropy with Havrda-Charvat's structural
* α-entropy. Interestingly, the minimum entropy criterion based
* on structural α-entropy is equal to the probability error of the
* nearest neighbor method when α= 2. To estimate p(C | x), MEC employs
* Parzen density estimation, a nonparametric approach.
*
* MEC is an iterative algorithm starting with an initial partition given by
* any other clustering methods, e.g. k-means, CLARNAS, hierarchical clustering,
* etc. Note that a random initialization is NOT appropriate.
*
*
References
*
* - Haifeng Li. All rights reserved., Keshu Zhang, and Tao Jiang. Minimum Entropy Clustering and Applications to Gene Expression Analysis. CSB, 2004.
*
*
* @author Haifeng Li
*/
public class MEC extends PartitionClustering implements Comparable> {
private static final long serialVersionUID = 2L;
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MEC.class);
/**
* The conditional entropy as the objective function.
*/
public final double entropy;
/**
* The range of neighborhood.
*/
public final double radius;
/**
* The neighborhood search data structure.
*/
private final RNNSearch nns;
/**
* Constructor.
* @param entropy the conditional entropy of clusters.
* @param radius the neighborhood radius.
* @param nns the data structure for neighborhood search.
* @param k the number of clusters.
* @param y the cluster labels.
*/
public MEC(double entropy, double radius, RNNSearch nns, int k, int[] y) {
super(k, y);
this.entropy = entropy;
this.radius = radius;
this.nns = nns;
}
@Override
public int compareTo(MEC o) {
return Double.compare(entropy, o.entropy);
}
/**
* Clustering the data.
* @param data the observations.
* @param distance the distance function.
* @param k the number of clusters. Note that this is just a hint. The final
* number of clusters may be less.
* @param radius the neighborhood radius.
* @param the data type.
* @return the model.
*/
public static MEC fit(T[] data, Distance distance, int k, double radius) {
if (k < 2) {
throw new IllegalArgumentException("Invalid k: " + k);
}
if (radius <= 0.0) {
throw new IllegalArgumentException("Invalid radius: " + radius);
}
// Initialize clusters with KMeans/CLARANS.
int[] y;
if (data instanceof double[][] && distance instanceof EuclideanDistance) {
KMeans kmeans = KMeans.fit((double[][]) data, k);
y = kmeans.y;
} else {
CLARANS clarans = CLARANS.fit(data, distance, k);
y = clarans.y;
}
return fit(data, LinearSearch.of(data, distance), k, radius, y, 1E-4);
}
/**
* Clustering the data.
* @param data the observations.
* @param nns the neighborhood search data structure.
* @param k the number of clusters. Note that this is just a hint. The final
* number of clusters may be less.
* @param radius the neighborhood radius.
* @param y the initial clustering labels, which could be produced by any
* other clustering methods.
* @param tol the tolerance of convergence test.
* @param the data type.
* @return the model.
*/
public static MEC fit(T[] data, RNNSearch nns, int k, double radius, int[] y, double tol) {
if (k < 2) {
throw new IllegalArgumentException("Invalid k: " + k);
}
if (radius <= 0.0) {
throw new IllegalArgumentException("Invalid radius: " + radius);
}
int n = data.length;
// The density of each observation.
double[] px = new double[n];
// Neighbors of each observation.
int[][] neighbors = new int[n][];
logger.info("Estimating the probabilities ...");
IntStream stream = IntStream.range(0, n);
if (!(nns instanceof LinearSearch)) {
stream = stream.parallel();
}
stream.forEach(i -> {
ArrayList> list = new ArrayList<>();
// Add the point itself to the neighborhood
// This is important to estimate posterior probability
// and also avoid empty neighborhood.
list.add(Neighbor.of(data[i], i, 0.0));
nns.search(data[i], radius, list);
int[] neighborhood = new int[list.size()];
neighbors[i] = neighborhood;
for (int j = 0; j < list.size(); j++) {
neighborhood[j] = list.get(j).index;
}
px[i] = (double) list.size() / n;
});
// Initialize a posterior probabilities.
// The number of observations in each cluster in the neighborhood.
int[][] size = new int[n][k];
// The most significant cluster in the neighborhood.
int[] dominantCluster = new int[n];
IntStream.range(0, n).parallel().forEach(i -> {
for (int j : neighbors[i]) {
size[i][y[j]]++;
}
});
IntStream.range(0, n).parallel().forEach(i -> {
int max = 0;
for (int j = 0; j < k; j++) {
if (size[i][j] > max) {
dominantCluster[i] = j;
max = size[i][j];
}
}
});
double entropy = entropy(k, neighbors, size, px);
logger.info(String.format("Entropy after initialization: %.4f", entropy));
double diff = Double.MAX_VALUE;
for (int iter = 1; diff > tol; iter++) {
for (int i = 0; i < n; i++) {
if (dominantCluster[i] != y[i]) {
double oldMutual = 0.0;
double newMutual = 0.0;
for (int neighbor : neighbors[i]) {
double nk = neighbors[neighbor].length;
double r1 = (double) size[neighbor][y[i]] / nk;
double r2 = (double) size[neighbor][dominantCluster[i]] / nk;
if (r1 > 0) {
oldMutual -= r1 * MathEx.log2(r1) * px[neighbor];
}
if (r2 > 0) {
oldMutual -= r2 * MathEx.log2(r2) * px[neighbor];
}
r1 = (size[neighbor][y[i]] - 1.0) / nk;
r2 = (size[neighbor][dominantCluster[i]] + 1.0) / nk;
if (r1 > 0) {
newMutual -= r1 * MathEx.log2(r1) * px[neighbor];
}
if (r2 > 0) {
newMutual -= r2 * MathEx.log2(r2) * px[neighbor];
}
}
if (newMutual < oldMutual) {
for (int neighbor : neighbors[i]) {
--size[neighbor][y[i]];
++size[neighbor][dominantCluster[i]];
int mi = dominantCluster[i];
int mk = dominantCluster[neighbor];
if (size[neighbor][mi] > size[neighbor][mk]) {
dominantCluster[neighbor] = dominantCluster[i];
}
}
y[i] = dominantCluster[i];
}
}
}
double prevObj = entropy;
entropy = entropy(k, neighbors, size, px);
diff = prevObj - entropy;
logger.info(String.format("Entropy after %3d iterations: %.4f", iter, entropy));
}
// Collapse clusters by removing clusters with no samples.
int[] clusterSize = new int[k];
for (int i = 0; i < n; i++) {
clusterSize[y[i]]++;
}
// Reuse clusterSize as the index of new cluster id.
int K = 0;
for (int i = 0, j = 0; i < k; i++) {
if (clusterSize[i] > 0) {
K++;
clusterSize[i] = j++;
}
}
for (int i = 0; i < n; i++) {
y[i] = clusterSize[y[i]];
}
return new MEC<>(entropy, radius, nns, K, y);
}
/**
* Cluster a new instance.
* @param x a new instance.
* @return the cluster label. Note that it may be {@link #OUTLIER}.
*/
public int predict(T x) {
List> neighbors = new ArrayList<>();
nns.search(x, radius, neighbors);
if (neighbors.isEmpty()) {
return OUTLIER;
}
int[] label = new int[k];
for (Neighbor neighbor : neighbors) {
int yi = y[neighbor.index];
label[yi]++;
}
return MathEx.whichMax(label);
}
@Override
public String toString() {
return String.format("Cluster entropy: %.5f%n", entropy) + super.toString();
}
/** Calculates the entropy. */
private static double entropy(int k, int[][] neighbors, int[][] size, double[] px) {
return IntStream.range(0, neighbors.length).parallel().mapToDouble(i -> {
double conditionalEntropy = 0.0;
int ni = neighbors[i].length;
int[] ci = size[i];
for (int j = 0; j < k; j++) {
if (ci[j] > 0) {
double r = ((double) ci[j]) / ni;
conditionalEntropy -= r * MathEx.log2(r);
}
}
conditionalEntropy *= px[i];
return conditionalEntropy;
}).sum();
}
}