All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.MEC Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.clustering;

import java.io.Serial;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;

import smile.math.MathEx;
import smile.math.distance.Distance;
import smile.math.distance.EuclideanDistance;
import smile.neighbor.LinearSearch;
import smile.neighbor.Neighbor;
import smile.neighbor.RNNSearch;

/**
 * Non-parametric Minimum Conditional Entropy Clustering. This method performs
 * very well especially when the exact number of clusters is unknown.
 * The method can also correctly reveal the structure of data and effectively
 * identify outliers simultaneously.
 * 

* The clustering criterion is based on the conditional entropy H(C | x), where * C is the cluster label and x is an observation. According to Fano's * inequality, we can estimate C with a low probability of error only if the * conditional entropy H(C | X) is small. MEC also generalizes the criterion * by replacing Shannon's entropy with Havrda-Charvat's structural * α-entropy. Interestingly, the minimum entropy criterion based * on structural α-entropy is equal to the probability error of the * nearest neighbor method when α= 2. To estimate p(C | x), MEC employs * Parzen density estimation, a nonparametric approach. *

* MEC is an iterative algorithm starting with an initial partition given by * any other clustering methods, e.g. k-means, CLARNAS, hierarchical clustering, * etc. Note that a random initialization is NOT appropriate. * *

References

*
    *
  1. Haifeng Li. All rights reserved., Keshu Zhang, and Tao Jiang. Minimum Entropy Clustering and Applications to Gene Expression Analysis. CSB, 2004.
  2. *
* * @param the data type of model input objects. * * @author Haifeng Li */ public class MEC extends PartitionClustering implements Comparable> { @Serial private static final long serialVersionUID = 2L; private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MEC.class); /** * The conditional entropy as the objective function. */ public final double entropy; /** * The range of neighborhood. */ public final double radius; /** * The neighborhood search data structure. */ private final RNNSearch nns; /** * Constructor. * @param entropy the conditional entropy of clusters. * @param radius the neighborhood radius. * @param nns the data structure for neighborhood search. * @param k the number of clusters. * @param y the cluster labels. */ public MEC(double entropy, double radius, RNNSearch nns, int k, int[] y) { super(k, y); this.entropy = entropy; this.radius = radius; this.nns = nns; } @Override public int compareTo(MEC o) { return Double.compare(entropy, o.entropy); } /** * Clustering the data. * @param data the observations. * @param distance the distance function. * @param k the number of clusters. Note that this is just a hint. The final * number of clusters may be less. * @param radius the neighborhood radius. * @param the data type. * @return the model. */ public static MEC fit(T[] data, Distance distance, int k, double radius) { if (k < 2) { throw new IllegalArgumentException("Invalid k: " + k); } if (radius <= 0.0) { throw new IllegalArgumentException("Invalid radius: " + radius); } // Initialize clusters with KMeans/CLARANS. int[] y; if (data instanceof double[][] matrix && distance instanceof EuclideanDistance) { KMeans kmeans = KMeans.fit(matrix, k); y = kmeans.y; } else { CLARANS clarans = CLARANS.fit(data, distance, k); y = clarans.y; } return fit(data, LinearSearch.of(data, distance), k, radius, y, 1E-4); } /** * Clustering the data. * @param data the observations. * @param nns the neighborhood search data structure. * @param k the number of clusters. Note that this is just a hint. The final * number of clusters may be less. * @param radius the neighborhood radius. * @param y the initial clustering labels, which could be produced by any * other clustering methods. * @param tol the tolerance of convergence test. * @param the data type. * @return the model. */ public static MEC fit(T[] data, RNNSearch nns, int k, double radius, int[] y, double tol) { if (k < 2) { throw new IllegalArgumentException("Invalid k: " + k); } if (radius <= 0.0) { throw new IllegalArgumentException("Invalid radius: " + radius); } int n = data.length; // The density of each observation. double[] px = new double[n]; // Neighbors of each observation. int[][] neighbors = new int[n][]; logger.info("Estimating the probabilities ..."); IntStream stream = IntStream.range(0, n); if (!(nns instanceof LinearSearch)) { stream = stream.parallel(); } stream.forEach(i -> { ArrayList> list = new ArrayList<>(); // Add the point itself to the neighborhood // This is important to estimate posterior probability // and also avoid empty neighborhood. list.add(Neighbor.of(data[i], i, 0.0)); nns.search(data[i], radius, list); int[] neighborhood = new int[list.size()]; neighbors[i] = neighborhood; for (int j = 0; j < list.size(); j++) { neighborhood[j] = list.get(j).index; } px[i] = (double) list.size() / n; }); // Initialize a posterior probabilities. // The number of observations in each cluster in the neighborhood. int[][] size = new int[n][k]; // The most significant cluster in the neighborhood. int[] dominantCluster = new int[n]; IntStream.range(0, n).parallel().forEach(i -> { for (int j : neighbors[i]) { size[i][y[j]]++; } }); IntStream.range(0, n).parallel().forEach(i -> { int max = 0; for (int j = 0; j < k; j++) { if (size[i][j] > max) { dominantCluster[i] = j; max = size[i][j]; } } }); double entropy = entropy(k, neighbors, size, px); logger.info("Entropy after initialization: {}", entropy); double diff = Double.MAX_VALUE; for (int iter = 1; diff > tol; iter++) { for (int i = 0; i < n; i++) { if (dominantCluster[i] != y[i]) { double oldMutual = 0.0; double newMutual = 0.0; for (int neighbor : neighbors[i]) { double nk = neighbors[neighbor].length; double r1 = (double) size[neighbor][y[i]] / nk; double r2 = (double) size[neighbor][dominantCluster[i]] / nk; if (r1 > 0) { oldMutual -= r1 * MathEx.log2(r1) * px[neighbor]; } if (r2 > 0) { oldMutual -= r2 * MathEx.log2(r2) * px[neighbor]; } r1 = (size[neighbor][y[i]] - 1.0) / nk; r2 = (size[neighbor][dominantCluster[i]] + 1.0) / nk; if (r1 > 0) { newMutual -= r1 * MathEx.log2(r1) * px[neighbor]; } if (r2 > 0) { newMutual -= r2 * MathEx.log2(r2) * px[neighbor]; } } if (newMutual < oldMutual) { for (int neighbor : neighbors[i]) { --size[neighbor][y[i]]; ++size[neighbor][dominantCluster[i]]; int mi = dominantCluster[i]; int mk = dominantCluster[neighbor]; if (size[neighbor][mi] > size[neighbor][mk]) { dominantCluster[neighbor] = dominantCluster[i]; } } y[i] = dominantCluster[i]; } } } double prevObj = entropy; entropy = entropy(k, neighbors, size, px); diff = prevObj - entropy; logger.info("Entropy after {} iterations: {}", iter, entropy); } // Collapse clusters by removing clusters with no samples. int[] clusterSize = new int[k]; for (int i = 0; i < n; i++) { clusterSize[y[i]]++; } // Reuse clusterSize as the index of new cluster id. int K = 0; for (int i = 0, j = 0; i < k; i++) { if (clusterSize[i] > 0) { K++; clusterSize[i] = j++; } } for (int i = 0; i < n; i++) { y[i] = clusterSize[y[i]]; } return new MEC<>(entropy, radius, nns, K, y); } /** * Cluster a new instance. * @param x a new instance. * @return the cluster label. Note that it may be {@link #OUTLIER}. */ public int predict(T x) { List> neighbors = new ArrayList<>(); nns.search(x, radius, neighbors); if (neighbors.isEmpty()) { return OUTLIER; } int[] label = new int[k]; for (Neighbor neighbor : neighbors) { int yi = y[neighbor.index]; label[yi]++; } return MathEx.whichMax(label); } @Override public String toString() { return String.format("Cluster entropy: %.5f%n", entropy) + super.toString(); } /** Calculates the entropy. */ private static double entropy(int k, int[][] neighbors, int[][] size, double[] px) { return IntStream.range(0, neighbors.length).parallel().mapToDouble(i -> { double conditionalEntropy = 0.0; int ni = neighbors[i].length; int[] ci = size[i]; for (int j = 0; j < k; j++) { if (ci[j] > 0) { double r = ((double) ci[j]) / ni; conditionalEntropy -= r * MathEx.log2(r); } } conditionalEntropy *= px[i]; return conditionalEntropy; }).sum(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy