All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.HierarchicalClustering Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2025 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile. If not, see .
 */
package smile.clustering;

import java.io.Serial;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Queue;
import smile.clustering.linkage.Linkage;
import smile.clustering.linkage.UPGMCLinkage;
import smile.clustering.linkage.WPGMCLinkage;
import smile.clustering.linkage.WardLinkage;
import smile.sort.IntHeapSelect;

/**
 * Agglomerative Hierarchical Clustering. Hierarchical agglomerative clustering
 * seeks to build a hierarchy of clusters in a bottom up approach: each
 * observation starts in its own cluster, and pairs of clusters are merged as
 * one moves up the hierarchy. The results of hierarchical clustering are
 * usually presented in a dendrogram.
 * 

* In general, the merges are determined in a greedy manner. In order to decide * which clusters should be combined, a measure of dissimilarity between sets * of observations is required. In most methods of hierarchical clustering, * this is achieved by use of an appropriate metric, and a linkage criteria * which specifies the dissimilarity of sets as a function of the pairwise * distances of observations in the sets. *

* Hierarchical clustering has the distinct advantage that any valid measure * of distance can be used. In fact, the observations themselves are not * required: all that is used is a matrix of distances. * *

References

*
    *
  1. David Eppstein. Fast hierarchical clustering and other applications of dynamic closest pairs. SODA 1998.
  2. *
* * @see Linkage * * @param tree the hierarchical cluster tree. An n-1 by 2 matrix of which * row i describes the merging of clusters at step i of the * clustering. If an element j in the row is less than n, * then observation j was merged at this stage. * If {@code j >= n} then the merge was with the cluster formed * at the (earlier) stage j-n of the algorithm. * @param height the clustering height. A set of n-1 non-decreasing real values, * which are the value of the criterion associated with the * clustering method for the particular agglomeration. * @author Haifeng Li */ public record HierarchicalClustering(int[][] tree, double[] height) implements Serializable { @Serial private static final long serialVersionUID = 3L; /** * Fits the Agglomerative Hierarchical Clustering with given linkage * method, which includes proximity matrix. * @param linkage a linkage method to merge clusters. The linkage object * includes the proximity matrix of data. * @return the model. */ public static HierarchicalClustering fit(Linkage linkage) { int n = linkage.size(); int[][] merge = new int[n - 1][2]; int[] id = new int[n]; double[] height = new double[n - 1]; int[] points = new int[n]; for (int i = 0; i < n; i++) { points[i] = i; id[i] = i; } FastPair fp = new FastPair(points, linkage); for (int i = 0; i < n - 1; i++) { height[i] = fp.getNearestPair(merge[i]); linkage.merge(merge[i][0], merge[i][1]); // merge clusters into one fp.remove(merge[i][1]); // drop b fp.updatePoint(merge[i][0]); // and tell the closest pairs about merger int p = merge[i][0]; int q = merge[i][1]; merge[i][0] = Math.min(id[p], id[q]); merge[i][1] = Math.max(id[p], id[q]); id[p] = n + i; } if (linkage instanceof UPGMCLinkage || linkage instanceof WPGMCLinkage || linkage instanceof WardLinkage) { for (int i = 0; i < height.length; i++) { height[i] = Math.sqrt(height[i]); } } return new HierarchicalClustering(merge, height); } /** * Cuts a tree into several groups by specifying the desired number. * @param k the number of clusters. * @return the cluster label of each sample. */ public int[] partition(int k) { int n = tree.length + 1; int[] membership = new int[n]; IntHeapSelect heap = new IntHeapSelect(k); for (int i = 2; i <= k; i++) { heap.add(tree[n - i][0]); heap.add(tree[n - i][1]); } for (int i = 0; i < k; i++) { bfs(membership, heap.get(i), i); } return membership; } /** * Cuts a tree into several groups by specifying the cut height. * @param h the cut height. * @return the cluster label of each sample. */ public int[] partition(double h) { for (int i = 0; i < height.length - 1; i++) { if (height[i] > height[i + 1]) { throw new IllegalStateException("Non-monotonic cluster tree -- the linkage is probably not appropriate!"); } } int n = tree.length + 1; int k = 2; for (; k <= n; k++) { if (height[n - k] < h) { break; } } if (k <= 2) { throw new IllegalArgumentException("The parameter h is too large."); } return partition(k - 1); } /** * BFS the merge tree and identify cluster membership. * @param membership the cluster membership array. * @param cluster the last merge point of cluster. * @param id the cluster ID. */ private void bfs(int[] membership, int cluster, int id) { int n = tree.length + 1; Queue queue = new LinkedList<>(); queue.offer(cluster); for (Integer i = queue.poll(); i != null; i = queue.poll()) { if (i < n) { membership[i] = id; continue; } i -= n; int m1 = tree[i][0]; if (m1 >= n) { queue.offer(m1); } else { membership[m1] = id; } int m2 = tree[i][1]; if (m2 >= n) { queue.offer(m2); } else { membership[m2] = id; } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy