com.aliasi.cluster.SingleLinkClusterer Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/* * LingPipe v. 4.1.0 * Copyright (C) 2003-2011 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */ package com.aliasi.cluster; import com.aliasi.util.Distance; import com.aliasi.util.ScoredObject; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; /** * A
E. * *SingleLinkClusterer
implements standard single-link * agglomerative clustering. Single link clustering is a greedy * algorithm in which the two closest clusters are always merged * up to a specified distance threshold. Distance between clustes for * single link clustering is defined to be the minimum of the distances * between the members of the clusters. See {@link CompleteLinkClusterer} * for a clusterer that takes the maximum rather than the minimum in * making clustering decisions. * *For example, consider the following proximity matrix * representing distances between pairs of elements (the same example * is used in {@link CompleteLinkClusterer}): * *
** * The result of single-link clustering is the following dendrogram (pardon * the ASCII art): * **
** A B C D E * A 0 1 2 7 5 * B 0 3 8 6 * C 0 5 9 * D 0 4 * E 0 * A B C D E * | | | | | * 1 ----- | | | * 2 ------- | | * 3 | | | * 4 | ----- * 5 ---------- ** *First, the objects
A
andB
are merged * at distance one. Then objectC
is merged into the * cluster at distance 2, because that is its distance from *A
. ObjectsD
andE
are * merged next at distance 4, and finally, the two big clusters are * merged at distance 5, the distance betweenA
and *The various clusters at each proximity bound threshold are: * *
** * The intervals show the clusters returned for thresholds within * the specified interval. As usual, square brackets denote inclusive * range bounds and round brackets exclusive bounds. For instance, * if the distance threshold is 1.5, four clusters are returned, * whereas if the threshold is 4.0, two clusters are returned. * **
** Threshold Range Clusters [Double.NEGATIVE_INFINITY,1) {A}, {B}, {C}, {D}, {E} * * [1,2) {A,B}, {C}, {D}, {E} * [2,4) {A,B,C}, {D}, {E} * [4,5) {A,B,C}, {D,E} * [5,Double.POSITIVE_INFINITY] {A,B,C,D,E} Note that this example has the same clusters as that in {@link * CompleteLinkClusterer}. A simple example where the results are * different is the result of clustering four points on the x-axis, *
x1=1
,x2=3
, *x3=6
,x4=10
. * The distance is computed by subtraction, e.g.d(x2, * x4) = x4- x2. * This leads to the following distance matrix: * *
** * For this example, the complete link clustering is * **
** x1 x2 x3 x4 * x1 0 2 5 9 * x2 0 3 7 * x3 0 4 * x4 0 {{x1,x2},{x3,x4}}
* * whereas the single-link clustering is: * *{{{x1,x2},x3},x4}
* *Implementation Note: This clusterer is implemented using * the minimum-spanning-tree-based algorithm described in: * *
* Jain, Anil K. and Richard C. Dubes. 1988. Algorithms for Clustering * Data. Prentice-Hall. ** * The minimum spanning tree is constructed using Kruskal's algorithm, * which is described in: * ** Cormen, Thomas H., Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein. * 2001. * Introduction to Algorithms. MIT Press. ** * This algorithm requires an efficient lookup of whether two * nodes are already in the same connected component, which * is implemented by the standard disjoint set algorithm with * path compression (also described in Cormen et al.). * *In brief, the algorithm sorts all pairs of objects in order of * increasing distance (with
n
objects that involves * sortingn2
pairs of objects, * leading to an initial step with worst case time complexity bound *O(n2 log n)
. The pairs are then * merged in order of decreasing distance. The sets involved in * Kruskal's algorithm translate directly into dendrograms with their * dereferencing. * * @author Bob Carpenter * @version 4.1.0 * @since LingPipe2.0 * @paramthe type of object being clustered */ public class SingleLinkClusterer extends AbstractHierarchicalClusterer { /** * Construct a single-link clusterer with the specified distance * bound. * * @param maxDistance Maximum distance for clusters. * @param distance Distance measure between objects to cluster. * @throws IllegalArgumentException If the specified bound is not * a non-negative number. */ public SingleLinkClusterer(double maxDistance, Distance super E> distance) { super(maxDistance,distance); } /** * Construct a single-link clusterer with no distance bound. * The distance bound is set to {@link Double#POSITIVE_INFINITY}, * which effectively removes the distance bound. * * @param distance Distance measure between objects to cluster. */ public SingleLinkClusterer(Distance super E> distance) { this(Double.POSITIVE_INFINITY,distance); } /** * Return the array of clusters derived from the specified * distance matrix by performing single-link clustering up to the * specified maximum distance bound. Every pair of elements in * the matrix that are within the distance bound will fall in the * same dendrogram in the result. Setting the maximum distance * to {@link Double#POSITIVE_INFINITY} results in a complete * clustering. * * @param elementSet Set of elements to cluster. * @return Clustering in the form of a set of sets of elements. * @throws IllegalArgumentException If the set of elements is empty. */ @Override public Dendrogram hierarchicalCluster(Set extends E> elementSet) { if (elementSet.size() == 0) { String msg = "Require non-empty set to form dendrogram." + " Found elementSet.size()=" + elementSet.size(); throw new IllegalArgumentException(msg); } if (elementSet.size() == 1) return new LeafDendrogram (elementSet.iterator().next()); Object[] elements = toElements(elementSet); // need array so identity is shared among leaf dendros // require supression for array @SuppressWarnings({"unchecked","rawtypes"}) LeafDendrogram [] leafs = (LeafDendrogram []) new LeafDendrogram[elements.length]; for (int i = 0; i < leafs.length; ++i) { // required for obj array @SuppressWarnings("unchecked") E elt = (E) elements[i]; leafs[i] = new LeafDendrogram (elt); } Set > clusters = new HashSet >(elements.length); for (Dendrogram dendrogram : leafs) clusters.add(dendrogram); // compute ordered list of pairs of elements to merge ArrayList > pairScoreList = new ArrayList >(); int len = elements.length; double maxDistance = getMaxDistance(); for (int i = 0; i < len; ++i) { // required for obj array @SuppressWarnings("unchecked") E eI = (E) elements[i]; Dendrogram dendroI = leafs[i]; for (int j = i + 1; j < len; ++j) { // required for obj array @SuppressWarnings("unchecked") E eJ = (E) elements[j]; double distanceIJ = distance().distance(eI,eJ); // if (distanceIJ > maxDistance) continue; // speeds up, but strands elements Dendrogram dendroJ = leafs[j]; pairScoreList.add(new PairScore (dendroI,dendroJ,distanceIJ)); } } // required for array @SuppressWarnings({"unchecked","rawtypes"}) PairScore [] pairScores = (PairScore []) new PairScore[pairScoreList.size()]; pairScoreList.toArray(pairScores); Arrays.sort(pairScores,ScoredObject.comparator()); // increasing order of distance for (int i = 0; i < pairScores.length && clusters.size() > 1; ++i) { PairScore ps = pairScores[i]; if (ps.score() > getMaxDistance()) break; Dendrogram d1 = ps.mDendrogram1.dereference(); Dendrogram d2 = ps.mDendrogram2.dereference(); if (d1.equals(d2)) { continue; // already linked } clusters.remove(d1); clusters.remove(d2); LinkDendrogram dLink = new LinkDendrogram (d1,d2,pairScores[i].mScore); clusters.add(dLink); } // link up remaining unlinked dendros at +infinity distance Iterator > it = clusters.iterator(); Dendrogram dendro = it.next(); // skip first - self while (it.hasNext()) dendro = new LinkDendrogram (dendro,it.next(), Double.POSITIVE_INFINITY); return dendro; } }