
com.aliasi.cluster.CompleteLinkClusterer Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.cluster;
import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.Distance;
import com.aliasi.util.ObjectToSet;
import com.aliasi.util.ScoredObject;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* A CompleteLinkClusterer
implements complete link
* agglomerative clustering. Complete link clustering is a greedy
* algorithm in which the two closest clusters are always merged up to
* a specified distance threshold. Distance between clusters for
* complete link clustering is defined be the maximum of the distances
* between the members of the clusters. See {@link
* SingleLinkClusterer} for a clusterer that takes the minimum rather
* than the maximum in making clustering decisions.
*
* For example, consider the following distance matrix (which is
* also analyzed by way of example in {@link SingleLinkClusterer}):
*
*
*
* A B C D E
* A 0 1 2 7 5
* B 0 3 8 6
* C 0 5 9
* D 0 4
* E 0
*
*
*
* The result of complete-link clustering is the following dendrogram:
*
*
* A B C D E
* | | | | |
* 1 ----- | | |
* 2 | | | |
* 3 ------- | |
* 4 | -----
* 5 | |
* 6 | |
* 7 | |
* 8 | |
* 9 ----------
*
*
* First, the objects A
and B
are merged at
* distance one. At distance 3, the object C
is merged
* with the cluster {A,B}
because
* max(distance(C,A),distance(C,B))=3
, and that is
* smaller than any other pair of distances. Next, D
* and E
are merged at distance 4, because that is the
* distance between them, and they are both further than 4 away from
* the cluster {A,B,C}
. This continues with one more
* step, which happens at distance 9, the distance between C
* and E
, which is the maximum distance between pairs drawn from
* {A,B,C}
and {D,E}
.
*
*
The various clusters at each distance bound threshold are:
*
*
*
* Threshold Range Clusters
* [Double.NEGATIVE_INFINITY,1) {A}, {B}, {C}, {D}, {E}
* [1,3) {A,B}, {C}, {D}, {E}
* [3,4) {A,B,C}, {D}, {E}
* [4,9) {A,B,C}, {D,E}
* [9,Double.POSITIVE_INFINITY] {A,B,C,D,E}
*
*
*
* The intervals show the clusters returned for thresholds within
* the specified interval. As usual, square brackets denote inclusive
* range bounds and round brackets exclusive bounds. Although this example
* has the same single-link clustering, see the class documentation in
* {@link SingleLinkClusterer} for an example that does not.
*
* Note that results may not be well-defined in the case of ties
* between proximities. If there are ties, there is no guarantee
* as to how this implementation will break the tie.
*
*
Implementation Note: This algorithm requires worst-case
* O(n3)
running time to cluster
* n
elements. The initialization loop considers all pairs
* of elements, thus requiring n2
* running time. The main loop then walks over these pairs, requiring
* up to n
steps each to compute new distances. If the
* initial array of pairs is heavily pruned, the outer loop is smaller
* and the updates are actually smaller, too.
*
* @author Bob Carpenter
* @version 3.8.3
* @since LingPipe2.0
* @param the type of objects being clustered
*/
public class CompleteLinkClusterer
extends AbstractHierarchicalClusterer {
/**
* Construct a complete link clusterer with the specified distance
* bound.
*
* @param maxDistance Maximum distance between pairs of clusters
* to allow clustering.
* @param distance Distance measure to use among instances.
*/
public CompleteLinkClusterer(double maxDistance,
Distance super E> distance) {
super(maxDistance,distance);
}
/**
* Construct a complete link clusterer with no distance bound.
* This constructor uses {@link Double#POSITIVE_INFINITY} as the
* bound value, which effectively makes clustering unbounded.
*/
public CompleteLinkClusterer(Distance super E> distance) {
this(Double.POSITIVE_INFINITY,distance);
}
@Override
public Dendrogram hierarchicalCluster(Set extends E> elementSet) {
if (elementSet.size() == 0) {
String msg = "Require non-empty set to form dendrogram."
+ " Found elementSet.size()=" + elementSet.size();
throw new IllegalArgumentException(msg);
}
if (elementSet.size() == 1)
return new LeafDendrogram(elementSet.iterator().next());
// create queue (reverse because lower is better for distances)
BoundedPriorityQueue> queue
= new BoundedPriorityQueue>(ScoredObject.reverseComparator(),
Integer.MAX_VALUE);
ObjectToSet,PairScore> index
= new ObjectToSet,PairScore>();
E[] elements = toElements(elementSet);
// required for array
@SuppressWarnings({"unchecked","rawtypes"})
LeafDendrogram[] leafs
= (LeafDendrogram[]) new LeafDendrogram[elements.length];
for (int i = 0; i < leafs.length; ++i)
leafs[i] = new LeafDendrogram(elements[i]);
double maxDistance = getMaxDistance();
for (int i = 0; i < elements.length; ++i) {
E eI = elements[i];
LeafDendrogram dI = leafs[i];
for (int j = i + 1; j < elements.length; ++j) {
E eJ = elements[j];
double score = distance().distance(eI,eJ);
// used to continue here if too large; patched in 4.0.2, but slow
LeafDendrogram dJ = leafs[j];
PairScore psIJ = new PairScore(dI,dJ,score);
queue.offer(psIJ);
index.addMember(dI,psIJ);
index.addMember(dJ,psIJ);
}
}
while (queue.size() > 0) {
PairScore next = queue.poll();
Dendrogram dendro1 = next.mDendrogram1.dereference();
Dendrogram dendro2 = next.mDendrogram2.dereference();
double dist12 = next.score();
LinkDendrogram dendro12
= new LinkDendrogram(dendro1,dendro2,dist12);
// remove & store distances to dendro1
Map,Double> distanceBuf
= new HashMap,Double>();
Set> ps3Set = index.remove(dendro1);
queue.removeAll(ps3Set);
for (PairScore ps3 : ps3Set) {
Dendrogram dendro3
= ps3.mDendrogram1 == dendro1
? ps3.mDendrogram2
: ps3.mDendrogram1;
index.get(dendro3).remove(ps3);
double dist1_3 = ps3.score();
distanceBuf.put(dendro3,Double.valueOf(dist1_3));
}
// remove & iterate over distances to dendro2
ps3Set = index.remove(dendro2);
queue.removeAll(ps3Set);
for (PairScore ps3 : ps3Set) {
Dendrogram dendro3
= ps3.mDendrogram1 == dendro2
? ps3.mDendrogram2
: ps3.mDendrogram1;
index.get(dendro3).remove(ps3);
Double dist1_3D = distanceBuf.get(dendro3);
if (dist1_3D == null) continue; // dist(dendro2,dendro3) too large
double dist1_3 = dist1_3D.doubleValue();
double dist2_3 = ps3.score();
double dist12_3 = Math.max(dist1_3,dist2_3);
PairScore ps = new PairScore(dendro12,dendro3,dist12_3);
queue.offer(ps);
index.addMember(dendro12,ps);
index.addMember(dendro3,ps);
}
// dendro3 must be linked above threshold
// by both dendro1 and dendro2
if (queue.isEmpty()) return dendro12;
}
// share following code with Single Link
Iterator> it = index.keySet().iterator();
Dendrogram dendro = it.next(); // skip first element -- self
while (it.hasNext())
dendro = new LinkDendrogram(dendro,it.next(),
Double.POSITIVE_INFINITY);
return dendro;
}
}