All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.cluster.CompleteLinkClusterer Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.cluster;

import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.Distance;
import com.aliasi.util.ObjectToSet;
import com.aliasi.util.ScoredObject;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * A CompleteLinkClusterer implements complete link
 * agglomerative clustering.  Complete link clustering is a greedy
 * algorithm in which the two closest clusters are always merged up to
 * a specified distance threshold.  Distance between clusters for
 * complete link clustering is defined be the maximum of the distances
 * between the members of the clusters.  See {@link
 * SingleLinkClusterer} for a clusterer that takes the minimum rather
 * than the maximum in making clustering decisions.
 *
 * 

For example, consider the following distance matrix (which is * also analyzed by way of example in {@link SingleLinkClusterer}): * *

* * * * * * * *
 ABCDE
A01275
B 0386
C  059
D   04
E    0
*
* * The result of complete-link clustering is the following dendrogram: * *
 *         A   B   C   D   E
 *         |   |   |   |   |
 *      1  -----   |   |   |
 *      2    |     |   |   |
 *      3    -------   |   |
 *      4       |      -----
 *      5       |        |
 *      6       |        |
 *      7       |        |
 *      8       |        |
 *      9       ----------
 * 
* *

First, the objects A and B are merged at * distance one. At distance 3, the object C is merged * with the cluster {A,B} because * max(distance(C,A),distance(C,B))=3, and that is * smaller than any other pair of distances. Next, D * and E are merged at distance 4, because that is the * distance between them, and they are both further than 4 away from * the cluster {A,B,C}. This continues with one more * step, which happens at distance 9, the distance between C * and E, which is the maximum distance between pairs drawn from * {A,B,C} and {D,E}. * *

The various clusters at each distance bound threshold are: * *

* * * * * * *
Threshold RangeClusters
[Double.NEGATIVE_INFINITY,1){A}, {B}, {C}, {D}, {E} *
[1,3) {A,B}, {C}, {D}, {E}
[3,4){A,B,C}, {D}, {E}
[4,9){A,B,C}, {D,E}
[9,Double.POSITIVE_INFINITY]{A,B,C,D,E}
*
* * The intervals show the clusters returned for thresholds within * the specified interval. As usual, square brackets denote inclusive * range bounds and round brackets exclusive bounds. Although this example * has the same single-link clustering, see the class documentation in * {@link SingleLinkClusterer} for an example that does not. * *

Note that results may not be well-defined in the case of ties * between proximities. If there are ties, there is no guarantee * as to how this implementation will break the tie. * *

Implementation Note: This algorithm requires worst-case * O(n3) running time to cluster * n elements. The initialization loop considers all pairs * of elements, thus requiring n2 * running time. The main loop then walks over these pairs, requiring * up to n steps each to compute new distances. If the * initial array of pairs is heavily pruned, the outer loop is smaller * and the updates are actually smaller, too. * * @author Bob Carpenter * @version 3.8.3 * @since LingPipe2.0 * @param the type of objects being clustered */ public class CompleteLinkClusterer extends AbstractHierarchicalClusterer { /** * Construct a complete link clusterer with the specified distance * bound. * * @param maxDistance Maximum distance between pairs of clusters * to allow clustering. * @param distance Distance measure to use among instances. */ public CompleteLinkClusterer(double maxDistance, Distance distance) { super(maxDistance,distance); } /** * Construct a complete link clusterer with no distance bound. * This constructor uses {@link Double#POSITIVE_INFINITY} as the * bound value, which effectively makes clustering unbounded. */ public CompleteLinkClusterer(Distance distance) { this(Double.POSITIVE_INFINITY,distance); } @Override public Dendrogram hierarchicalCluster(Set elementSet) { if (elementSet.size() == 0) { String msg = "Require non-empty set to form dendrogram." + " Found elementSet.size()=" + elementSet.size(); throw new IllegalArgumentException(msg); } if (elementSet.size() == 1) return new LeafDendrogram(elementSet.iterator().next()); // create queue (reverse because lower is better for distances) BoundedPriorityQueue> queue = new BoundedPriorityQueue>(ScoredObject.reverseComparator(), Integer.MAX_VALUE); ObjectToSet,PairScore> index = new ObjectToSet,PairScore>(); E[] elements = toElements(elementSet); // required for array @SuppressWarnings({"unchecked","rawtypes"}) LeafDendrogram[] leafs = (LeafDendrogram[]) new LeafDendrogram[elements.length]; for (int i = 0; i < leafs.length; ++i) leafs[i] = new LeafDendrogram(elements[i]); double maxDistance = getMaxDistance(); for (int i = 0; i < elements.length; ++i) { E eI = elements[i]; LeafDendrogram dI = leafs[i]; for (int j = i + 1; j < elements.length; ++j) { E eJ = elements[j]; double score = distance().distance(eI,eJ); // used to continue here if too large; patched in 4.0.2, but slow LeafDendrogram dJ = leafs[j]; PairScore psIJ = new PairScore(dI,dJ,score); queue.offer(psIJ); index.addMember(dI,psIJ); index.addMember(dJ,psIJ); } } while (queue.size() > 0) { PairScore next = queue.poll(); Dendrogram dendro1 = next.mDendrogram1.dereference(); Dendrogram dendro2 = next.mDendrogram2.dereference(); double dist12 = next.score(); LinkDendrogram dendro12 = new LinkDendrogram(dendro1,dendro2,dist12); // remove & store distances to dendro1 Map,Double> distanceBuf = new HashMap,Double>(); Set> ps3Set = index.remove(dendro1); queue.removeAll(ps3Set); for (PairScore ps3 : ps3Set) { Dendrogram dendro3 = ps3.mDendrogram1 == dendro1 ? ps3.mDendrogram2 : ps3.mDendrogram1; index.get(dendro3).remove(ps3); double dist1_3 = ps3.score(); distanceBuf.put(dendro3,Double.valueOf(dist1_3)); } // remove & iterate over distances to dendro2 ps3Set = index.remove(dendro2); queue.removeAll(ps3Set); for (PairScore ps3 : ps3Set) { Dendrogram dendro3 = ps3.mDendrogram1 == dendro2 ? ps3.mDendrogram2 : ps3.mDendrogram1; index.get(dendro3).remove(ps3); Double dist1_3D = distanceBuf.get(dendro3); if (dist1_3D == null) continue; // dist(dendro2,dendro3) too large double dist1_3 = dist1_3D.doubleValue(); double dist2_3 = ps3.score(); double dist12_3 = Math.max(dist1_3,dist2_3); PairScore ps = new PairScore(dendro12,dendro3,dist12_3); queue.offer(ps); index.addMember(dendro12,ps); index.addMember(dendro3,ps); } // dendro3 must be linked above threshold // by both dendro1 and dendro2 if (queue.isEmpty()) return dendro12; } // share following code with Single Link Iterator> it = index.keySet().iterator(); Dendrogram dendro = it.next(); // skip first element -- self while (it.hasNext()) dendro = new LinkDendrogram(dendro,it.next(), Double.POSITIVE_INFINITY); return dendro; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy