com.aliasi.cluster.CompleteLinkClusterer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.cluster;

import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.Distance;
import com.aliasi.util.ObjectToSet;
import com.aliasi.util.ScoredObject;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * A CompleteLinkClusterer implements complete link
 * agglomerative clustering.  Complete link clustering is a greedy
 * algorithm in which the two closest clusters are always merged up to
 * a specified distance threshold.  Distance between clusters for
 * complete link clustering is defined be the maximum of the distances
 * between the members of the clusters.  See {@link
 * SingleLinkClusterer} for a clusterer that takes the minimum rather
 * than the maximum in making clustering decisions.
 *
 * For example, consider the following distance matrix (which is
 * also analyzed by way of example in {@link SingleLinkClusterer}):
 *
 * 

 * 
 * 
 * 
 * 
 * 
 * 
 * 
 *   A B C D E
A 0 1 2 7 5
B   0 3 8 6
C     0 5 9
D       0 4
E         0
 * 
 *
 * The result of complete-link clustering is the following dendrogram:
 *
 *  *         A   B   C   D   E
 *         |   |   |   |   |
 *      1  -----   |   |   |
 *      2    |     |   |   |
 *      3    -------   |   |
 *      4       |      -----
 *      5       |        |
 *      6       |        |
 *      7       |        |
 *      8       |        |
 *      9       ----------
 * 
 *
 * First, the objects A and B are merged at
 * distance one.  At distance 3, the object C is merged
 * with the cluster {A,B} because
 * max(distance(C,A),distance(C,B))=3, and that is
 * smaller than any other pair of distances.  Next, D
 * and E are merged at distance 4, because that is the
 * distance between them, and they are both further than 4 away from
 * the cluster {A,B,C}.  This continues with one more
 * step, which happens at distance 9, the distance between C
 * and E, which is the maximum distance between pairs drawn from
 * {A,B,C} and {D,E}.
 *
 * 
The various clusters at each distance bound threshold are:
 *
 * 

 * 
 * 
 * 
 * 
 * 
 * 
 * Threshold Range Clusters
[Double.NEGATIVE_INFINITY,1) {A}, {B}, {C}, {D}, {E}
 * 
[1,3)  {A,B}, {C}, {D}, {E}
[3,4) {A,B,C}, {D}, {E}
[4,9) {A,B,C}, {D,E}
[9,Double.POSITIVE_INFINITY] {A,B,C,D,E}
 * 
 *
 * The intervals show the clusters returned for thresholds within
 * the specified interval.  As usual, square brackets denote inclusive
 * range bounds and round brackets exclusive bounds.  Although this example
 * has the same single-link clustering, see the class documentation in
 * {@link SingleLinkClusterer} for an example that does not.
 *
 * Note that results may not be well-defined in the case of ties
 * between proximities.  If there are ties, there is no guarantee
 * as to how this implementation will break the tie.
 *
 * Implementation Note: This algorithm requires worst-case
 * O(n^³) running time to cluster
 * n elements.  The initialization loop considers all pairs
 * of elements, thus requiring n^²
 * running time.  The main loop then walks over these pairs, requiring
 * up to n steps each to compute new distances.  If the
 * initial array of pairs is heavily pruned, the outer loop is smaller
 * and the updates are actually smaller, too.
 *
 * @author Bob Carpenter
 * @version 3.8.3
 * @since   LingPipe2.0
 * @param  the type of objects being clustered
 */
public class CompleteLinkClusterer
    extends AbstractHierarchicalClusterer {

    /**
     * Construct a complete link clusterer with the specified distance
     * bound.
     *
     * @param maxDistance Maximum distance between pairs of clusters
     * to allow clustering.
     * @param distance Distance measure to use among instances.
     */
    public CompleteLinkClusterer(double maxDistance,
                                 Distance distance) {
        super(maxDistance,distance);
    }

    /**
     * Construct a complete link clusterer with no distance bound.
     * This constructor uses {@link Double#POSITIVE_INFINITY} as the
     * bound value, which effectively makes clustering unbounded.
     */
    public CompleteLinkClusterer(Distance distance) {
        this(Double.POSITIVE_INFINITY,distance);
    }


    @Override
    public Dendrogram hierarchicalCluster(Set elementSet) {
        if (elementSet.size() == 0) {
            String msg = "Require non-empty set to form dendrogram."
                + " Found elementSet.size()=" + elementSet.size();
            throw new IllegalArgumentException(msg);
        }
        if (elementSet.size() == 1)
            return new LeafDendrogram(elementSet.iterator().next());

        // create queue (reverse because lower is better for distances)
        BoundedPriorityQueue> queue
            = new BoundedPriorityQueue>(ScoredObject.reverseComparator(),
                                                     Integer.MAX_VALUE);
        ObjectToSet,PairScore> index
            = new ObjectToSet,PairScore>();
        E[] elements = toElements(elementSet);

        // required for array
        @SuppressWarnings({"unchecked","rawtypes"})
        LeafDendrogram[] leafs
            = (LeafDendrogram[]) new LeafDendrogram[elements.length];
        for (int i = 0; i < leafs.length; ++i)
            leafs[i] = new LeafDendrogram(elements[i]);
        double maxDistance = getMaxDistance();
        for (int i = 0; i < elements.length; ++i) {
            E eI = elements[i];
            LeafDendrogram dI = leafs[i];
            for (int j = i + 1; j < elements.length; ++j) {
                E eJ = elements[j];
                double score = distance().distance(eI,eJ);
                // used to continue here if too large; patched in 4.0.2, but slow
                LeafDendrogram dJ = leafs[j];
                PairScore psIJ = new PairScore(dI,dJ,score);
                queue.offer(psIJ);
                index.addMember(dI,psIJ);
                index.addMember(dJ,psIJ);
            }
        }

        while (queue.size() > 0) {
            PairScore next = queue.poll();
            Dendrogram dendro1 = next.mDendrogram1.dereference();
            Dendrogram dendro2 = next.mDendrogram2.dereference();
            double dist12 = next.score();
            LinkDendrogram dendro12
                = new LinkDendrogram(dendro1,dendro2,dist12);

            // remove & store distances to dendro1
            Map,Double> distanceBuf
                = new HashMap,Double>();
            Set> ps3Set = index.remove(dendro1);
            queue.removeAll(ps3Set);
            for (PairScore ps3 : ps3Set) {
                Dendrogram dendro3
                    = ps3.mDendrogram1 == dendro1
                    ? ps3.mDendrogram2
                    : ps3.mDendrogram1;
                index.get(dendro3).remove(ps3);
                double dist1_3 = ps3.score();
                distanceBuf.put(dendro3,Double.valueOf(dist1_3));
            }

            // remove & iterate over distances to dendro2
            ps3Set = index.remove(dendro2);
            queue.removeAll(ps3Set);
            for (PairScore ps3 : ps3Set) {
                Dendrogram dendro3
                    = ps3.mDendrogram1 == dendro2
                    ? ps3.mDendrogram2
                    : ps3.mDendrogram1;
                index.get(dendro3).remove(ps3);
                Double dist1_3D = distanceBuf.get(dendro3);
                if (dist1_3D == null) continue; // dist(dendro2,dendro3) too large
                double dist1_3 = dist1_3D.doubleValue();
                double dist2_3 = ps3.score();
                double dist12_3 = Math.max(dist1_3,dist2_3);
                PairScore ps = new PairScore(dendro12,dendro3,dist12_3);
                queue.offer(ps);
                index.addMember(dendro12,ps);
                index.addMember(dendro3,ps);
            }
            // dendro3 must be linked above threshold
            // by both dendro1 and dendro2
            if (queue.isEmpty()) return dendro12;
        }

        // share following code with Single Link
        Iterator> it = index.keySet().iterator();
        Dendrogram dendro = it.next(); // skip first element -- self
        while (it.hasNext())
            dendro = new LinkDendrogram(dendro,it.next(),
                                           Double.POSITIVE_INFINITY);
        return dendro;
    }

}
Threshold Range	Clusters
[Double.NEGATIVE_INFINITY,1)	{A}, {B}, {C}, {D}, {E} *
[1,3)	{A,B}, {C}, {D}, {E}
[3,4)	{A,B,C}, {D}, {E}
[4,9)	{A,B,C}, {D,E}
[9,Double.POSITIVE_INFINITY]	{A,B,C,D,E}