com.aliasi.cluster.SingleLinkClusterer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.cluster;

import com.aliasi.util.Distance;
import com.aliasi.util.ScoredObject;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

/**
 * A SingleLinkClusterer implements standard single-link
 * agglomerative clustering.  Single link clustering is a greedy
 * algorithm in which the two closest clusters are always merged
 * up to a specified distance threshold.  Distance between clustes for
 * single link clustering is defined to be the minimum of the distances
 * between the members of the clusters.  See {@link CompleteLinkClusterer}
 * for a clusterer that takes the maximum rather than the minimum in
 * making clustering decisions.
 *
 * For example, consider the following proximity matrix
 * representing distances between pairs of elements (the same example
 * is used in {@link CompleteLinkClusterer}):
 *
 * 

 * 
 * 
 * 
 * 
 * 
 * 
 * 
 *   A B C D E
A 0 1 2 7 5
B   0 3 8 6
C     0 5 9
D       0 4
E         0
 * 
 *
 * The result of single-link clustering is the following dendrogram (pardon
 * the ASCII art):
 *
 *  *         A   B   C   D   E
 *         |   |   |   |   |
 *      1  -----   |   |   |
 *      2    -------   |   |
 *      3       |      |   |
 *      4       |      -----
 *      5       ----------
 * 
 *
 * First, the objects A and B are merged
 * at distance one.  Then object C is merged into the
 * cluster at distance 2, because that is its distance from
 * A.  Objects D and E are
 * merged next at distance 4, and finally, the two big clusters are
 * merged at distance 5, the distance between A and
 * E.
 *
 * 
The various clusters at each proximity bound threshold are:
 *
 * 

 * 
 * 
 * 
 * 
 * 
 * 
 * Threshold Range Clusters
[Double.NEGATIVE_INFINITY,1) {A}, {B}, {C}, {D}, {E}
 * 
[1,2)  {A,B}, {C}, {D}, {E}
[2,4) {A,B,C}, {D}, {E}
[4,5) {A,B,C}, {D,E}
[5,Double.POSITIVE_INFINITY] {A,B,C,D,E}
 * 
 *
 * The intervals show the clusters returned for thresholds within
 * the specified interval.  As usual, square brackets denote inclusive
 * range bounds and round brackets exclusive bounds.  For instance,
 * if the distance threshold is 1.5, four clusters are returned,
 * whereas if the threshold is 4.0, two clusters are returned.
 *
 * Note that this example has the same clusters as that in {@link
 * CompleteLinkClusterer}.  A simple example where the results are
 * different is the result of clustering four points on the x-axis,
 * x_₁=1, x_₂=3,
 * x_₃=6, x_₄=10.
 * The distance is computed by subtraction, e.g. d(x_₂,
 * x_₄) = x_₄- x_₂.
 * This leads to the following distance matrix:
 *
 * 

 * 
 * 
 * 
 * 
 * 
 * 
 *   x_₁ x_₂ x_₃ x_₄
x_₁ 0 2 5 9
x_₂   0 3 7
x_₃     0 4
x_₄       0
 * 

 *
 * For this example, the complete link clustering is
 *
 * {{x_₁,x_₂},{x_₃,x_₄}}
 *
 * whereas the single-link clustering is:
 *
 * {{{x_₁,x_₂},x_₃},x_₄}
 *
 * Implementation Note: This clusterer is implemented using
 * the minimum-spanning-tree-based algorithm described in:
 *
 * 

 * Jain, Anil K. and Richard C. Dubes. 1988.  Algorithms for Clustering
 * Data.  Prentice-Hall.
 * 
 *
 * The minimum spanning tree is constructed using Kruskal's algorithm,
 * which is described in:
 *
 * 
 * Cormen, Thomas H., Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein.
 * 2001.
 * Introduction to Algorithms. MIT Press.
 * 
 *
 * This algorithm requires an efficient lookup of whether two
 * nodes are already in the same connected component, which
 * is implemented by the standard disjoint set algorithm with
 * path compression (also described in Cormen et al.).
 *
 * In brief, the algorithm sorts all pairs of objects in order of
 * increasing distance (with n objects that involves
 * sorting n^² pairs of objects,
 * leading to an initial step with worst case time complexity bound
 * O(n^² log n).  The pairs are then
 * merged in order of decreasing distance.  The sets involved in
 * Kruskal's algorithm translate directly into dendrograms with their
 * dereferencing.
 *
 * @author  Bob Carpenter
 * @version 4.1.0
 * @since   LingPipe2.0
 * @param  the type of object being clustered
 */
public class SingleLinkClusterer
    extends AbstractHierarchicalClusterer {

    /**
     * Construct a single-link clusterer with the specified distance
     * bound.
     *
     * @param maxDistance Maximum distance for clusters.
     * @param distance Distance measure between objects to cluster.
     * @throws IllegalArgumentException If the specified bound is not
     * a non-negative number.
     */
    public SingleLinkClusterer(double maxDistance, Distance distance) {
        super(maxDistance,distance);
    }

    /**
     * Construct a single-link clusterer with no distance bound.
     * The distance bound is set to {@link Double#POSITIVE_INFINITY},
     * which effectively removes the distance bound.
     *
     * @param distance Distance measure between objects to cluster.
     */
    public SingleLinkClusterer(Distance distance) {
        this(Double.POSITIVE_INFINITY,distance);
    }

    /**
     * Return the array of clusters derived from the specified
     * distance matrix by performing single-link clustering up to the
     * specified maximum distance bound.  Every pair of elements in
     * the matrix that are within the distance bound will fall in the
     * same dendrogram in the result.  Setting the maximum distance
     * to {@link Double#POSITIVE_INFINITY} results in a complete
     * clustering.
     *
     * @param elementSet Set of elements to cluster.
     * @return Clustering in the form of a set of sets of elements.
     * @throws IllegalArgumentException If the set of elements is empty.
     */
    @Override
    public Dendrogram hierarchicalCluster(Set elementSet) {
        if (elementSet.size() == 0) {
            String msg = "Require non-empty set to form dendrogram."
                + " Found elementSet.size()=" + elementSet.size();
            throw new IllegalArgumentException(msg);
        }
        if (elementSet.size() == 1)
            return new LeafDendrogram(elementSet.iterator().next());
        Object[] elements = toElements(elementSet);
        // need array so identity is shared among leaf dendros
        // require supression for array
        @SuppressWarnings({"unchecked","rawtypes"})
        LeafDendrogram[] leafs
            = (LeafDendrogram[]) new LeafDendrogram[elements.length];
        for (int i = 0; i < leafs.length; ++i) {
            // required for obj array
            @SuppressWarnings("unchecked")
            E elt = (E) elements[i];
            leafs[i] = new LeafDendrogram(elt);
        }
        Set> clusters
            = new HashSet>(elements.length);
        for (Dendrogram dendrogram : leafs)
            clusters.add(dendrogram);

        // compute ordered list of pairs of elements to merge
        ArrayList> pairScoreList = new ArrayList>();
        int len = elements.length;
        double maxDistance = getMaxDistance();
        for (int i = 0; i < len; ++i) {
            // required for obj array
            @SuppressWarnings("unchecked")
            E eI = (E) elements[i];
            Dendrogram dendroI = leafs[i];
            for (int j = i + 1; j < len; ++j) {
                // required for obj array
                @SuppressWarnings("unchecked")
                E eJ = (E) elements[j];
                double distanceIJ = distance().distance(eI,eJ);
                // if (distanceIJ > maxDistance) continue;  // speeds up, but strands elements
                Dendrogram dendroJ = leafs[j];
                pairScoreList.add(new PairScore(dendroI,dendroJ,distanceIJ));
            }
        }
        // required for array
        @SuppressWarnings({"unchecked","rawtypes"})
        PairScore[] pairScores
            = (PairScore[]) new PairScore[pairScoreList.size()];
        pairScoreList.toArray(pairScores);
        Arrays.sort(pairScores,ScoredObject.comparator());  // increasing order of distance

        for (int i = 0;  i < pairScores.length && clusters.size() > 1;  ++i) {
            PairScore ps = pairScores[i];
            if (ps.score() > getMaxDistance()) break;
            Dendrogram d1 = ps.mDendrogram1.dereference();
            Dendrogram d2 = ps.mDendrogram2.dereference();
            if (d1.equals(d2)) {
                continue; // already linked
            }
            clusters.remove(d1);
            clusters.remove(d2);
            LinkDendrogram dLink
                = new LinkDendrogram(d1,d2,pairScores[i].mScore);
            clusters.add(dLink);
        }
        // link up remaining unlinked dendros at +infinity distance
        Iterator> it = clusters.iterator();
        Dendrogram dendro = it.next(); // skip first - self
        while (it.hasNext())
            dendro = new LinkDendrogram(dendro,it.next(),
                                           Double.POSITIVE_INFINITY);
        return dendro;
    }


}
Threshold Range	Clusters
[Double.NEGATIVE_INFINITY,1)	{A}, {B}, {C}, {D}, {E} *
[1,2)	{A,B}, {C}, {D}, {E}
[2,4)	{A,B,C}, {D}, {E}
[4,5)	{A,B,C}, {D,E}
[5,Double.POSITIVE_INFINITY]	{A,B,C,D,E}