edu.ucla.sspace.clustering.LinkClustering Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace Show documentation
The S-Space Package is a Natural Language Processing library for distributional semantics representations. Distributional semantics representations model the meaning of words, phrases, and sentences as high dimensional vectors or probability distributions. The library includes common algorithms such as Latent Semantic Analysis, Random Indexing, and Latent Dirichlet Allocation. The S-Space package also includes software libraries for matrices, vectors, graphs, and numerous clustering algorithms.
The newest version!
/*
 * Copyright 2011 David Jurgens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.common.Similarity;

import edu.ucla.sspace.clustering.HierarchicalAgglomerativeClustering.ClusterLinkage;

import edu.ucla.sspace.matrix.AbstractMatrix;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseHashMatrix;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.SparseSymmetricMatrix;
import edu.ucla.sspace.matrix.YaleSparseMatrix;

import edu.ucla.sspace.util.HashMultiMap;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.WorkQueue;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;

import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * An implmentation of the link clustering described in Ahn, Bagrow, and Lehman
 * (2010).  This algorithm is a multi-class clustering algorithm that instead of
 * clustering the nodes in a graph according to their similarity with eacher,
 * clusters the links connecting the nodes to reveal communities that
 * connect the nodes.  For full information on the algorithm see, 
 *
 *    Yong-Yeol Ahn, James P. Bagrow and Sune Lehmann.  Link communities
 *   reveal multiscale complexity in networks.  Nature 466, 761–764 (05 August
 *   2010).  Available online here.
 * 
 * 
 *
 * This algorithm automatically determines the number of clusters based on a
 * partition density function.  Accordingly, the clustering methods take no
 * parameters.  Calling the {@code cluster} method with a fixed number of
 * elements will still cluster the rows, but will ignore the requester number of
 * clusters.
 *
 *  Note that this class is not thread-safe.  Each call to clustering
 * will cache local information about the clustering result to facilitate the
 * {@link #getSolution(int)} and {@link #getSolutionDensity(int)} functions.
 *
 * This class provides one configurable property:
 *
 * 

 *  Property: {@value #KEEP_SIMILARITY_MATRIX_IN_MEMORY_PROPERTY}
 *       

 *      Default: {@code true}
 *
 * 
 If {@code true}, this property specifies the
 *      edge similarity matrix used by {@link
 *      HierarchicalAgglomerativeClustering} should be computed once and then
 *      kept in memory, which is the default behavior.  If {@code false}, this
 *      causes the similarity of two edges to be recomputed on-the-fly whenever
 *      it is requester.  By computing these values on-the-fly, the performance
 *      will be slowed down, depending on the complexity of the edge similarity
 *      function.  However, this on-the-fly setting allows for clustering large
 *      graphs whose edge similarity matrix would not regularly fit into memory.
 *      It is advised that users not tune this parameter unless it is known that
 *      the similarity matrix will not fit in memory. 
 *
 * 
 *
 * @author David Jurgens 
 */
public class LinkClustering implements Clustering, java.io.Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * A prefix for specifying properties.
     */
    public static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.clustering.LinkClustering";

    /**
     * The property to specify if the edge similarity matrix should be kept in
     * memory during clustering, or if its values should be computed on the fly.
     */
    public static final String KEEP_SIMILARITY_MATRIX_IN_MEMORY_PROPERTY =
        PROPERTY_PREFIX + ".keepSimilarityMatrixInMemory";
    
    /**
     * The logger to which clustering status updates will be written.
     */
    private static final Logger LOGGER =
        Logger.getLogger(LinkClustering.class.getName());

    /**
     * The work used by all {@code LinkClustering} instances to perform
     * multi-threaded operations.
     */
    private final WorkQueue workQueue;
    
    /**
     * The merges for the prior run of this clustering algorithm
     */
    private List mergeOrder;

    /**
     * The list of edges that were last merged.  This list is maintained in the
     * same order as the initial cluster ordering.
     */
    private List edgeList;

    /**
     * The number of rows in the input matrix that was last clustered.
     */
    private int numRows;

    /**
     * Instantiates a new {@code LinkClustering} instance.
     */
    public LinkClustering() { 
        mergeOrder = null;
        edgeList = null;
        numRows = 0;
        workQueue = WorkQueue.getWorkQueue();
    }

    /**
     * Ignores the specified number of clusters and returns the
     * clustering solution according to the partition density.
     *
     * @param numClusters this parameter is ignored.
     *
     * @throws IllegalArgumentException if {@code matrix} is not square, or is
     *         not an instance of {@link SparseMatrix}
     */
    public Assignments cluster(Matrix matrix, 
                               int numClusters,
                               Properties props) {
        LOGGER.warning("Link clustering does not take a specified number of " +
                       "clusters.  Clustering the matrix anyway.");
        return cluster(matrix, props);
    }

    /**
     * {@inheritDoc}
     *
     * @throws IllegalArgumentException if {@code matrix} is not square, or is
     *         not an instance of {@link SparseMatrix}
     */
    public Assignments cluster(Matrix matrix, Properties props) { 
        if (matrix.rows() != matrix.columns()) 
            throw new IllegalArgumentException("Input matrix is not square. " +
                "Matrix is expected to be a square matrix whose values (i,j) " +
                "denote an edge from row i to row j");
        if (!(matrix instanceof SparseMatrix)) {
            throw new IllegalArgumentException("Input matrix must be a " +
                "sparse matrix.");
        }
        SparseMatrix sm = (SparseMatrix)matrix;

        String inMemProp =
            props.getProperty(KEEP_SIMILARITY_MATRIX_IN_MEMORY_PROPERTY);
        boolean keepSimMatrixInMem = (inMemProp != null) 
            ? Boolean.parseBoolean(inMemProp) : true;

        // IMPLEMENTATION NOTE: Ahn et al. used single-linkage HAC, which can be
        // efficiently implemented in O(n^2) time as a special case of HAC.
        // However, we currently don't optimize for this special case and
        // instead use our HAC class.  Because of the complexity of the edge
        // similarity function, we build our own similarity matrix and then pass
        // it in, rather than passing in the edge matrix directly.

        final int rows = sm.rows();
        numRows = rows;
        LOGGER.fine("Generating link similarity matrix for " + rows + " nodes");

        //  Rather than create an O(row^3) matrix for representing the edges,
        // compress the edge matrix by getting a mapping for each edge to a row
        // in the new matrix.
        final List edgeList = new ArrayList();
        this.edgeList = edgeList;

        for (int r = 0; r < rows; ++r) {
            SparseDoubleVector row = sm.getRowVector(r);
            int[] edges = row.getNonZeroIndices();
            for (int col : edges) {
                // Always add edges from the upper triangular
                if (r > col)
                    edgeList.add(new Edge(r, col));
                // Otherwise, we only add the edge from the lower triangular if
                // it wasn't present in the upper.  This avoids counting
                // duplicate edges.
                else if (r < col && sm.get(col, r) == 0)
                    edgeList.add(new Edge(r, col));
            }
        }

        final int numEdges = edgeList.size();
        LOGGER.fine("Number of edges to cluster: " + numEdges);
        
        Matrix edgeSimMatrix = 
            getEdgeSimMatrix(edgeList, sm, keepSimMatrixInMem);
        
        LOGGER.fine("Computing single linkage link clustering");

        final List mergeOrder = 
            new HierarchicalAgglomerativeClustering().
                buildDendrogram(edgeSimMatrix, ClusterLinkage.SINGLE_LINKAGE);
        this.mergeOrder = mergeOrder;

        LOGGER.fine("Calculating partition densitities");

        // Set up a concurrent map that each thread will update once it has
        // calculated the densitites of each of its partitions.  This map is
        // only written to once per thread.
        final ConcurrentNavigableMap partitionDensities 
            = new ConcurrentSkipListMap();
        
        // Register a task group for calculating all of the partition
        // densitities
        Object key = workQueue.registerTaskGroup(mergeOrder.size());
        for (int p = 0; p < mergeOrder.size(); ++p) {
            final int part = p;
            workQueue.add(key, new Runnable() {
                    public void run() {
                        // Get the merges for this particular partitioning of
                        // the links
                        List mergeSteps = mergeOrder.subList(0, part);
                        
                        // Convert the merges to a specific cluster labeling
                        MultiMap clusterToElements = 
                            convertMergesToAssignments(mergeSteps, numEdges);

                        // Based on the link partitioning, calculate the
                        // partition density for each cluster
                        double partitionDensitySum = 0d;
                        for (Integer cluster : clusterToElements.keySet()) {
                            Set linkPartition = 
                                clusterToElements.get(cluster);
                            int numLinks = linkPartition.size();
                            BitSet nodesInPartition = new BitSet(rows);
                            for (Integer linkIndex : linkPartition) {
                                Edge link = edgeList.get(linkIndex);
                                nodesInPartition.set(link.from);
                                nodesInPartition.set(link.to);
                            }
                            int numNodes = nodesInPartition.cardinality();
                            // This reflects the density of this particular
                            // cluster
                            double partitionDensity =
                                (numLinks - (numNodes - 1d))
                                / (((numNodes * (numNodes - 1d)) / 2d)
                                   - (numLinks - 1));                
                            partitionDensitySum += partitionDensity;
                        }
                        // Compute the density for the total partitioning
                        // solution
                        double partitionDensity = 
                            (2d / numEdges) * partitionDensitySum;
                        LOGGER.log(Level.FINER, "Partition solution {0} had "
                                   + "density {1}",
                                   new Object[] { part, partitionDensity });
                        
                        // Update the thread-shared partition density map with
                        // this task's calculation
                        partitionDensities.put(partitionDensity, part);
                        
                    }
                });
        }

        // Wait for all the partition densities to be calculated
        workQueue.await(key);

        Map.Entry densest = partitionDensities.lastEntry();
        LOGGER.fine("Partition " + densest.getValue() + 
                    " had the highest density: " + densest.getKey());
        int partitionWithMaxDensity = densest.getValue();

        // Select the solution with the highest partition density and assign
        // nodes accordingly
        MultiMap bestEdgeAssignment =
            convertMergesToAssignments(
                mergeOrder.subList(0, partitionWithMaxDensity), numEdges);

        List> nodeClusters = new ArrayList>(rows);
        for (int i = 0; i < rows; ++i) 
            nodeClusters.add(new HashSet());
        
        // Ignore the original partition labeling, and use our own cluster
        // labeling to ensure that the IDs are contiguous.
        int clusterId = 0;

        // For each of the partitions, add the partion's cluster ID to all the
        // nodes that are connected by one of the partition's edges
        for (Integer cluster : bestEdgeAssignment.keySet()) {
            Set edgePartition = bestEdgeAssignment.get(cluster);
            for (Integer edgeId : edgePartition) {
                Edge e = edgeList.get(edgeId);
                nodeClusters.get(e.from).add(clusterId);
                nodeClusters.get(e.to).add(clusterId);
            }
            // Update the cluster id
            clusterId++;
        }

        int numClusters = 0;
        Assignment[] nodeAssignments = new Assignment[rows];
        for (int i = 0; i < nodeAssignments.length; ++i) {
            nodeAssignments[i] = 
                new SoftAssignment(nodeClusters.get(i));
        }
        return new Assignments(numClusters, nodeAssignments, matrix);
    }

    /**
     * Returns the edge similarity matrix for the edges in the provided sparse
     * matrix.
     */
    private Matrix getEdgeSimMatrix(List edgeList, SparseMatrix sm,
                                    boolean keepSimilarityMatrixInMemory) {
        return (keepSimilarityMatrixInMemory) 
            ? calculateEdgeSimMatrix(edgeList, sm)
            : new LazySimilarityMatrix(edgeList, sm);            
    }

    /**
     * Calculates the similarity matrix for the edges.  The similarity matrix is
     * symmetric.
     *
     * @param edgeList the list of all edges known to the system
     * @param sm a square matrix whose values denote edges between the rows.
     *
     * @return the similarity matrix
     */
    private Matrix calculateEdgeSimMatrix(
            final List edgeList, final SparseMatrix sm) {

        final int numEdges = edgeList.size();
        final Matrix edgeSimMatrix = 
            new SparseSymmetricMatrix(
                new SparseHashMatrix(numEdges, numEdges));

        Object key = workQueue.registerTaskGroup(numEdges);
        for (int i = 0; i < numEdges; ++i) {
            final int row = i;
            workQueue.add(key, new Runnable() {
                    public void run() {
                        for (int j = row; j < numEdges; ++j) {
                            Edge e1 = edgeList.get(row);
                            Edge e2 = edgeList.get(j);
                            
                            double sim = getEdgeSimilarity(sm, e1, e2);
                
                            if (sim > 0) {
                                // The symmetric matrix handles the (j,i) case
                                edgeSimMatrix.set(row, j, sim);
                            }
                        }
                    }
                });            
        }
        workQueue.await(key);
        return edgeSimMatrix;
    }

    /**
     * Converts a series of merges to cluster assignments.  Cluster assignments
     * are assumed to start at 0.
     *
     * @param merges the merge steps, in order
     * @param numOriginalClusters how many clusters are present prior to
     *        merging.  This is typically the number of rows in the matrix being
     *        clustered
     *
     * @returns a mapping from a cluster to all the elements contained within it.
     */
    private static MultiMap convertMergesToAssignments(
            List merges, int numOriginalClusters) {

        MultiMap clusterToElements = 
            new HashMultiMap();
        for (int i = 0; i < numOriginalClusters; ++i)
            clusterToElements.put(i, i);

        for (Merge m : merges) {
            clusterToElements.putMany(m.remainingCluster(), 
                clusterToElements.remove(m.mergedCluster()));
        }           

        return clusterToElements;
    }

    /**
     * Computes the similarity of the two edges as the Jaccard index of the
     * neighbors of two impost nodes.  The impost nodes are the two nodes the
     * edges do not have in common.  Subclasses may override this method to
     * define a new method for computing edge similarity.
     *
     * Implementation Note: Subclasses that wish to override this
     * behavior should be aware that this method is likely to be called by
     * multiple threads and therefor should make provisions to be thread safe.
     * In addition, this method may be called more than once per edge pair if
     * the similarity matrix is being computed on-the-fly.
     *
     * @param sm a matrix containing the connections between edges.  A non-zero
     *        value in location (i,j) indicates a node i is connected to
     *        node j by an edge.
     * @param e1 an edge to be compared with {@code e2}
     * @param e2 an edge to be compared with {@code e1}
     *
     * @return the similarity of the edges.a
     */
    protected double getEdgeSimilarity(SparseMatrix sm, Edge e1, Edge e2) {
        // Determing the keystone (shared) node by the edges and the other two
        // impost (unshared) nodes.
        int keystone = -1;
        int impost1 = -1;
        int impost2 = -1;
        if (e1.from == e2.from) {
            keystone = e1.from;
            impost1 = e1.to;
            impost2 = e2.to;
        }
        else if (e1.from == e2.to) {
            keystone = e1.from;
            impost1 = e1.to;
            impost2 = e2.from;
        }
        else if (e2.to == e1.from) {
            keystone = e1.from;
            impost1 = e1.to;
            impost2 = e2.from;
        }
        else if (e1.to == e2.to) {
            keystone = e1.to;
            impost1 = e1.from;
            impost2 = e2.from;
        }
        else
            return 0d;

        // Determine the overlap between the neighbors of the impost nodes
        int[] impost1edges = getImpostNeighbors(sm, impost1);
        int[] impost2edges = getImpostNeighbors(sm, impost2);
        double similarity = Similarity.jaccardIndex(impost1edges, impost2edges);
        return similarity;
    }

    /**
     * Returns an array containing the row indices of the neighbors of the
     * impost node and the row index of the impost node itself.
     */
    private static int[] getImpostNeighbors(SparseMatrix sm, int rowIndex) {
        int[] impost1edges = sm.getRowVector(rowIndex).getNonZeroIndices();
        int[] neighbors = Arrays.copyOf(impost1edges, impost1edges.length + 1);
        neighbors[neighbors.length - 1] = rowIndex;
        return neighbors;
    }

    /**
     * Returns the partition density of the clustering solution.
     */
    public double getSolutionDensity(int solutionNum) {
        if (solutionNum < 0 || solutionNum >= mergeOrder.size()) {
            throw new IllegalArgumentException(
                "not a valid solution: " + solutionNum);
        }      
        if (mergeOrder == null || edgeList == null) {
            throw new IllegalStateException(
                "initial clustering solution is not valid yet");
        }
        
        int numEdges = edgeList.size();

        // Get the merges for this particular partitioning of the links
        List mergeSteps = 
            mergeOrder.subList(0, solutionNum);
        
        // Convert the merges to a specific cluster labeling
        MultiMap clusterToElements = 
            convertMergesToAssignments(mergeSteps, numEdges);
        
        // Based on the link partitioning, calculate the node partition density
        double partitionDensitySum = 0d;
        for (Integer cluster : clusterToElements.keySet()) {
            Set linkPartition = clusterToElements.get(cluster);
            int numLinks = linkPartition.size();
            BitSet nodesInPartition = new BitSet(numRows);
            for (Integer linkIndex : linkPartition) {
                Edge link = edgeList.get(linkIndex);
                nodesInPartition.set(link.from);
                nodesInPartition.set(link.to);
            }
            int numNodes = nodesInPartition.cardinality();
            // This reflects the density of this particular cluster within the
            // total partitioning
            double partitionDensity = (numLinks - (numNodes - 1d))
                / (((numNodes * (numNodes - 1d)) / 2d) - (numLinks - 1));                
            partitionDensitySum += partitionDensity;
        }
        // Compute the density for the total partitioning solution
        double partitionDensity =  (2d / numEdges) * partitionDensitySum;
        return partitionDensity;
    }

    /**
     * Returns the clustering solution after the specified number of merge
     * steps.
     *
     * @param solutionNum the number of merge steps to take prior to returning
     *        the clustering solution.
     *
     * @throws IllegalArgumentException if {@code solutionNum} is less than 0 or
     *         is greater than or equal to {@link #numberOfSolutions()}.
     * @throws IllegalStateException if this instance has not yet finished a
     *         clustering solution.
     */
    public Assignments getSolution(int solutionNum) {
        if (solutionNum < 0 || solutionNum >= mergeOrder.size()) {
            throw new IllegalArgumentException(
                "not a valid solution: " + solutionNum);
        }      
        if (mergeOrder == null || edgeList == null) {
            throw new IllegalStateException(
                "initial clustering solution is not valid yet");
        }

        int numEdges = edgeList.size();

        // Select the solution and all merges necessary to solve it
        MultiMap bestEdgeAssignment =
            convertMergesToAssignments(
                mergeOrder.subList(0, solutionNum), numEdges);

        List> nodeClusters = new ArrayList>(numRows);
        for (int i = 0; i < numRows; ++i) 
            nodeClusters.add(new HashSet());
        
        // Ignore the original partition labeling, and use our own cluster
        // labeling to ensure that the IDs are contiguous.
        int clusterId = 0;

        // For each of the partitions, add the partion's cluster ID to all the
        // nodes that are connected by one of the partition's edges
        for (Integer cluster : bestEdgeAssignment.keySet()) {
            Set edgePartition = bestEdgeAssignment.get(cluster);
            for (Integer edgeId : edgePartition) {
                Edge e = edgeList.get(edgeId);
                nodeClusters.get(e.from).add(clusterId);
                nodeClusters.get(e.to).add(clusterId);
            }
            // Update the cluster id
            clusterId++;
        }

        Assignment[] nodeAssignments = new Assignment[numRows];
        for (int i = 0; i < nodeAssignments.length; ++i)
            nodeAssignments[i] = new SoftAssignment(nodeClusters.get(i));
        return new Assignments(clusterId, nodeAssignments);
    }

    /**
     * Returns the number of clustering solutions found by this instances for
     * the prior clustering run.
     *
     * @returns the number of solutions, or {@code 0} if no solutions are
     *          available.
     */
    public int numberOfSolutions() {
        return (mergeOrder == null) ? 0 : mergeOrder.size();
    }
    
    /**
     * A utility data structure for representing a directed edge between two
     * ordinally labeled nodes.
     */
    protected static class Edge {

        public final int from;

        public final int to;

        public Edge(int from, int to) {
            this.from = from;
            this.to = to;
        }

        public boolean equals(Object o) {
            if (o instanceof Edge) {
                Edge e = (Edge)o;
                return e.from == from && e.to == to;
            }
            return false;
        }

        public int hashCode() {
            return from ^ to;
        }

        public String toString() {
            return  "(" + from + "->" + to + ")";
        }
    }

    /**
     * A utility class that represents the edge similarity matrix, where the
     * similarity values are lazily computed on demand, rather than stored
     * internally.  While computationally more expensive, this class provides an
     * enormous benefit for clustering a graph where the similarity matrix
     * cannot fit into memory.
     */
    private class LazySimilarityMatrix extends AbstractMatrix {

        private final List edgeList;

        private final SparseMatrix sm;

        public LazySimilarityMatrix(List edgeList, SparseMatrix sm) {
            this.edgeList = edgeList;
            this.sm = sm;
        }
        
        public int columns() {
            return edgeList.size();
        }

        public double get(int row, int column) {
            Edge e1 = edgeList.get(row);
            Edge e2 = edgeList.get(column);
            
            double sim = getEdgeSimilarity(sm, e1, e2);
            return sim;
        }
        
        public DoubleVector getRowVector(int row) {
            int cols = columns();
            DoubleVector vec = new DenseVector(cols);
            for (int c = 0; c < cols; ++c) {
                vec.set(c, get(row, c));
            }
            return vec;
        }

        public int rows() {
            return edgeList.size();
        }

        public void set(int row, int columns, double val) {
            throw new UnsupportedOperationException();
        }
    }
}