All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.clustering.ClusteringByCommittee Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.common.Similarity.SimType;

import edu.ucla.sspace.clustering.HierarchicalAgglomerativeClustering.ClusterLinkage;

import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;

import edu.ucla.sspace.util.BoundedSortedMultiMap;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.Duple;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * An implementation of the Clustering by Committee (CBC) algorithm.  This
 * implementation is based on the Pantel's thesis:
 * 
    * *
  • Patrick * Pantel. 2003. Clustering by Committee. Ph.D. Dissertation. Department of * Computing Science, University of Alberta, Canada. available here. *
* *

This class offers five parameters for configuring how the clustering * occurs * *

* *
Property: {@value #AVERGAGE_LINK_MERGE_THRESHOLD_PROPERTY} *
* Default: {@value #DEFAULT_AVERGAGE_LINK_MERGE_THRESHOLD} * *
The property to specify during the Phase II.1 * when to stop the agglomerative clustering of the nearest neighbors. * This property specifies a {@code double} threshold where clusters whose * the average-link similarity falls below the value will not be merged * (i.e. stay two clusters).

* *

Property: {@value #COMMITTEE_SIMILARITY_THRESHOLD_PROPERTY} *
* Default: {@value #DEFAULT_COMMITTEE_SIMILARITY_THRESHOLD} * *
The property to specify during Phase II.3 what * is the maximum similarity between two committees above which a new * committee will not be included. This property corresponds to θ-1 in the * CBC papers. This property specifies a {@code double}.

* *

Property: {@value #RESIDUE_SIMILARITY_THRESHOLD_PROPERTY} *
* Default: {@value #DEFAULT_RESIDUE_SIMILARITY_THRESHOLD} * *
The property for specifying the similarity * threshold in Phase II.5 where if an element has a similarity less than * this threshold to all existing committees, the element is marked as * "residue" and recursively clustered. This property corresponds to the * θ-2 parameter in the original papers. The property is specified as a * {@code double}.

* *

Property: {@value #HARD_CLUSTERING_PROPERTY} *
* Default: {@code true} * *
Specifies whether CBC should use a hard * (single class) or soft (multi-class) cluster labeling. The default * behavior is to use hard clustering.

* *

Property: {@value #SOFT_CLUSTERING_SIMILARITY_THRESHOLD_PROPERTY} *
* Default: {@value #DEFAULT_SOFT_CLUSTERING_SIMILARITY_THRESHOLD} * *
If soft clustering is enabled, specifies a * {@code double} the threshold used during soft clustering where a point * will not be labeled with the committees who are more similar than this * value. If hard clustering is enabled the value of this property has no * effect. See Phrase III of the CBC algorithm for more details.

* *

* * This class is thread-safe. * * @author David Jurgens */ public class ClusteringByCommittee implements Clustering { private static final String PROPERTY_PREFIX = "edu.ucla.sspace.clustering.ClusteringByCommittee"; /** * The property to specify during the Phase II.1 when to stop the * agglomerative clustering of the nearest neighbors. This property * specifies a {@code double} threshold where clusters whose the * average-link similarity falls below the value will not be merged * (i.e. stay two clusters). */ public static final String AVERGAGE_LINK_MERGE_THRESHOLD_PROPERTY = PROPERTY_PREFIX + ".averageLinkMergeThreshold"; /** * The default value of the {@value #AVERGAGE_LINK_MERGE_THRESHOLD_PROPERTY} * property. */ public static final String DEFAULT_AVERGAGE_LINK_MERGE_THRESHOLD = ".25"; /** * The property to specify during Phase II.3 what is the maximum similarity * between two committees above which a new committee will not be included. * This property corresponds to θ-1 in the CBC papers. This property * specifies a {@code double}. */ public static final String COMMITTEE_SIMILARITY_THRESHOLD_PROPERTY = PROPERTY_PREFIX + ".maxCommitteeSimilarity"; /** * The default value of the {@value * #COMMITTEE_SIMILARITY_THRESHOLD_PROPERTY} property. */ public static final String DEFAULT_COMMITTEE_SIMILARITY_THRESHOLD = ".35"; /** * The property for specifying the similarity threshold in Phase II.5 where * if an element has a similarity less than this threshold to all existing * committees, the element is marked as "residue" and recursively clustered. * This property corresponds to the θ-2 parameter in the original papers. * The property is specified as a {@code double}. */ public static final String RESIDUE_SIMILARITY_THRESHOLD_PROPERTY = PROPERTY_PREFIX + ".residueSimilarityThreshold"; /** * The default value of the {@value #RESIDUE_SIMILARITY_THRESHOLD_PROPERTY} * property. */ public static final String DEFAULT_RESIDUE_SIMILARITY_THRESHOLD = ".25"; /** * The property for specifying a {@code double} the threshold used during * soft clustering where a point will not be labeled with the committees who * are more similar than this value. See Phrase III of the CBC algorithm * for more details. */ public static final String SOFT_CLUSTERING_SIMILARITY_THRESHOLD_PROPERTY = PROPERTY_PREFIX + ".softClusteringThreshold"; /** * The default value of the {@value * #SOFT_CLUSTERING_SIMILARITY_THRESHOLD_PROPERTY} property. */ public static final String DEFAULT_SOFT_CLUSTERING_SIMILARITY_THRESHOLD = ".25"; /** * Specifies whether CBC should use a hard (single class) or soft * (multi-class) cluster labeling. The default is to use hard clustering. */ public static final String HARD_CLUSTERING_PROPERTY = PROPERTY_PREFIX + ".useHardClustering"; /** * The logger used by this class. */ private static final Logger LOGGER = Logger.getLogger(ClusteringByCommittee.class.getName()); /** * During Phase II.1, the k-nearest neighbors are used to create candidate * committees. This constant is what was used in the Pantel and Lin (2002) * paper. */ private static final int K_MOST_SIMILAR_NEIGHBORS = 10; /** * Creates a new {@code ClusteringByCommittee} instance */ public ClusteringByCommittee() { } /** * Ignores the provided number of clusters and clusters the rows of * the provided matrix using the CBC algorithm. This method is equivalent * to calling {@link #cluster(Matrix,Properties)} without specifying the * number of clusters. * * @throws IllegalArgumentException if {@code m} is not an instance of * {@link SparseMatrix}. */ public Assignments cluster(Matrix m, int numClusters, Properties props) { LOGGER.warning("CBC does not take in a specified number of clusters. " + "Ignoring specification and clustering anyway."); return cluster(m, props); } /** * Clusters the rows of {@code m} according to the CBC algorithm, using * {@code props} to specify the configurable parameters of the algorithm. * * @throws IllegalArgumentException if {@code m} is not an instance of * {@link SparseMatrix}. */ public Assignments cluster(Matrix m, Properties props) { // Set up the parameters for clustering double avgLinkMergeThresh = Double.parseDouble(props.getProperty( AVERGAGE_LINK_MERGE_THRESHOLD_PROPERTY, DEFAULT_AVERGAGE_LINK_MERGE_THRESHOLD)); double maxCommitteeSimThresh = Double.parseDouble(props.getProperty( COMMITTEE_SIMILARITY_THRESHOLD_PROPERTY, DEFAULT_COMMITTEE_SIMILARITY_THRESHOLD)); double residueSimThresh = Double.parseDouble(props.getProperty( RESIDUE_SIMILARITY_THRESHOLD_PROPERTY, DEFAULT_RESIDUE_SIMILARITY_THRESHOLD)); double softClusteringThresh = Double.parseDouble(props.getProperty( SOFT_CLUSTERING_SIMILARITY_THRESHOLD_PROPERTY, DEFAULT_SOFT_CLUSTERING_SIMILARITY_THRESHOLD)); boolean useHardClustering = Boolean.parseBoolean( props.getProperty(HARD_CLUSTERING_PROPERTY, "true")); LOGGER.info("Starting Clustering By Committee"); // Check that the input is a sparse matrix if (!(m instanceof SparseMatrix)) throw new IllegalArgumentException("CBC only accepts sparse matrices"); SparseMatrix sm = (SparseMatrix)m; // Create a bit set with the number of bits equal to the number of rows. // This serves as input to phase 2 where we indicate that all rows // should be considered for clustering at first. BitSet allRows = new BitSet(sm.rows()); allRows.set(0, sm.rows()); LOGGER.info("CBC begining Phase 2"); List committees = phase2( sm, allRows, avgLinkMergeThresh, maxCommitteeSimThresh, residueSimThresh); LOGGER.info("CBC begining Phase 3"); // PHASE 3: Assign elements to clusters Assignments assignments = new Assignments( committees.size(), m.rows(), m); for (int r = 0; r < m.rows(); ++r) { LOGGER.fine("Computing Phase 3 for row " + r); SparseDoubleVector row = sm.getRowVector(r); // Determine to which committees the row belongs List committeeIds = phase3( row, committees, useHardClustering, softClusteringThresh); int[] clusters = new int[committeeIds.size()]; for (int i = 0; i < committeeIds.size(); ++i) clusters[i] = committeeIds.get(i); assignments.setAll(r, clusters); } return assignments; } /** * Starts Phase II of the CBC algorithm, returning a list {@link Committee} * instances that may cover the elements (rows) of {@code sm}. * * @param sm the matrix whose rows are to be clustered * @param rowsToConsider a bit set where the {@code true} values indicate * which rows of {@code sm} should be evaluated. This parameter is * important for recursive calls where not all the neighbors will be * considered * @param avgLinkMergeThresh specifies when to stop the agglomerative * clustering of the nearest neighbors for clusters whose the * average-link similarity falls below the value will not be merged * (i.e. stay two clusters). * @param maxCommitteeSimThresh specifies during what is the maximum * similarity between two committees above which a new committee will * not be included. This parameter corresponds to θ-1 in the CBC * papers. * @param residueSimThresh specifies the similarity threshold in Phase II.5 * where if an element has a similarity less than this threshold to * all existing committees, the element is marked as "residue" and * recursively clustered. This property corresponds to the θ-2 * parameter in the original papers. * * @return a list of identified committies in arbitrary order. */ private static List phase2(SparseMatrix sm, BitSet rowsToConsider, double avgLinkMergeThresh, double maxCommitteeSimThresh, double residueSimThresh) { if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine("CBC computing Phase 2 for " + rowsToConsider.cardinality() + " rows"); } List candidateCommittees = new ArrayList(); // STEP 1 // For each element e in E (for each row in m) for (int r = rowsToConsider.nextSetBit(0); r >= 0; r = rowsToConsider.nextSetBit(r + 1)) { // 1.1) Cluster the top similar elements of e from S using // average-link clustering. MultiMap mostSimilarElements = new BoundedSortedMultiMap(K_MOST_SIMILAR_NEIGHBORS); for (int r2 = rowsToConsider.nextSetBit(0); r2 >= 0; r2 = rowsToConsider.nextSetBit(r2 + 1)) { if (r == r2) continue ; double sim = Similarity.cosineSimilarity(sm.getRowVector(r), sm.getRowVector(r2)); mostSimilarElements.put(sim, r2); } // If there were no similar elements to the current row, skip it. if (mostSimilarElements.size() == 0) continue; // 1.2) For each cluster discovered c compute the following score: // |c| × avgsim(c), where |c| is the number of elements in c // and avgsim(c) is the average pairwise similarity between // elements in c. List commsForRow = buildCommitteesForRow( mostSimilarElements.values(), sm, avgLinkMergeThresh); Collections.sort(commsForRow); // 1.3) Store the highest-scoring cluster in a list L. candidateCommittees.add(commsForRow.get(0)); } // STEP 2 // Sort the clusters in L in descending order of their scores. Collections.sort(candidateCommittees); // STEP 3 // Let C be a list of committees, initially empty. List committees = new ArrayList(); // For each cluster c in L in sorted order for (CandidateCommittee cc : candidateCommittees) { // 3.1) Compute the centroid of c by averaging the frequency vectors // of its elements and computing the mutual information vector // of the centroid in the same way as we did for individual // elements. // 3.2) If c's similarity to the centroid of each committee // previously added to C is below a threshold, add c to C. boolean isDissimilar = true; for (Committee c : committees) { if (Similarity.cosineSimilarity(cc.centroid(), c.centroid()) >= maxCommitteeSimThresh) { isDissimilar = false; } } if (isDissimilar) { committees.add(new Committee(cc)); } } LOGGER.log(Level.FINE, "Found {0} committees.", new Object[] {committees.size()}); // STEP 4 // If C is empty, we are done and return C. if (committees.isEmpty()) return committees; Set residues = new HashSet(); // STEP 5 // For each element e in E for (int r = rowsToConsider.nextSetBit(0); r >= 0; r = rowsToConsider.nextSetBit(r + 1)) { // 5.1) If e's similarity to every committee in C is below // threshold2, add e to a list of residues R. boolean isResidue = true; SparseDoubleVector row = sm.getRowVector(r); for (Committee c : committees) { if (Similarity.cosineSimilarity(c.centroid(), row) >= residueSimThresh) { isResidue = false; } } if (isResidue) residues.add(r); } if (LOGGER.isLoggable(Level.FINER) && !residues.isEmpty()) { LOGGER.finer("Found residual elements: " + residues); } // STEP 6 // 6.1) If R is empty, we are done and return C. if (residues.isEmpty()) { return committees; } // Edge case: if only a single row is passed in to evaluate, the return // the existing set of committees, since we can't form any new // committees from a word with zero neighbors. else if (residues.size() == 1) return committees; BitSet b = new BitSet(sm.rows()); for (Integer i : residues) b.set(i); // 6.2) Otherwise, return the union of C and the output of a recursive // call to Phase II using the same input except replacing E with R. committees.addAll(phase2(sm, b, avgLinkMergeThresh, maxCommitteeSimThresh, residueSimThresh)); // Return: a list of committees. return committees; } /** * Computes Phase 3 of the CBC algorithm for the provided row * * @param row the row for which the final committee (clustering) assignment * should be performed * @param committees the final list of committees to use in labeling the row * @param useHardClustering {@code true} if the row should be assigned one * committee, {@code false} if the row could be assigned more than * one * @param softClusteringThresh if soft-clustering is enabled, specifies the * threshold used during soft clustering where a point will not be * labeled with the committees who are more similar than this value. * * @return a list of committee assignments for the given row */ private static List phase3(SparseDoubleVector row, List committees, boolean useHardClustering, double softClusteringThresh) { if (useHardClustering) { int mostSimilarCommittee = -1; double highestSim = -1d; for (int i = 0; i < committees.size(); ++i) { Committee c = committees.get(i); double sim = Similarity.cosineSimilarity(row, c.centroid()); if (sim > highestSim) { highestSim = sim; mostSimilarCommittee = i; } } return Collections.singletonList(mostSimilarCommittee); } else { // Make a copy of the row because will be changing the vector as we // assign it to more committees SparseDoubleVector copy = new CompactSparseVector(row); // let C be a list of clusters initially empty List assignedClusters = new ArrayList(); // let S be the top-200 similar clusters to e MultiMap> mostSimilarCommittees = new BoundedSortedMultiMap>(200); // for (Committee c : committees) for (int i = 0; i < committees.size(); ++i) { Committee c = committees.get(i); mostSimilarCommittees.put( Similarity.cosineSimilarity(row, c.centroid()), new Duple(c, i)); } // System.out.println("Most similar committees: " + // mostSimilarCommittees); // while S is not empty { // let c be the most similar cluster to e for (Duple p : mostSimilarCommittees.values()) { Committee c = p.x; Integer comId = p.y; SparseDoubleVector centroid = c.centroid(); // if the similarity(e, c) < SIGMA, exit the loop if (Similarity.cosineSimilarity(copy, centroid) < 0) { // NOTE: we intentionally don't exit the loop continue; } // if c is not similar to any cluster in C { boolean isSimilar = false; for (Integer committeeId : assignedClusters) { Committee c2 = committees.get(committeeId); if (Similarity.cosineSimilarity(c2.centroid(), centroid) >= softClusteringThresh) { isSimilar = true; break; } } if (!isSimilar) { // assign e to c assignedClusters.add(comId); // remove from e its features that overlap with the features of // c; remove c from S for (int i : centroid.getNonZeroIndices()) { copy.set(i, 0); } } } return assignedClusters; } } /** * Builds a set of candidate committees from the clusters formed by the * average-link clustering of the provided rows. * * @param avgLinkMergThresh the parameter used by HAC to determine when to * stop merging clusters on the basis of their dissimilarity */ public static List buildCommitteesForRow( Collection rows, SparseMatrix sm, double avgLinkMergeThresh) { // If there are no candidate rows, just return early. if (rows.size() == 0) return new ArrayList(); double AVG_LINK_MERGE_THRESHOLD = .25; // ???? // Convert the nearest neighbors to a matrix and cluster them using HAC // wth the mean link critera. List v = new ArrayList(); for (Integer neighbor : rows) v.add(sm.getRowVector(neighbor)); int[] assignments = HierarchicalAgglomerativeClustering.clusterRows( Matrices.asSparseMatrix(v), AVG_LINK_MERGE_THRESHOLD, ClusterLinkage.MEAN_LINKAGE, SimType.COSINE); // Form clusters for all the rows Map> clusters = new HashMap>(); int i = 0; for (Integer row : rows) { int clusterId = assignments[i]; Set cluster = clusters.get(clusterId); if (cluster == null) { cluster = new HashSet(); clusters.put(clusterId, cluster); } cluster.add(row); i++; } // Create the set of candidate committees from the clusters List candidates = new ArrayList(); for (Set cluster : clusters.values()) candidates.add(new CandidateCommittee(cluster, sm)); return candidates; } /** * A decorator for indicating that a {@link CandidateCommittee} has been * reified as an actual committee and will be used for the final cluster * assignment. */ private static class Committee { private final CandidateCommittee cc; public Committee(CandidateCommittee cc) { this.cc = cc; } /** * Returns the centroid. */ public SparseDoubleVector centroid() { return cc.centroid(); } public boolean equals(Object o) { return (o instanceof Committee) && ((Committee)o).cc.equals(cc); } public int hashCode() { return cc.hashCode(); } public String toString() { return cc.toString(); } } /** * A simple struct for representing a proposed committee that has not been * finalized for clustering. This class implements {@link Comparable} so * that an ordered, descending list of Committees can be made based on the * score (highest score first). */ private static class CandidateCommittee implements Comparable { /** * The set of rows that forms this cluster */ private final Set rows; /** * The centroid for this committee. */ private final SparseDoubleVector centroid; /** * The score for this committee. */ private final double score; /** * Constructs a new {@link CandidateCommittee}. */ public CandidateCommittee(Set rows, SparseMatrix sm) { this.rows = rows; // Compute the centroid centroid = new CompactSparseVector(sm.columns()); double simSum = 0d; for (int r : rows) { SparseDoubleVector row = sm.getRowVector(r); VectorMath.add(centroid, row); for (int r2 : rows) { if (r == r2) continue; simSum += Similarity. cosineSimilarity(row, sm.getRowVector(r2)); } } double denom = 1d / rows.size(); for (int nz : centroid.getNonZeroIndices()) { centroid.set(nz, centroid.get(nz) / denom); } // From Phase 2 // // 1.2) For each cluster discovered c compute the following score: // |c| × avgsim(c), where |c| is the number of elements in c // and avgsim(c) is the average pairwise similarity between // elements in c. double avgSim = (rows.size() == 1) ? 0 : simSum / ((rows.size() * rows.size()) - rows.size()); score = rows.size() * avgSim; } /** * Returns the centroid. */ public SparseDoubleVector centroid() { return centroid; } /** * Returns the difference between another Committee's score and this * Committee's score. */ public int compareTo(CandidateCommittee c) { return -Double.compare(score, c.score); } public boolean equals(Object o) { if (o instanceof CandidateCommittee) { CandidateCommittee cc = (CandidateCommittee)o; return cc.rows.equals(rows); } return false; } public int hashCode() { return rows.hashCode(); } /** * Returns the score for this candidate committee according to the * criteria defined in Phase 2, Part 1. */ public double score() { return score; } public String toString() { return "Committee {rows=" + rows + ", score=" + score + ", centroid=" + centroid + "}"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy