edu.ucla.sspace.clustering.BisectingKMeans Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.Matrices;

import edu.ucla.sspace.vector.DoubleVector;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import java.util.logging.Logger;


/**
 * An implementation of the Bisecting K-Means algorithm, also known as Repeated
 * Bisections.  This implementation is based on the following paper:
 *
 *   Michael Steinbach,
 *   George Karypis, Vipin Kumar.  "A comparison of document clustering
 *   techniques," in KDD Workshop on Text Mining, 200
 *
 * This clustering algorithm improves upon the standard K-Means algorithm by
 * taking a data set and repeatedly splitting the data points into two regions.
 * Initially all data points are separated into two clusters.  Then, until the
 * desired number of clusters are created, the largest cluster is divided using
 * K-Means with K equal to 2.  This implementation relies on the {@link
 * KMeansClustering} implementation.  Any properties passed to this clustering
 * method are passed onto the {@link KMeansClustering} algorithm, allowing the
 * user to set the desired seeding method.
 *
 * @see KMeansClustering
 * 
 * @author Keith Stevens
 */
public class BisectingKMeans implements Clustering {

    /**
     * Not implemented.
     */
    public Assignments cluster(Matrix dataPoints, Properties props) {
        throw new UnsupportedOperationException(
                "KMeansClustering requires that the " +
                "number of clusters be specified");
    }

    /**
     * {@inheritDoc}
     */
    public Assignments cluster(Matrix dataPoints,
                               int numClusters,
                               Properties props) {
        // Handle a simple base case.
        if (numClusters <= 1)
            return new Assignments(numClusters, dataPoints.rows(), dataPoints);

        // Create a count of cluster assignments.
        int[] numAssignments = new int[numClusters];

        // Create a list of lists.  The inner list represents the vectors
        // assigned to a particular cluster.  We use this method so that we can
        // easily transform the cluster to a Matrix
        List> clusters = new ArrayList>(
                numClusters);
        for (int c = 0; c < numClusters; ++c)
            clusters.add(new ArrayList());

        Clustering clustering = new DirectClustering();
        // Make the first bisection.
        Assignments subAssignments = clustering.cluster(dataPoints, 2, props);

        // Count the number of assignments made to each cluster and move the
        // vectors in to the corresponding list.
        for (int i = 0; i < subAssignments.size(); ++i) {
            int id = subAssignments.get(i);
            numAssignments[id]++;
            clusters.get(id).add(dataPoints.getRowVector(i));
        }

        // Generate the numClusters - 2 clusters by finding the largest cluster
        // and bisecting it.  Of the 2 resulting clusters, one will maintain the
        // same cluster index and the other will be given a new cluster index,
        // namely k, the current cluster number.
        Assignments assignments = new Assignments(
                numClusters, dataPoints.rows(), dataPoints);
        for (int k = 2; k < numClusters; k++) {
            // Find the largest cluster.
            int largestSize = 0;
            int largestIndex = 0;
            for (int c = 0; c < numClusters; ++c) {
                if (numAssignments[c] > largestSize) {
                    largestSize = numAssignments[c];
                    largestIndex = c;
                }
            }

            // Get the list of vectors representing the cluster being split and
            // the cluster that will hold the vectors split off from this
            // cluster.
            List originalCluster = clusters.get(largestIndex);
            List newCluster = clusters.get(k);

            // Split the largest cluster.
            Matrix clusterToSplit = Matrices.asMatrix(originalCluster);
            Assignments newAssignments = 
                clustering.cluster(dataPoints, 2, props);

            // Clear the lists for cluster being split and the new cluster.
            // Also clear the number of assignments.
            originalCluster.clear();
            newCluster.clear();
            numAssignments[largestIndex] = 0;
            numAssignments[k] = 0;

            // Reassign data points in the largest cluster.  Data points
            // assigned to the 0 cluster maintain their cluster number in the
            // real assignment list.  Data points assigned to cluster 1 get the
            // new cluster number, k.  
            for (int i = 0, j = 0; i < dataPoints.rows(); ++i) {
                if (assignments.get(i) == largestIndex) {
                    // Make the assignment for vectors that keep their
                    // assignment.
                    if (newAssignments.get(j) == 0) {
                        originalCluster.add(dataPoints.getRowVector(i));
                        numAssignments[largestIndex]++;
                    }
                    // Make the assignment for vectors that have changed their
                    // assignment.
                    else {
                        newCluster.add(dataPoints.getRowVector(i));
                        assignments.set(i, k);
                        numAssignments[k]++;
                    }
                    j++;
                }
            }
        }
        return assignments;
    }

    public String toString() {
        return "BisectingKMeans";
    }
}