edu.ucla.sspace.clustering.seeding.OrssSeed Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.
The newest version!
/*
 * Copyright 2011 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.clustering.seeding;

import edu.ucla.sspace.common.Similarity;

import edu.ucla.sspace.matrix.Matrix;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.VectorMath;


/**
 * Select seeds using a modification of the ORSS algorithm.  Seeds are selected
 * from the data such that they are well scattered.  This implementation has a
 * runtime of O(nkd) and is is based on the following paper:
 *
 *      Rafail Ostrovsky ,
 *     Yuval Rabani ,  Leonard J. Schulman ,  Chaitanya Swamy, "The
 *     effectiveness of lloyd-type methods for the k-means problem," in
 *     47th IEEE Symposium on the Foundations of Computer Science,
 *     2006
 *
 * This involves 4 steps:
 * 
 *   First, the cost of an optimal solution to the K-Means
 *       problem is solved for the 1 cluster case along (called
 *       optimalSingleCost) with the centroid for this case (called c).
 *   Then, an intial seed is selected from the set of data points (X)
 *       with 
 *       probability p((optimalSingleCost + n*euclieanDist(x,c)^2)/2*n*optimalSingleCost) 
 *   The second centroid is selected from X with probability
 *        p(euclideanDist(x, c_1)^2/(optimalSingleCost + n*euclideanDist(c,
 *        c_1)^2)) 
 *   The remaining k-2 centroids are selected from X with probability
 *       p(euclideanDist(x, c_nearest)^2)
 *   
 * 
 *
 * @author Keith Stevens
 */
public class OrssSeed implements KMeansSeed {

    /**
     * A small number used to determine when the centroids have converged.
     */
    private static final double EPSILON = 1e-3;

    /**
     * {@inheritDoc}
     */
    public DoubleVector[] chooseSeeds(int numCentroids, Matrix dataPoints) {
        int numDataPoints = dataPoints.rows();
        DoubleVector[] centroids = new DoubleVector[numCentroids];

        // Compute the centroid assuming that there is only one cluster to be
        // found.  This is required for computing the optimal cost of a single
        // cluster solution.
        DoubleVector singleCentroid = new DenseVector(dataPoints.columns());
        for (int i = 0; i < numDataPoints; ++i)
            VectorMath.add(singleCentroid, dataPoints.getRowVector(i));
        singleCentroid = new ScaledDoubleVector(
                singleCentroid, 1/((double)numDataPoints));

        // Compute the optimal cost of the single cluster case.
        double optimalSingleCost = 0;
        double[] distances = new double[numDataPoints];
        for (int i = 0; i < numDataPoints; ++i) {
            distances[i] = Similarity.euclideanDistance(
                    singleCentroid, dataPoints.getRowVector(i));
            optimalSingleCost += distances[i];
        }

        // Select the first centroid.  We pick the first centroid based on the
        // probability p((optimalSingleCost + n*euclideanDistance(x, c)^2)/
        //               2*n*optimalSingleCost)
        double probability = Math.random();
        double optimalDistance = 0;
        for (int i = 0; i < numDataPoints; ++i) {
            // Get the distance between each data point and the single
            // centroid.
            double distance = distances[i];
            // Compute the probability of selecting this data point.
            double cost = optimalSingleCost + numDataPoints*distance*distance;
            cost /= (2 * numDataPoints * optimalSingleCost);

            // Determine whether or not we should select the data point based on
            // it's probability.  Also save the distance from this new centroid
            // to the single centroid.
            probability -= cost;
            if (probability <= EPSILON) {
                centroids[0] = dataPoints.getRowVector(i);
                optimalDistance = distance;
                break;
            }
        }

        // Select the second centroid.
        probability = Math.random();
        
        // Precompute the normalizing factor for the probability of selecting a
        // data point.  In short this should be:
        //   optimalSingleCost * n*|euclideanDist(c, c_1)|^2
        double normalFactor = 
            optimalSingleCost + numDataPoints*Math.pow(optimalDistance, 2);

        // Store the set minimum distance each data point has to a single
        // centroid.  This is used later on for selecting the last k-2
        // centroids.
        double[] minDistances = new double[numDataPoints];
        boolean selected = false;
        for (int i = 0; i < numDataPoints; ++i) {
            // Compute the distance.  Since the first centroid is the only one,
            // we know its the closest so store the distance.
            double distance = Similarity.euclideanDistance(
                    centroids[0], dataPoints.getRowVector(i));
            minDistances[i] = distance;

            // Determine whether or not we should select this data point as the
            // second centroid.
            double cost = Math.pow(distance, 2) / normalFactor;
            probability -= cost;
            if (probability <= EPSILON && !selected) {
                centroids[1] = dataPoints.getRowVector(i);
                selected = true;
            }
        }

        // Select the remaining k-2 centroids.
        // We pick a data point to be a new centroid based on the following
        // probability:
        //   p(|euclideanDist(dataPoint, nearestCentroid)|^2)
        // Since we store the minimum distance each data point has to any given
        // centroid in minDistances, all we need to do is update that with the
        // distance between each data point and the most recently selected
        // centroid.
        for (int c = 2; c < numCentroids; ++c) {
            selected = false;
            probability = Math.random();
            for (int i = 0; i < numDataPoints; ++i) {
                // Compute the distance between each data point and the most
                // recently selected centroid.  Update the minimum distance
                // array if the recent centroid is nearer.
                double distance = Similarity.euclideanDistance(
                        centroids[c-1], dataPoints.getRowVector(i));
                if (distance < minDistances[c])
                    minDistances[i] = distance;

                // Determine whether or not we should select this data point as
                // the next centroid.
                double cost = Math.pow(minDistances[i], 2);
                probability -= cost;
                if (probability <= EPSILON && !selected) {
                    centroids[c] = dataPoints.getRowVector(i);
                    selected = true;
                }
            }
        }

        return centroids;
    }
}