gov.sandia.cognition.learning.algorithm.clustering.initializer.DistanceSamplingClusterInitializer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
/*
* File: MinDistanceSamplingClusterInitializer.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry Learning Core
*
* Copyright February 21, 2011, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
*/
package gov.sandia.cognition.learning.algorithm.clustering.initializer;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.Cluster;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator;
import gov.sandia.cognition.math.DivergenceFunction;
import gov.sandia.cognition.statistics.DiscreteSamplingUtil;
import java.util.Random;
/**
* Implements {@code FixedClusterInitializer} that initializes clusters by
* first selecting a random point for the first cluster and then randomly
* sampling each successive cluster based on the squared minimum distance from
* the point to the existing selected clusters. This is also known as the
* K-means++ initialization algorithm.
*
* @param
* Type of {@code Cluster} used in theaceous {@code learn()}
* method.
* @param
* The algorithm operates on a {@code Collection}, so
* {@code DataType} will be something like Vector or String.
* @author Justin Basilico
* @since 3.1
*/
@PublicationReference(
author={"David Arthur", "Sergei Vassilvitskii"},
title="k-means++: the advantages of careful seeding",
year=2007,
type=PublicationType.Conference,
publication="Proceedings of the eighteenth annual ACM-SIAM Symposium on Discrete algorithms (SODA)",
url="http://portal.acm.org/citation.cfm?id=1283383.1283494")
public class DistanceSamplingClusterInitializer, DataType>
extends AbstractMinDistanceFixedClusterInitializer
{
/**
* Creates a new, empty instance of {@code MinDistanceSamplingClusterInitializer}.
*/
public DistanceSamplingClusterInitializer()
{
this(null, null, new Random());
}
/**
* Creates a new instance of {@code MinDistanceSamplingClusterInitializer}.
*
* @param divergenceFunction
* The divergence function to use.
* @param creator
* The cluster creator to use.
* @param random
* The random number generator to use.
*/
public DistanceSamplingClusterInitializer(
final DivergenceFunction super DataType, ? super DataType> divergenceFunction,
final ClusterCreator creator,
final Random random)
{
super(divergenceFunction, creator, random);
}
@SuppressWarnings("unchecked")
@Override
public DistanceSamplingClusterInitializer clone()
{
return (DistanceSamplingClusterInitializer)
super.clone();
}
@Override
protected int selectNextClusterIndex(
final double[] minDistances,
final boolean[] selected)
{
final int elementCount = minDistances.length;
// Build up the cumulative distribution of weights from which we will
// sample. We build up the cumulative distribution to make it easy
// to sample from by using a binary search.
final double[] cumulativeDistribution = new double[elementCount];
double sum = 0.0;
for (int i = 0; i < elementCount; i++)
{
// The weight is based on the minmum distance. Note that clusters
// that have already been selected will have a minimum distance of
// zero, which means it will have no weight.
final double minDistance = minDistances[i];
final double weight = minDistance * minDistance;
sum += weight;
cumulativeDistribution[i] = sum;
}
if (sum <= 0.0)
{
// All of the data points are distance 0.0 from a cluster center,
// Cannot sample anything.
return -1;
}
else
{
// Randomly select an index from cumulative proportions.
return DiscreteSamplingUtil.sampleIndexFromCumulativeProportions(
this.getRandom(), cumulativeDistribution);
}
}
}