All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openimaj.ml.clustering.rac.IntRAC Maven / Gradle / Ivy

Go to download

Various clustering algorithm implementations for all primitive types including random, random forest, K-Means (Exact, Hierarchical and Approximate), ...

There is a newer version: 1.3.5
Show newest version
/**
 * Copyright (c) 2011, The University of Southampton and the individual contributors.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * 	Redistributions of source code must retain the above copyright notice,
 * 	this list of conditions and the following disclaimer.
 *
 *   *	Redistributions in binary form must reproduce the above copyright notice,
 * 	this list of conditions and the following disclaimer in the documentation
 * 	and/or other materials provided with the distribution.
 *
 *   *	Neither the name of the University of Southampton nor the names of its
 * 	contributors may be used to endorse or promote products derived from this
 * 	software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.openimaj.ml.clustering.rac;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.math.FunctionEvaluationException;
import org.apache.commons.math.MaxIterationsExceededException;
import org.apache.commons.math.analysis.UnivariateRealFunction;
import org.apache.commons.math.analysis.solvers.BisectionSolver;
import org.openimaj.citation.annotation.Reference;
import org.openimaj.citation.annotation.ReferenceType;
import org.openimaj.data.DataSource;
import org.openimaj.data.RandomData;
import org.openimaj.ml.clustering.CentroidsProvider;
import org.openimaj.ml.clustering.IndexClusters;
import org.openimaj.ml.clustering.SpatialClusterer;
import org.openimaj.ml.clustering.SpatialClusters;
import org.openimaj.ml.clustering.assignment.HardAssigner;
import org.openimaj.util.pair.IntFloatPair;

/**
 * An implementation of the RAC algorithm proposed by Ramanan and Niranjan.
 * 

* During training, data points are selected at random. The first data point is * chosen as a centroid. Every following data point is set as a new centroid if * it is outside the threshold of all current centroids. In this way it is * difficult to guarantee number of clusters so a minimisation function is * provided to allow a close estimate of the required threshold for a given K. *

* This implementation supports int[] cluster centroids. *

* In terms of implementation, this class is a both a clusterer, assigner and * the result of the clustering. This is because the RAC algorithm never ends; * that is to say that if a new point is being assigned through the * {@link HardAssigner} interface, and that point is more than the threshold * distance from any other centroid, then a new centroid will be created for the * point. If this behaviour is undesirable, the results of clustering can be * "frozen" by manually constructing an assigner that takes a * {@link CentroidsProvider} (or the centroids provided by calling * {@link #getCentroids()}) as an argument. * * @author Sina Samangooei ([email protected]) * @author Jonathon Hare ([email protected]) */ @Reference( type = ReferenceType.Inproceedings, author = { "Amirthalingam Ramanan", "Mahesan Niranjan" }, title = "Resource-Allocating Codebook for Patch-based Face Recognition", year = "2009", booktitle = "IIS", url = "http://eprints.ecs.soton.ac.uk/21401/") public class IntRAC implements SpatialClusters, SpatialClusterer, CentroidsProvider, HardAssigner { private static class ClusterMinimisationFunction implements UnivariateRealFunction { private int[][] distances; private int[][] samples; private int nClusters; public ClusterMinimisationFunction(int[][] samples, int[][] distances, int nClusters) { this.distances = distances; this.samples = samples; this.nClusters = nClusters; } @Override public double value(double radius) throws FunctionEvaluationException { final IntRAC r = new IntRAC(radius); r.train(samples, distances); final int diff = this.nClusters - r.numClusters(); return diff; } } private static final String HEADER = SpatialClusters.CLUSTER_HEADER + "RAIC"; protected ArrayList codebook; protected double threshold; protected int nDims; protected static int[][] distances; protected long totalSamples; /** * Sets the threshold to 128 */ public IntRAC() { codebook = new ArrayList(); this.threshold = 128; this.nDims = -1; this.totalSamples = 0; } /** * Define the threshold at which point a new cluster will be made. * * @param radiusSquared */ public IntRAC(double radiusSquared) { this(); this.threshold = radiusSquared; } /** * Iteratively select subSamples from bKeys and try to choose a threshold * which results in nClusters. This is provided to estimate threshold as * this is a very data dependant value. The threshold is found using a * BisectionSolver with a complete distance matrix (so make sure subSamples * is reasonable) * * @param bKeys * All keys to be trained against * @param subSamples * number of subsamples to select from bKeys each iteration * @param nClusters * number of clusters to aim for */ public IntRAC(int[][] bKeys, int subSamples, int nClusters) { this(); distances = new int[subSamples][subSamples]; int j = 0; this.threshold = 0; final int thresholdIteration = 5; while (j++ < thresholdIteration) { final int[][] randomList = new int[subSamples][]; final int[] randomListIndex = RandomData.getUniqueRandomInts(subSamples, 0, bKeys.length); int ri = 0; for (int k = 0; k < randomListIndex.length; k++) randomList[ri++] = bKeys[randomListIndex[k]]; try { this.threshold += calculateThreshold(randomList, nClusters); } catch (final Exception e) { this.threshold += 200000; } System.out.println("Current threshold: " + this.threshold / j); } this.threshold /= thresholdIteration; } @SuppressWarnings("deprecation") protected static double calculateThreshold(int[][] samples, int nClusters) throws MaxIterationsExceededException, FunctionEvaluationException { int maxDistance = 0; for (int i = 0; i < samples.length; i++) { for (int j = i + 1; j < samples.length; j++) { distances[i][j] = distanceEuclidianSquared(samples[i], samples[j]); distances[j][i] = distances[i][j]; if (distances[i][j] > maxDistance) maxDistance = distances[i][j]; } } System.out.println("Distance matrix calculated"); final BisectionSolver b = new BisectionSolver(); b.setAbsoluteAccuracy(100.0); return b.solve(100, new ClusterMinimisationFunction(samples, distances, nClusters), 0, maxDistance); } int train(int[][] samples, int[][] distances) { int foundLength = -1; final List codebookIndex = new ArrayList(); for (int i = 0; i < samples.length; i++) { final int[] entry = samples[i]; if (foundLength == -1) foundLength = entry.length; // all the data entries must be the same length otherwise this // doesn't make sense if (foundLength != entry.length) { this.codebook = new ArrayList(); return -1; } boolean found = false; for (final int j : codebookIndex) { if (distances[i][j] < threshold) { found = true; break; } } if (!found) { this.codebook.add(entry); codebookIndex.add(i); } } this.nDims = foundLength; return 0; } @Override public IntRAC cluster(int[][] data) { int foundLength = -1; for (final int[] entry : data) { if (foundLength == -1) foundLength = entry.length; // all the data entries must be the same length otherwise this // doesn't make sense if (foundLength != entry.length) { this.codebook = new ArrayList(); throw new RuntimeException(); } boolean found = false; for (final int[] existing : this.codebook) { if (distanceEuclidianSquared(entry, existing) < threshold) { found = true; break; } } if (!found) { this.codebook.add(entry); if (this.codebook.size() % 1000 == 0) { System.out.println("Codebook increased to size " + this.codebook.size()); } } } return this; } @Override public IntRAC cluster(DataSource data) { final int[][] dataArr = new int[data.numRows()][data.numDimensions()]; return cluster(dataArr); } static int distanceEuclidianSquared(int[] a, int[] b) { int sum = 0; for (int i = 0; i < a.length; i++) { final int diff = a[i] - b[i]; sum += diff * diff; } return sum; } static int distanceEuclidianSquared(int[] a, int[] b, int threshold2) { int sum = 0; for (int i = 0; i < a.length; i++) { final int diff = a[i] - b[i]; sum += diff * diff; if (sum > threshold2) return threshold2; } return sum; } @Override public int numClusters() { return this.codebook.size(); } @Override public int numDimensions() { return this.nDims; } @Override public int[] assign(int[][] data) { final int[] centroids = new int[data.length]; for (int i = 0; i < data.length; i++) { final int[] entry = data[i]; centroids[i] = this.assign(entry); } return centroids; } @Override public int assign(int[] data) { int mindiff = -1; int centroid = -1; for (int i = 0; i < this.numClusters(); i++) { final int[] centroids = this.codebook.get(i); int sum = 0; boolean set = true; for (int j = 0; j < centroids.length; j++) { final int diff = centroids[j] - data[j]; sum += diff * diff; if (mindiff != -1 && mindiff < sum) { set = false; break; // Stop checking the distance if you } } if (set) { mindiff = sum; centroid = i; // if(mindiff < this.threshold){ // return centroid; // } } } return centroid; } @Override public String asciiHeader() { return "ASCII" + HEADER; } @Override public byte[] binaryHeader() { return HEADER.getBytes(); } @Override public void readASCII(Scanner in) throws IOException { throw new UnsupportedOperationException("Not done!"); } @Override public void readBinary(DataInput dis) throws IOException { threshold = dis.readDouble(); nDims = dis.readInt(); final int nClusters = dis.readInt(); assert (threshold > 0); codebook = new ArrayList(); for (int i = 0; i < nClusters; i++) { final byte[] wang = new byte[nDims]; dis.readFully(wang, 0, nDims); final int[] cluster = new int[nDims]; for (int j = 0; j < nDims; j++) cluster[j] = wang[j] & 0xFF; codebook.add(cluster); } } @Override public void writeASCII(PrintWriter writer) throws IOException { writer.format("%d\n", this.threshold); writer.format("%d\n", this.nDims); writer.format("%d\n", this.numClusters()); for (final int[] a : this.codebook) { writer.format("%d,", a); } } @Override public void writeBinary(DataOutput dos) throws IOException { dos.writeDouble(this.threshold); dos.writeInt(this.nDims); dos.writeInt(this.numClusters()); for (final int[] arr : this.codebook) { for (final int a : arr) { dos.write(a); } } } @Override public int[][] getCentroids() { return this.codebook.toArray(new int[0][]); } @Override public void assignDistance(int[][] data, int[] indices, float[] distances) { throw new UnsupportedOperationException("Not implemented"); } @Override public IntFloatPair assignDistance(int[] data) { throw new UnsupportedOperationException("Not implemented"); } @Override public HardAssigner defaultHardAssigner() { return this; } /** * The number of centroids; this potentially grows as assignments are made. * * @see org.openimaj.ml.clustering.assignment.HardAssigner#size() */ @Override public int size() { return this.nDims; } @Override public int[][] performClustering(int[][] data) { return new IndexClusters(this.cluster(data).defaultHardAssigner().assign(data)).clusters(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy