org.openimaj.knn.pq.DoubleProductQuantiser Maven / Gradle / Ivy

Go to download
/*
	AUTOMATICALLY GENERATED BY jTemp FROM
	/Users/jsh2/Work/openimaj/target/checkout/machine-learning/nearest-neighbour/src/main/jtemp/org/openimaj/knn/pq/#T#ProductQuantiser.jtemp
*/
/**
 * Copyright (c) 2011, The University of Southampton and the individual contributors.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * 	Redistributions of source code must retain the above copyright notice,
 * 	this list of conditions and the following disclaimer.
 *
 *   *	Redistributions in binary form must reproduce the above copyright notice,
 * 	this list of conditions and the following disclaimer in the documentation
 * 	and/or other materials provided with the distribution.
 *
 *   *	Neither the name of the University of Southampton nor the names of its
 * 	contributors may be used to endorse or promote products derived from this
 * 	software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
  
package org.openimaj.knn.pq;

import java.util.Arrays;

import org.openimaj.citation.annotation.Reference;
import org.openimaj.citation.annotation.ReferenceType;
import org.openimaj.knn.DoubleNearestNeighboursExact;
import org.openimaj.knn.NearestNeighbours;

/**
 * Implementation of a Product Quantiser for vectors/arrays of doubles. Product
 * Quantisers quantise data into a very large number of clusters (large enough
 * that the centroids could not possibly fit into memory - i.e. 2^64 centroids).
 * The Product Quantiser can be used to create compressed representations of
 * high-dimensional vectors, and also as a means to perform efficient
 * nearest-neighbour search over large collections of vectors (which have been
 * effectively compressed using the product quantiser).
 * 
 * This is achieved by breaking down the input vectors into non-overlapping
 * sub-vectors, and applying quantisation to these sub-vectors individually. The
 * number of bins (cluster centroids) for the sub-vectors is small (up to 256 in
 * this implementation), but when combined over all sub-vectors, the number of
 * bins is much larger as it accounts for all combinations of bins across
 * sub-vectors. As only a small set of centroids needs to be held for the
 * sub-vectors, the memory requirements are quite modest. The output of the
 * quantisation action in this implementation is an array of bytes corresponding
 * to the index of the matching centroid for each sub-vector (index numbers are
 * offset by -128 so that 256 centroids indexes can fit in a single byte). The
 * bit-pattern of this byte array could be interpreted as a numeric value of
 * global cluster index, however in practice this is not useful.
 * 

 * Typically the product quantiser is "trained" so that it adapts to the data
 * that is is being applied too. The standard approach to this is to use
 * K-Means, however, this is not required. Insofar as this implementation is
 * concerned, any set of compatible {@link NearestNeighbours} implementations
 * can be provided to the constructor. Each of the {@link NearestNeighbours}
 * could even potentially have a different number of dimensions (corresponding
 * to the sub-vector lengths).
 * 
 * In the standard case, where you just want to use K-Means to train the Product
 * Quantiser, a set of utility methods can be found in the
 * org.openimaj.knn.pq.DoubleProductQuantiserUtilities class which can be found in
 * the clustering sub-project (due to the dependence on the K-Means algorithm).
 * 
 * @author Jonathon Hare ([email protected])
 * 
 */
 @Reference(
 		type = ReferenceType.Article,
 		author = { "Jegou, Herve", "Douze, Matthijs", "Schmid, Cordelia" },
 		title = "Product Quantization for Nearest Neighbor Search",
 		year = "2011",
 		journal = "IEEE Trans. Pattern Anal. Mach. Intell.",
 		pages = { "117", "", "128" },
 		url = "http://dx.doi.org/10.1109/TPAMI.2010.57",
 		month = "January",
 		number = "1",
 		publisher = "IEEE Computer Society",
 		volume = "33",
 		customData = {
 				"issn", "0162-8828",
 				"numpages", "12",
 				"doi", "10.1109/TPAMI.2010.57",
 				"acmid", "1916695",
 				"address", "Washington, DC, USA",
 				"keywords", "High-dimensional indexing, High-dimensional indexing, image indexing, very large databases, approximate search., approximate search., image indexing, very large databases"
 		})
public class DoubleProductQuantiser {
	protected DoubleNearestNeighboursExact[] assigners;
	protected int ndims;

	/**
	 * Construct a {@link DoubleProductQuantiser} with the given
	 * nearest-neighbour assigners. The number of dimensions of the assigners
	 * determines how long each sub-vector is. There is a one-to-one mapping
	 * between in the order of assigners and sub-vectors.
	 * 
	 * @param assigners
	 *            the nearest-neighbour assigners.
	 */
	public DoubleProductQuantiser(DoubleNearestNeighboursExact[] assigners) {
		this.assigners = assigners;
		
		for (final DoubleNearestNeighboursExact nn : assigners)
			ndims += nn.numDimensions();
	}

	/**
	 * Quantise the given data using this Product Quantiser. The output is an
	 * array of bytes corresponding to the index of the matching centroid for
	 * each sub-vector (index numbers are offset by -128 so that 256 centroids
	 * indexes can fit in a single byte).
	 * 
	 * @param data
	 *            the data to quantise
	 * @return the quantised data.
	 */
	public byte[] quantise(double[] data) {
		final byte[] quantised = new byte[assigners.length];

		final int[] idx = { 0 };
		final double[] dst = { 0 };
		final double[][] qus = new double[1][0];

		for (int i = 0, from = 0; i < assigners.length; i++) {
			final int to = assigners[i].numDimensions();

			qus[0] = Arrays.copyOfRange(data, from, from + to);
			assigners[i].searchNN(qus, idx, dst);
			quantised[i] = (byte) (idx[0] - 128);

			from += to;
		}

		return quantised;
	}
	
	/**
	 * Decompress the quantised data by replacing each encoded index with the actual centroid subvector.
	 *
	 * @param qdata the quantised data
	 *
	 * @return the (approximate) decompressed feature
	 */
	public double[] decompress(byte[] qdata) {
		final double[] data = new double[ndims];

		for (int i = 0, from = 0; i < assigners.length; i++) {
			final int len = assigners[i].numDimensions();
			int index = (int)qdata[i] + 128;
		
			System.arraycopy(this.assigners[i].getPoints()[index], 0, data, from, len);

			from += len;
		}
		
		return data;
	}
}