cc.mallet.util.VectorStats Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;

/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
 This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
 http://www.cs.umass.edu/~mccallum/mallet
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */

/** 
 * Class of static methods for calculating  statistics of a SparseVector sample 
 * packaged in an InstanceList.
 *
 *  @author Jerod Weinman [email protected]
 */

import java.util.Arrays;
import java.util.Iterator;

import cc.mallet.types.*;
import gnu.trove.TIntHashSet;

public class VectorStats {

	/**
	 * Returns a SparseVector whose entries (taken from the union of
	 * those in the instances) are the expected values of those in the
	 * InstanceList. This implies the returned vector will not have
	 * binary values.
	 */
	public static SparseVector mean(InstanceList instances) {

		if (instances == null || instances.size() == 0)
			return null;

		Iterator instanceItr = instances.iterator();

		SparseVector v;
		Instance instance;
		int indices[];
		int maxSparseIndex = -1;
		int maxDenseIndex = -1;

		// First, we find the union of all the indices used in the instances
		TIntHashSet hIndices = new TIntHashSet(instances.getDataAlphabet().size());

		while (instanceItr.hasNext()) {
			instance = (Instance) instanceItr.next();
			v = (SparseVector) (instance.getData());
			indices = v.getIndices();

			if (indices != null) {
				hIndices.addAll(indices);

				if (indices[indices.length - 1] > maxSparseIndex)
					maxSparseIndex = indices[indices.length - 1];
			} else // dense
			if (v.numLocations() > maxDenseIndex)
				maxDenseIndex = v.numLocations() - 1;
		}

		if (maxDenseIndex > -1) // dense vectors were present
		{
			if (maxSparseIndex > maxDenseIndex)
			// sparse vectors were present and they had greater indices than
			// the dense vectors
			{
				// therefore, we create sparse vectors and
				// add all the dense indices
				for (int i = 0; i <= maxDenseIndex; i++)
					hIndices.add(i);
			} else
			// sparse indices may have been present, but we don't care
			// since they never had indices that exceeded those of the
			// dense vectors
			{
				return mean(instances, maxDenseIndex + 1);
			}
		}

		// reaching this statement implies we can create a sparse vector
		return mean(instances, hIndices.toArray());

	}

	/**
	 * Returns a SparseVector whose entries (dense with the given
	 * number of indices) are the expected values of those in the
	 * InstanceList. This implies the returned vector will not have
	 * binary values.
	 */
	public static SparseVector mean(InstanceList instances, int numIndices) {
		SparseVector mv = new SparseVector(new double[numIndices], false);

		return mean(instances, mv);
	}

	/**
	 * Returns a SparseVector whose entries (the given indices) are
	 * the expected values of those in the InstanceList. This implies
	 * the returned vector will not have binary values.
	 */
	public static SparseVector mean(InstanceList instances, int[] indices) {

		// Create the mean vector with the indices having all zeros,
		// nothing copied, sorted, and no checks for duplicates.

		// [email protected]
		// it is faster to sort indices first
		Arrays.sort(indices);

		SparseVector mv = new SparseVector(indices, new double[indices.length],
		// [email protected]
				// it is faster to sort indices first (above)
				// false, true, false);
				false, false, false);

		return mean(instances, mv);

	}

	private static SparseVector mean(InstanceList instances,
			SparseVector meanVector) {
		if (instances == null || instances.size() == 0)
			return null;

		Instance instance;
		SparseVector v;

		Iterator instanceItr = instances.iterator();

		double factor = 1.0 / (double) instances.size();

		while (instanceItr.hasNext()) {
			instance = (Instance) instanceItr.next();
			v = (SparseVector) (instance.getData());

			meanVector.plusEqualsSparse(v, factor);
		}

		return meanVector;
	}

	/**
	 * Returns a SparseVector whose entries (taken from the union of
	 * those in the instances) are the variance of those in the
	 * InstanceList. This implies the returned vector will not have
	 * binary values.
	 * 
	 * @param unbiased
	 *          Normalizes by N-1 when true, and by N otherwise.
	 */
	public static SparseVector variance(InstanceList instances, boolean unbiased) {
		return variance(instances, mean(instances), unbiased);
	}

	/**
	 * Returns a SparseVector whose entries (taken from the mean
	 * argument) are the variance of those in the InstanceList. This
	 * implies the returned vector will not have binary values.
	 * 
	 * @param unbiased
	 *          Normalizes by N-1 when true, and by N otherwise.
	 */

	public static SparseVector variance(InstanceList instances,
			SparseVector mean, boolean unbiased)

	{

		if (instances == null || instances.size() == 0)
			return null;

		double factor = 1.0 / (double) (instances.size() - (unbiased ? 1.0 : 0.0));

		System.out.println("factor = " + factor);

		SparseVector v;

		// var = (x^2 - n*mu^2)/(n-1)

		SparseVector vv = (SparseVector) mean.cloneMatrix();

		vv.timesEqualsSparse(vv, -(double) instances.size() * factor);

		Iterator instanceItr = instances.iterator();
		Instance instance;

		while (instanceItr.hasNext()) {
			instance = (Instance) instanceItr.next();
			v = (SparseVector) ((SparseVector) (instance.getData())).cloneMatrix();
			v.timesEqualsSparse(v);

			vv.plusEqualsSparse(v, factor);
		}

		System.out.println("Var:\n" + vv);
		return vv;
	}

	/** Returns unbiased variance */
	public static SparseVector variance(InstanceList instances) {
		return variance(instances, true);
	}

	/** Returns unbiased variance of instances having the given mean. */
	public static SparseVector variance(InstanceList instances, SparseVector mean) {
		return variance(instances, mean, true);
	}

	/**
	 * Square root of variance.
	 * 
	 * @param mean
	 *          Mean of the given instances.
	 * @param unbiased
	 *          Normalizes variance by N-1 when true, and by N otherwise.
	 * @see variance
	 */
	public static SparseVector stddev(InstanceList instances, SparseVector mean,
			boolean unbiased) {

		if (instances.size() == 0)
			return null;

		SparseVector sv = variance(instances, mean, unbiased);

		int dim = sv.numLocations();

		double val;

		for (int i = 0; i < dim; i++) {
			val = sv.valueAtLocation(i);

			sv.setValueAtLocation(i, Math.sqrt(val));
		}

		return sv;

	}

	/** Square root of unbiased variance. */
	public static SparseVector stddev(InstanceList instances) {
		return stddev(instances, true);
	}

	/**
	 * Square root of variance.
	 * 
	 * @param unbiased
	 *          Normalizes variance by N-1 when true, and by N otherwise.
	 * @see variance
	 */
	public static SparseVector stddev(InstanceList instances, boolean unbiased) {
		return stddev(instances, mean(instances), unbiased);
	}

	/** Square root of unbiased variance of instances having the given mean */
	public static SparseVector stddev(InstanceList instances, SparseVector mean) {
		return stddev(instances, mean, true);
	}

}