
cc.mallet.util.VectorStats Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
package cc.mallet.util;
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
* Class of static methods for calculating statistics of a SparseVector sample
* packaged in an InstanceList.
*
* @author Jerod Weinman [email protected]
*/
import java.util.Arrays;
import java.util.Iterator;
import cc.mallet.types.*;
import gnu.trove.TIntHashSet;
public class VectorStats {
/**
* Returns a SparseVector
whose entries (taken from the union of
* those in the instances) are the expected values of those in the
* InstanceList
. This implies the returned vector will not have
* binary values.
*/
public static SparseVector mean(InstanceList instances) {
if (instances == null || instances.size() == 0)
return null;
Iterator instanceItr = instances.iterator();
SparseVector v;
Instance instance;
int indices[];
int maxSparseIndex = -1;
int maxDenseIndex = -1;
// First, we find the union of all the indices used in the instances
TIntHashSet hIndices = new TIntHashSet(instances.getDataAlphabet().size());
while (instanceItr.hasNext()) {
instance = (Instance) instanceItr.next();
v = (SparseVector) (instance.getData());
indices = v.getIndices();
if (indices != null) {
hIndices.addAll(indices);
if (indices[indices.length - 1] > maxSparseIndex)
maxSparseIndex = indices[indices.length - 1];
} else // dense
if (v.numLocations() > maxDenseIndex)
maxDenseIndex = v.numLocations() - 1;
}
if (maxDenseIndex > -1) // dense vectors were present
{
if (maxSparseIndex > maxDenseIndex)
// sparse vectors were present and they had greater indices than
// the dense vectors
{
// therefore, we create sparse vectors and
// add all the dense indices
for (int i = 0; i <= maxDenseIndex; i++)
hIndices.add(i);
} else
// sparse indices may have been present, but we don't care
// since they never had indices that exceeded those of the
// dense vectors
{
return mean(instances, maxDenseIndex + 1);
}
}
// reaching this statement implies we can create a sparse vector
return mean(instances, hIndices.toArray());
}
/**
* Returns a SparseVector
whose entries (dense with the given
* number of indices) are the expected values of those in the
* InstanceList
. This implies the returned vector will not have
* binary values.
*/
public static SparseVector mean(InstanceList instances, int numIndices) {
SparseVector mv = new SparseVector(new double[numIndices], false);
return mean(instances, mv);
}
/**
* Returns a SparseVector
whose entries (the given indices) are
* the expected values of those in the InstanceList
. This implies
* the returned vector will not have binary values.
*/
public static SparseVector mean(InstanceList instances, int[] indices) {
// Create the mean vector with the indices having all zeros,
// nothing copied, sorted, and no checks for duplicates.
// [email protected]
// it is faster to sort indices first
Arrays.sort(indices);
SparseVector mv = new SparseVector(indices, new double[indices.length],
// [email protected]
// it is faster to sort indices first (above)
// false, true, false);
false, false, false);
return mean(instances, mv);
}
private static SparseVector mean(InstanceList instances,
SparseVector meanVector) {
if (instances == null || instances.size() == 0)
return null;
Instance instance;
SparseVector v;
Iterator instanceItr = instances.iterator();
double factor = 1.0 / (double) instances.size();
while (instanceItr.hasNext()) {
instance = (Instance) instanceItr.next();
v = (SparseVector) (instance.getData());
meanVector.plusEqualsSparse(v, factor);
}
return meanVector;
}
/**
* Returns a SparseVector
whose entries (taken from the union of
* those in the instances) are the variance of those in the
* InstanceList
. This implies the returned vector will not have
* binary values.
*
* @param unbiased
* Normalizes by N-1 when true, and by N otherwise.
*/
public static SparseVector variance(InstanceList instances, boolean unbiased) {
return variance(instances, mean(instances), unbiased);
}
/**
* Returns a SparseVector
whose entries (taken from the mean
* argument) are the variance of those in the InstanceList
. This
* implies the returned vector will not have binary values.
*
* @param unbiased
* Normalizes by N-1 when true, and by N otherwise.
*/
public static SparseVector variance(InstanceList instances,
SparseVector mean, boolean unbiased)
{
if (instances == null || instances.size() == 0)
return null;
double factor = 1.0 / (double) (instances.size() - (unbiased ? 1.0 : 0.0));
System.out.println("factor = " + factor);
SparseVector v;
// var = (x^2 - n*mu^2)/(n-1)
SparseVector vv = (SparseVector) mean.cloneMatrix();
vv.timesEqualsSparse(vv, -(double) instances.size() * factor);
Iterator instanceItr = instances.iterator();
Instance instance;
while (instanceItr.hasNext()) {
instance = (Instance) instanceItr.next();
v = (SparseVector) ((SparseVector) (instance.getData())).cloneMatrix();
v.timesEqualsSparse(v);
vv.plusEqualsSparse(v, factor);
}
System.out.println("Var:\n" + vv);
return vv;
}
/** Returns unbiased variance */
public static SparseVector variance(InstanceList instances) {
return variance(instances, true);
}
/** Returns unbiased variance of instances having the given mean. */
public static SparseVector variance(InstanceList instances, SparseVector mean) {
return variance(instances, mean, true);
}
/**
* Square root of variance.
*
* @param mean
* Mean of the given instances.
* @param unbiased
* Normalizes variance by N-1 when true, and by N otherwise.
* @see variance
*/
public static SparseVector stddev(InstanceList instances, SparseVector mean,
boolean unbiased) {
if (instances.size() == 0)
return null;
SparseVector sv = variance(instances, mean, unbiased);
int dim = sv.numLocations();
double val;
for (int i = 0; i < dim; i++) {
val = sv.valueAtLocation(i);
sv.setValueAtLocation(i, Math.sqrt(val));
}
return sv;
}
/** Square root of unbiased variance. */
public static SparseVector stddev(InstanceList instances) {
return stddev(instances, true);
}
/**
* Square root of variance.
*
* @param unbiased
* Normalizes variance by N-1 when true, and by N otherwise.
* @see variance
*/
public static SparseVector stddev(InstanceList instances, boolean unbiased) {
return stddev(instances, mean(instances), unbiased);
}
/** Square root of unbiased variance of instances having the given mean */
public static SparseVector stddev(InstanceList instances, SparseVector mean) {
return stddev(instances, mean, true);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy