edu.ucla.sspace.index.RandomOrthogonalVectorGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.index;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.VectorMath;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Random;
/**
* An class that generates {@link DoubleVector}s that are orthogonal to each
* other based on configurable properties using the Gram-Schmidt
* orthogonalization process. This class supports three properties:
*
*
*
* - Property:
{@value #VECTOR_MEAN_PROPERTY}
*
* Default: {@value #DEFAULT_VECTOR_MEAN}
*
* - This variable sets the gaussian mean for
* generating values in random vectors.
*
*
- Property:
{@value #VECTOR_STANDARD_DEVIATION_PROPERTY}
*
* Default: {@value #DEFAULT_VECTOR_STANDARD_DEVIATION}
*
* - This variable sets the std when generating
* gaussian values for random vectors.
*
*
- Property:
{@value #VECTOR_LENGTH_PROPERTY}
*
* Default: {@value #DEFAULT_VECTOR_LENGTH}
*
* - This variable sets the length of vectors to
* create. Note that vector lengths passed to {@code generateRandomVector}
* is ignored.
*
*
*/
public class RandomOrthogonalVectorGenerator
implements DoubleVectorGenerator, Serializable {
private static final long serialVersionUID = 1L;
/**
* A random number generator that can be accessed to other classes which
* will rely on the same source of random values.
*/
public static final Random RANDOM = new Random();
/**
* The prefix for naming public properties.
*/
private static final String PROPERTY_PREFIX =
"edu.ucla.sspace.index.RandomOrthogonalVectorGenerator";
/**
* The property to specify the number of values to set in an {@link
* TernaryVector}.
*/
public static final String VECTOR_MEAN_PROPERTY =
PROPERTY_PREFIX + ".mean";
/**
* The property to specify the std in the number of values to set in an
* {@link TernaryVector}.
*/
public static final String VECTOR_STANDARD_DEVIATION_PROPERTY =
PROPERTY_PREFIX + ".std";
public static final String VECTOR_LENGTH_PROPERTY =
PROPERTY_PREFIX + ".length";
/**
* The default number of values to set in an {@link TernaryVector}.
*/
public static final int DEFAULT_VECTOR_MEAN = 0;
/**
* The default number of dimensions to create in each {@code TernaryVector}.
*/
public static final int DEFAULT_VECTOR_LENGTH = 1000;
/**
* The default random std in the number of values that are set in an
* {@code TernaryVector}.
*/
public static final int DEFAULT_VECTOR_STANDARD_DEVIATION = 1;
/**
* The mean of random values to generate.
*/
private double mean;
/**
* The std in the number of values that are set in an {@link
* DoubleVector}
*/
private double std;
/**
* The length for each vector to generate. This also limits the number of
* vectors that can be generated.
*/
private int vectorLength;
/**
* The list of orthogonal vectors already generated by this generator.
*/
private final List generatedVectors;
/**
* Constructs this instance using the system properties and no initial
* vector.
*/
public RandomOrthogonalVectorGenerator(int vectorLength) {
this(vectorLength, System.getProperties(), null);
}
/**
* Contructs this instance using the system properties and the provided
* intial vector.
*/
public RandomOrthogonalVectorGenerator(int vectorLength,
DoubleVector originalVector) {
this(vectorLength, System.getProperties(), originalVector);
}
/**
* Constructs this instance using the provided properties and uses an
* initial vector.
*/
public RandomOrthogonalVectorGenerator(int vectorLength,
Properties properties,
DoubleVector originalVector) {
String meanProp =
properties.getProperty(VECTOR_MEAN_PROPERTY);
mean = (meanProp != null)
? Double.parseDouble(meanProp)
: DEFAULT_VECTOR_MEAN;
String stdProp =
properties.getProperty(VECTOR_STANDARD_DEVIATION_PROPERTY);
std = (stdProp != null)
? Double.parseDouble(stdProp)
: DEFAULT_VECTOR_STANDARD_DEVIATION;
this.vectorLength = vectorLength;
generatedVectors = new ArrayList();
if (originalVector == null)
originalVector =
generateInitialVector(vectorLength, mean, std);
generatedVectors.add(originalVector);
}
/**
* Generates a simple random vector.
*/
private static DoubleVector generateInitialVector(int length,
double mean,
double std) {
DoubleVector vector = new DenseVector(length);
for (int i = 0; i < length; ++i) {
double v = RANDOM.nextGaussian();
v = std * v + mean;
vector.set(i, v);
}
return vector;
}
/**
* Compute the dot product between two vectors.
*/
private static double dotProduct(DoubleVector u,
DoubleVector v) {
double dot = 0;
for (int i = 0; i < u.length(); ++i) {
double a = u.get(i);
double b = v.get(i);
dot += u.get(i) * v.get(i);
}
return dot;
}
/**
*
* @param length CAUTION: This value is ignored
*
* @return A random vector that is orthogonal to all previously created
* vectors
*/
public DoubleVector generate() {
if (generatedVectors.size() == vectorLength)
throw new IllegalArgumentException(
"Too many vectors have been generated");
DoubleVector vector =
generateInitialVector(vectorLength, mean, std);
for (DoubleVector otherVector : generatedVectors) {
double uDotV = dotProduct(otherVector, vector);
double uDotU = dotProduct(otherVector, otherVector);
for (int i = 0; i < vectorLength; ++i) {
double projection = otherVector.get(i) * uDotV / uDotU;
vector.set(i, vector.get(i) - projection);
}
}
generatedVectors.add(vector);
return vector;
}
}