gov.sandia.cognition.text.topic.LatentSemanticAnalysis Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
/*
* File: LatentSemanticAnalysis.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright March 03, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.topic;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationReferences;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.collection.CollectionUtil;
import gov.sandia.cognition.evaluator.Evaluator;
import gov.sandia.cognition.learning.algorithm.BatchLearner;
import gov.sandia.cognition.math.matrix.Matrix;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorInputEvaluator;
import gov.sandia.cognition.math.matrix.VectorOutputEvaluator;
import gov.sandia.cognition.math.matrix.Vectorizable;
import gov.sandia.cognition.math.matrix.decomposition.SingularValueDecomposition;
import gov.sandia.cognition.math.matrix.mtj.DenseMatrix;
import gov.sandia.cognition.math.matrix.mtj.DenseMatrixFactoryMTJ;
import gov.sandia.cognition.math.matrix.mtj.decomposition.SingularValueDecompositionMTJ;
import gov.sandia.cognition.text.topic.LatentSemanticAnalysis.Transform;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.Collection;
/**
* Implements the Latent Semantic Analysis (LSA) algorithm using Singular Value
* Decomposition (SVD).
*
* @author Justin Basilico
* @since 3.0
*/
@PublicationReferences(
references={
@PublicationReference(
author={"Scott Deerwester", "Susan T. Dumais", "George W. Furnas", "Thomas K. Landauer", "Richard Harshman"},
title="Indexing by Latent Semantic Analysis",
year=1990,
type=PublicationType.Journal,
publication="Journal of the American Society for Information Science",
pages={391, 407},
url="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.108.8490"
),
@PublicationReference(
author={"Thomas K. Landauer", "Peter W. Foltz", "Darrell Laham"},
title="An Introduction to Latent Semantic Analysis",
year=1998,
type=PublicationType.Journal,
publication="Discourse Processes",
pages={259, 284},
url="http://lsa.colorado.edu/papers/dp1.LSAintro.pdf"
),
@PublicationReference(
author="Wikipedia",
title="Latent semantic analysis",
year=2009,
type=PublicationType.WebPage,
url="http://en.wikipedia.org/wiki/Latent_semantic_analysis"
)
}
)
public class LatentSemanticAnalysis
extends AbstractCloneableSerializable
implements BatchLearner, Transform>
{
// TODO: Implement an iterative LSA that allows documents to be added and removed.
// TODO: Implement a sparse LSA.
/** The default requested rank is {@value}. */
public static final int DEFAULT_REQUESTED_RANK = 10;
/** The rank requested for the result LSA. The results may have smaller rank
* if the requested rank is greater than the number of documents. Must
* be positive.
*/
protected int requestedRank;
/**
* Creates a new {@code LatentSemanticAnalysis} with default parameters.
*/
public LatentSemanticAnalysis()
{
this(DEFAULT_REQUESTED_RANK);
}
/**
* Creates a new {@code LatentSemanticAnalysis} with the given parameters.
*
* @param requestedRank
* The requested rank to create results of.
*/
public LatentSemanticAnalysis(
final int requestedRank)
{
super();
this.setRequestedRank(requestedRank);
}
public Transform learn(
final Collection extends Vectorizable> documents)
{
// Get the dimensionality of the documents. This is also the number of
// terms in the documents.
final int dimensionality =
CollectionUtil.getFirst(documents).convertToVector().getDimensionality();
// Create the input matrix for SVD by stacking the documents as column
// vectors.
final DenseMatrix inputMatrix =
DenseMatrixFactoryMTJ.INSTANCE.copyColumnVectors(documents);
// Perform SVD on the matrix.
// TODO: Do a thin SVD to only take up to the requested rank number of values.
final SingularValueDecomposition svd =
SingularValueDecompositionMTJ.create(inputMatrix);
// Get the singular values and term basis from the SVD.
// The singular values are a diagonal matrix that have the different
// singular values on the diagonal.
// The term basis is the matrix of orthogonal term columns
Matrix singularValues = svd.getS();
Matrix termBasis = svd.getU();
// Figure out the rank of the result. The SVD may have zero entries
// so we may not be able to get the full requested rank.
final int rank = Math.min(this.getRequestedRank(),
svd.effectiveRank(0.0));
// Figure out if we need to downselect the number of rows and columns.
// This happens if
final boolean filterRows = rank < singularValues.getNumRows();
final boolean filterColumns = rank < singularValues.getNumColumns();
if (filterRows || filterColumns)
{
// Change the diagonal to be the proper size.
final int newRows = Math.min(singularValues.getNumRows(), rank);
final int newCols = Math.min(singularValues.getNumColumns(), rank);
singularValues = singularValues.getSubMatrix(
0, newRows - 1,
0, newCols - 1);
}
if (filterRows)
{
// Change the term basis to only include the proper rank of values.
termBasis = termBasis.getSubMatrix(
0, dimensionality - 1,
0, rank - 1);
}
// Create learned result.
return new Transform(termBasis, singularValues);
}
/**
* Gets the requested rank for the analysis.
*
* @return
* The requested rank for the analysis.
*/
public int getRequestedRank()
{
return this.requestedRank;
}
/**
* Sets the requested rank of the analysis. The analysis will attempt to
* find the requested number of latent topics. If the number of documents
* is less than the requested rank, the actual rank of the analysis will
* be reduced to the number of documents.
*
* @param requestedRank
* The requested rank of the analysis. Must be positive.
*/
public void setRequestedRank(
final int requestedRank)
{
if (requestedRank <= 0)
{
throw new IllegalArgumentException("requestedRank must be positive.");
}
this.requestedRank = requestedRank;
}
/**
* The result from doing latent semantic analysis (LSA). It is a transform
* that can be applied as a dimensionality reduction.
*/
public static class Transform
extends AbstractCloneableSerializable
implements Evaluator,
VectorInputEvaluator,
VectorOutputEvaluator
{
/** The matrix of orthogonal term column vectors. */
protected Matrix termBasis;
/** The diagonal matrix of singular values. */
protected Matrix singularValues;
/** The cached transform matrix. It is the term basis times the
* singular values. */
protected Matrix transform;
/**
* Create a new {@code Transform}
*
* @param termBasis
* The matrix of orthogonal term column vectors.
* @param singularValues
* The diagonal matrix of singular values.
*/
public Transform(
final Matrix termBasis,
final Matrix singularValues)
{
super();
this.termBasis = termBasis;
this.singularValues = singularValues;
this.setTransform(termBasis.times(singularValues));
}
public Vector evaluate(
final Vectorizable input)
{
// Apply the transform to the input vector.
return input.convertToVector().times(this.transform);
}
public int getInputDimensionality()
{
return this.transform.getNumRows();
}
public int getOutputDimensionality()
{
return this.transform.getNumColumns();
}
/**
* Gets the rank of the LSA. This is equivalent to the output
* dimensionality of the transform.
*
* @return
* The rank of the LSA.
*/
public int getRank()
{
return this.getOutputDimensionality();
}
/**
* Gets the i-th orthogonal term vector that makes up the basis for
* the transform.
*
* @param i
* An index. Must be between 0 (inclusive) and rank (exclusive).
* @return
* The i-th orthogonal term vector.
*/
public Vector getTermVector(
final int i)
{
return this.termBasis.getColumn(i);
}
/**
* Gets the matrix of orthogonal term column vectors.
*
* @return
* The matrix of orthogonal term column vectors.
*/
public Matrix getTermBasis()
{
return this.termBasis;
}
/**
* Sets the matrix of orthogonal term column vectors.
*
* @param termBasis
* The matrix of orthogonal term column vectors.
*/
protected void setTermBasis(
final Matrix termBasis)
{
this.termBasis = termBasis;
}
/**
* Gets the diagonal matrix of singular values.
*
* @return
* The diagonal matrix of singular values.
*/
public Matrix getSingularValues()
{
return this.singularValues;
}
/**
* Sets the diagonal matrix of singular values.
*
* @param singularValues
* The diagonal matrix of singular values.
*/
protected void setSingularValues(
final Matrix singularValues)
{
this.singularValues = singularValues;
}
/**
* Gets the cached transform matrix. It is the term basis times the
* singular values.
*
* @return
* The cached transform matrix.
*/
public Matrix getTransform()
{
return this.transform;
}
/**
* Gets the cached transform matrix. It is the term basis times the
* singular values.
*
* @param transform
* The cached transform matrix.
*/
protected void setTransform(
final Matrix transform)
{
this.transform = transform;
}
}
}