edu.ucla.sspace.coals.Coals Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.coals;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.matrix.CellMaskedSparseMatrix;
import edu.ucla.sspace.matrix.ArrayMatrix;
import edu.ucla.sspace.matrix.AtomicGrowingSparseMatrix;
import edu.ucla.sspace.matrix.MatlabSparseMatrixBuilder;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixBuilder;
import edu.ucla.sspace.matrix.MatrixFactorization;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.Normalize;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;
import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.text.IteratorFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
/**
* An implementation of the COALS Semantic Space model. This implementation is
* based on:
*
* Rohde, D. L. T.,
* Gonnerman, L. M., Plaut, D. C. (2005). An Improved Model of Semantic
* Similarity Based on Lexical Co-Occurrence. Cognitive Science
* (submitted). Available here
*
* COALS first computes a term by term co-occurrance using a ramped 4-word
* window. Once all documents have been processed, the co-occurrence matrix will
* be re ordered such that only the {@code N} most frequent terms have their
* semantic vectors retained and only the {@code M} most frequent terms are used
* as co-occurrence features. These values can be set by the {@value
*#MAX_WORDS_PROPERTY} and {@value MAX_DIMENSIONS_PROPERTY} properties,
* resepctively. After re ordering the semantic vectors and features, {@link
* CorrelationTransform} is used to rerank all co-occurrence scores. As part of
* this transform, all negative correlations are dropped and replaced with a 0.
* Finally, and optionally, the {@link SVD} is used to reduce the semantic
* space. To set the number of retained dimensions via {@link SVD}, set the
* {@value REDUCE_DIMENSION_PROPERTY} property.
*
* @author Keith Stevens
*/
public class Coals implements SemanticSpace {
/**
* The property prefix for other settings.
*/
public static final String PROPERTY_PREFIX =
"edu.ucla.sspace.coals.Coals";
/**
* Specifies whether or not the co-occurance matrix should be reduced.
*/
public static final String REDUCE_MATRIX_PROPERTY =
PROPERTY_PREFIX + ".reduce";
/**
* Specifies the number of dimensions the co-occurance matrix should be
* reduced to.
*/
public static final String REDUCE_DIMENSION_PROPERTY =
PROPERTY_PREFIX + ".dimension";
/**
* Specifies the number of dimensions in the raw co-occurrance matrix to
* maintain.
*/
public static final String MAX_DIMENSIONS_PROPERTY =
PROPERTY_PREFIX + ".maxDimensions";
/**
* Specifies the number of words to build semantics for.
*/
public static final String MAX_WORDS_PROPERTY =
PROPERTY_PREFIX + ".maxWords";
/**
* Specifies if Coals should not normalize the co-occurance matrix.
*/
public static final String DO_NOT_NORMALIZE_PROPERTY =
PROPERTY_PREFIX + ".doNotNormalize";
/**
* The default number of dimensions to reduce to.
*/
private static final int DEFAULT_REDUCE_DIMENSIONS = 800;
/**
* The default number of dimensions to save in the co-occurrance matrix.
*/
private static final int DEFAULT_MAX_DIMENSIONS = 14000;
/**
* The default number of rows to save in the co-occurrance matrix.
*/
private static final int DEFAULT_MAX_WORDS = 15000;
/**
* The name of this {@code SemanticSpace}
*/
public static final String COALS_SSPACE_NAME =
"coals-semantic-space";
/**
* The logger used to record all output
*/
private static final Logger COALS_LOGGER =
Logger.getLogger(Coals.class.getName());
/**
* A mapping from each word to the vector the represents its semantics
*/
private Map wordToSemantics;
/**
* A mapping from word to index number.
*/
private Map termToIndex;
/**
* A map containg the total frequency counts of each word.
*/
private ConcurrentMap totalWordFreq;
/**
* The final reduced matrix.
*/
private Matrix finalCorrelation;
/**
* Specifies the number of reduced dimensions if the matrix is reduced by
* SVD.
*/
private final int reducedDimensions;
/**
* The maximum number of words that will be retained by {@link Coals}.
*/
private final int maxWords;
/**
* The maximum number of co-occurring words that will be retained by {@link
* Coals}.
*/
private final int maxDimensions;
/**
* A counter for keeping track of the index values of words.
*/
private int wordIndexCounter;
/**
* The {@link MatrixFactorization} algorithm that will decompose the word by
* document feature space into two smaller feature spaces: a word by class
* feature space and a class by feature space.
*/
private final MatrixFactorization reducer;
/**
* The {@link Transform} applied to the co-ocucrrence counts, if not {@code
* null}.
*/
private final Transform transform;
public Coals(Transform transform, MatrixFactorization reducer) {
this(transform, reducer, DEFAULT_REDUCE_DIMENSIONS,
DEFAULT_MAX_WORDS, DEFAULT_MAX_DIMENSIONS);
}
/**
* Creats a {@link Coals} instance.
*/
public Coals(Transform transform,
MatrixFactorization reducer,
int reducedDimensions,
int maxWords,
int maxDimensions) {
termToIndex = new HashMap();
totalWordFreq = new ConcurrentHashMap();
wordToSemantics = new HashMap(1024, 4f);
finalCorrelation = null;
this.transform = transform;
this.reducer = reducer;
this.reducedDimensions = (reducedDimensions == 0)
? DEFAULT_REDUCE_DIMENSIONS
: reducedDimensions;
this.maxWords = (maxWords == 0)
? DEFAULT_MAX_WORDS
: maxWords;
this.maxDimensions = (maxDimensions == 0)
? DEFAULT_MAX_DIMENSIONS
: maxDimensions;
}
/**
* {@inheritDoc}
*/
public Set getWords() {
return termToIndex.keySet();
}
/**
* {@inheritDoc}
*/
public Vector getVector(String term) {
Integer index = termToIndex.get(term);
if (index == null)
return null;
return Vectors.immutable(
finalCorrelation.getRowVector(index.intValue()));
}
public String getSpaceName() {
String ret = COALS_SSPACE_NAME;
if (reducer != null)
ret += "-svd-" + reducedDimensions;
return ret;
}
public int getVectorLength() {
return finalCorrelation.columns();
}
/**
* {@inheritDoc}
*/
public void processDocument(BufferedReader document) throws IOException {
Map wordFreq = new HashMap();
Map wordDocSemantics =
new HashMap();
// Setup queues to track the set of previous and next words in a
// context.
Queue prevWords = new ArrayDeque();
Queue nextWords = new ArrayDeque();
Iterator it = IteratorFactory.tokenizeOrdered(document);
for (int i = 0; i < 4 && it.hasNext(); ++i)
nextWords.offer(it.next());
// Compute the co-occurrance statistics of each focus word in the
// document.
while (!nextWords.isEmpty()) {
// Slide over the context by one word.
if (it.hasNext())
nextWords.offer(it.next());
// Get the focus word
String focusWord = nextWords.remove();
if (!focusWord.equals(IteratorFactory.EMPTY_TOKEN)) {
getIndexFor(focusWord);
// Update the frequency count of the focus word.
Integer focusFreq = wordFreq.get(focusWord);
wordFreq.put(focusWord, (focusFreq == null)
? 1
: 1 + focusFreq.intValue());
// Get the temprorary semantics for the focus word, create a new
// vector for them if needed.
SparseDoubleVector focusSemantics = wordDocSemantics.get(
focusWord);
if (focusSemantics == null) {
focusSemantics = new SparseHashDoubleVector(
Integer.MAX_VALUE);
wordDocSemantics.put(focusWord, focusSemantics);
}
// Process the previous words.
int offset = 4 - prevWords.size();
for (String word : prevWords) {
offset++;
if (word.equals(IteratorFactory.EMPTY_TOKEN))
continue;
int index = getIndexFor(word);
focusSemantics.add(index, offset);
}
// Process the next words.
offset = 5;
for (String word : nextWords) {
offset--;
if (word.equals(IteratorFactory.EMPTY_TOKEN))
continue;
int index = getIndexFor(word);
focusSemantics.add(index, offset);
}
}
prevWords.offer(focusWord);
if (prevWords.size() > 4)
prevWords.remove();
}
// Add the temporary vectors for each word in this document to the
// actual semantic fectors.
for (Map.Entry e :
wordDocSemantics.entrySet()) {
SparseDoubleVector focusSemantics = getSemanticVector(
e.getKey());
// Get the non zero indices before hand so that they are cached
// during the synchronized section.
focusSemantics.getNonZeroIndices();
synchronized (focusSemantics) {
VectorMath.add(focusSemantics, e.getValue());
}
}
// Store the total frequency counts of the words seen in this document
// so far.
for (Map.Entry entry : wordFreq.entrySet()) {
int count = entry.getValue().intValue();
AtomicInteger freq = totalWordFreq.putIfAbsent(
entry.getKey(), new AtomicInteger(count));
if (freq != null)
freq.addAndGet(count);
}
}
/**
* Returns the current semantic vector for the provided word, or if the word
* is not currently in the semantic space, a vector is added for it and
* returned.
*
* @param word a word
*
* @return the {@code SemanticVector} for the provide word.
*/
private SparseDoubleVector getSemanticVector(String word) {
SparseDoubleVector v = wordToSemantics.get(word);
if (v == null) {
// lock on the word in case multiple threads attempt to add it at
// once
synchronized(this) {
// recheck in case another thread added it while we were waiting
// for the lock
v = wordToSemantics.get(word);
if (v == null) {
v = new CompactSparseVector();
wordToSemantics.put(word, v);
}
}
}
return v;
}
/**
* Returns the index in the co-occurence matrix for this word. If the word
* was not previously assigned an index, this method adds one for it and
* returns that index.
*/
private int getIndexFor(String word) {
Integer index = termToIndex.get(word);
if (index == null) {
synchronized(this) {
// recheck to see if the term was added while blocking
index = termToIndex.get(word);
// if another thread has not already added this word while the
// current thread was blocking waiting on the lock, then add it.
if (index == null) {
int i = wordIndexCounter++;
termToIndex.put(word, i);
return i; // avoid the auto-boxing to assign i to index
}
}
}
return index;
}
/**
* {@inheritDoc}
*/
public void processSpace(Properties props) {
COALS_LOGGER.info("Droppring dimensions from co-occurrance matrix.");
// Read in the matrix from a file with dimensions dropped.
finalCorrelation = buildMatrix(maxWords, maxDimensions);
COALS_LOGGER.info("Done dropping dimensions.");
if (transform != null) {
COALS_LOGGER.info("Normalizing co-occurrance matrix.");
// Normalize the matrix using correlation.
int wordCount = finalCorrelation.rows();
finalCorrelation = transform.transform(finalCorrelation);
COALS_LOGGER.info("Done normalizing co-occurrance matrix.");
}
if (reducer != null) {
if (reducedDimensions > finalCorrelation.columns())
throw new IllegalArgumentException(
"Cannot reduce to more dimensions than exist");
COALS_LOGGER.info("Reducing using SVD.");
try {
File coalsMatrixFile =
File.createTempFile("coals-term-doc-matrix", "dat");
coalsMatrixFile.deleteOnExit();
MatrixIO.writeMatrix(finalCorrelation,
coalsMatrixFile,
Format.SVDLIBC_SPARSE_BINARY);
MatrixFile processedSpace = new MatrixFile(
coalsMatrixFile, Format.SVDLIBC_SPARSE_BINARY);
reducer.factorize(processedSpace, reducedDimensions);
finalCorrelation = reducer.dataClasses();
} catch (IOException ioe) {
throw new IOError(ioe);
}
COALS_LOGGER.info("Done reducing using SVD.");
}
}
/**
* Returns a {@link Matrix} that contains {@code maxWords} and {@code
* maxDimensions} columns. If {@code maxWords} is 0, then all words will be
* returned in the semantic {@link Matrix}. If {@code maxDimensions} is
* larger than the number of observed features, then all observed features
* will be maintained. The resulting rows and columns are both ordred based
* on the frequency of each term, in descending order, {@code termToIndex}
* is modified to account for these changed.
*/
private Matrix buildMatrix(int maxWords, int maxDimensions) {
// Convert the vectors in the semantic map to a matrix.
SparseDoubleVector[] vectorList =
new SparseDoubleVector[wordToSemantics.size()];
for (Map.Entry e :
wordToSemantics.entrySet())
vectorList[getIndexFor(e.getKey())] = e.getValue();
SparseMatrix matrix = Matrices.asSparseMatrix(
Arrays.asList(vectorList));
// If maxwords was set to 0, save all words.
if (maxWords == 0 || maxWords > wordToSemantics.size())
maxWords = wordToSemantics.size();
COALS_LOGGER.info("Forming the inverse mapping from terms to indices.");
// Calculate an inverse mapping from index to word since the binary file
// stores things by index number.
String[] indexToTerm = new String[termToIndex.size()];
for (Map.Entry entry : termToIndex.entrySet())
indexToTerm[entry.getValue()] = entry.getKey();
COALS_LOGGER.info("Sorting the terms based on frequency.");
// Calculate the new indices for each word that will be kept based on
// the frequency count, where the most frequent word will be first.
ArrayList> wordCountList =
new ArrayList>(
totalWordFreq.entrySet());
Collections.sort(wordCountList, new EntryComp());
// Calculate the new term to index mapping based on the order of the
// word frequencies.
COALS_LOGGER.info("Generating the index masks.");
// Compute the number of dimensions to maintain.
int wordCount = (wordCountList.size() > maxDimensions)
? maxDimensions
: wordCountList.size();
int[] rowMask = new int[maxWords];
int[] colMask = new int[wordCount];
// Create a new vector list to store the word semantics that will be
// retained. When this method exits, it will throw away all the other
// vectors.
SparseDoubleVector[] newVectorList = new SparseDoubleVector[maxWords];
// For each of the terms that we have a mapping, add row and column
// maskings for the indices of the first maxWords terms. For all other
// terms, remove the term to index mapping.
int termCount = 0;
for (Map.Entry entry : wordCountList) {
Integer oldIndex = termToIndex.get(entry.getKey());
// Skip any non mapped terms.
if (oldIndex == null)
continue;
// Add a row and/or column mask from the index of this word to it's
// index in the original matrix.
if (termCount < maxWords) {
if (termCount < wordCount)
colMask[termCount] = oldIndex;
// Add the vector for this reserved word to the new vector list.
newVectorList[termCount] = vectorList[oldIndex];
// Record the new dimension for this term.
rowMask[termCount] = termCount;
termToIndex.put(entry.getKey(), termCount);
termCount++;
}
// Drop all other mappings.
else
termToIndex.remove(entry.getKey());
}
wordToSemantics = null;
matrix = Matrices.asSparseMatrix(Arrays.asList(newVectorList));
// Return a masked version of the original matrix.
return new CellMaskedSparseMatrix(matrix, rowMask, colMask);
}
private class EntryComp
implements Comparator> {
public int compare(Map.Entry o1,
Map.Entry o2) {
int diff = o2.getValue().get() - o1.getValue().get();
return (diff != 0) ? diff : o2.getKey().compareTo(o1.getKey());
}
}
}