edu.ucla.sspace.purandare.PurandareFirstOrder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.purandare;
import edu.ucla.sspace.clustering.Assignments;
import edu.ucla.sspace.clustering.ClutoClustering;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.Similarity.SimType;
import edu.ucla.sspace.common.Statistics;
import edu.ucla.sspace.matrix.AtomicGrowingMatrix;
import edu.ucla.sspace.matrix.AtomicMatrix;
import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.SparseRowMaskedMatrix;
import edu.ucla.sspace.matrix.YaleSparseMatrix;
import edu.ucla.sspace.matrix.SparseOnDiskMatrix;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.util.SparseArray;
import edu.ucla.sspace.util.SparseHashArray;
import edu.ucla.sspace.util.WorkerThread;
import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.vector.Vectors;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicInteger;
/**
* An implementation of the word sense induction algorithm described by
* Purandare and Pedersen. This implementation is based on the following
* paper:
*
* - Amruta Purandare and Ted
* Pedersen. (2004) Word Sense Discrimination by Clustering Contexts in
* Vector and Similarity Spaces. Proceedings of Conference on
* Computational Natural Language Learning (CoNLL), pp. 41-48, May 6-7,
* 2004, Boston, MA.
*
*
*
*
* This class offers one configurable parameter.
*
*
*
* - Property:
{@value #MAX_CONTEXT_PER_WORD}
*
* Default: {@link Integer.MAX_VALUE}
*
* - This property sets the upper-bound on the
* maximum number of contexts to be clustered for a single word. If there
* are fewer of contexts than this value, all of the contexts will be used.
* Users should consider setting this value if a large corpus is to be
* used, or if the corpus contains many frequently used words after
* filtering.
*
*
*
* @author David Jurgens
*/
public class PurandareFirstOrder implements SemanticSpace {
private static final Logger LOGGER =
Logger.getLogger(PurandareFirstOrder.class.getName());
private static final String PROPERTY_PREFIX =
"edu.ucla.sspace.purandare.PurandareFirstOrder";
/**
* The property to set the upper-bound on the maximum number of contexts to
* be clustered for a single word.
*/
public static final String MAX_CONTEXTS_PER_WORD =
PROPERTY_PREFIX + ".maxContexts";
/**
* Map that pairs the word with its position in the original term-document
* matrix.
*/
private final Map termToIndex;
/**
* A mapping from term to the sense-induced semantic vectors. The first
* sense of the term will be the token itself, while addition senses will be
* denoted with a "-" and a sense number appended to the token.
*/
private final Map termToVector;
/**
* The window size for identifying co-occurence words that have the
* potential to be features.
*/
private final int windowSize;
/**
* The window size used for generating feature vectors.
*/
private final int contextWindowSize;
/**
* The matrix used for storing weight co-occurrence statistics of those
* words that occur both before and after.
*/
private final AtomicMatrix cooccurrenceMatrix;
/**
* A count for how many times each term appears in the corpus.
*/
private final List termCounts;
/**
* A compressed version of the corpus that is built as the text version is
* being processed. The file contains documents represented as an integer
* for the number of tokens in that document followed by the indices for all
* of the tokens in the order that they appeared.
*
* @see #processIntDocument(int[],Matrix,int,BitSet[],Set[])
*/
private File compressedDocuments;
/**
* The output stream used to the write the {@link #compressedDocuments} file
* as the text documents are being processed.
*/
private DataOutputStream compressedDocumentsWriter;
/**
* A counter for the number of documents seen in the corpus.
*/
private final AtomicInteger documentCounter;
/**
* The maximum number of contexts allowable for any word.
*/
private final int maxContextsPerWord;
/**
* The number that keeps track of the index values of words
*/
private int wordIndexCounter;
/**
* Creates a new instance of {@code PurandareFirstOrder} using the system
* properties for configuration
*/
public PurandareFirstOrder() {
this(System.getProperties());
}
/**
* Creates a new instance of {@code PurandareFirstOrder} using the provided
* properties for configuration
*/
public PurandareFirstOrder(Properties props) {
cooccurrenceMatrix = new AtomicGrowingMatrix();
termToIndex = new ConcurrentHashMap();
termToVector = new ConcurrentHashMap();
termCounts = new CopyOnWriteArrayList();
windowSize = 5;
contextWindowSize = 20;
documentCounter = new AtomicInteger(0);
String maxContextsProp = props.getProperty(MAX_CONTEXTS_PER_WORD);
if (maxContextsProp == null)
maxContextsPerWord = Integer.MAX_VALUE;
else {
int i = Integer.parseInt(maxContextsProp);
if (i <= 0)
throw new IllegalArgumentException(
"The number of contexts must be a positive number");
maxContextsPerWord = i;
}
try {
compressedDocuments =
File.createTempFile("petersen-documents",".dat");
compressedDocumentsWriter = new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(compressedDocuments)));
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* {@inheritDoc}
*/
public void processDocument(BufferedReader document) throws IOException {
documentCounter.getAndIncrement();
Queue nextWords = new ArrayDeque();
Queue prevWords = new ArrayDeque();
Iterator documentTokens =
IteratorFactory.tokenizeOrdered(document);
String focus = null;
ByteArrayOutputStream compressedDocument =
new ByteArrayOutputStream(4096);
DataOutputStream dos = new DataOutputStream(compressedDocument);
int tokens = 0; // count how many are in this document
int unfilteredTokens = 0;
//Load the first windowSize words into the Queue
for(int i = 0; i < windowSize && documentTokens.hasNext(); i++)
nextWords.offer(documentTokens.next());
while(!nextWords.isEmpty()) {
tokens++;
// Load the top of the nextWords Queue into the focus word
focus = nextWords.remove();
// Add the next word to nextWords queue (if possible)
if (documentTokens.hasNext()) {
String windowEdge = documentTokens.next();
nextWords.offer(windowEdge);
}
// If the filter does not accept this word, skip the semantic
// processing, continue with the next word
if (focus.equals(IteratorFactory.EMPTY_TOKEN)) {
// Mark the token as empty using a negative term index in the
// compressed form of the document
dos.writeInt(-1);
// shift the window
prevWords.offer(focus);
if (prevWords.size() > windowSize)
prevWords.remove();
continue;
}
int focusIndex = getIndexFor(focus);
// write the term index into the compressed for the document for
// later corpus reprocessing
dos.writeInt(focusIndex);
// Update the occurrences of this token
termCounts.get(focusIndex).incrementAndGet();
unfilteredTokens++;
// Iterate through the words occurring after and add values
for (String after : nextWords) {
// skip adding co-occurence values for words that are not
// accepted by the filter
if (!after.equals(IteratorFactory.EMPTY_TOKEN)) {
int index = getIndexFor(after);
cooccurrenceMatrix.addAndGet(focusIndex, index, 1);
}
}
for (String before : prevWords) {
// skip adding co-occurence values for words that are not
// accepted by the filter
if (!before.equals(IteratorFactory.EMPTY_TOKEN)) {
int index = getIndexFor(before);
cooccurrenceMatrix.addAndGet(focusIndex, index, 1);
}
}
// last, put this focus word in the prev words and shift off the
// front if it is larger than the window
prevWords.offer(focus);
if (prevWords.size() > windowSize)
prevWords.remove();
}
dos.close();
byte[] docAsBytes = compressedDocument.toByteArray();
// Once the document is finished, write the compressed contents to the
// corpus stream
synchronized(compressedDocumentsWriter) {
// Write how many terms were in this document
compressedDocumentsWriter.writeInt(tokens);
compressedDocumentsWriter.writeInt(unfilteredTokens);
compressedDocumentsWriter.write(docAsBytes, 0, docAsBytes.length);
}
}
/**
* Returns the index in the co-occurence matrix for this word. If the word
* was not previously assigned an index, this method adds one for it and
* returns that index.
*/
private final int getIndexFor(String word) {
Integer index = termToIndex.get(word);
if (index == null) {
synchronized(this) {
// recheck to see if the term was added while blocking
index = termToIndex.get(word);
// if another thread has not already added this word while the
// current thread was blocking waiting on the lock, then add it.
if (index == null) {
int i = wordIndexCounter++;
// Add a new counter for this term. Because the
// wordIndexCounter starts at zero, so the next index will
// be the last index in the termCounts list.
termCounts.add(new AtomicInteger(0));
termToIndex.put(word, i);
return i; // avoid the auto-boxing to assign i to index
}
}
}
return index;
}
/**
* {@inheritDoc}
*/
public Set getWords() {
// If no documents have been processed, it will be empty
return Collections.unmodifiableSet(termToVector.keySet());
}
/**
* {@inheritDoc}
*/
public DoubleVector getVector(String word) {
return termToVector.get(word);
}
/**
* {@inheritDoc}
*/
public void processSpace(Properties properties) {
try {
// Wrap the call to avoid having all the code in a try/catch. This
// is for improved readability purposes only.
processSpace();
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Calculates the first order co-occurrence statics to determine the feature
* set for each term, then clusters the feature vectors for each terms
* contexts and finally induces the sense-specific vectors for each term.
*/
@SuppressWarnings("unchecked")
private void processSpace() throws IOException {
compressedDocumentsWriter.close();
// Generate the reverse index-to-term mapping. We will need this for
// assigning specific senses to each term
String[] indexToTerm = new String[termToIndex.size()];
for (Map.Entry e : termToIndex.entrySet())
indexToTerm[e.getValue()] = e.getKey();
// Compute how many terms were in the corpus. We will need this for
// determining the log-likelihood for all co-occurrences.
int corpusSize = 0;
for (AtomicInteger i : termCounts)
corpusSize += i.get();
final int uniqueTerms = cooccurrenceMatrix.rows();
LOGGER.info("calculating term features");
// Create a set for each term that contains the term indices that are
// determined to be features for the term, i.e. not all co-occurrences
// will count as the features. The feature set for each term is
// determined by computing the log-likelihood for each co-occurrence and
// only keeping those terms whos l-l value is above a certain threshold.
final BitSet[] termFeatures = new BitSet[wordIndexCounter];
for (int termIndex = 0; termIndex < uniqueTerms; ++termIndex) {
String term = indexToTerm[termIndex];
termFeatures[termIndex] = calculateTermFeatures(term, corpusSize);
}
LOGGER.info("reprocessing corpus to generate feature vectors");
// Set up the concurrent data structures so we can process the documents
// concurrently
final BlockingQueue workQueue =
new LinkedBlockingQueue();
for (int i = 0; i < Runtime.getRuntime().availableProcessors(); ++i) {
Thread t = new WorkerThread(workQueue);
t.start();
}
final Semaphore termsProcessed = new Semaphore(0);
for (int termIndex = 0; termIndex < uniqueTerms; ++termIndex) {
final String term = indexToTerm[termIndex];
final int i = termIndex;
workQueue.offer(new Runnable() {
public void run() {
try {
LOGGER.fine(String.format(
"processing term %6d/%d: %s", i, uniqueTerms,term));
Matrix contexts = getTermContexts(i, termFeatures[i]);
senseInduce(term, contexts);
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
termsProcessed.release();
}
}
});
}
// Wait until all the documents have been processed
try {
termsProcessed.acquire(uniqueTerms);
} catch (InterruptedException ie) {
throw new Error("interrupted while waiting for terms to " +
"finish reprocessing", ie);
}
LOGGER.info("finished reprocessing all terms");
}
private BitSet calculateTermFeatures(String term, int corpusSize) {
int termIndex = termToIndex.get(term);
LOGGER.fine(String.format("Calculating feature set for %6d/%d: %s",
termIndex, cooccurrenceMatrix.rows(), term));
DoubleVector cooccurrences = cooccurrenceMatrix.getRowVector(termIndex);
int termCount = termCounts.get(termIndex).get();
BitSet validFeatures = new BitSet(wordIndexCounter);
// For each of the co-occurring terms, calculate the log-likelikehood
// value for that term's occurrences. Only terms whose value is above
// 3.841 will be counted as features
for (int co = 0; co < cooccurrences.length(); ++co) {
// Form the contingency table:
// a b
// c d
double count = cooccurrences.get(co);
// Don't include words that never co-occur as features
if (count == 0)
continue;
// a = the number of times they both co-occur
double a = count;
// b = the number of times co occurs without term
double b = termCounts.get(co).get() - count;
// c = the number of times term occurs without co
double c = termCount - count;
// d = the number of times neither co-occurrence
double d = corpusSize - (a + b + c);
double logLikelihood = logLikelihood(a, b, c, d);
if (logLikelihood > 3.841)
validFeatures.set(co);
}
if (LOGGER.isLoggable(Level.FINE))
LOGGER.fine(term + " had " + validFeatures.cardinality()
+ " features");
return validFeatures;
}
/**
* For the specified term, reprocesses the entire corpus using the term's
* features to construct a matrix of all the contexts in which the term
* appears. If the term occurs in more contexts than is allowed in the
* {@link #maxContextsPerWord}, the random subset of the contexts is
* returned.
*
* @param termIndex the index of the term for which the context matrix
* should be generated
* @param termFeatures the set of term indices that are valid features when
* in the context of {@code termIndex}.
*
* @return a {@code Matrix} where each row is a different context for the
* term in the corpus
*/
private Matrix getTermContexts(int termIndex, BitSet termFeatures)
throws IOException {
// Reprocess the corpus in binary format to generate the set of context
// with the appropriate feature vectors
DataInputStream corpusReader = new DataInputStream(
new BufferedInputStream(new FileInputStream(compressedDocuments)));
int documents = documentCounter.get();
// Use the number of times the term occurred in the corpus to determine
// how many rows (contexts) in the matrix.
SparseMatrix contextsForCurTerm = new YaleSparseMatrix(
termCounts.get(termIndex).get(), termToIndex.size());
int contextsSeen = 0;
for (int d = 0; d < documents; ++d) {
final int docId = d;
int tokensInDoc = corpusReader.readInt();
int unfilteredTokens = corpusReader.readInt();
// Read in the document
int[] doc = new int[tokensInDoc];
for (int i = 0; i < tokensInDoc; ++i)
doc[i] = corpusReader.readInt();
int contextsInDoc =
processIntDocument(termIndex, doc, contextsForCurTerm,
contextsSeen, termFeatures);
contextsSeen += contextsInDoc;
}
corpusReader.close();
// If the term is to be processed using fewer than all of its contexts,
// then randomly select the maximum allowable contexts from the matrix
if (maxContextsPerWord < Integer.MAX_VALUE &&
contextsForCurTerm.rows() > maxContextsPerWord) {
BitSet randomContexts = Statistics.randomDistribution(
maxContextsPerWord, contextsForCurTerm.rows());
contextsForCurTerm =
new SparseRowMaskedMatrix(contextsForCurTerm, randomContexts);
}
return contextsForCurTerm;
}
/**
* Given a matrix for the term where each row is a different context,
* clusters the rows to identify how many senses the word has.
*
* @param term the term for which the senses will be discovered
* @param contexts a matrix containing all the contexts in which {@code
* term} appear, with one context per row
*/
private void senseInduce(String term, Matrix contexts) throws IOException {
LOGGER.fine("Clustering " + contexts.rows() + " contexts for " + term);
// For terms with fewer than seven contexts, set the number of potential
// clusters lower
int numClusters = Math.min(7, contexts.rows());
// Cluster each of the rows into seven groups. Note that if the term is
// not purely alphabetic (i.e. contains number of other symbols), don't
// bother clustering it. This is done to reduce the computation time,
// and to avoid clustering non-meaningful terms such as '.' or '''
if (!(term.matches("[a-zA-z]+") && numClusters > 6)) { // special case
SparseDoubleVector meanSenseVector =
new CompactSparseVector(termToIndex.size());
int rows = contexts.rows();
for (int row = 0; row < rows; ++row)
VectorMath.add(meanSenseVector, contexts.getRowVector(row));
termToVector.put(term, meanSenseVector);
return;
}
Assignments clusterAssignment =
new ClutoClustering().cluster(contexts, numClusters,
ClutoClustering.Method.AGGLOMERATIVE,
ClutoClustering.Criterion.UPGMA);
LOGGER.fine("Generative sense vectors for " + term);
// For each of the clusters, compute the mean sense vector
int[] clusterSize = new int[numClusters];
// Use CompactSparseVector to conserve memory given the potentially
// large number of sense vectors
SparseDoubleVector[] meanSenseVectors =
new CompactSparseVector[numClusters];
for (int i = 0; i < meanSenseVectors.length; ++i)
meanSenseVectors[i] = new CompactSparseVector(termToIndex.size());
// For each of the contexts, determine which cluster it was in and sum
// it value with the other contexts
for (int row = 0; row < clusterAssignment.size(); ++row) {
int assignment = clusterAssignment.get(row);
if (assignment < 0)
continue;
clusterSize[assignment]++;
DoubleVector contextVector = contexts.getRowVector(row);
VectorMath.add(meanSenseVectors[assignment], contextVector);
}
// For each of the clusters with more than 2% of the contexts, generage
// an average sense vectors. For those clusters with less than that
// amount, discard them.
int senseCounter = 0;
for (int i = 0; i < numClusters; ++i) {
int size = clusterSize[i];
if (size / (double)(contexts.rows()) > 0.02) {
String termWithSense = (senseCounter == 0)
? term : term + "-" + senseCounter;
senseCounter++;
termToVector.put(termWithSense,meanSenseVectors[i]);
}
}
LOGGER.fine("Discovered " + senseCounter + " senses for " + term);
}
/**
* Processes the compressed version of a document where each integer
* indicates that token's index and identifies all the contexts for the
* target word, adding them as new rows to the context matrix.
*
* @param termIndex the term whose contexts should be extracted
* @param document the document to be processed where each {@code int} is a
* term index
* @param contextMatrix the matrix that will contain all the contexts for
* the term with {@code termIndex} when this method returns
* @param rowStart the next row index in the matrix where a new context can
* be added
* @param featuresForTerm a mapping from term index to the set of other term
* indices that are valid feature for that term
*
* @return the number of contexts present in this document
*/
private int processIntDocument(int termIndex, int[] document,
Matrix contextMatrix,
int rowStart,
BitSet featuresForTerm) {
int contexts = 0;
for (int i = 0; i < document.length; ++i) {
int curToken = document[i];
// Skip processing tokens that are not the current focus
if (curToken != termIndex)
continue;
// Buffer the count of how many times each feature appeared in the
// context.
SparseArray contextCounts = new SparseHashArray();
// Process all the tokes to the left (prior) to the current token;
for (int left = Math.max(i - contextWindowSize, 0);
left < i; ++left) {
// NOTE: this token value could be -1 if the token's original
// text was filtered out from the corpus, i.e. was EMPTY_TOKEN
int token = document[left];
// Only count co-occurrences that are valid features for the
// current token
if (token >= 0 && featuresForTerm.get(token)) {
Integer count = contextCounts.get(token);
contextCounts.set(token, (count == null) ? 1 : count + 1);
}
}
// Process all the tokes to the right (after) to the current token;
int end = Math.min(i + contextWindowSize, document.length);
for (int right = i + 1; right < end; ++right) {
int token = document[right];
// Only count co-occurrences that are valid features for the
// current token
if (token >= 0 && featuresForTerm.get(token)) {
Integer count = contextCounts.get(token);
contextCounts.set(token, (count == null) ? 1 : count + 1);
}
}
// Each word in the document represents a new context, so the
// specific context instance can be determined from the current word
// and the number of previously process words
int curContext = rowStart + contexts;
for (int feat : contextCounts.getElementIndices()) {
//System.out.println("setting row: " + curContext + ", col: " + feat);
contextMatrix.set(curContext, feat, contextCounts.get(feat));
}
// If the current token wasn't skipped, indicate that another
// context was seen
contexts++;
}
return contexts;
}
/**
* {@inheritDoc}
*/
public int getVectorLength() {
// The vector length is dependent upon the total number of features seen
return termToIndex.size();
}
/**
* {@inheritDoc}
*/
public String getSpaceName() {
return "purandare-petersen";
}
/**
* Returns the log-likelihood of the contingency table made up of the four
* values.
*/
private static double logLikelihood(double a, double b,
double c, double d) {
// Table set up as:
// a b
// c d
double col1sum = a + c;
double col2sum = b + d;
double row1sum = a + b;
double row2sum = c + d;
double sum = row1sum + row2sum;
// Calculate the expected values for a, b, c, d
double aExp = (row1sum / sum) * col1sum;
double bExp = (row1sum / sum) * col2sum;
double cExp = (row2sum / sum) * col1sum;
double dExp = (row2sum / sum) * col2sum;
// log(0) = Infinity, which messes up the calcuation. Therefore, check
// whether the value is zero before calculating its contribution.
double aVal = (a == 0) ? 0 : a * Math.log(a / aExp);
double bVal = (b == 0) ? 0 : b * Math.log(b / bExp);
double cVal = (c == 0) ? 0 : c * Math.log(c / cExp);
double dVal = (d == 0) ? 0 : d * Math.log(d / dExp);
return 2 * (aVal + bVal + cVal + dVal);
}
}