edu.ucla.sspace.wordsi.WaitingWordsi Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.
The newest version!
/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.wordsi;

import edu.ucla.sspace.clustering.Assignments;
import edu.ucla.sspace.clustering.Clustering;

import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Matrices;

import edu.ucla.sspace.util.WorkQueue;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.Vectors;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.concurrent.ConcurrentHashMap;

import java.util.logging.Logger;



/**
 * A {@link Wordsi} implementation that performs batch clustering.  Each context
 * vector is stored and later clustered using a {@link Clustering} algorithm.
 *
 * @author Keith Stevens
 */
public class WaitingWordsi extends BaseWordsi {

    /**
     * A logger for recording the process of the {@link Wordsi} processing.
     */
    private static final Logger LOG = Logger.getLogger(
            WaitingWordsi.class.getName());

    /**
     * The {@link Clustering} implementation to use when all data points have
     * been observed.
     */
    private final Clustering clustering;

    /**
     * A mapping from strings to the set of context vectors associated with that
     * token.
     */
    private final Map> dataVectors;

    /**
     * The final word space, which maps from strings to the semantic
     * representation.
     */
    private final Map wordSpace;

    /**
     * The number of clusters.  This may be used as a theoretical upper bound as
     * opposed to a strict number of clusters.
     */
    private final int numClusters;

    /**
     * The {@link AssignmentReporter} to use for reporting clustering
     * assignments.
     */
    private final AssignmentReporter reporter;

    /**
     * Creates a new {@link WaitingWordsi}.  The number of clusters is left
     * unset, which requires that the {@link Clustering} algorithm be able to
     * decide on an appropriate number of clusters.
     *
     * @param acceptedWords The set of words that {@link Wordsi} should
     *        represent.  This may be {@code null} or empty}.
     * @param extractor The {@link ContextExtractor} used to parse documents.
     * @param trackSecondaryKeys If true, cluster assignments and secondary keys
     *        will be tracked.  If this is false, the {@link AssignmentReporter}
     *        will not be used.
     * @param clustering The {@link Clustering} algorithm to use on each data
     *        set.
     * @param reporter The {@link AssignmentReporter} responsible for generating
     *        a report that details the cluster assignments.    This may be
     *        {@link null}.    If {@code trackSecondaryKeys} is false, this is
     *        not used.
     */
    public WaitingWordsi(Set acceptedWords,
                         ContextExtractor extractor,
                         Clustering clustering,
                         AssignmentReporter reporter) {
        this(acceptedWords, extractor, clustering, reporter, 0);
    }

    /**
     * Creates a new {@link WaitingWordsi}.    
     *
     * @param acceptedWords The set of words that {@link Wordsi} should
     *        represent.  This may be {@code null} or empty}.
     * @param extractor The {@link ContextExtractor} used to parse documents.
     * @param clustering The {@link Clustering} algorithm to use on each data
     *        set.
     * @param reporter The {@link AssignmentReporter} responsible for generating
     *        a report that details the cluster assignments.  This may be {@link
     *        null}.  If {@code trackSecondaryKeys} is false, this is not used.
     * @param numClusters Specifies the number of clusters to generate for each
     *        term.
     */
    public WaitingWordsi(Set acceptedWords,
                         ContextExtractor extractor,
                         Clustering clustering,
                         AssignmentReporter reporter,
                         int numClusters) {
        super(acceptedWords, extractor);

        this.clustering = clustering;
        this.reporter = reporter;
        this.numClusters = numClusters;

        dataVectors = new HashMap>();
        wordSpace = new ConcurrentHashMap();
    }

    /**
     * {@inheritDoc}
     */
    public Set getWords() {
        return wordSpace.keySet();
    }

    /**
     * {@inheritDoc}
     */
    public SparseDoubleVector getVector(String term) {
        return wordSpace.get(term);
    }

    /**
     * Adds the context vector to the end of the list of context vectors
     * associated with {@code focusKey}.
     */
    public void handleContextVector(String focusKey,
                                    String secondaryKey,
                                    SparseDoubleVector context) {
        // Get the list of context vectors for the focus key.
        List termContexts = dataVectors.get(focusKey);
        if (termContexts == null) {
            synchronized (this) {
                termContexts = dataVectors.get(focusKey);
                if (termContexts == null) {
                    termContexts = new ArrayList();
                    dataVectors.put(focusKey, termContexts);
                }
            }
        }

        // Add the new context vector.
        int contextId = 0;
        synchronized (termContexts) {
            contextId = termContexts.size();
            termContexts.add(context);
        }

        // Record the association.
        if (reporter != null)
            reporter.assignContextToKey(focusKey, secondaryKey, contextId);
    }

    /**
     * {@inheritDoc}
     */
    public void processSpace(final Properties props) {
        WorkQueue workQueue = WorkQueue.getWorkQueue();

        Object key = workQueue.registerTaskGroup(dataVectors.size());
        // Process each word's context set in a worker thread.
        for (Map.Entry> entry :
                dataVectors.entrySet()) {
            // Get the root word being discriminated and list of observed
            // contexts.
            final String senseName = entry.getKey();

            List contextsWithNoLength = entry.getValue();
            final List contextSet = 
                new ArrayList(contextsWithNoLength.size());
            for (SparseDoubleVector v : contextsWithNoLength)
                contextSet.add(Vectors.subview(v, 0, getVectorLength()));
            
            workQueue.add(key, new Runnable() {
                public void run() {
                    clusterTerm(senseName, contextSet, props);
                }
            });
        }
        workQueue.await(key);
        LOG.info("Finished processing all terms");
    }

    /**
     * Clusters the context vectors associated with {@link senseName}.
     */
    private void clusterTerm(String senseName,
                             List contextSet,
                             Properties props) {
        // Convert the data points to a sparse matrix.
        SparseMatrix contexts = Matrices.asSparseMatrix(contextSet);

        // Cluster the context set.
        LOG.info("Clustering term: " + senseName);
        Assignments assignments = (numClusters > 0) 
            ? clustering.cluster(contexts, numClusters, props)
            : clustering.cluster(contexts, props);
        LOG.info("Finished clustering term: " + senseName);

        SparseDoubleVector[] centroids = assignments.getSparseCentroids();

        // Add the centroids to the splitSenses map.
        for (int index = 0; index < centroids.length; ++index) {
            String sense = (index > 0)
                    ? senseName + "-" + index
                    : senseName;
            wordSpace.put(sense, centroids[index]);
        }

        LOG.info("Finished creating centroids for term: " + senseName);

        // Empty out the stored contexts to free up memory for later processes.
        contextSet.clear();

        // If the reporter is null, avoid making any report.
        if (reporter == null)
            return;

        // Generate the secondary context labels for each data point.
        String[] contextLabels = reporter.contextLabels(senseName);
        if (contextLabels.length == 0)
            return;

        LOG.info("Making assignment report: " + senseName);
        // Report the assignments for each clustered data point.  Note that some
        // data points might not have been clustered (Cluto based clustering
        // does this on occasion) so we must check for the number of assignments
        // first.
        for (int i = 0; i < assignments.size(); ++i)
            reporter.updateAssignment(
                    senseName, contextLabels[i], assignments.get(i));

        LOG.info("Finished making assignment report: " + senseName);
    }
}