edu.ucla.sspace.tools.DependencyBasisMaker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.
The newest version!
/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */


package edu.ucla.sspace.tools;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractorManager;
import edu.ucla.sspace.dependency.DependencyIterator;
import edu.ucla.sspace.dependency.DependencyPath;
import edu.ucla.sspace.dependency.DependencyPathAcceptor;
import edu.ucla.sspace.dependency.DependencyPathWeight;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.FilteredDependencyIterator;
import edu.ucla.sspace.dependency.FlatPathWeight;
import edu.ucla.sspace.dependency.UniversalPathAcceptor ;

import edu.ucla.sspace.mains.DependencyGenericMain;

import edu.ucla.sspace.matrix.AtomicGrowingSparseHashMatrix;
import edu.ucla.sspace.matrix.NoTransform;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.Transform;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.BoundedSortedMap;
import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectOutputStream;

import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;


/**
 * This main creates a {@link BasisMapping} based on the unique terms found in a
 * document set and serializes it to disk.
 *
 * @author Keith Stevens
 */
public class DependencyBasisMaker extends DependencyGenericMain {

    /**
     * {@inheritDoc}
     */
    public void addExtraOptions(ArgOptions options) { 
        options.addOption('b', "basisSize",
                          "Specifies the total desired size of the basis " +
                          "(Default: 10000)",
                          true, "INT", "Optional");
        options.addOption('a', "pathAcceptor",
                          "Specifies the dependency path acceptor to use. " +
                          "(Default:    UnivseralPathAcceptor)",
                          true, "CLASSNAME", "Optional");
        options.addOption('w', "pathWeighter",
                          "Specifies the dependency path weighter to use. " +
                          "(Default:    FlatPathWeight)",
                          true, "CLASSNAME", "Optional");
        options.addOption('l', "pathLength",
                          "Specifies the maximum dependency path length. " +
                          "(Default:    5)",
                          true, "INT", "Optional");
    }

    /**
     * {@inheritDoc}
     */
    protected SemanticSpace getSpace() {
        setupDependencyExtractor();

        int bound = argOptions.getIntOption('b', 10000);
        Transform transform = argOptions.getObjectOption(
                'T', new NoTransform());
        DependencyPathAcceptor acceptor = argOptions.getObjectOption(
                'a', new UniversalPathAcceptor());
        DependencyPathWeight weighter = argOptions.getObjectOption(
                'w', new FlatPathWeight());
        int pathLength = argOptions.getIntOption('l', 5);
        return new OccurrenceCounter(
                transform, bound, acceptor, weighter, pathLength);
    }

    /**
     * Saves the {@link BasisMapping} created from the {@link
     * OccurrenceCounter}.
     */
    protected void saveSSpace(SemanticSpace sspace, File outputFile)
            throws IOException{
        BasisMapping savedTerms = new StringBasisMapping();
        for (String term : sspace.getWords())
            savedTerms.getDimension(term);

        ObjectOutputStream ouStream = new ObjectOutputStream(
                new FileOutputStream(outputFile));
        ouStream.writeObject(savedTerms);
        ouStream.close();
    }

    /**
     * A simple term {@link SemanticSpace} implementation that counts word
     * co-occurrences, performs a transform, and then scores each recorded basis
     * dimension based on the row summed scores for each word.
     */
    public class OccurrenceCounter implements SemanticSpace {

        /**
         * The matrix used for storing weight co-occurrence statistics of those
         * words that occur both before and after.
         */
        private final AtomicGrowingSparseHashMatrix cooccurrenceMatrix;

        /**
         * The {@link BasisMapping} used to record dimensions.
         */
        private final BasisMapping basis;

        /**
         * The final scores for each word in the {@code basis}.
         */
        private final Map wordScores;

        /**
         * The {@link Transform} class used to rescore each word.
         */
        private final Transform transform;

        /**
         * The {@link DependencyPathAcceptor} used to accept or reject
         * dependency paths.
         */
        private final DependencyPathAcceptor acceptor;

        /**
         * The {@link DependencyPathWeight} used to score dependency paths.
         */
        private final DependencyPathWeight weighter;

        /**
         * The maximum path length that is acceptable.
         */
        private final int pathLength;

        /**
         * The {@link DependencyExtractor} used to extract parse trees from each
         * document.
         */
        private final DependencyExtractor extractor;

        /**
         * Creates a new {@link OccurrenceCounter}.
         */
        public OccurrenceCounter(Transform transform,
                                 int bound, 
                                 DependencyPathAcceptor acceptor,
                                 DependencyPathWeight weighter,
                                 int pathLength) {
            cooccurrenceMatrix = new AtomicGrowingSparseHashMatrix();
            basis = new StringBasisMapping();
            wordScores = new BoundedSortedMap(bound);
            extractor = DependencyExtractorManager.getDefaultExtractor();

            this.transform = transform;
            this.acceptor = acceptor;
            this.weighter = weighter;
            this.pathLength = pathLength;
        }

        /**
         * {@inheritDoc}
         */
        public void processDocument(BufferedReader document)
                throws IOException {
            // Rather than updating the matrix every time an occurrence is
            // seen, keep a thread-local count of what needs to be modified
            // in the matrix and update after the document has been
            // processed.  This saves potential contention from concurrent
            // writes.
            Map,Double> matrixEntryToCount = 
                    new HashMap,Double>();

            // Iterate over all of the parseable dependency parsed sentences in
            // the document.
            for (DependencyTreeNode[] nodes = null; 
                    (nodes = extractor.readNextTree(document)) != null; ) {

                // Skip empty documents.
                if (nodes.length == 0)
                    continue;                        

                // Examine the paths for each word in the sentence.
                for (int wordIndex = 0; wordIndex < nodes.length; ++wordIndex) {
                    String focusWord = nodes[wordIndex].word();                            
                    int focusIndex = basis.getDimension(focusWord);

                    // Get all the valid paths starting from this word.    The
                    // acceptor will filter out any paths that don't contain the
                    // semantic connections we're looking for.
                    Iterator paths =
                        new FilteredDependencyIterator(
                                nodes[wordIndex], acceptor, pathLength);
                            
                    // For each of the paths rooted at the focus word, update
                    // the co-occurrences of the focus word in the dimension
                    // that the BasisFunction states.
                    while (paths.hasNext()) {
                        DependencyPath path = paths.next();

                        String occurrence = path.last().word();
                        int featureIndex = basis.getDimension(occurrence);

                        double score = weighter.scorePath(path);
                        matrixEntryToCount.put(new Pair(
                                    focusIndex, featureIndex), score);
                    }
                }
            }

            // Once the document has been processed, update the co-occurrence
            // matrix accordingly.
            for (Map.Entry,Double> e :
                    matrixEntryToCount.entrySet()){
                    Pair p = e.getKey();
                    cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
            }
        }

        /**
         * {@inheritDoc}
         */
        public Set getWords() {
            return Collections.unmodifiableSet(wordScores.keySet());
        }

        /**
         * {@inheritDoc}
         */
        public DoubleVector getVector(String word) {
            Double score = wordScores.get(word);
            return (score == null)
                ? new DenseVector(new double[] {0})
                : new DenseVector(new double[] {score});
        }

        /**
         * {@inheritDoc}
         */
        public int getVectorLength() {
            return 1;
        }

        /**
         * {@inheritDoc}
         */
        public void processSpace(Properties properties) {
            SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(
                    cooccurrenceMatrix);
            for (String term : basis.keySet()) {
                int index = basis.getDimension(term);
                SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);

                double score = 0;
                for (int i : sdv.getNonZeroIndices())
                    score += sdv.get(i);

                wordScores.put(term, score);
            }
        }

        /**
         * {@inheritDoc}
         */
        public String getSpaceName() {
            return "BasisMaker";
        }
    }
}