edu.ucla.sspace.grefenstette.Grefenstette Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.
The newest version!
/*
 * Copyright 2009 Grace Park
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.grefenstette;

import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.Matrix;

import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.Vector;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.IOException;
import java.io.IOError;
import java.io.PrintWriter;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Properties;

import java.util.concurrent.atomic.AtomicInteger;

import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * An implementation of a semantic space built from syntactic co-occurrence, as
 * described by Grefenstette.  See the following references for full details.
 * 
 *
 *   G. Grefenstette,
 *   Explorations in Automatic Thesaurus Discovery. Indiana University
 *   Press, 1994.
 *
 * 
 *
 *
 * @author Grace Park 
 */
public class Grefenstette implements SemanticSpace {

    /**
     * The logger for reporting all debugging information
     */
    private static final Logger LOGGER = 
	Logger.getLogger(Grefenstette.class.getName());

    /**
     * The temporary file used to record syntactic word relations while the
     * documents are parsed.  Relations are written to a file to save memory.
     */
    private final File wordRelations;

    /**
     * The writer to the {@code wordRelations} file.
     */
    private final PrintWriter wordRelationsWriter;

    /**
     * A mapping from a string token to the integer that represents that token's
     * row in the {@code syntacticCooccurrence} matrix.
     */
    private final Map objectTable;

    /**
     * A mapping from a token in a specific syntactic position to the integer
     * that represents that token configuration's column.
     */
    private final Map attributeTable;

    /**
     * A matrix where rows correspond to tokens and columns correspond to the
     * syntactic co-occurrence of a specific token in a specific syntactic
     * position.
     */
    private final Matrix syntacticCooccurrence;

    /**
     * An incremental counter used for assigning tokens to matrix row indices
     */
    private final AtomicInteger objectCounter;

    /**
     * An incremental counter used for assigning token syntax positions to
     * matrix column indices
     */
    private final AtomicInteger attributeCounter;

    /**
     * Constructs an instance using the system properties for any required
     * configuration
     *
     * @throws IOError if unable to create the backing file to hold data while
     *         processing
     */
    public Grefenstette() {
	try {
	    wordRelations = File.createTempFile("word-relation-list","txt");
	    wordRelationsWriter = new PrintWriter(wordRelations);
	  
	    objectTable = new HashMap();
	    attributeTable = new HashMap();
	  
	    syntacticCooccurrence = new GrowingSparseMatrix();
	  
	    objectCounter = new AtomicInteger(0);
	    attributeCounter = new AtomicInteger(0);
	} catch (IOException ioe) {
	    throw new IOError(ioe);
	}
    }

    /**
     * {@inheritDoc}
     */
    public void processDocument (BufferedReader document) {
	ArrayList> wordsInPhrase = new ArrayList>();

	String nounPhrase = "";
	String lastNoun = "";
	String lastVerb = "";
	String secondPrevPhrase = "";
	String prevPhrase = "";

	try {
	    nounPhrase = document.readLine();
	} catch (IOException e) {
	    e.printStackTrace();
	}

	for( String tag = getNextTag(nounPhrase); 
	     tag != null; tag = getNextTag(nounPhrase) ) {
	    String word;

	    int startOfTag = nounPhrase.indexOf(tag);

	    nounPhrase = nounPhrase.substring(startOfTag);
	    wordsInPhrase.clear();

	    if( tag.equals("NP") ) {
		while( nounPhrase.charAt(0) != ')' ) {

		    // extract tag of word in noun phrase
		    tag = getNextTag(nounPhrase); 

		    if( isPhraseOrClause(tag) || isPreposition(tag) ) {
			nounPhrase = nounPhrase.
			    substring(nounPhrase.indexOf(tag) + tag.length());
			// stop processing NP
			break;
		    } else if( inStartSet(tag) || inReceiveSet(tag) ) {
			// note to self: find out why this broke
			try {
			    word = nounPhrase.
				substring(nounPhrase.indexOf(" ", 
					      nounPhrase.indexOf(tag)) + 1, 
					  nounPhrase.indexOf(")"));

			    wordsInPhrase.add(new Pair(tag,word));

			    nounPhrase = nounPhrase.
				substring(nounPhrase.indexOf(")",
					      nounPhrase.indexOf(word))+1);
			} catch (StringIndexOutOfBoundsException e) {
			    nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")"));
			}
			// else it's not a tag I care about
		    } else {
			nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")")+1);
		    }
		}

		// note to self: is this if statement represent the same thing
		// as the next if statement??
		if( !wordsInPhrase.isEmpty() ) {
		    // set head noun to last word in noun phrase
		    String headNoun = wordsInPhrase.get(wordsInPhrase.size()-1).y;

		    // create the relations from pass two
		    if( prevPhrase.equals("PP") && secondPrevPhrase.equals("NP") 
			&& lastNoun.length() != 0 ) {
			wordRelationsWriter.println(lastNoun + " " + headNoun);

			addRelation(lastNoun, headNoun);
		    }

		    // create relations from pass four
		    if( prevPhrase.equals("PP") && secondPrevPhrase.equals("VP")
			&& lastVerb.length() != 0 ) {

			wordRelationsWriter.println(lastVerb + " " + headNoun);

			addRelation(lastVerb, headNoun);
		    } else if( prevPhrase.equals("VP") ) {

			wordRelationsWriter.println(lastVerb + " " + headNoun);

			addRelation(lastVerb, headNoun);
		    }

		    lastNoun = headNoun;
		}

		// reached end of noun phrase
		if( nounPhrase.charAt(0) == ')' ) {
		    // create relations between words in noun phrase
		    // relations from pass one
		    processWordsInNP(wordsInPhrase);

		    if( !"NP".equals(prevPhrase) ) {
			secondPrevPhrase = prevPhrase;
			prevPhrase = "NP";
		    }
		}
	    } //end processing NP
	    else if( tag.equals("VP") ) {
		while( tag != null && tag.startsWith("V") ) {
		    // nonphrase verb
		    if( tag.startsWith("VB") ) {
			word = nounPhrase.substring( nounPhrase.indexOf(" ", 
									nounPhrase.indexOf(tag))+1, nounPhrase.indexOf(")"));
			lastVerb = word;
		    }

		    nounPhrase = nounPhrase.substring(nounPhrase.indexOf(tag)+1);
		    tag = getNextTag(nounPhrase);
		}

		// relations from pass three
		if( prevPhrase.equals("NP") && lastNoun.length() != 0 ) {

		    wordRelationsWriter.println(lastNoun + " " + lastVerb);

		    addRelation(lastNoun, lastVerb);
		}

		if( !prevPhrase.equals("VP") ) {
		    secondPrevPhrase = prevPhrase;
		    prevPhrase = "VP";
		}
	    }
	    else if( isPhraseOrClause(tag) || isPreposition(tag) ) {
		nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag)
						   + tag.length());
		if( !tag.equals(prevPhrase) ) {
		    secondPrevPhrase = prevPhrase;
		    prevPhrase = tag;
		}
	    }
	    else {
		nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag)
						   + tag.length());
	    }
	}
    }


    /**
     * Adds a relation pair to the matrix
     */
    private void addRelation(String object, String attribute) {
	double val;
	int row, col;

        object = object.toLowerCase();
        attribute = attribute.toLowerCase();

	// get row in matrix
	if( objectTable.containsKey(object) ) {
            // if the object already exists in matrix, find its index
	    row = objectTable.get(object);
	} else {
            // otherwise give the object a new index number
	    row = Integer.valueOf(objectCounter.getAndIncrement());
            // insert new object/index pair into lookup table
	    objectTable.put( object, row );
            System.out.println(object + " " + row);
	}

	// get column in matrix
	if( attributeTable.containsKey(attribute) ) {
	    col = attributeTable.get(attribute);
	} else {
	    col = Integer.valueOf(attributeCounter.getAndIncrement());
	    attributeTable.put( attribute, col );
	}

	// update entry in matrix which records how many times the
        // object/attribute pair has been seen
	if( row < syntacticCooccurrence.rows() && 
	    col < syntacticCooccurrence.columns()) {
            // if there's already an entry for the object and attribute, get the
            // current value for the pair of words
	    val = syntacticCooccurrence.get(row, col);
            // increment the current value by one and store in matrix
	    syntacticCooccurrence.set(row, col, val+1);
	} else {
            // otherwise set the row, col value to 1
	    syntacticCooccurrence.set(row, col, 1.0);
	}
    }

    /**
     * Creates relations between words in a noun phrase
     */
    private void processWordsInNP(ArrayList> wordsInPhrase) {
	if( wordsInPhrase.size() > 1 ) {
	    // this is from Grefenstette's pseudo code
	    for (int i = 0; i < wordsInPhrase.size()-1; i++) {

		if (inStartSet(wordsInPhrase.get(i).x) ) {
		
		    for (int j = i+1; j < wordsInPhrase.size(); j++ ) {
			
			if (inReceiveSet( wordsInPhrase.get(j).x ) ) {

			    wordRelationsWriter.
				println(wordsInPhrase.get(j).y + " "
					+ wordsInPhrase.get(i).y);
			    
// 			    System.out.println(wordsInPhrase.get(j).y + " "
// 					       + wordsInPhrase.get(i).y);
			    
			    addRelation(wordsInPhrase.get(j).y, 
					wordsInPhrase.get(i).y);
			}
		    }
		}
	    }
	}
    }

  
    /**
     * Checks to see if the tag can modify another word
     *
     * @param tag A tag from the parsed corpus to be checked
     */
    private boolean inStartSet(String tag) {
	return
	    // noun
	    tag.startsWith("NN") ||
	    // adjective
	    tag.startsWith("JJ") ||
	    // adverb
	    tag.startsWith("RB") ||
	    // cardinal number
	    tag.startsWith("CD");
    }


    /**
     * Checks to see if tag can be modified by a word in StartSet
     */
    private boolean inReceiveSet(String tag) {
	return
	    tag.startsWith("NN") ||
	    tag.startsWith("VB");
    }


    /**
     * Checks to see if tag is a preposition
     */
    private boolean isPreposition(String tag) {
	return tag.startsWith("PP");
    }


    /**
     * Checks to see if tag marks a phrase or clause
     */
    private boolean isPhraseOrClause(String tag) {
	// find out why adding more reduced the number of relations
	return
	    (!tag.equals("SYM") &&
	     tag.startsWith("S")) ||
	    tag.equals("ADJP") ||
	    tag.equals("ADVP") ||
	    tag.equals("CONJP") ||
	    tag.equals("FRAG") ||
	    tag.equals("INTJ") ||
	    tag.equals("LST") ||
	    tag.equals("NAC") ||
	    tag.equals("NP") ||
	    tag.equals("NX") ||
	    tag.equals("PP") ||
	    tag.equals("PRN") ||
	    /* removing prt adds 1% more relations */
	    tag.equals("PRT") ||
	    tag.equals("QP") ||
	    tag.equals("RRC") ||
	    tag.equals("UCP") ||
	    tag.equals("VP") ||
	    tag.startsWith("WH") ||
	    tag.equals("X");
    }

    /**
     * Returns the next tag in the sentence or null if there are no more tags
     * @param str The sentence that the tag is extracted from
     */
    private String getNextTag(String str) {
	String tag;
	int endIndex;
	int tagIndex = str.indexOf("(");

	if( tagIndex < 0 ) {
	    return null;
	}

	// in case there's nothing in the sentence
	endIndex = str.indexOf(" ", tagIndex);

	if( endIndex < 0 ) {
	    return null;
	}

	tag = str.substring( tagIndex+1, endIndex ); 

	if( tag.length() > 0 ) {
	    return tag;
	} else {
	    str = str.substring( tagIndex+1 );
	    return getNextTag(str);
	}
    }

    /**
     * {@inheritDoc}
     */
    public Set getWords() {
	return Collections.unmodifiableSet(objectTable.keySet());
    }

    /**
     * {@inheritDoc}
     */
    public Vector getVector(String word) {
        word = word.toLowerCase();
        if(objectTable.containsKey(word)) {
            int wordIndex = objectTable.get(word);
            if(wordIndex < syntacticCooccurrence.rows()) {
                return syntacticCooccurrence.getRowVector(wordIndex);
            }
            // At this section, several exception handlers were removed.  These
            // may have been superfluous, or the code may have relied on them
            // being caught.
        }
        return null;
    }
    
    /**
     * Does nothing.
     */
    public void processSpace(Properties properties) {
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
	return "grefenstette-syntatic-analysis";
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
      return syntacticCooccurrence.columns();
    }
}