All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucla.sspace.grefenstette.Grefenstette Maven / Gradle / Ivy

Go to download

The S-Space Package is a collection of algorithms for building Semantic Spaces as well as a highly-scalable library for designing new distributional semantics algorithms. Distributional algorithms process text corpora and represent the semantic for words as high dimensional feature vectors. This package also includes matrices, vectors, and numerous clustering algorithms. These approaches are known by many names, such as word spaces, semantic spaces, or distributed semantics and rest upon the Distributional Hypothesis: words that appear in similar contexts have similar meanings.

The newest version!
/*
 * Copyright 2009 Grace Park
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see .
 */

package edu.ucla.sspace.grefenstette;

import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.Matrix;

import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.Vector;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.IOException;
import java.io.IOError;
import java.io.PrintWriter;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Properties;

import java.util.concurrent.atomic.AtomicInteger;

import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * An implementation of a semantic space built from syntactic co-occurrence, as
 * described by Grefenstette.  See the following references for full details.
 * 
    * *
  • G. Grefenstette, * Explorations in Automatic Thesaurus Discovery. Indiana University * Press, 1994.
  • * *
* * * @author Grace Park */ public class Grefenstette implements SemanticSpace { /** * The logger for reporting all debugging information */ private static final Logger LOGGER = Logger.getLogger(Grefenstette.class.getName()); /** * The temporary file used to record syntactic word relations while the * documents are parsed. Relations are written to a file to save memory. */ private final File wordRelations; /** * The writer to the {@code wordRelations} file. */ private final PrintWriter wordRelationsWriter; /** * A mapping from a string token to the integer that represents that token's * row in the {@code syntacticCooccurrence} matrix. */ private final Map objectTable; /** * A mapping from a token in a specific syntactic position to the integer * that represents that token configuration's column. */ private final Map attributeTable; /** * A matrix where rows correspond to tokens and columns correspond to the * syntactic co-occurrence of a specific token in a specific syntactic * position. */ private final Matrix syntacticCooccurrence; /** * An incremental counter used for assigning tokens to matrix row indices */ private final AtomicInteger objectCounter; /** * An incremental counter used for assigning token syntax positions to * matrix column indices */ private final AtomicInteger attributeCounter; /** * Constructs an instance using the system properties for any required * configuration * * @throws IOError if unable to create the backing file to hold data while * processing */ public Grefenstette() { try { wordRelations = File.createTempFile("word-relation-list","txt"); wordRelationsWriter = new PrintWriter(wordRelations); objectTable = new HashMap(); attributeTable = new HashMap(); syntacticCooccurrence = new GrowingSparseMatrix(); objectCounter = new AtomicInteger(0); attributeCounter = new AtomicInteger(0); } catch (IOException ioe) { throw new IOError(ioe); } } /** * {@inheritDoc} */ public void processDocument (BufferedReader document) { ArrayList> wordsInPhrase = new ArrayList>(); String nounPhrase = ""; String lastNoun = ""; String lastVerb = ""; String secondPrevPhrase = ""; String prevPhrase = ""; try { nounPhrase = document.readLine(); } catch (IOException e) { e.printStackTrace(); } for( String tag = getNextTag(nounPhrase); tag != null; tag = getNextTag(nounPhrase) ) { String word; int startOfTag = nounPhrase.indexOf(tag); nounPhrase = nounPhrase.substring(startOfTag); wordsInPhrase.clear(); if( tag.equals("NP") ) { while( nounPhrase.charAt(0) != ')' ) { // extract tag of word in noun phrase tag = getNextTag(nounPhrase); if( isPhraseOrClause(tag) || isPreposition(tag) ) { nounPhrase = nounPhrase. substring(nounPhrase.indexOf(tag) + tag.length()); // stop processing NP break; } else if( inStartSet(tag) || inReceiveSet(tag) ) { // note to self: find out why this broke try { word = nounPhrase. substring(nounPhrase.indexOf(" ", nounPhrase.indexOf(tag)) + 1, nounPhrase.indexOf(")")); wordsInPhrase.add(new Pair(tag,word)); nounPhrase = nounPhrase. substring(nounPhrase.indexOf(")", nounPhrase.indexOf(word))+1); } catch (StringIndexOutOfBoundsException e) { nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")")); } // else it's not a tag I care about } else { nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")")+1); } } // note to self: is this if statement represent the same thing // as the next if statement?? if( !wordsInPhrase.isEmpty() ) { // set head noun to last word in noun phrase String headNoun = wordsInPhrase.get(wordsInPhrase.size()-1).y; // create the relations from pass two if( prevPhrase.equals("PP") && secondPrevPhrase.equals("NP") && lastNoun.length() != 0 ) { wordRelationsWriter.println(lastNoun + " " + headNoun); addRelation(lastNoun, headNoun); } // create relations from pass four if( prevPhrase.equals("PP") && secondPrevPhrase.equals("VP") && lastVerb.length() != 0 ) { wordRelationsWriter.println(lastVerb + " " + headNoun); addRelation(lastVerb, headNoun); } else if( prevPhrase.equals("VP") ) { wordRelationsWriter.println(lastVerb + " " + headNoun); addRelation(lastVerb, headNoun); } lastNoun = headNoun; } // reached end of noun phrase if( nounPhrase.charAt(0) == ')' ) { // create relations between words in noun phrase // relations from pass one processWordsInNP(wordsInPhrase); if( !"NP".equals(prevPhrase) ) { secondPrevPhrase = prevPhrase; prevPhrase = "NP"; } } } //end processing NP else if( tag.equals("VP") ) { while( tag != null && tag.startsWith("V") ) { // nonphrase verb if( tag.startsWith("VB") ) { word = nounPhrase.substring( nounPhrase.indexOf(" ", nounPhrase.indexOf(tag))+1, nounPhrase.indexOf(")")); lastVerb = word; } nounPhrase = nounPhrase.substring(nounPhrase.indexOf(tag)+1); tag = getNextTag(nounPhrase); } // relations from pass three if( prevPhrase.equals("NP") && lastNoun.length() != 0 ) { wordRelationsWriter.println(lastNoun + " " + lastVerb); addRelation(lastNoun, lastVerb); } if( !prevPhrase.equals("VP") ) { secondPrevPhrase = prevPhrase; prevPhrase = "VP"; } } else if( isPhraseOrClause(tag) || isPreposition(tag) ) { nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag) + tag.length()); if( !tag.equals(prevPhrase) ) { secondPrevPhrase = prevPhrase; prevPhrase = tag; } } else { nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag) + tag.length()); } } } /** * Adds a relation pair to the matrix */ private void addRelation(String object, String attribute) { double val; int row, col; object = object.toLowerCase(); attribute = attribute.toLowerCase(); // get row in matrix if( objectTable.containsKey(object) ) { // if the object already exists in matrix, find its index row = objectTable.get(object); } else { // otherwise give the object a new index number row = Integer.valueOf(objectCounter.getAndIncrement()); // insert new object/index pair into lookup table objectTable.put( object, row ); System.out.println(object + " " + row); } // get column in matrix if( attributeTable.containsKey(attribute) ) { col = attributeTable.get(attribute); } else { col = Integer.valueOf(attributeCounter.getAndIncrement()); attributeTable.put( attribute, col ); } // update entry in matrix which records how many times the // object/attribute pair has been seen if( row < syntacticCooccurrence.rows() && col < syntacticCooccurrence.columns()) { // if there's already an entry for the object and attribute, get the // current value for the pair of words val = syntacticCooccurrence.get(row, col); // increment the current value by one and store in matrix syntacticCooccurrence.set(row, col, val+1); } else { // otherwise set the row, col value to 1 syntacticCooccurrence.set(row, col, 1.0); } } /** * Creates relations between words in a noun phrase */ private void processWordsInNP(ArrayList> wordsInPhrase) { if( wordsInPhrase.size() > 1 ) { // this is from Grefenstette's pseudo code for (int i = 0; i < wordsInPhrase.size()-1; i++) { if (inStartSet(wordsInPhrase.get(i).x) ) { for (int j = i+1; j < wordsInPhrase.size(); j++ ) { if (inReceiveSet( wordsInPhrase.get(j).x ) ) { wordRelationsWriter. println(wordsInPhrase.get(j).y + " " + wordsInPhrase.get(i).y); // System.out.println(wordsInPhrase.get(j).y + " " // + wordsInPhrase.get(i).y); addRelation(wordsInPhrase.get(j).y, wordsInPhrase.get(i).y); } } } } } } /** * Checks to see if the tag can modify another word * * @param tag A tag from the parsed corpus to be checked */ private boolean inStartSet(String tag) { return // noun tag.startsWith("NN") || // adjective tag.startsWith("JJ") || // adverb tag.startsWith("RB") || // cardinal number tag.startsWith("CD"); } /** * Checks to see if tag can be modified by a word in StartSet */ private boolean inReceiveSet(String tag) { return tag.startsWith("NN") || tag.startsWith("VB"); } /** * Checks to see if tag is a preposition */ private boolean isPreposition(String tag) { return tag.startsWith("PP"); } /** * Checks to see if tag marks a phrase or clause */ private boolean isPhraseOrClause(String tag) { // find out why adding more reduced the number of relations return (!tag.equals("SYM") && tag.startsWith("S")) || tag.equals("ADJP") || tag.equals("ADVP") || tag.equals("CONJP") || tag.equals("FRAG") || tag.equals("INTJ") || tag.equals("LST") || tag.equals("NAC") || tag.equals("NP") || tag.equals("NX") || tag.equals("PP") || tag.equals("PRN") || /* removing prt adds 1% more relations */ tag.equals("PRT") || tag.equals("QP") || tag.equals("RRC") || tag.equals("UCP") || tag.equals("VP") || tag.startsWith("WH") || tag.equals("X"); } /** * Returns the next tag in the sentence or null if there are no more tags * @param str The sentence that the tag is extracted from */ private String getNextTag(String str) { String tag; int endIndex; int tagIndex = str.indexOf("("); if( tagIndex < 0 ) { return null; } // in case there's nothing in the sentence endIndex = str.indexOf(" ", tagIndex); if( endIndex < 0 ) { return null; } tag = str.substring( tagIndex+1, endIndex ); if( tag.length() > 0 ) { return tag; } else { str = str.substring( tagIndex+1 ); return getNextTag(str); } } /** * {@inheritDoc} */ public Set getWords() { return Collections.unmodifiableSet(objectTable.keySet()); } /** * {@inheritDoc} */ public Vector getVector(String word) { word = word.toLowerCase(); if(objectTable.containsKey(word)) { int wordIndex = objectTable.get(word); if(wordIndex < syntacticCooccurrence.rows()) { return syntacticCooccurrence.getRowVector(wordIndex); } // At this section, several exception handlers were removed. These // may have been superfluous, or the code may have relied on them // being caught. } return null; } /** * Does nothing. */ public void processSpace(Properties properties) { } /** * {@inheritDoc} */ public String getSpaceName() { return "grefenstette-syntatic-analysis"; } /** * {@inheritDoc} */ public int getVectorLength() { return syntacticCooccurrence.columns(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy