edu.ucla.sspace.evaluation.RubensteinGoodenoughWordSimilarityEvaluation Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.evaluation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import edu.ucla.sspace.common.SemanticSpace;
/**
* A collection of human similarity judgements of word pairs gathered by
* Rubenstein and Goodneough. See the following paper for full details.
*
*
*
* -
* Rubenstein, H. and Goodenough, J. B. Contextual Correlates of Synonymy
* Communications of the ACM, 1965, 8, 627-633
*
*
*
*/
public class RubensteinGoodenoughWordSimilarityEvaluation
implements WordSimilarityEvaluation {
/**
* A collection of human judgements on word relatedness
*/
private final Collection pairs;
/**
* The name of the data file for this test
*/
private final String dataFileName;
/**
* Constructs this word similarity evaluation test using the WS353 data file
* refered to by the provided name.
*/
public RubensteinGoodenoughWordSimilarityEvaluation(String rbSimFileName) {
this(new File(rbSimFileName));
}
/**
* Constructs this word similarity evaluation test using the provide WS353
* data file.
*/
public RubensteinGoodenoughWordSimilarityEvaluation(File rbSimFile) {
pairs = parse(rbSimFile);
dataFileName = rbSimFile.getName();
}
/**
* Parses the WordSimilarity353 file and returns the set of judgements.
*/
private Collection parse(File word353file) {
Collection pairs = new LinkedList();
try {
BufferedReader br = new BufferedReader(new FileReader(word353file));
// skip the first line
br.readLine();
for (String line = null; (line = br.readLine()) != null; ) {
// skip comments and blank lines
if (line.startsWith("#") || line.length() == 0) {
continue;
}
String[] wordsAndNum = line.split("\\s+");
if (wordsAndNum.length != 3) {
throw new Error("Unexpected line formatting: " + line);
}
pairs.add(new SimpleWordSimilarity(
wordsAndNum[0], wordsAndNum[1],
Double.parseDouble(wordsAndNum[2])));
}
} catch (IOException ioe) {
// rethrow as an IOE is fatal evaluation
throw new IOError(ioe);
}
return pairs;
}
/**
* {@inheritDoc}
*/
public Collection getPairs() {
return pairs;
}
/**
* {@inheritDoc}
*/
public double getMostSimilarValue() {
return 10d;
}
/**
* {@inheritDoc}
*/
public double getLeastSimilarValue() {
return 0d;
}
public String toString() {
return "Rubenstein & Goodenough Word Similarity Test ["
+ dataFileName + "]";
}
}