edu.ucla.sspace.evaluation.FinkelsteinEtAl353WordSimilarityEvaluation Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.evaluation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import edu.ucla.sspace.common.SemanticSpace;
/**
* A collection of human similarity judgements of word pairs gathered by
* Finkelstein et al. See their
* website for access to the test data. See the following reference for
* full details on how the data was gathered.
*
*
*
* - Lev Finkelstein, Evgeniy
* Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan
* Ruppin, "Placing Search in Context: The Concept Revisited", ACM
* Transactions on Information Systems, 20(1):116-131, January 2002.
* Available here.
*
*
*/
public class FinkelsteinEtAl353WordSimilarityEvaluation
implements WordSimilarityEvaluation {
/**
* A collection of human judgements on word relatedness
*/
private final Collection pairs;
/**
* The name of the data file for this test
*/
private final String dataFileName;
/**
* Constructs this word similarity evaluation test using the WS353 data file
* refered to by the provided name.
*/
public FinkelsteinEtAl353WordSimilarityEvaluation(String word353fileName) {
this(new File(word353fileName));
}
/**
* Constructs this word similarity evaluation test using the provide WS353
* data file.
*/
public FinkelsteinEtAl353WordSimilarityEvaluation(File word353file) {
pairs = parse(word353file);
dataFileName = word353file.getName();
}
/**
* Parses the WordSimilarity353 file and returns the set of judgements.
*/
private Collection parse(File word353file) {
// the ws353 data set comes in two formats, a comma-separated format and
// a tab-separated format. Support both by checking the file name
// suffix.
String delimeter = (word353file.getName().endsWith(".csv"))
? "," : "\\s";
Collection pairs = new LinkedList();
try {
BufferedReader br = new BufferedReader(new FileReader(word353file));
// skip the first line
br.readLine();
for (String line = null; (line = br.readLine()) != null; ) {
String[] wordsAndNum = line.split(delimeter);
if (wordsAndNum.length != 3) {
throw new Error("Unexpected line formatting: " + line);
}
pairs.add(new SimpleWordSimilarity(
wordsAndNum[0], wordsAndNum[1],
Double.parseDouble(wordsAndNum[2])));
}
} catch (IOException ioe) {
// rethrow as an IOE is fatal evaluation
throw new IOError(ioe);
}
return pairs;
}
/**
* {@inheritDoc}
*/
public Collection getPairs() {
return pairs;
}
/**
* {@inheritDoc}
*/
public double getMostSimilarValue() {
return 10d;
}
/**
* {@inheritDoc}
*/
public double getLeastSimilarValue() {
return 0d;
}
public String toString() {
return "Finkelstein et al. Word Similarity Test [" + dataFileName + "]";
}
}