edu.stanford.nlp.process.DistSimClassifier Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.process;
import java.io.Serializable;
import java.util.Map;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Timing;
/** Maps a String to its distributional similarity class.
*
* @author Christopher Manning
*/
public class DistSimClassifier implements Serializable {
private static final long serialVersionUID = 3L;
private final Map lexicon;
private final boolean cased;
private final boolean numberEquivalence;
private final String unknownWordClass;
public DistSimClassifier(String filename, boolean cased, boolean numberEquivalence) {
this(filename, "alexClark", "utf-8", -1, cased, numberEquivalence, "NULL");
}
public DistSimClassifier(String filename, boolean cased,
boolean numberEquivalence, String unknownWordClass) {
this(filename, "alexClark", "utf-8", -1, cased, numberEquivalence, unknownWordClass);
}
public DistSimClassifier(String filename, String format, String encoding,
int distSimMaxBits,
boolean cased, boolean numberEquivalence,
String unknownWordClass) {
this.cased = cased;
this.numberEquivalence = numberEquivalence;
this.unknownWordClass = unknownWordClass;
Timing.startDoing("Loading distsim lexicon from " + filename);
lexicon = Generics.newHashMap(1 << 15); // make a reasonable starting size
boolean terryKoo = "terryKoo".equals(format);
for (String line : ObjectBank.getLineIterator(filename, encoding)) {
String word;
String wordClass;
if (terryKoo) {
String[] bits = line.split("\\t");
word = bits[1];
wordClass = bits[0];
if (distSimMaxBits > 0 && wordClass.length() > distSimMaxBits) {
wordClass = wordClass.substring(0, distSimMaxBits);
}
} else {
// "alexClark"
String[] bits = line.split("\\s+");
word = bits[0];
wordClass = bits[1];
}
if ( ! cased) {
word = word.toLowerCase();
}
if (numberEquivalence) {
word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
}
lexicon.put(word, wordClass);
}
Timing.endDoing();
}
public String distSimClass(String word) {
if ( ! cased) {
word = word.toLowerCase();
}
if (numberEquivalence) {
word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
}
String distSim = lexicon.get(word);
if (distSim == null) {
distSim = unknownWordClass;
}
return distSim;
}
}