cmu.arktweetnlp.impl.Vocabulary Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.impl;
import java.util.ArrayList;
import java.util.HashMap;
import edu.berkeley.nlp.util.StringUtils;
/** Could scrap this and use ark-regression's version -- that one has CheapStrings **/
public class Vocabulary {
private HashMap name2num;
private ArrayList num2name;
private boolean isLocked = false;
Vocabulary() {
name2num = new HashMap();
num2name = new ArrayList();
}
public void lock() {
isLocked = true;
}
public boolean isLocked() { return isLocked; }
public int size() {
assert name2num.size() == num2name.size();
return name2num.size();
}
/**
* If not locked, an unknown name is added to the vocabulary.
* If locked, return -1 on OOV.
* @param featname
* @return
*/
public int num(String featname) {
if (! name2num.containsKey(featname)) {
if (isLocked) return -1;
int n = name2num.size();
name2num.put(featname, n);
num2name.add(featname);
return n;
} else {
return name2num.get(featname);
}
}
public String name(int num) {
if (num2name.size() <= num) {
throw new RuntimeException("Unknown number for vocab: " + num);
} else {
return num2name.get(num);
}
}
public boolean contains(String name) {
return name2num.containsKey(name);
}
public String toString() {
return "[" + StringUtils.join(num2name) + "]";
}
/** Throw an error if OOV **/
public int numStrict(String string) {
assert isLocked;
int n = num(string);
if (n == -1) throw new RuntimeException("OOV happened");
return n;
}
}