edu.stanford.nlp.parser.lexparser.BinaryGrammar Maven / Gradle / Ivy
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import java.io.*;
import java.util.*;
/**
* Maintains efficient indexing of binary grammar rules.
*
* @author Dan Klein
* @author Christopher Manning (generified and optimized storage)
*/
public class BinaryGrammar implements Serializable, Iterable {
// private static final BinaryRule[] EMPTY_BINARY_RULE_ARRAY = new BinaryRule[0];
private final Index index;
private final List allRules;
private transient List[] rulesWithParent;
private transient List[] rulesWithLC;
private transient List[] rulesWithRC;
private transient Set[] ruleSetWithLC;
private transient Set[] ruleSetWithRC;
private transient BinaryRule[][] splitRulesWithLC;
private transient BinaryRule[][] splitRulesWithRC;
// private transient BinaryRule[][] splitRulesWithParent = null;
private transient Map ruleMap;
// for super speed! (maybe)
private transient boolean[] synthetic;
public int numRules() {
return allRules.size();
}
public List rules() {
return allRules;
}
public boolean isSynthetic(int state) {
return synthetic[state];
}
/**
* Populates the "splitRules" accessor lists using the existing rule lists.
* If the state is synthetic, these lists contain all rules for the state.
* If the state is NOT synthetic, these lists contain only the rules in
* which both children are not synthetic.
*
* This method must be called before the grammar is
* used, either after training or deserializing grammar.
*/
public void splitRules() {
// first initialize the synthetic array
int numStates = index.size();
synthetic = new boolean[numStates];
for (int s = 0; s < numStates; s++) {
try {
//System.out.println(((String)index.get(s))); // debugging
synthetic[s] = (index.get(s).charAt(0) == '@');
} catch (NullPointerException e) {
synthetic[s] = true;
}
}
splitRulesWithLC = new BinaryRule[numStates][];
splitRulesWithRC = new BinaryRule[numStates][];
// splitRulesWithParent = new BinaryRule[numStates][];
// rules accessed by their "synthetic" child or left child if none
for (int state = 0; state < numStates; state++) {
// System.out.println("Splitting rules for state: " + index.get(state));
// check synthetic
if (isSynthetic(state)) {
splitRulesWithLC[state] = rulesWithLC[state].toArray(new BinaryRule[rulesWithLC[state].size()]);
// cdm 2012: I thought sorting the rules might help with speed (memory locality) but didn't seem to
// Arrays.sort(splitRulesWithLC[state]);
splitRulesWithRC[state] = rulesWithRC[state].toArray(new BinaryRule[rulesWithRC[state].size()]);
// Arrays.sort(splitRulesWithRC[state]);
} else {
// if state is not synthetic, we add rule to splitRules only if both children are not synthetic
// do left
List ruleList = new ArrayList<>();
for (BinaryRule br : rulesWithLC[state]) {
if ( ! isSynthetic(br.rightChild)) {
ruleList.add(br);
}
}
splitRulesWithLC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
// Arrays.sort(splitRulesWithLC[state]);
// do right
ruleList.clear();
for (BinaryRule br : rulesWithRC[state]) {
if ( ! isSynthetic(br.leftChild)) {
ruleList.add(br);
}
}
splitRulesWithRC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
// Arrays.sort(splitRulesWithRC[state]);
}
// parent accessor
// splitRulesWithParent[state] = toBRArray(rulesWithParent[state]);
}
}
public BinaryRule[] splitRulesWithLC(int state) {
// if (state >= splitRulesWithLC.length) {
// return EMPTY_BINARY_RULE_ARRAY;
// }
return splitRulesWithLC[state];
}
public BinaryRule[] splitRulesWithRC(int state) {
// if (state >= splitRulesWithRC.length) {
// return EMPTY_BINARY_RULE_ARRAY;
// }
return splitRulesWithRC[state];
}
// public BinaryRule[] splitRulesWithParent(int state) {
// return splitRulesWithParent[state];
// }
// the sensible version
public double scoreRule(BinaryRule br) {
BinaryRule rule = ruleMap.get(br);
return (rule != null ? rule.score : Double.NEGATIVE_INFINITY);
}
public void addRule(BinaryRule br) {
// System.out.println("BG adding rule " + br);
rulesWithParent[br.parent].add(br);
rulesWithLC[br.leftChild].add(br);
rulesWithRC[br.rightChild].add(br);
ruleSetWithLC[br.leftChild].add(br);
ruleSetWithRC[br.rightChild].add(br);
allRules.add(br);
ruleMap.put(br, br);
}
public Iterator iterator() {
return allRules.iterator();
}
public Iterator ruleIteratorByParent(int state) {
if (state >= rulesWithParent.length) {
return Collections.emptyList().iterator();
}
return rulesWithParent[state].iterator();
}
public Iterator ruleIteratorByRightChild(int state) {
if (state >= rulesWithRC.length) {
return Collections.emptyList().iterator();
}
return rulesWithRC[state].iterator();
}
public Iterator ruleIteratorByLeftChild(int state) {
if (state >= rulesWithLC.length) {
return Collections.emptyList().iterator();
}
return rulesWithLC[state].iterator();
}
public List ruleListByParent(int state) {
if (state >= rulesWithParent.length) {
return Collections.emptyList();
}
return rulesWithParent[state];
}
public List ruleListByRightChild(int state) {
if (state >= rulesWithRC.length) {
return Collections.emptyList();
}
return rulesWithRC[state];
}
public List ruleListByLeftChild(int state) {
if (state >= rulesWithRC.length) {
return Collections.emptyList();
}
return rulesWithLC[state];
}
/* ----
public Set ruleSetByRightChild(int state) {
if (state >= ruleSetWithRC.length) {
return Collections.emptySet();
}
return ruleSetWithRC[state];
}
public Set ruleSetByLeftChild(int state) {
if (state >= ruleSetWithRC.length) {
return Collections.emptySet();
}
return ruleSetWithLC[state];
}
--- */
private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
stream.defaultReadObject();
init();
for (BinaryRule br : allRules) {
rulesWithParent[br.parent].add(br);
rulesWithLC[br.leftChild].add(br);
rulesWithRC[br.rightChild].add(br);
ruleMap.put(br, br);
}
splitRules();
}
@SuppressWarnings("unchecked")
private void init() {
ruleMap = Generics.newHashMap();
int numStates = index.size();
rulesWithParent = new List[numStates];
rulesWithLC = new List[numStates];
rulesWithRC = new List[numStates];
ruleSetWithLC = new Set[numStates];
ruleSetWithRC = new Set[numStates];
for (int s = 0; s < numStates; s++) {
rulesWithParent[s] = new ArrayList<>();
rulesWithLC[s] = new ArrayList<>();
rulesWithRC[s] = new ArrayList<>();
ruleSetWithLC[s] = Generics.newHashSet();
ruleSetWithRC[s] = Generics.newHashSet();
}
}
public BinaryGrammar(Index stateIndex) {
this.index = stateIndex;
allRules = new ArrayList<>();
init();
}
/**
* Populates data in this BinaryGrammar from the character stream
* given by the Reader r.
*
* @param in Where input is read from
* @throws IOException If format is bung
*/
public void readData(BufferedReader in) throws IOException {
//if (Test.verbose) System.err.println(">> readData");
String line;
int lineNum = 1;
line = in.readLine();
while (line != null && line.length() > 0) {
try {
addRule(new BinaryRule(line, index));
} catch (Exception e) {
throw new IOException("Error on line " + lineNum);
}
lineNum++;
line = in.readLine();
}
splitRules();
}
/**
* Writes out data from this Object to the Writer w.
*
* @param w Where output is written
* @throws IOException If data can't be written
*/
public void writeData(Writer w) throws IOException {
PrintWriter out = new PrintWriter(w);
for (BinaryRule br : this) {
out.println(br.toString(index));
}
out.flush();
}
private static final long serialVersionUID = 1L;
} // end class BinaryGrammar