edu.stanford.nlp.parser.lexparser.BinaryGrammar Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;

import java.io.*;
import java.util.*;

/**
 * Maintains efficient indexing of binary grammar rules.
 *
 * @author Dan Klein
 * @author Christopher Manning (generified and optimized storage)
 */
public class BinaryGrammar implements Serializable, Iterable {

  // private static final BinaryRule[] EMPTY_BINARY_RULE_ARRAY = new BinaryRule[0];

  private final Index index;

  private final List allRules;

  private transient List[] rulesWithParent;
  private transient List[] rulesWithLC;
  private transient List[] rulesWithRC;
  private transient Set[] ruleSetWithLC;
  private transient Set[] ruleSetWithRC;
  private transient BinaryRule[][] splitRulesWithLC;
  private transient BinaryRule[][] splitRulesWithRC;
  //  private transient BinaryRule[][] splitRulesWithParent = null;
  private transient Map ruleMap;
  // for super speed! (maybe)
  private transient boolean[] synthetic;


  public int numRules() {
    return allRules.size();
  }

  public List rules() {
    return allRules;
  }

  public boolean isSynthetic(int state) {
    return synthetic[state];
  }

  /**
   * Populates the "splitRules" accessor lists using the existing rule lists.
   * If the state is synthetic, these lists contain all rules for the state.
   * If the state is NOT synthetic, these lists contain only the rules in
   * which both children are not synthetic.
   * 
   * This method must be called before the grammar is
   * used, either after training or deserializing grammar.
   */
  public void splitRules() {
    // first initialize the synthetic array
    int numStates = index.size();
    synthetic = new boolean[numStates];
    for (int s = 0; s < numStates; s++) {
      try {
        //System.out.println(((String)index.get(s))); // debugging
        synthetic[s] = (index.get(s).charAt(0) == '@');
      } catch (NullPointerException e) {
        synthetic[s] = true;
      }
    }

    splitRulesWithLC = new BinaryRule[numStates][];
    splitRulesWithRC = new BinaryRule[numStates][];
    //    splitRulesWithParent = new BinaryRule[numStates][];
    // rules accessed by their "synthetic" child or left child if none
    for (int state = 0; state < numStates; state++) {
      //      System.out.println("Splitting rules for state: " + index.get(state));
      // check synthetic
      if (isSynthetic(state)) {
        splitRulesWithLC[state] = rulesWithLC[state].toArray(new BinaryRule[rulesWithLC[state].size()]);
        // cdm 2012: I thought sorting the rules might help with speed (memory locality) but didn't seem to
        // Arrays.sort(splitRulesWithLC[state]);
        splitRulesWithRC[state] = rulesWithRC[state].toArray(new BinaryRule[rulesWithRC[state].size()]);
        // Arrays.sort(splitRulesWithRC[state]);
      } else {
        // if state is not synthetic, we add rule to splitRules only if both children are not synthetic
        // do left
        List ruleList = new ArrayList();
        for (BinaryRule br : rulesWithLC[state]) {
          if ( ! isSynthetic(br.rightChild)) {
            ruleList.add(br);
          }
        }
        splitRulesWithLC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
        // Arrays.sort(splitRulesWithLC[state]);
        // do right
        ruleList.clear();
        for (BinaryRule br : rulesWithRC[state]) {
          if ( ! isSynthetic(br.leftChild)) {
            ruleList.add(br);
          }
        }
        splitRulesWithRC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
        // Arrays.sort(splitRulesWithRC[state]);
      }
      // parent accessor
      //      splitRulesWithParent[state] = toBRArray(rulesWithParent[state]);
    }
  }

  public BinaryRule[] splitRulesWithLC(int state) {
    // if (state >= splitRulesWithLC.length) {
    //   return EMPTY_BINARY_RULE_ARRAY;
    // }
    return splitRulesWithLC[state];
  }

  public BinaryRule[] splitRulesWithRC(int state) {
    // if (state >= splitRulesWithRC.length) {
    //   return EMPTY_BINARY_RULE_ARRAY;
    // }
    return splitRulesWithRC[state];
  }

  //  public BinaryRule[] splitRulesWithParent(int state) {
  //    return splitRulesWithParent[state];
  //  }

  // the sensible version

  public double scoreRule(BinaryRule br) {
    BinaryRule rule = ruleMap.get(br);
    return (rule != null ? rule.score : Double.NEGATIVE_INFINITY);
  }

  public void addRule(BinaryRule br) {
    //    System.out.println("BG adding rule " + br);
    rulesWithParent[br.parent].add(br);
    rulesWithLC[br.leftChild].add(br);
    rulesWithRC[br.rightChild].add(br);
    ruleSetWithLC[br.leftChild].add(br);
    ruleSetWithRC[br.rightChild].add(br);
    allRules.add(br);
    ruleMap.put(br, br);
  }


  public Iterator iterator() {
    return allRules.iterator();
  }

  public Iterator ruleIteratorByParent(int state) {
    if (state >= rulesWithParent.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithParent[state].iterator();
  }

  public Iterator ruleIteratorByRightChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithRC[state].iterator();
  }

  public Iterator ruleIteratorByLeftChild(int state) {
    if (state >= rulesWithLC.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithLC[state].iterator();
  }

  public List ruleListByParent(int state) {
    if (state >= rulesWithParent.length) {
      return Collections.emptyList();
    }
    return rulesWithParent[state];
  }

  public List ruleListByRightChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList();
    }
    return rulesWithRC[state];
  }

  public List ruleListByLeftChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList();
    }
    return rulesWithLC[state];
  }

  /* ----
  public Set ruleSetByRightChild(int state) {
    if (state >= ruleSetWithRC.length) {
      return Collections.emptySet();
    }
    return ruleSetWithRC[state];
  }

  public Set ruleSetByLeftChild(int state) {
    if (state >= ruleSetWithRC.length) {
      return Collections.emptySet();
    }
    return ruleSetWithLC[state];
  }
  --- */


  private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
    stream.defaultReadObject();
    init();
    for (BinaryRule br : allRules) {
      rulesWithParent[br.parent].add(br);
      rulesWithLC[br.leftChild].add(br);
      rulesWithRC[br.rightChild].add(br);
      ruleMap.put(br, br);
    }
    splitRules();
  }

  @SuppressWarnings("unchecked")
  private void init() {
    ruleMap = Generics.newHashMap();
    int numStates = index.size();
    rulesWithParent = new List[numStates];
    rulesWithLC = new List[numStates];
    rulesWithRC = new List[numStates];
    ruleSetWithLC = new Set[numStates];
    ruleSetWithRC = new Set[numStates];
    for (int s = 0; s < numStates; s++) {
      rulesWithParent[s] = new ArrayList();
      rulesWithLC[s] = new ArrayList();
      rulesWithRC[s] = new ArrayList();
      ruleSetWithLC[s] = Generics.newHashSet();
      ruleSetWithRC[s] = Generics.newHashSet();
    }
  }

  public BinaryGrammar(Index stateIndex) {
    this.index = stateIndex;
    allRules = new ArrayList();
    init();
  }

  /**
   * Populates data in this BinaryGrammar from the character stream
   * given by the Reader r.
   *
   * @param in Where input is read from
   * @throws IOException If format is bung
   */
  public void readData(BufferedReader in) throws IOException {
    //if (Test.verbose) System.err.println(">> readData");
    String line;
    int lineNum = 1;
    line = in.readLine();
    while (line != null && line.length() > 0) {
      try {
        addRule(new BinaryRule(line, index));
      } catch (Exception e) {
        throw new IOException("Error on line " + lineNum);
      }
      lineNum++;
      line = in.readLine();
    }
    splitRules();
  }

  /**
   * Writes out data from this Object to the Writer w.
   *
   * @param w Where output is written
   * @throws IOException If data can't be written
   */
  public void writeData(Writer w) throws IOException {
    PrintWriter out = new PrintWriter(w);
    for (BinaryRule br : this) {
      out.println(br.toString(index));
    }
    out.flush();
  }

  private static final long serialVersionUID = 1L;

} // end class BinaryGrammar