edu.stanford.nlp.parser.lexparser.BinaryGrammar Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;

import java.io.*;
import java.util.*;

/**
 * Maintains efficient indexing of binary grammar rules.
 *
 * @author Dan Klein
 * @author Christopher Manning (generified and optimized storage)
 */
public class BinaryGrammar implements Serializable, Iterable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(BinaryGrammar.class);

  // private static final BinaryRule[] EMPTY_BINARY_RULE_ARRAY = new BinaryRule[0];

  private final Index index;

  private final List allRules;

  private transient List[] rulesWithParent;
  private transient List[] rulesWithLC;
  private transient List[] rulesWithRC;
  private transient Set[] ruleSetWithLC;
  private transient Set[] ruleSetWithRC;
  private transient BinaryRule[][] splitRulesWithLC;
  private transient BinaryRule[][] splitRulesWithRC;
  //  private transient BinaryRule[][] splitRulesWithParent = null;
  private transient Map ruleMap;
  // for super speed! (maybe)
  private transient boolean[] synthetic;


  public int numRules() {
    return allRules.size();
  }

  public List rules() {
    return allRules;
  }

  public boolean isSynthetic(int state) {
    return synthetic[state];
  }

  /**
   * Populates the "splitRules" accessor lists using the existing rule lists.
   * If the state is synthetic, these lists contain all rules for the state.
   * If the state is NOT synthetic, these lists contain only the rules in
   * which both children are not synthetic.
   * 
   * This method must be called before the grammar is
   * used, either after training or deserializing grammar.
   */
  public void splitRules() {
    // first initialize the synthetic array
    int numStates = index.size();
    synthetic = new boolean[numStates];
    for (int s = 0; s < numStates; s++) {
      try {
        //System.out.println(((String)index.get(s))); // debugging
        synthetic[s] = (index.get(s).charAt(0) == '@');
      } catch (NullPointerException e) {
        synthetic[s] = true;
      }
    }

    splitRulesWithLC = new BinaryRule[numStates][];
    splitRulesWithRC = new BinaryRule[numStates][];
    //    splitRulesWithParent = new BinaryRule[numStates][];
    // rules accessed by their "synthetic" child or left child if none
    for (int state = 0; state < numStates; state++) {
      //      System.out.println("Splitting rules for state: " + index.get(state));
      // check synthetic
      if (isSynthetic(state)) {
        splitRulesWithLC[state] = rulesWithLC[state].toArray(new BinaryRule[rulesWithLC[state].size()]);
        // cdm 2012: I thought sorting the rules might help with speed (memory locality) but didn't seem to
        // Arrays.sort(splitRulesWithLC[state]);
        splitRulesWithRC[state] = rulesWithRC[state].toArray(new BinaryRule[rulesWithRC[state].size()]);
        // Arrays.sort(splitRulesWithRC[state]);
      } else {
        // if state is not synthetic, we add rule to splitRules only if both children are not synthetic
        // do left
        List ruleList = new ArrayList<>();
        for (BinaryRule br : rulesWithLC[state]) {
          if ( ! isSynthetic(br.rightChild)) {
            ruleList.add(br);
          }
        }
        splitRulesWithLC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
        // Arrays.sort(splitRulesWithLC[state]);
        // do right
        ruleList.clear();
        for (BinaryRule br : rulesWithRC[state]) {
          if ( ! isSynthetic(br.leftChild)) {
            ruleList.add(br);
          }
        }
        splitRulesWithRC[state] = ruleList.toArray(new BinaryRule[ruleList.size()]);
        // Arrays.sort(splitRulesWithRC[state]);
      }
      // parent accessor
      //      splitRulesWithParent[state] = toBRArray(rulesWithParent[state]);
    }
  }

  public BinaryRule[] splitRulesWithLC(int state) {
    // if (state >= splitRulesWithLC.length) {
    //   return EMPTY_BINARY_RULE_ARRAY;
    // }
    return splitRulesWithLC[state];
  }

  public BinaryRule[] splitRulesWithRC(int state) {
    // if (state >= splitRulesWithRC.length) {
    //   return EMPTY_BINARY_RULE_ARRAY;
    // }
    return splitRulesWithRC[state];
  }

  //  public BinaryRule[] splitRulesWithParent(int state) {
  //    return splitRulesWithParent[state];
  //  }

  // the sensible version

  public double scoreRule(BinaryRule br) {
    BinaryRule rule = ruleMap.get(br);
    return (rule != null ? rule.score : Double.NEGATIVE_INFINITY);
  }

  public void addRule(BinaryRule br) {
    //    System.out.println("BG adding rule " + br);
    rulesWithParent[br.parent].add(br);
    rulesWithLC[br.leftChild].add(br);
    rulesWithRC[br.rightChild].add(br);
    ruleSetWithLC[br.leftChild].add(br);
    ruleSetWithRC[br.rightChild].add(br);
    allRules.add(br);
    ruleMap.put(br, br);
  }


  public Iterator iterator() {
    return allRules.iterator();
  }

  public Iterator ruleIteratorByParent(int state) {
    if (state >= rulesWithParent.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithParent[state].iterator();
  }

  public Iterator ruleIteratorByRightChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithRC[state].iterator();
  }

  public Iterator ruleIteratorByLeftChild(int state) {
    if (state >= rulesWithLC.length) {
      return Collections.emptyList().iterator();
    }
    return rulesWithLC[state].iterator();
  }

  public List ruleListByParent(int state) {
    if (state >= rulesWithParent.length) {
      return Collections.emptyList();
    }
    return rulesWithParent[state];
  }

  public List ruleListByRightChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList();
    }
    return rulesWithRC[state];
  }

  public List ruleListByLeftChild(int state) {
    if (state >= rulesWithRC.length) {
      return Collections.emptyList();
    }
    return rulesWithLC[state];
  }

  /* ----
  public Set ruleSetByRightChild(int state) {
    if (state >= ruleSetWithRC.length) {
      return Collections.emptySet();
    }
    return ruleSetWithRC[state];
  }

  public Set ruleSetByLeftChild(int state) {
    if (state >= ruleSetWithRC.length) {
      return Collections.emptySet();
    }
    return ruleSetWithLC[state];
  }
  --- */


  private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
    stream.defaultReadObject();
    init();
    for (BinaryRule br : allRules) {
      rulesWithParent[br.parent].add(br);
      rulesWithLC[br.leftChild].add(br);
      rulesWithRC[br.rightChild].add(br);
      ruleMap.put(br, br);
    }
    splitRules();
  }

  @SuppressWarnings("unchecked")
  private void init() {
    ruleMap = Generics.newHashMap();
    int numStates = index.size();
    rulesWithParent = new List[numStates];
    rulesWithLC = new List[numStates];
    rulesWithRC = new List[numStates];
    ruleSetWithLC = new Set[numStates];
    ruleSetWithRC = new Set[numStates];
    for (int s = 0; s < numStates; s++) {
      rulesWithParent[s] = new ArrayList<>();
      rulesWithLC[s] = new ArrayList<>();
      rulesWithRC[s] = new ArrayList<>();
      ruleSetWithLC[s] = Generics.newHashSet();
      ruleSetWithRC[s] = Generics.newHashSet();
    }
  }

  public BinaryGrammar(Index stateIndex) {
    this.index = stateIndex;
    allRules = new ArrayList<>();
    init();
  }

  /**
   * Populates data in this BinaryGrammar from the character stream
   * given by the Reader r.
   *
   * @param in Where input is read from
   * @throws IOException If format is bung
   */
  public void readData(BufferedReader in) throws IOException {
    //if (Test.verbose) log.info(">> readData");
    String line;
    int lineNum = 1;
    line = in.readLine();
    while (line != null && line.length() > 0) {
      try {
        addRule(new BinaryRule(line, index));
      } catch (Exception e) {
        throw new IOException("Error on line " + lineNum);
      }
      lineNum++;
      line = in.readLine();
    }
    splitRules();
  }

  /**
   * Writes out data from this Object to the Writer w.
   *
   * @param w Where output is written
   * @throws IOException If data can't be written
   */
  public void writeData(Writer w) throws IOException {
    PrintWriter out = new PrintWriter(w);
    for (BinaryRule br : this) {
      out.println(br.toString(index));
    }
    out.flush();
  }

  private static final long serialVersionUID = 1L;

} // end class BinaryGrammar