edu.stanford.nlp.parser.shiftreduce.State Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.parser.shiftreduce;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;

import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Scored;
import edu.stanford.nlp.util.TreeShapedStack;

/**
 * A class which encodes the current state of the parsing.  This can
 * be used either for direct search or for beam search.
 * 

 * Important information which needs to be encoded:
 * 
 *
 * A stack.  This needs to be updatable in O(1) time to keep the
 * parser's run time linear.  This is done by using a linked-list type
 * stack in which new states are created by the push
 * operation. 
 *
 * 
A queue.  This also needs to be updatable in O(1) time.  This
 * is accomplished by having all the states share the same list of
 * queued items, with different states only changing an index into the
 * queue.
 *
 * 
The score of the current state.  This is useful in beam searches.
 *
 * 
Whether or not the current state is "finalized".  If so, the
 * only thing that can be done from now on is to idle.
 *
 * 
 */
public class State implements Scored {
  /**
   * Expects a list of preterminals.  The preterminals should be built
   * with CoreLabels and have HeadWord and HeadTag annotations set.
   */
  public State(List sentence) {
    this(new TreeShapedStack<>(), new TreeShapedStack<>(), findSeparators(sentence), sentence, 0, 0.0, false);
  }

  State(TreeShapedStack stack, TreeShapedStack transitions, TreeMap separators,
        List sentence, int tokenPosition, double score, boolean finished) {
    this.stack = stack;
    this.transitions = transitions;
    this.separators = separators;
    this.sentence = sentence;
    this.tokenPosition = tokenPosition;
    this.score = score;
    this.finished = finished;
  }

  /**
   * The stack of Tree pieces we have already assembled.
   */
  final TreeShapedStack stack;

  /**
   * The transition sequence used to get to the current position
   */
  final TreeShapedStack transitions;

  /**
   * Used to describe the relative location of separators to the head of a subtree
   */
  public enum HeadPosition { NONE, LEFT, RIGHT, BOTH, HEAD };

  /**
   * A description of where the separators such as ,;:- are in a
   * subtree, relative to the head of the subtree
   */
  final TreeMap separators;

  Tree getStackNode(int depth) {
    if (depth >= stack.size()) {
      return null;
    }
    TreeShapedStack node = stack;
    for (int i = 0; i < depth; ++i) {
      node = node.pop();
    }
    return node.peek();
  }

  Tree getQueueNode(int depth) {
    if (tokenPosition + depth >= sentence.size()) {
      return null;
    }
    return sentence.get(tokenPosition + depth);
  }

  /**
   * Returns the first separator between two nodes or returns null if
   * such a thing does not exist
   */
  String getSeparatorBetween(int right, int left) {
    if (right >= left) {
      throw new AssertionError("Expected right < left");
    }
    return getSeparatorBetween(getStackNode(right), getStackNode(left));
  }

  String getSeparatorBetween(Tree right, Tree left) {
    if (right == null || left == null) {
      return null;
    }
    int leftHead = ShiftReduceUtils.headIndex(left);
    int rightHead = ShiftReduceUtils.headIndex(right);
    Map.Entry nextSeparator = separators.ceilingEntry(leftHead);
    if (nextSeparator == null || nextSeparator.getKey() > rightHead) {
      return null;
    }
    return nextSeparator.getValue().substring(0, 1);
  }

  /**
   * Returns the separator count between two nodes 
   * (0 if any of the nodes don't exist)
   */
  int getSeparatorCount(int right, int left) {
    if (right >= left) {
      throw new AssertionError("Expected right < left");
    }
    return getSeparatorCount(getStackNode(right), getStackNode(left));
  }

  int getSeparatorCount(Tree right, Tree left) {
    if (right == null || left == null) {
      return 0;
    }
    int leftHead = ShiftReduceUtils.headIndex(left);
    int rightHead = ShiftReduceUtils.headIndex(right);
    Integer nextSeparator = separators.higherKey(leftHead);
    int count = 0;
    while (nextSeparator != null && nextSeparator < rightHead) {
      ++count;
      nextSeparator = separators.higherKey(nextSeparator);
    }
    return count;
  }

  HeadPosition getSeparator(int nodeNum) {
    if (nodeNum >= stack.size()) {
      return null;
    }
    TreeShapedStack stack = this.stack;
    for (int i = 0; i < nodeNum; ++i) {
      stack = stack.pop();
    }
    Tree node = stack.peek();
    int head = ShiftReduceUtils.headIndex(node);
    if (separators.get(head) != null) {
      return HeadPosition.HEAD;
    }
    int left = ShiftReduceUtils.leftIndex(node);
    Integer nextLeft = separators.floorKey(head);
    boolean hasLeft = (nextLeft != null && nextLeft >= left);

    int right = ShiftReduceUtils.rightIndex(node);
    Integer nextRight = separators.ceilingKey(head);
    boolean hasRight = (nextRight != null && nextRight <= right);

    if (hasLeft && hasRight) {
      return HeadPosition.BOTH;
    } else if (hasLeft) {
      return HeadPosition.LEFT;
    } else if (hasRight) {
      return HeadPosition.RIGHT;
    } else {
      return HeadPosition.NONE;
    }
  }

  static final Pattern separatorRegex = Pattern.compile("^[,;:-]+$");

  static final char[][] equivalentSeparators = { { '，', ',' },
                                                 { '；', ';' },
                                                 { '：', ':' } };

  static TreeMap findSeparators(List sentence) {
    TreeMap separators = Generics.newTreeMap();
    for (int index = 0; index < sentence.size(); ++index) {
      Tree leaf = sentence.get(index).children()[0];
      String value = leaf.value();
      for (char[] equivalentSeparator : equivalentSeparators) {
        value = value.replace(equivalentSeparator[0], equivalentSeparator[1]);
      }
      if (separatorRegex.matcher(value).matches()) {
        // TODO: put "value" instead?  Perhaps do this next time we rebuild all models
        separators.put(index, leaf.value());
      }
    }
    return separators;
  }

  /**
   * The words we are parsing.  They need to be tagged before we can
   * parse.  The words are stored as preterminal Trees whose only
   * nodes are the tag node and the word node.
   */
  final List sentence;

  /**
   * Essentially, the position in the queue part of the state.  
   * 0 represents that we are at the start of the queue and nothing
   * has been shifted yet.
   */
  final int tokenPosition;

  /**
   * The score of the current state based on the transitions that were
   * used to create it.
   */
  final double score;

  @Override
  public double score() { return score; }

  /**
   * Whether or not processing has finished.  Once that is true, only
   * idle transitions are allowed.
   */ 
  final boolean finished;

  public boolean isFinished() { return finished; }

  public boolean endOfQueue() {
    return tokenPosition == sentence.size();
  }

  @Override
  public String toString() {
    StringBuilder result = new StringBuilder();
    result.append("State summary\n");
    result.append("  Tokens: " + sentence + "\n");
    result.append("  Token position: " + tokenPosition + "\n");
    result.append("  Current stack contents: " + stack.toString("\n") + "\n");
    result.append("  Component transitions: " + transitions + "\n");
    result.append("  Score: " + score + "\n");
    result.append("  " + ((finished) ? "" : "not ") + "finished\n");
    return result.toString();
  }

  /**
   * Whether or not the transitions that built the two states are
   * equal.  Doesn't check anything else.  Useful for training using
   * an agenda, for example, when you know the underlying information
   * such as the words are the same and all you care about checking is
   * the transition sequence
   */
  public boolean areTransitionsEqual(State other) {
    return transitions.equals(other.transitions);
  }
}