edu.stanford.nlp.parser.shiftreduce.State Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.shiftreduce;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Scored;
import edu.stanford.nlp.util.TreeShapedStack;
/**
* A class which encodes the current state of the parsing. This can
* be used either for direct search or for beam search.
*
* Important information which needs to be encoded:
*
*
* - A stack. This needs to be updatable in O(1) time to keep the
* parser's run time linear. This is done by using a linked-list type
* stack in which new states are created by the
push
* operation.
*
* - A queue. This also needs to be updatable in O(1) time. This
* is accomplished by having all the states share the same list of
* queued items, with different states only changing an index into the
* queue.
*
*
- The score of the current state. This is useful in beam searches.
*
*
- Whether or not the current state is "finalized". If so, the
* only thing that can be done from now on is to idle.
*
*
*/
public class State implements Scored {
/**
* Expects a list of preterminals. The preterminals should be built
* with CoreLabels and have HeadWord and HeadTag annotations set.
*/
public State(List sentence) {
this(new TreeShapedStack(), new TreeShapedStack(), findSeparators(sentence), sentence, 0, 0.0, false);
}
State(TreeShapedStack stack, TreeShapedStack transitions, TreeMap separators,
List sentence, int tokenPosition, double score, boolean finished) {
this.stack = stack;
this.transitions = transitions;
this.separators = separators;
this.sentence = sentence;
this.tokenPosition = tokenPosition;
this.score = score;
this.finished = finished;
}
/**
* The stack of Tree pieces we have already assembled.
*/
final TreeShapedStack stack;
/**
* The transition sequence used to get to the current position
*/
final TreeShapedStack transitions;
/**
* Used to describe the relative location of separators to the head of a subtree
*/
public enum HeadPosition { NONE, LEFT, RIGHT, BOTH, HEAD };
/**
* A description of where the separators such as ,;:- are in a
* subtree, relative to the head of the subtree
*/
final TreeMap separators;
Tree getStackNode(int depth) {
if (depth >= stack.size()) {
return null;
}
TreeShapedStack node = stack;
for (int i = 0; i < depth; ++i) {
node = node.pop();
}
return node.peek();
}
Tree getQueueNode(int depth) {
if (tokenPosition + depth >= sentence.size()) {
return null;
}
return sentence.get(tokenPosition + depth);
}
/**
* Returns the first separator between two nodes or returns null if
* such a thing does not exist
*/
String getSeparatorBetween(int right, int left) {
if (right >= left) {
throw new AssertionError("Expected right < left");
}
return getSeparatorBetween(getStackNode(right), getStackNode(left));
}
String getSeparatorBetween(Tree right, Tree left) {
if (right == null || left == null) {
return null;
}
int leftHead = ShiftReduceUtils.headIndex(left);
int rightHead = ShiftReduceUtils.headIndex(right);
Map.Entry nextSeparator = separators.ceilingEntry(leftHead);
if (nextSeparator == null || nextSeparator.getKey() > rightHead) {
return null;
}
return nextSeparator.getValue().substring(0, 1);
}
/**
* Returns the separator count between two nodes
* (0 if any of the nodes don't exist)
*/
int getSeparatorCount(int right, int left) {
if (right >= left) {
throw new AssertionError("Expected right < left");
}
return getSeparatorCount(getStackNode(right), getStackNode(left));
}
int getSeparatorCount(Tree right, Tree left) {
if (right == null || left == null) {
return 0;
}
int leftHead = ShiftReduceUtils.headIndex(left);
int rightHead = ShiftReduceUtils.headIndex(right);
Integer nextSeparator = separators.higherKey(leftHead);
int count = 0;
while (nextSeparator != null && nextSeparator < rightHead) {
++count;
nextSeparator = separators.higherKey(nextSeparator);
}
return count;
}
HeadPosition getSeparator(int nodeNum) {
if (nodeNum >= stack.size()) {
return null;
}
TreeShapedStack stack = this.stack;
for (int i = 0; i < nodeNum; ++i) {
stack = stack.pop();
}
Tree node = stack.peek();
int head = ShiftReduceUtils.headIndex(node);
if (separators.get(head) != null) {
return HeadPosition.HEAD;
}
int left = ShiftReduceUtils.leftIndex(node);
Integer nextLeft = separators.floorKey(head);
boolean hasLeft = (nextLeft != null && nextLeft >= left);
int right = ShiftReduceUtils.rightIndex(node);
Integer nextRight = separators.ceilingKey(head);
boolean hasRight = (nextRight != null && nextRight <= right);
if (hasLeft && hasRight) {
return HeadPosition.BOTH;
} else if (hasLeft) {
return HeadPosition.LEFT;
} else if (hasRight) {
return HeadPosition.RIGHT;
} else {
return HeadPosition.NONE;
}
}
static final Pattern separatorRegex = Pattern.compile("^[,;:-]+$");
static final char[][] equivalentSeparators = { { ',', ',' },
{ ';', ';' },
{ ':', ':' } };
static TreeMap findSeparators(List sentence) {
TreeMap separators = Generics.newTreeMap();
for (int index = 0; index < sentence.size(); ++index) {
Tree leaf = sentence.get(index).children()[0];
String value = leaf.value();
for (int i = 0; i < equivalentSeparators.length; ++i) {
value = value.replace(equivalentSeparators[i][0], equivalentSeparators[i][1]);
}
if (separatorRegex.matcher(value).matches()) {
// TODO: put "value" instead? Perhaps do this next time we rebuild all models
separators.put(index, leaf.value());
}
}
return separators;
}
/**
* The words we are parsing. They need to be tagged before we can
* parse. The words are stored as preterminal Trees whose only
* nodes are the tag node and the word node.
*/
final List sentence;
/**
* Essentially, the position in the queue part of the state.
* 0 represents that we are at the start of the queue and nothing
* has been shifted yet.
*/
final int tokenPosition;
/**
* The score of the current state based on the transitions that were
* used to create it.
*/
final double score;
@Override
public double score() { return score; }
/**
* Whether or not processing has finished. Once that is true, only
* idle transitions are allowed.
*/
final boolean finished;
public boolean isFinished() { return finished; }
public boolean endOfQueue() {
return tokenPosition == sentence.size();
}
@Override
public String toString() {
StringBuilder result = new StringBuilder();
result.append("State summary\n");
result.append(" Tokens: " + sentence + "\n");
result.append(" Token position: " + tokenPosition + "\n");
result.append(" Current stack contents: " + stack.toString("\n") + "\n");
result.append(" Component transitions: " + transitions + "\n");
result.append(" Score: " + score + "\n");
result.append(" " + ((finished) ? "" : "not ") + "finished\n");
return result.toString();
}
/**
* Whether or not the transitions that built the two states are
* equal. Doesn't check anything else. Useful for training using
* an agenda, for example, when you know the underlying information
* such as the words are the same and all you care about checking is
* the transition sequence
*/
public boolean areTransitionsEqual(State other) {
return transitions.equals(other.transitions);
}
}