edu.stanford.nlp.trees.tregex.DescriptionPattern Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.trees.tregex;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.function.Function;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.ArrayStringFilter;
import edu.stanford.nlp.util.Pair;
import java.util.function.Predicate;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DescriptionPattern extends TregexPattern {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(DescriptionPattern.class);
enum DescriptionMode {
PATTERN, STRINGS, EXACT, ANYTHING
}
private final Relation rel;
private final boolean negDesc;
private final DescriptionMode descriptionMode;
private final String exactMatch;
private final Pattern descPattern;
private final Predicate stringFilter;
// what size string matchers to use before switching to regex for
// disjunction matches
private static final int MAX_STRING_MATCHER_SIZE = 8;
private final String stringDesc;
/** The name to give the matched node */
private final String name;
/** If this pattern is a link, this is the node linked to */
private final String linkedName;
private final boolean isLink;
// todo: conceptually final, but we'd need to rewrite TregexParser
// to make it so.
private TregexPattern child;
// also conceptually final, but it depends on the child
private final List> variableGroups; // specifies the groups in a regex that are captured as matcher-global string variables
private final Function basicCatFunction;
/** Used to detect regex expressions which can be simplified to exact matches */
private static final Pattern SINGLE_WORD_PATTERN = Pattern.compile("/\\^(.)\\$/" + "|" + // for example, /^:$/
"/\\^\\[(.)\\]\\$/" + "|" + // for example, /^[$]$/
"/\\^([-a-zA-Z']+)\\$/"); // for example, /^-NONE-$/
private static final Pattern MULTI_WORD_PATTERN = Pattern.compile("/\\^\\(\\?\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");
private static final Pattern CASE_INSENSITIVE_PATTERN = Pattern.compile("/\\^\\(\\?i\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");
/** Used to detect regex expressions which can be simplified to exact matches */
private static final Pattern PREFIX_PATTERN = Pattern.compile("/\\^([-a-zA-Z|]+)\\/" + "|" + // for example, /^JJ/
"/\\^\\(\\?\\:([-a-zA-Z|]+)\\)\\/");
public DescriptionPattern(Relation rel, boolean negDesc, String desc,
String name, boolean useBasicCat,
Function basicCatFunction,
List> variableGroups,
boolean isLink, String linkedName) {
this.rel = rel;
this.negDesc = negDesc;
this.isLink = isLink;
this.linkedName = linkedName;
if (desc != null) {
stringDesc = desc;
// TODO: factor out some of these blocks of code
if (desc.equals("__") || desc.equals("/.*/") || desc.equals("/^.*$/")) {
descriptionMode = DescriptionMode.ANYTHING;
descPattern = null;
exactMatch = null;
stringFilter = null;
} else if (SINGLE_WORD_PATTERN.matcher(desc).matches()) {
// Expressions are written like this to put special characters
// in the tregex matcher, but a regular expression is less
// efficient than a simple string match
descriptionMode = DescriptionMode.EXACT;
descPattern = null;
Matcher matcher = SINGLE_WORD_PATTERN.matcher(desc);
matcher.matches();
String matchedGroup = null;
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
matchedGroup = matcher.group(i);
break;
}
}
exactMatch = matchedGroup;
stringFilter = null;
//log.info("DescriptionPattern: converting " + desc + " to " + exactMatch);
} else if (MULTI_WORD_PATTERN.matcher(desc).matches()) {
Matcher matcher = MULTI_WORD_PATTERN.matcher(desc);
matcher.matches();
String matchedGroup = null;
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
matchedGroup = matcher.group(i);
break;
}
}
matchedGroup = matchedGroup.replaceAll("\\\\", "");
if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
descriptionMode = DescriptionMode.PATTERN;
descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
exactMatch = null;
stringFilter = null;
//log.info("DescriptionPattern: not converting " + desc);
} else {
descriptionMode = DescriptionMode.STRINGS;
descPattern = null;
exactMatch = null;
stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, matchedGroup.split("[|]"));
//log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
}
} else if (CASE_INSENSITIVE_PATTERN.matcher(desc).matches()) {
Matcher matcher = CASE_INSENSITIVE_PATTERN.matcher(desc);
matcher.matches();
String matchedGroup = null;
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
matchedGroup = matcher.group(i);
break;
}
}
matchedGroup = matchedGroup.replaceAll("\\\\", "");
if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
descriptionMode = DescriptionMode.PATTERN;
descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
exactMatch = null;
stringFilter = null;
//log.info("DescriptionPattern: not converting " + desc);
} else {
descriptionMode = DescriptionMode.STRINGS;
descPattern = null;
exactMatch = null;
stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.CASE_INSENSITIVE, matchedGroup.split("[|]"));
//log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
}
} else if (PREFIX_PATTERN.matcher(desc).matches()) {
Matcher matcher = PREFIX_PATTERN.matcher(desc);
matcher.matches();
String matchedGroup = null;
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
matchedGroup = matcher.group(i);
break;
}
}
if (matchedGroup.split("\\|").length > MAX_STRING_MATCHER_SIZE) {
descriptionMode = DescriptionMode.PATTERN;
descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
exactMatch = null;
stringFilter = null;
//log.info("DescriptionPattern: not converting " + desc);
} else {
descriptionMode = DescriptionMode.STRINGS;
descPattern = null;
exactMatch = null;
stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.PREFIX, matchedGroup.split("[|]"));
//log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
}
} else if (desc.matches("/.*/")) {
descriptionMode = DescriptionMode.PATTERN;
descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
exactMatch = null;
stringFilter = null;
} else if (desc.indexOf('|') >= 0) {
// patterns which contain ORs are a special case; we either
// promote those to regex match or make a string matcher out
// of them. for short enough disjunctions, a simple string
// matcher can be more efficient than a regex.
String[] words = desc.split("[|]");
if (words.length <= MAX_STRING_MATCHER_SIZE) {
descriptionMode = DescriptionMode.STRINGS;
descPattern = null;
exactMatch = null;
stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, words);
} else {
descriptionMode = DescriptionMode.PATTERN;
descPattern = Pattern.compile("^(?:" + desc + ")$");
exactMatch = null;
stringFilter = null;
}
} else { // raw description
descriptionMode = DescriptionMode.EXACT;
descPattern = null;
exactMatch = desc;
stringFilter = null;
}
} else {
if (name == null && linkedName == null) {
throw new AssertionError("Illegal description pattern. Does not describe a node or link/name a variable");
}
stringDesc = " ";
descriptionMode = null;
descPattern = null;
exactMatch = null;
stringFilter = null;
}
this.name = name;
setChild(null);
this.basicCatFunction = (useBasicCat ? basicCatFunction : null);
// System.out.println("Made " + (negDesc ? "negated " : "") + "DescNode with " + desc);
this.variableGroups = variableGroups;
}
public DescriptionPattern(Relation newRelation, DescriptionPattern oldPattern) {
this.rel = newRelation;
this.negDesc = oldPattern.negDesc;
this.isLink = oldPattern.isLink;
this.linkedName = oldPattern.linkedName;
this.stringDesc = oldPattern.stringDesc;
this.descriptionMode = oldPattern.descriptionMode;
this.descPattern = oldPattern.descPattern;
this.exactMatch = oldPattern.exactMatch;
this.stringFilter = oldPattern.stringFilter;
this.name = oldPattern.name;
this.setChild(oldPattern.child);
this.basicCatFunction = oldPattern.basicCatFunction;
this.variableGroups = oldPattern.variableGroups;
}
@Override
public String localString() {
return rel.toString() + ' ' + (negDesc ? "!" : "") + (basicCatFunction != null ? "@" : "") + stringDesc + (name == null ? "" : '=' + name);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if (isNegated()) {
sb.append('!');
}
if (isOptional()) {
sb.append('?');
}
sb.append(rel.toString());
sb.append(' ');
if (child != null) {
sb.append('(');
}
if (negDesc) {
sb.append('!');
}
if (basicCatFunction != null) {
sb.append('@');
}
sb.append(stringDesc);
if (isLink) {
sb.append('~');
sb.append(linkedName);
}
if (name != null) {
sb.append('=');
sb.append(name);
}
sb.append(' ');
if (child != null) {
sb.append(child.toString());
sb.append(')');
}
return sb.toString();
}
public void setChild(TregexPattern n) {
child = n;
}
@Override
public List getChildren() {
if (child == null) {
return Collections.emptyList();
} else {
return Collections.singletonList(child);
}
}
@Override
public TregexMatcher matcher(Tree root, Tree tree,
IdentityHashMap nodesToParents,
Map namesToNodes,
VariableStrings variableStrings,
HeadFinder headFinder) {
return new DescriptionMatcher(this, root, tree, nodesToParents,
namesToNodes, variableStrings, headFinder);
}
// TODO: Why is this a static class with a pointer to the containing
// class? There seems to be no reason for such a thing.
// cdm: agree: It seems like it should just be a non-static inner class. Try this and check it works....
private static class DescriptionMatcher extends TregexMatcher {
private Iterator treeNodeMatchCandidateIterator;
private final DescriptionPattern myNode;
// a DescriptionMatcher only has a single child; if it is the left
// side of multiple relations, a CoordinationMatcher is used.
// childMatcher is null until the first time a matcher needs to check the child
// myNode.child == null OR resetChild has never been called
private TregexMatcher childMatcher;
private Tree nextTreeNodeMatchCandidate; // the Tree node that this DescriptionMatcher node is trying to match on.
private boolean finished = false; // when finished = true, it means I have exhausted my potential tree node match candidates.
private boolean matchedOnce = false;
private boolean committedVariables = false;
public DescriptionMatcher(DescriptionPattern n, Tree root, Tree tree,
IdentityHashMap nodesToParents,
Map namesToNodes,
VariableStrings variableStrings,
HeadFinder headFinder) {
super(root, tree, nodesToParents, namesToNodes, variableStrings, headFinder);
myNode = n;
// no need to reset anything - everything starts out as null or false.
// lazy initialization of children to save time.
// resetChildIter();
}
@Override
void resetChildIter() {
decommitVariableGroups();
removeNamedNodes();
// lazy initialization saves quite a bit of time in use cases
// where we call something other than matches()
treeNodeMatchCandidateIterator = null;
finished = false;
nextTreeNodeMatchCandidate = null;
if (childMatcher != null) {
// need to tell the children to clean up any preexisting data
childMatcher.resetChildIter();
}
}
private void resetChild() {
if (childMatcher == null) {
if (myNode.child == null) {
matchedOnce = false;
}
} else {
childMatcher.resetChildIter(nextTreeNodeMatchCandidate);
}
}
/* goes to the next node in the tree that is a successful match to my description pattern.
* This is the hotspot method in running tregex, but not clear how to make it faster. */
// when finished = false; break; is called, it means I successfully matched.
private void goToNextTreeNodeMatch() {
decommitVariableGroups(); // make sure variable groups are free.
removeNamedNodes(); // if we named a node, it should now be unnamed
finished = true;
Matcher m = null;
String value = null;
if (treeNodeMatchCandidateIterator == null) {
treeNodeMatchCandidateIterator = myNode.rel.searchNodeIterator(tree, this);
}
while (treeNodeMatchCandidateIterator.hasNext()) {
nextTreeNodeMatchCandidate = treeNodeMatchCandidateIterator.next();
if (myNode.descriptionMode == null) {
// this is a backreference or link
if (myNode.isLink) {
Tree otherTree = namesToNodes.get(myNode.linkedName);
if (otherTree != null) {
String otherValue = myNode.basicCatFunction == null ? otherTree.value() : myNode.basicCatFunction.apply(otherTree.value());
String myValue = myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value());
if (otherValue.equals(myValue)) {
finished = false;
break;
}
}
} else if (namesToNodes.get(myNode.name) == nextTreeNodeMatchCandidate) {
finished = false;
break;
}
} else { // try to match the description pattern.
// cdm: Nov 2006: Check for null label, just make found false
// String value = (myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value()));
// m = myNode.descPattern.matcher(value);
// boolean found = m.find();
boolean found;
value = nextTreeNodeMatchCandidate.value();
if (value == null) {
found = false;
} else {
if (myNode.basicCatFunction != null) {
value = myNode.basicCatFunction.apply(value);
}
switch(myNode.descriptionMode) {
case EXACT:
found = value.equals(myNode.exactMatch);
break;
case PATTERN:
m = myNode.descPattern.matcher(value);
found = m.find();
break;
case ANYTHING:
found = true;
break;
case STRINGS:
found = myNode.stringFilter.test(value);
break;
default:
throw new IllegalArgumentException("Unexpected match mode");
}
}
if (found) {
for (Pair varGroup : myNode.variableGroups) { // if variables have been captured from a regex, they must match any previous matchings
String thisVariable = varGroup.second();
String thisVarString = variableStrings.getString(thisVariable);
if (m != null) {
if (thisVarString != null &&
!thisVarString.equals(m.group(varGroup.first()))) {
// failed to match a variable
found = false;
break;
}
} else {
if (thisVarString != null &&
!thisVarString.equals(value)) {
// here we treat any variable group # as a match
found = false;
break;
}
}
}
}
if (found != myNode.negDesc) {
finished = false;
break;
}
}
}
if (!finished) { // I successfully matched.
resetChild(); // reset my unique TregexMatcher child based on the Tree node I successfully matched at.
// cdm bugfix jul 2009: on next line need to check for descPattern not null, or else this is a backreference or a link to an already named node, and the map should _not_ be updated
if ((myNode.descriptionMode != null || myNode.isLink) && myNode.name != null) {
// note: have to fill in the map as we go for backreferencing
namesToNodes.put(myNode.name, nextTreeNodeMatchCandidate);
}
if (m != null) {
// commit variable groups using a matcher, meaning
// it extracts the expressions from that matcher
commitVariableGroups(m);
} else if (value != null) {
// commit using a set string (all groups are treated as the string)
commitVariableGroups(value);
}
}
// finished is false exiting this if and only if nextChild exists
// and has a label or backreference that matches
// (also it will just have been reset)
}
private void commitVariableGroups(Matcher m) {
committedVariables = true; // commit all my variable groups.
for(Pair varGroup : myNode.variableGroups) {
String thisVarString = m.group(varGroup.first());
variableStrings.setVar(varGroup.second(),thisVarString);
}
}
private void commitVariableGroups(String value) {
committedVariables = true;
for(Pair varGroup : myNode.variableGroups) {
variableStrings.setVar(varGroup.second(), value);
}
}
private void decommitVariableGroups() {
if (committedVariables) {
for(Pair varGroup : myNode.variableGroups) {
variableStrings.unsetVar(varGroup.second());
}
}
committedVariables = false;
}
private void removeNamedNodes() {
if ((myNode.descriptionMode != null || myNode.isLink) &&
myNode.name != null) {
namesToNodes.remove(myNode.name);
}
}
/* tries to match the unique child of the DescriptionPattern node to a Tree node. Returns "true" if succeeds.*/
private boolean matchChild() {
// entering here (given that it's called only once in matches())
// we know finished is false, and either nextChild == null
// (meaning goToNextChild has not been called) or nextChild exists
// and has a label or backreference that matches
if (nextTreeNodeMatchCandidate == null) { // I haven't been initialized yet, so my child certainly can't be matched yet.
return false;
}
// lazy initialization of the child matcher
if (childMatcher == null && myNode.child != null) {
childMatcher = myNode.child.matcher(root, nextTreeNodeMatchCandidate, nodesToParents, namesToNodes, variableStrings, headFinder);
//childMatcher.resetChildIter();
}
if (childMatcher == null) {
if (!matchedOnce) {
matchedOnce = true;
return true;
}
return false;
}
return childMatcher.matches();
}
// find the next local match
@Override
public boolean matches() {
// this is necessary so that a negated/optional node matches only once
if (finished) {
return false;
}
while (!finished) {
if (matchChild()) {
if (myNode.isNegated()) {
// negated node only has to fail once
finished = true;
return false; // cannot be optional and negated
} else {
if (myNode.isOptional()) {
finished = true;
}
return true;
}
} else {
goToNextTreeNodeMatch();
}
}
if (myNode.isNegated()) { // couldn't match my relation/pattern, so succeeded!
return true;
} else { // couldn't match my relation/pattern, so failed!
decommitVariableGroups();
removeNamedNodes();
nextTreeNodeMatchCandidate = null;
// didn't match, but return true anyway if optional
return myNode.isOptional();
}
}
@Override
public Tree getMatch() {
return nextTreeNodeMatchCandidate;
}
} // end class DescriptionMatcher
private static final long serialVersionUID = 1179819056757295757L;
}