All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.tregex.DescriptionPattern Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees.tregex; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.function.Function;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.ArrayStringFilter;
import edu.stanford.nlp.util.Pair;
import java.util.function.Predicate;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DescriptionPattern extends TregexPattern  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DescriptionPattern.class);

  enum DescriptionMode {
    PATTERN, STRINGS, EXACT, ANYTHING
  }

  private final Relation rel;
  private final boolean negDesc;

  private final DescriptionMode descriptionMode;
  private final String exactMatch;
  private final Pattern descPattern;
  private final Predicate stringFilter;

  // what size string matchers to use before switching to regex for
  // disjunction matches
  private static final int MAX_STRING_MATCHER_SIZE = 8;

  private final String stringDesc;
  /** The name to give the matched node */
  private final String name;
  /** If this pattern is a link, this is the node linked to */
  private final String linkedName;
  private final boolean isLink;
  // todo: conceptually final, but we'd need to rewrite TregexParser
  // to make it so.
  private TregexPattern child;
  // also conceptually final, but it depends on the child
  private final List> variableGroups; // specifies the groups in a regex that are captured as matcher-global string variables

  private final Function basicCatFunction;

  /** Used to detect regex expressions which can be simplified to exact matches */
  private static final Pattern SINGLE_WORD_PATTERN = Pattern.compile("/\\^(.)\\$/" + "|" + // for example, /^:$/
                                                                     "/\\^\\[(.)\\]\\$/" + "|" + // for example, /^[$]$/
                                                                     "/\\^([-a-zA-Z']+)\\$/"); // for example, /^-NONE-$/

  private static final Pattern MULTI_WORD_PATTERN = Pattern.compile("/\\^\\(\\?\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");

  private static final Pattern CASE_INSENSITIVE_PATTERN = Pattern.compile("/\\^\\(\\?i\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");

  /** Used to detect regex expressions which can be simplified to exact matches */
  private static final Pattern PREFIX_PATTERN = Pattern.compile("/\\^([-a-zA-Z|]+)\\/" + "|" + // for example, /^JJ/
                                                                "/\\^\\(\\?\\:([-a-zA-Z|]+)\\)\\/");

  public DescriptionPattern(Relation rel, boolean negDesc, String desc,
                            String name, boolean useBasicCat,
                            Function basicCatFunction,
                            List> variableGroups,
                            boolean isLink, String linkedName) {
    this.rel = rel;
    this.negDesc = negDesc;
    this.isLink = isLink;
    this.linkedName = linkedName;
    if (desc != null) {
      stringDesc = desc;
      // TODO: factor out some of these blocks of code
      if (desc.equals("__") || desc.equals("/.*/") || desc.equals("/^.*$/")) {
        descriptionMode = DescriptionMode.ANYTHING;
        descPattern = null;
        exactMatch = null;
        stringFilter = null;
      } else if (SINGLE_WORD_PATTERN.matcher(desc).matches()) {
        // Expressions are written like this to put special characters
        // in the tregex matcher, but a regular expression is less
        // efficient than a simple string match
        descriptionMode = DescriptionMode.EXACT;
        descPattern = null;
        Matcher matcher = SINGLE_WORD_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        exactMatch = matchedGroup;
        stringFilter = null;
        //log.info("DescriptionPattern: converting " + desc + " to " + exactMatch);
      } else if (MULTI_WORD_PATTERN.matcher(desc).matches()) {
        Matcher matcher = MULTI_WORD_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        matchedGroup = matchedGroup.replaceAll("\\\\", "");
        if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //log.info("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, matchedGroup.split("[|]")); 
          //log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (CASE_INSENSITIVE_PATTERN.matcher(desc).matches()) {
        Matcher matcher = CASE_INSENSITIVE_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        matchedGroup = matchedGroup.replaceAll("\\\\", "");
        if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //log.info("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.CASE_INSENSITIVE, matchedGroup.split("[|]")); 
          //log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (PREFIX_PATTERN.matcher(desc).matches()) {
        Matcher matcher = PREFIX_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        if (matchedGroup.split("\\|").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //log.info("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.PREFIX, matchedGroup.split("[|]")); 
          //log.info("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (desc.matches("/.*/")) {
        descriptionMode = DescriptionMode.PATTERN;
        descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
        exactMatch = null;
        stringFilter = null;
      } else if (desc.indexOf('|') >= 0) {
        // patterns which contain ORs are a special case; we either
        // promote those to regex match or make a string matcher out
        // of them.  for short enough disjunctions, a simple string
        // matcher can be more efficient than a regex.
        String[] words = desc.split("[|]");
        if (words.length <= MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, words);
        } else {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile("^(?:" + desc + ")$");
          exactMatch = null;
          stringFilter = null;
        }
      } else { // raw description
        descriptionMode = DescriptionMode.EXACT;
        descPattern = null;
        exactMatch = desc;
        stringFilter = null;
      }
    } else {
      if (name == null && linkedName == null) {
        throw new AssertionError("Illegal description pattern.  Does not describe a node or link/name a variable");
      }
      stringDesc = " ";
      descriptionMode = null;
      descPattern = null;
      exactMatch = null;
      stringFilter = null;
    }
    this.name = name;
    setChild(null);
    this.basicCatFunction = (useBasicCat ? basicCatFunction : null);
    //    System.out.println("Made " + (negDesc ? "negated " : "") + "DescNode with " + desc);
    this.variableGroups = variableGroups;
  }

  public DescriptionPattern(Relation newRelation, DescriptionPattern oldPattern) {
    this.rel = newRelation;
    this.negDesc = oldPattern.negDesc;
    this.isLink = oldPattern.isLink;
    this.linkedName = oldPattern.linkedName;
    this.stringDesc = oldPattern.stringDesc;
    this.descriptionMode = oldPattern.descriptionMode;
    this.descPattern = oldPattern.descPattern;
    this.exactMatch = oldPattern.exactMatch;
    this.stringFilter = oldPattern.stringFilter;
    this.name = oldPattern.name;
    this.setChild(oldPattern.child);
    this.basicCatFunction = oldPattern.basicCatFunction;
    this.variableGroups = oldPattern.variableGroups;
  }  

  @Override
  public String localString() {
    return rel.toString() + ' ' + (negDesc ? "!" : "") + (basicCatFunction != null ? "@" : "") + stringDesc + (name == null ? "" : '=' + name);
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    if (isNegated()) {
      sb.append('!');
    }
    if (isOptional()) {
      sb.append('?');
    }
    sb.append(rel.toString());
    sb.append(' ');
    if (child != null) {
      sb.append('(');
    }
    if (negDesc) {
      sb.append('!');
    }
    if (basicCatFunction != null) {
      sb.append('@');
    }
    sb.append(stringDesc);
    if (isLink) {
      sb.append('~');
      sb.append(linkedName);
    }
    if (name != null) {
      sb.append('=');
      sb.append(name);
    }
    sb.append(' ');
    if (child != null) {
      sb.append(child.toString());
      sb.append(')');
    }
    return sb.toString();
  }

  public void setChild(TregexPattern n) {
    child = n;
  }

  @Override
  public List getChildren() {
    if (child == null) {
      return Collections.emptyList();
    } else {
      return Collections.singletonList(child);
    }
  }

  @Override
  public TregexMatcher matcher(Tree root, Tree tree,
                               IdentityHashMap nodesToParents,
                               Map namesToNodes,
                               VariableStrings variableStrings,
                               HeadFinder headFinder) {
    return new DescriptionMatcher(this, root, tree, nodesToParents,
                                  namesToNodes, variableStrings, headFinder);
  }

  // TODO: Why is this a static class with a pointer to the containing
  // class?  There seems to be no reason for such a thing.
  // cdm: agree: It seems like it should just be a non-static inner class.  Try this and check it works....
  private static class DescriptionMatcher extends TregexMatcher {
    private Iterator treeNodeMatchCandidateIterator;
    private final DescriptionPattern myNode;

    // a DescriptionMatcher only has a single child; if it is the left
    // side of multiple relations, a CoordinationMatcher is used.

    // childMatcher is null until the first time a matcher needs to check the child 

    // myNode.child == null OR resetChild has never been called
    private TregexMatcher childMatcher;
    private Tree nextTreeNodeMatchCandidate; // the Tree node that this DescriptionMatcher node is trying to match on.
    private boolean finished = false; // when finished = true, it means I have exhausted my potential tree node match candidates.
    private boolean matchedOnce = false;
    private boolean committedVariables = false;


    public DescriptionMatcher(DescriptionPattern n, Tree root, Tree tree,
                              IdentityHashMap nodesToParents,
                              Map namesToNodes,
                              VariableStrings variableStrings,
                              HeadFinder headFinder) {
      super(root, tree, nodesToParents, namesToNodes, variableStrings, headFinder);
      myNode = n;
      // no need to reset anything - everything starts out as null or false.  
      // lazy initialization of children to save time.
      // resetChildIter();
    }

    @Override
    void resetChildIter() {
      decommitVariableGroups();
      removeNamedNodes();
      // lazy initialization saves quite a bit of time in use cases
      // where we call something other than matches()
      treeNodeMatchCandidateIterator = null;
      finished = false;
      nextTreeNodeMatchCandidate = null;
      if (childMatcher != null) {
        // need to tell the children to clean up any preexisting data
        childMatcher.resetChildIter();
      }
    }

    private void resetChild() {
      if (childMatcher == null) {
        if (myNode.child == null) {
          matchedOnce = false;
        }
      } else {
        childMatcher.resetChildIter(nextTreeNodeMatchCandidate);
      }
    }

    /* goes to the next node in the tree that is a successful match to my description pattern.
     * This is the hotspot method in running tregex, but not clear how to make it faster. */
    // when finished = false; break; is called, it means I successfully matched.
    private void goToNextTreeNodeMatch() {
      decommitVariableGroups(); // make sure variable groups are free.
      removeNamedNodes(); // if we named a node, it should now be unnamed
      finished = true;
      Matcher m = null;
      String value = null;
      if (treeNodeMatchCandidateIterator == null) {
        treeNodeMatchCandidateIterator = myNode.rel.searchNodeIterator(tree, this);
      }
      while (treeNodeMatchCandidateIterator.hasNext()) {
        nextTreeNodeMatchCandidate = treeNodeMatchCandidateIterator.next();
        if (myNode.descriptionMode == null) {
          // this is a backreference or link
          if (myNode.isLink) {
            Tree otherTree = namesToNodes.get(myNode.linkedName);
            if (otherTree != null) {
              String otherValue = myNode.basicCatFunction == null ? otherTree.value() : myNode.basicCatFunction.apply(otherTree.value());
              String myValue = myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value());
              if (otherValue.equals(myValue)) {
                finished = false;
                break;
              }
            }
          } else if (namesToNodes.get(myNode.name) == nextTreeNodeMatchCandidate) {
            finished = false;
            break;
          }
        } else { // try to match the description pattern.
          // cdm: Nov 2006: Check for null label, just make found false
          // String value = (myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value()));
          // m = myNode.descPattern.matcher(value);
          // boolean found = m.find();
          boolean found;
          value = nextTreeNodeMatchCandidate.value();
          if (value == null) {
            found = false;
          } else {
            if (myNode.basicCatFunction != null) {
              value = myNode.basicCatFunction.apply(value);
            }
            switch(myNode.descriptionMode) {
            case EXACT:
              found = value.equals(myNode.exactMatch);
              break;
            case PATTERN:
              m = myNode.descPattern.matcher(value);
              found = m.find();
              break;
            case ANYTHING:
              found = true;
              break;
            case STRINGS:
              found = myNode.stringFilter.test(value);
              break;
            default:
              throw new IllegalArgumentException("Unexpected match mode");
            }
          }
          if (found) {
            for (Pair varGroup : myNode.variableGroups) { // if variables have been captured from a regex, they must match any previous matchings
              String thisVariable = varGroup.second();
              String thisVarString = variableStrings.getString(thisVariable);
              if (m != null) {
                if (thisVarString != null &&
                    !thisVarString.equals(m.group(varGroup.first()))) {
                  // failed to match a variable
                  found = false;
                  break;
                }
              } else {
                if (thisVarString != null &&
                    !thisVarString.equals(value)) {
                  // here we treat any variable group # as a match
                  found = false;
                  break;
                }
              }
            }
          }
          if (found != myNode.negDesc) {
            finished = false;
            break;
          }
        }
      }
      if (!finished) { // I successfully matched.
        resetChild(); // reset my unique TregexMatcher child based on the Tree node I successfully matched at.
        // cdm bugfix jul 2009: on next line need to check for descPattern not null, or else this is a backreference or a link to an already named node, and the map should _not_ be updated
        if ((myNode.descriptionMode != null || myNode.isLink) && myNode.name != null) {
          // note: have to fill in the map as we go for backreferencing
          namesToNodes.put(myNode.name, nextTreeNodeMatchCandidate);
        }
        if (m != null) {
          // commit variable groups using a matcher, meaning
          // it extracts the expressions from that matcher
          commitVariableGroups(m);
        } else if (value != null) {
          // commit using a set string (all groups are treated as the string)
          commitVariableGroups(value);
        }
      }
      // finished is false exiting this if and only if nextChild exists
      // and has a label or backreference that matches
      // (also it will just have been reset)
    }

    private void commitVariableGroups(Matcher m) {
      committedVariables = true; // commit all my variable groups.
      for(Pair varGroup : myNode.variableGroups) {
        String thisVarString = m.group(varGroup.first());
        variableStrings.setVar(varGroup.second(),thisVarString);
      }
    }

    private void commitVariableGroups(String value) {
      committedVariables = true;
      for(Pair varGroup : myNode.variableGroups) {
        variableStrings.setVar(varGroup.second(), value);
      }
    }

    private void decommitVariableGroups() {
      if (committedVariables) {
        for(Pair varGroup : myNode.variableGroups) {
          variableStrings.unsetVar(varGroup.second());
        }
      }
      committedVariables = false;
    }

    private void removeNamedNodes() {
      if ((myNode.descriptionMode != null || myNode.isLink) &&
          myNode.name != null) {
        namesToNodes.remove(myNode.name);
      }
    }


    /* tries to match the unique child of the DescriptionPattern node to a Tree node.  Returns "true" if succeeds.*/
    private boolean matchChild() {
      // entering here (given that it's called only once in matches())
      // we know finished is false, and either nextChild == null
      // (meaning goToNextChild has not been called) or nextChild exists
      // and has a label or backreference that matches
      if (nextTreeNodeMatchCandidate == null) {  // I haven't been initialized yet, so my child certainly can't be matched yet.
        return false;
      }
      // lazy initialization of the child matcher
      if (childMatcher == null && myNode.child != null) {
        childMatcher = myNode.child.matcher(root, nextTreeNodeMatchCandidate, nodesToParents, namesToNodes, variableStrings, headFinder);
        //childMatcher.resetChildIter();
      }
      if (childMatcher == null) {
        if (!matchedOnce) {
          matchedOnce = true;
          return true;
        }
        return false;
      }
      return childMatcher.matches();
    }

    // find the next local match
    @Override
    public boolean matches() {
      // this is necessary so that a negated/optional node matches only once
      if (finished) {
        return false;
      }
      while (!finished) {
        if (matchChild()) {
          if (myNode.isNegated()) {
            // negated node only has to fail once
            finished = true;
            return false; // cannot be optional and negated
          } else {
            if (myNode.isOptional()) {
              finished = true;
            }
            return true;
          }
        } else {
          goToNextTreeNodeMatch();
        }
      }
      if (myNode.isNegated()) { // couldn't match my relation/pattern, so succeeded!
        return true;
      } else { // couldn't match my relation/pattern, so failed!
        decommitVariableGroups();
        removeNamedNodes();
        nextTreeNodeMatchCandidate = null;
        // didn't match, but return true anyway if optional
        return myNode.isOptional();
      }
    }

    @Override
    public Tree getMatch() {
      return nextTreeNodeMatchCandidate;
    }

  } // end class DescriptionMatcher

  private static final long serialVersionUID = 1179819056757295757L;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy