All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.tregex.DescriptionPattern Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees.tregex;

import java.util.function.Function;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.ArrayStringFilter;
import edu.stanford.nlp.util.Pair;
import java.util.function.Predicate;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class DescriptionPattern extends TregexPattern {

  enum DescriptionMode {
    PATTERN, STRINGS, EXACT, ANYTHING
  }

  private final Relation rel;
  private final boolean negDesc;

  private final DescriptionMode descriptionMode;
  private final String exactMatch;
  private final Pattern descPattern;
  private final Predicate stringFilter;

  // what size string matchers to use before switching to regex for
  // disjunction matches
  private static final int MAX_STRING_MATCHER_SIZE = 8;

  private final String stringDesc;
  /** The name to give the matched node */
  private final String name;
  /** If this pattern is a link, this is the node linked to */
  private final String linkedName;
  private final boolean isLink;
  // todo: conceptually final, but we'd need to rewrite TregexParser
  // to make it so.
  private TregexPattern child;
  // also conceptually final, but it depends on the child
  private final List> variableGroups; // specifies the groups in a regex that are captured as matcher-global string variables

  private final Function basicCatFunction;

  /** Used to detect regex expressions which can be simplified to exact matches */
  private static final Pattern SINGLE_WORD_PATTERN = Pattern.compile("/\\^(.)\\$/" + "|" + // for example, /^:$/
                                                                     "/\\^\\[(.)\\]\\$/" + "|" + // for example, /^[$]$/
                                                                     "/\\^([-a-zA-Z']+)\\$/"); // for example, /^-NONE-$/

  private static final Pattern MULTI_WORD_PATTERN = Pattern.compile("/\\^\\(\\?\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");

  private static final Pattern CASE_INSENSITIVE_PATTERN = Pattern.compile("/\\^\\(\\?i\\:((?:[-a-zA-Z|]|\\\\\\$)+)\\)\\$\\/");

  /** Used to detect regex expressions which can be simplified to exact matches */
  private static final Pattern PREFIX_PATTERN = Pattern.compile("/\\^([-a-zA-Z|]+)\\/" + "|" + // for example, /^JJ/
                                                                "/\\^\\(\\?\\:([-a-zA-Z|]+)\\)\\/");

  public DescriptionPattern(Relation rel, boolean negDesc, String desc,
                            String name, boolean useBasicCat,
                            Function basicCatFunction,
                            List> variableGroups,
                            boolean isLink, String linkedName) {
    this.rel = rel;
    this.negDesc = negDesc;
    this.isLink = isLink;
    this.linkedName = linkedName;
    if (desc != null) {
      stringDesc = desc;
      // TODO: factor out some of these blocks of code
      if (desc.equals("__") || desc.equals("/.*/") || desc.equals("/^.*$/")) {
        descriptionMode = DescriptionMode.ANYTHING;
        descPattern = null;
        exactMatch = null;
        stringFilter = null;
      } else if (SINGLE_WORD_PATTERN.matcher(desc).matches()) {
        // Expressions are written like this to put special characters
        // in the tregex matcher, but a regular expression is less
        // efficient than a simple string match
        descriptionMode = DescriptionMode.EXACT;
        descPattern = null;
        Matcher matcher = SINGLE_WORD_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        exactMatch = matchedGroup;
        stringFilter = null;
        //System.err.println("DescriptionPattern: converting " + desc + " to " + exactMatch);
      } else if (MULTI_WORD_PATTERN.matcher(desc).matches()) {
        Matcher matcher = MULTI_WORD_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        matchedGroup = matchedGroup.replaceAll("\\\\", "");
        if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //System.err.println("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, matchedGroup.split("[|]")); 
          //System.err.println("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (CASE_INSENSITIVE_PATTERN.matcher(desc).matches()) {
        Matcher matcher = CASE_INSENSITIVE_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        matchedGroup = matchedGroup.replaceAll("\\\\", "");
        if (matchedGroup.split("[|]").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //System.err.println("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.CASE_INSENSITIVE, matchedGroup.split("[|]")); 
          //System.err.println("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (PREFIX_PATTERN.matcher(desc).matches()) {
        Matcher matcher = PREFIX_PATTERN.matcher(desc);
        matcher.matches();
        String matchedGroup = null;
        for (int i = 1; i <= matcher.groupCount(); ++i) {
          if (matcher.group(i) != null) {
            matchedGroup = matcher.group(i);
            break;
          }
        }
        if (matchedGroup.split("\\|").length > MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
          exactMatch = null;
          stringFilter = null;
          //System.err.println("DescriptionPattern: not converting " + desc);
        } else {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.PREFIX, matchedGroup.split("[|]")); 
          //System.err.println("DescriptionPattern: converting " + desc + " to " + stringFilter);
        }
      } else if (desc.matches("/.*/")) {
        descriptionMode = DescriptionMode.PATTERN;
        descPattern = Pattern.compile(desc.substring(1, desc.length() - 1));
        exactMatch = null;
        stringFilter = null;
      } else if (desc.indexOf('|') >= 0) {
        // patterns which contain ORs are a special case; we either
        // promote those to regex match or make a string matcher out
        // of them.  for short enough disjunctions, a simple string
        // matcher can be more efficient than a regex.
        String[] words = desc.split("[|]");
        if (words.length <= MAX_STRING_MATCHER_SIZE) {
          descriptionMode = DescriptionMode.STRINGS;
          descPattern = null;
          exactMatch = null;
          stringFilter = new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, words);
        } else {
          descriptionMode = DescriptionMode.PATTERN;
          descPattern = Pattern.compile("^(?:" + desc + ")$");
          exactMatch = null;
          stringFilter = null;
        }
      } else { // raw description
        descriptionMode = DescriptionMode.EXACT;
        descPattern = null;
        exactMatch = desc;
        stringFilter = null;
      }
    } else {
      if (name == null && linkedName == null) {
        throw new AssertionError("Illegal description pattern.  Does not describe a node or link/name a variable");
      }
      stringDesc = " ";
      descriptionMode = null;
      descPattern = null;
      exactMatch = null;
      stringFilter = null;
    }
    this.name = name;
    setChild(null);
    this.basicCatFunction = (useBasicCat ? basicCatFunction : null);
    //    System.out.println("Made " + (negDesc ? "negated " : "") + "DescNode with " + desc);
    this.variableGroups = variableGroups;
  }

  public DescriptionPattern(Relation newRelation, DescriptionPattern oldPattern) {
    this.rel = newRelation;
    this.negDesc = oldPattern.negDesc;
    this.isLink = oldPattern.isLink;
    this.linkedName = oldPattern.linkedName;
    this.stringDesc = oldPattern.stringDesc;
    this.descriptionMode = oldPattern.descriptionMode;
    this.descPattern = oldPattern.descPattern;
    this.exactMatch = oldPattern.exactMatch;
    this.stringFilter = oldPattern.stringFilter;
    this.name = oldPattern.name;
    this.setChild(oldPattern.child);
    this.basicCatFunction = oldPattern.basicCatFunction;
    this.variableGroups = oldPattern.variableGroups;
  }  

  @Override
  public String localString() {
    return rel.toString() + ' ' + (negDesc ? "!" : "") + (basicCatFunction != null ? "@" : "") + stringDesc + (name == null ? "" : '=' + name);
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    if (isNegated()) {
      sb.append('!');
    }
    if (isOptional()) {
      sb.append('?');
    }
    sb.append(rel.toString());
    sb.append(' ');
    if (child != null) {
      sb.append('(');
    }
    if (negDesc) {
      sb.append('!');
    }
    if (basicCatFunction != null) {
      sb.append('@');
    }
    sb.append(stringDesc);
    if (isLink) {
      sb.append('~');
      sb.append(linkedName);
    }
    if (name != null) {
      sb.append('=');
      sb.append(name);
    }
    sb.append(' ');
    if (child != null) {
      sb.append(child.toString());
      sb.append(')');
    }
    return sb.toString();
  }

  public void setChild(TregexPattern n) {
    child = n;
  }

  @Override
  public List getChildren() {
    if (child == null) {
      return Collections.emptyList();
    } else {
      return Collections.singletonList(child);
    }
  }

  @Override
  public TregexMatcher matcher(Tree root, Tree tree,
                               IdentityHashMap nodesToParents,
                               Map namesToNodes,
                               VariableStrings variableStrings,
                               HeadFinder headFinder) {
    return new DescriptionMatcher(this, root, tree, nodesToParents,
                                  namesToNodes, variableStrings, headFinder);
  }

  // TODO: Why is this a static class with a pointer to the containing
  // class?  There seems to be no reason for such a thing.
  // cdm: agree: It seems like it should just be a non-static inner class.  Try this and check it works....
  private static class DescriptionMatcher extends TregexMatcher {
    private Iterator treeNodeMatchCandidateIterator;
    private final DescriptionPattern myNode;

    // a DescriptionMatcher only has a single child; if it is the left
    // side of multiple relations, a CoordinationMatcher is used.

    // childMatcher is null until the first time a matcher needs to check the child 

    // myNode.child == null OR resetChild has never been called
    private TregexMatcher childMatcher;
    private Tree nextTreeNodeMatchCandidate; // the Tree node that this DescriptionMatcher node is trying to match on.
    private boolean finished = false; // when finished = true, it means I have exhausted my potential tree node match candidates.
    private boolean matchedOnce = false;
    private boolean committedVariables = false;


    public DescriptionMatcher(DescriptionPattern n, Tree root, Tree tree,
                              IdentityHashMap nodesToParents,
                              Map namesToNodes,
                              VariableStrings variableStrings,
                              HeadFinder headFinder) {
      super(root, tree, nodesToParents, namesToNodes, variableStrings, headFinder);
      myNode = n;
      // no need to reset anything - everything starts out as null or false.  
      // lazy initialization of children to save time.
      // resetChildIter();
    }

    @Override
    void resetChildIter() {
      decommitVariableGroups();
      removeNamedNodes();
      // lazy initialization saves quite a bit of time in use cases
      // where we call something other than matches()
      treeNodeMatchCandidateIterator = null;
      finished = false;
      nextTreeNodeMatchCandidate = null;
      if (childMatcher != null) {
        // need to tell the children to clean up any preexisting data
        childMatcher.resetChildIter();
      }
    }

    private void resetChild() {
      if (childMatcher == null) {
        if (myNode.child == null) {
          matchedOnce = false;
        }
      } else {
        childMatcher.resetChildIter(nextTreeNodeMatchCandidate);
      }
    }

    /* goes to the next node in the tree that is a successful match to my description pattern.
     * This is the hotspot method in running tregex, but not clear how to make it faster. */
    // when finished = false; break; is called, it means I successfully matched.
    private void goToNextTreeNodeMatch() {
      decommitVariableGroups(); // make sure variable groups are free.
      removeNamedNodes(); // if we named a node, it should now be unnamed
      finished = true;
      Matcher m = null;
      String value = null;
      if (treeNodeMatchCandidateIterator == null) {
        treeNodeMatchCandidateIterator = myNode.rel.searchNodeIterator(tree, this);
      }
      while (treeNodeMatchCandidateIterator.hasNext()) {
        nextTreeNodeMatchCandidate = treeNodeMatchCandidateIterator.next();
        if (myNode.descriptionMode == null) {
          // this is a backreference or link
          if (myNode.isLink) {
            Tree otherTree = namesToNodes.get(myNode.linkedName);
            if (otherTree != null) {
              String otherValue = myNode.basicCatFunction == null ? otherTree.value() : myNode.basicCatFunction.apply(otherTree.value());
              String myValue = myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value());
              if (otherValue.equals(myValue)) {
                finished = false;
                break;
              }
            }
          } else if (namesToNodes.get(myNode.name) == nextTreeNodeMatchCandidate) {
            finished = false;
            break;
          }
        } else { // try to match the description pattern.
          // cdm: Nov 2006: Check for null label, just make found false
          // String value = (myNode.basicCatFunction == null ? nextTreeNodeMatchCandidate.value() : myNode.basicCatFunction.apply(nextTreeNodeMatchCandidate.value()));
          // m = myNode.descPattern.matcher(value);
          // boolean found = m.find();
          boolean found;
          value = nextTreeNodeMatchCandidate.value();
          if (value == null) {
            found = false;
          } else {
            if (myNode.basicCatFunction != null) {
              value = myNode.basicCatFunction.apply(value);
            }
            switch(myNode.descriptionMode) {
            case EXACT:
              found = value.equals(myNode.exactMatch);
              break;
            case PATTERN:
              m = myNode.descPattern.matcher(value);
              found = m.find();
              break;
            case ANYTHING:
              found = true;
              break;
            case STRINGS:
              found = myNode.stringFilter.test(value);
              break;
            default:
              throw new IllegalArgumentException("Unexpected match mode");
            }
          }
          if (found) {
            for (Pair varGroup : myNode.variableGroups) { // if variables have been captured from a regex, they must match any previous matchings
              String thisVariable = varGroup.second();
              String thisVarString = variableStrings.getString(thisVariable);
              if (m != null) {
                if (thisVarString != null &&
                    !thisVarString.equals(m.group(varGroup.first()))) {
                  // failed to match a variable
                  found = false;
                  break;
                }
              } else {
                if (thisVarString != null &&
                    !thisVarString.equals(value)) {
                  // here we treat any variable group # as a match
                  found = false;
                  break;
                }
              }
            }
          }
          if (found != myNode.negDesc) {
            finished = false;
            break;
          }
        }
      }
      if (!finished) { // I successfully matched.
        resetChild(); // reset my unique TregexMatcher child based on the Tree node I successfully matched at.
        // cdm bugfix jul 2009: on next line need to check for descPattern not null, or else this is a backreference or a link to an already named node, and the map should _not_ be updated
        if ((myNode.descriptionMode != null || myNode.isLink) && myNode.name != null) {
          // note: have to fill in the map as we go for backreferencing
          namesToNodes.put(myNode.name, nextTreeNodeMatchCandidate);
        }
        if (m != null) {
          // commit variable groups using a matcher, meaning
          // it extracts the expressions from that matcher
          commitVariableGroups(m);
        } else if (value != null) {
          // commit using a set string (all groups are treated as the string)
          commitVariableGroups(value);
        }
      }
      // finished is false exiting this if and only if nextChild exists
      // and has a label or backreference that matches
      // (also it will just have been reset)
    }

    private void commitVariableGroups(Matcher m) {
      committedVariables = true; // commit all my variable groups.
      for(Pair varGroup : myNode.variableGroups) {
        String thisVarString = m.group(varGroup.first());
        variableStrings.setVar(varGroup.second(),thisVarString);
      }
    }

    private void commitVariableGroups(String value) {
      committedVariables = true;
      for(Pair varGroup : myNode.variableGroups) {
        variableStrings.setVar(varGroup.second(), value);
      }
    }

    private void decommitVariableGroups() {
      if (committedVariables) {
        for(Pair varGroup : myNode.variableGroups) {
          variableStrings.unsetVar(varGroup.second());
        }
      }
      committedVariables = false;
    }

    private void removeNamedNodes() {
      if ((myNode.descriptionMode != null || myNode.isLink) &&
          myNode.name != null) {
        namesToNodes.remove(myNode.name);
      }
    }


    /* tries to match the unique child of the DescriptionPattern node to a Tree node.  Returns "true" if succeeds.*/
    private boolean matchChild() {
      // entering here (given that it's called only once in matches())
      // we know finished is false, and either nextChild == null
      // (meaning goToNextChild has not been called) or nextChild exists
      // and has a label or backreference that matches
      if (nextTreeNodeMatchCandidate == null) {  // I haven't been initialized yet, so my child certainly can't be matched yet.
        return false;
      }
      // lazy initialization of the child matcher
      if (childMatcher == null && myNode.child != null) {
        childMatcher = myNode.child.matcher(root, nextTreeNodeMatchCandidate, nodesToParents, namesToNodes, variableStrings, headFinder);
        //childMatcher.resetChildIter();
      }
      if (childMatcher == null) {
        if (!matchedOnce) {
          matchedOnce = true;
          return true;
        }
        return false;
      }
      return childMatcher.matches();
    }

    // find the next local match
    @Override
    public boolean matches() {
      // this is necessary so that a negated/optional node matches only once
      if (finished) {
        return false;
      }
      while (!finished) {
        if (matchChild()) {
          if (myNode.isNegated()) {
            // negated node only has to fail once
            finished = true;
            return false; // cannot be optional and negated
          } else {
            if (myNode.isOptional()) {
              finished = true;
            }
            return true;
          }
        } else {
          goToNextTreeNodeMatch();
        }
      }
      if (myNode.isNegated()) { // couldn't match my relation/pattern, so succeeded!
        return true;
      } else { // couldn't match my relation/pattern, so failed!
        decommitVariableGroups();
        removeNamedNodes();
        nextTreeNodeMatchCandidate = null;
        // didn't match, but return true anyway if optional
        return myNode.isOptional();
      }
    }

    @Override
    public Tree getMatch() {
      return nextTreeNodeMatchCandidate;
    }

  } // end class DescriptionMatcher

  private static final long serialVersionUID = 1179819056757295757L;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy