edu.stanford.nlp.ling.tokensregex.TokenSequencePattern Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.ling.tokensregex;

import java.util.*;

import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.util.*;

/**
 * Token Sequence Pattern for regular expressions over sequences of tokens (each represented as a CoreMap).
 * Sequences over tokens can be matched like strings.
 * 
 * To use:
 * 
 * 
 *   TokenSequencePattern p = TokenSequencePattern.compile("....");
 *   TokenSequenceMatcher m = p.getMatcher(tokens);
 *   while (m.find()) ....
 * 
 *
 * 
 * Supports the following:
 * 

 *  Concatenation: X Y
 *  Or: X | Y
 *  And: {@code X & Y}
 *  Groups:
 *     
 *     capturing: (X) (with numeric group id)
 *     capturing: (?$var X) (with group name "$var")
 *     noncapturing: (?:X)
 *     
 *  Capturing groups can be retrieved with group id or group variable, as matched string
 *     (m.group()) or list of tokens (m.groupNodes()).
 *  
 *     To retrieve group using id: m.group(id) or m.groupNodes(id)
 *     
 NOTE: Capturing groups are indexed from left to right, starting at one.  Group zero is the entire matched sequence.
 *     
 *     To retrieve group using bound variable name: m.group("$var") or m.groupNodes("$var")
 *     
 *  
 *  See {@link SequenceMatchResult} for more accessor functions to retrieve matches.
 * 
 * Greedy Quantifiers:  X+, X?, X*, X{n,m}, X{n}, X{n,}
 * Reluctant Quantifiers: X+?, X??, X*?, X{n,m}?, X{n}?, X{n,}?
 * Back references: \captureid 
 * Value binding for groups: {@code [pattern] => [value]}.
 *   Value for matched expression can be accessed using {@code m.groupValue()}
 *   

Example: {@code ( one => 1 | two => 2 | three => 3 | ...)}
 * 
 * 
 *
 * 
 * Individual tokens are marked by "[" TOKEN_EXPR "]" 
 * 
Possible TOKEN_EXPR:
 * 
 * 
 *  All specified token attributes match:
 * 
 For Strings:
 *      { lemma:/.../; tag:"NNP" }  = attributes that need to all match.
 *     If only one attribute, the {} can be dropped.
 * 
 See {@link edu.stanford.nlp.ling.AnnotationLookup AnnotationLookup} for a list of predefined token attribute names.
 * 
 Additional attributes can be bound using the environment (see below).
 * 
 NOTE: /.../ used for regular expressions,
 *            "..." for exact string matches
 * 
 For Numbers:
 *      { word>=2 }
 * 
 NOTE: Relation can be {@code ">=", "<=", ">", "<",} or {@code "=="}
 * 
 Others:
 *      { word::IS_NUM } , { word::IS_NIL }  or
 *      { word::NOT_EXISTS }, { word::NOT_NIL }  or  { word::EXISTS } 
 * 
 * Short hand for just word/text match:
 *      /.../   or  "..." 
 * 
 * 
 *  Negation:
 *      !{...} 
 * 
 * 
 *  Conjunction or Disjunction:
 *      {...} & {...}    or   {...} | {...} 
 * 
 * 
 *
 * 
 * Special tokens:
 *   Any token: []
 * 
 *
 * 
 * String pattern match across multiple tokens:
 *   (?m){min,max} /pattern/
 * 
 *
 * 
 * Special expressions: indicated by double braces: {{ expr }}
 *   
 See {@link edu.stanford.nlp.ling.tokensregex.types.Expressions} for syntax.
 * 
 *
 * 
 * Binding of variables for use in compiling patterns:
 * 
 * 
 *  Use  {@code Env env = TokenSequencePattern.getNewEnv()} to create a new environment for binding 
 *  Bind string to attribute key (Class) lookup:
 *    {@code env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class);}
 * 
 *  Bind patterns / strings for compiling patterns
 *    
 *    // Bind string for later compilation using: compile("/it/ /was/ $RELDAY");
 *    env.bind("$RELDAY", "/today|yesterday|tomorrow|tonight|tonite/");
 *    // Bind pre-compiled patter for later compilation using: compile("/it/ /was/ $RELDAY");
 *    env.bind("$RELDAY", TokenSequencePattern.compile(env, "/today|yesterday|tomorrow|tonight|tonite/"));
 *    
 * 
 *  Bind custom node pattern functions (currently no arguments are supported)
 *    
 *    // Bind node pattern so we can do patterns like: compile("... temporal::IS_TIMEX_DATE ...");
 *    //   (TimexTypeMatchNodePattern is a NodePattern that implements some custom logic)
 *    env.bind("::IS_TIMEX_DATE", new TimexTypeMatchNodePattern(SUTime.TimexType.DATE));
 *   
 * 
 * 
 *
 * 
 * Actions (partially implemented)
 * 
 * 
 *  {@code pattern ==> action} 
 *  Supported action:
 *     &annotate( { ner="DATE" } )  
 *  Not applied automatically, associated with a pattern.
 *  To apply, call pattern.getAction().apply(match, groupid)
 * 
 *
 * @author Angel Chang
 * @see TokenSequenceMatcher
 */
public class TokenSequencePattern extends SequencePattern {

  private static final long serialVersionUID = -4760710834202406916L;

  public static final TokenSequencePattern ANY_NODE_PATTERN = TokenSequencePattern.compile(ANY_NODE_PATTERN_EXPR);

  private static final Env DEFAULT_ENV = getNewEnv();

  public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) {
    super(patternStr, nodeSequencePattern);
  }

  public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern,
                                 SequenceMatchAction action) {
    super(patternStr, nodeSequencePattern, action);
  }

  public static Env getNewEnv() {
    Env env =  new Env(new TokenSequenceParser());
    env.initDefaultBindings();
    return env;
  }

  /**
   * Compiles a regular expressions over tokens into a TokenSequencePattern
   * using the default environment.
   *
   * @param string Regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(String string)
  {
    return compile(DEFAULT_ENV, string);
  }

  /**
   * Compiles a regular expression over tokens into a TokenSequencePattern
   * using the specified environment.
   *
   * @param env Environment to use
   * @param string Regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(Env env, String string)
  {
    try {
//      SequencePattern.PatternExpr nodeSequencePattern = TokenSequenceParser.parseSequence(env, string);
//      return new TokenSequencePattern(string, nodeSequencePattern);
      // TODO: Check token sequence parser?
      Pair> p = env.parser.parseSequenceWithAction(env, string);
      return new TokenSequencePattern(string, p.first(), p.second());

    } catch (Exception ex) {
      throw new RuntimeException("When parsing " + string + "\t\t" + ex);
    }
  }

  /**
   * Compiles a sequence of regular expressions into a TokenSequencePattern
   * using the default environment.
   *
   * @param strings List of regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(String... strings)
  {
    return compile(DEFAULT_ENV, strings);
  }

  /**
   * Compiles a sequence of regular expressions into a TokenSequencePattern
   * using the specified environment.
   *
   * @param env Environment to use
   * @param strings List of regular expression to be compiled
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(Env env, String... strings)
  {
    try {
      List patterns = new ArrayList<>();
      for (String string:strings) {
        // TODO: Check token sequence parser?
        SequencePattern.PatternExpr pattern = env.parser.parseSequence(env, string);
        patterns.add(pattern);
      }
      SequencePattern.PatternExpr nodeSequencePattern = new SequencePattern.SequencePatternExpr(patterns);
      return new TokenSequencePattern(StringUtils.join(strings), nodeSequencePattern);
    } catch (Exception ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Compiles a PatternExpr into a TokenSequencePattern.
   *
   * @param nodeSequencePattern A sequence pattern expression (before translation into a NFA)
   * @return Compiled TokenSequencePattern
   */
  public static TokenSequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern)
  {
    return new TokenSequencePattern(null, nodeSequencePattern);
  }

  /**
   * Returns a TokenSequenceMatcher that can be used to match this pattern
   * against the specified list of tokens.
   *
   * @param tokens List of tokens to match against
   * @return TokenSequenceMatcher
   */
  @Override
  public TokenSequenceMatcher getMatcher(List tokens) {
    return new TokenSequenceMatcher(this, tokens);
  }

  /**
   * Returns a TokenSequenceMatcher that can be used to match this pattern
   * against the specified list of tokens.
   *
   * @param tokens List of tokens to match against
   * @return TokenSequenceMatcher
   */
  public TokenSequenceMatcher matcher(List tokens) {
    return getMatcher(tokens);
  }

  /** Returns a String representation of the TokenSequencePattern.
   *
   * @return A String representation of the TokenSequencePattern
   */
  @Override
  public String toString(){
    return this.pattern();
  }


  /**
   * Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
   * @param patterns Collection of input patterns
   * @return a MultiPatternMatcher
   */
  public static MultiPatternMatcher getMultiPatternMatcher(Collection patterns) {
    return new MultiPatternMatcher<>(
            new MultiPatternMatcher.BasicSequencePatternTrigger<>(
                    new CoreMapNodePatternTrigger(patterns)
            ), patterns);
  }

  /**
   * Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
   * @param patterns input patterns
   * @return a MultiPatternMatcher
   */
  public static MultiPatternMatcher getMultiPatternMatcher(TokenSequencePattern... patterns) {
    return new MultiPatternMatcher<>(
            new MultiPatternMatcher.BasicSequencePatternTrigger<>(
                    new CoreMapNodePatternTrigger(patterns)
            ), patterns);
  }

}