All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.tokensregex.TokenSequencePattern Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.ling.tokensregex;

import java.util.*;

import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.util.*;

/**
 * Token Sequence Pattern for regular expressions over sequences of tokens (each represented as a CoreMap).
 * Sequences over tokens can be matched like strings.
 * 

* To use: *

*

 *   TokenSequencePattern p = TokenSequencePattern.compile("....");
 *   TokenSequenceMatcher m = p.getMatcher(tokens);
 *   while (m.find()) ....
 * 
* *

* Supports the following: *

    *
  • Concatenation: X Y
  • *
  • Or: X | Y
  • *
  • And: {@code X & Y}
  • *
  • Groups: *
      *
    • capturing: (X) (with numeric group id)
    • *
    • capturing: (?$var X) (with group name "$var")
    • *
    • noncapturing: (?:X)
    • *
    * Capturing groups can be retrieved with group id or group variable, as matched string * (m.group()) or list of tokens (m.groupNodes()). *
      *
    • To retrieve group using id: m.group(id) or m.groupNodes(id) *
      NOTE: Capturing groups are indexed from left to right, starting at one. Group zero is the entire matched sequence. *
    • *
    • To retrieve group using bound variable name: m.group("$var") or m.groupNodes("$var") *
    • *
    * See {@link SequenceMatchResult} for more accessor functions to retrieve matches. *
  • *
  • Greedy Quantifiers: X+, X?, X*, X{n,m}, X{n}, X{n,}
  • *
  • Reluctant Quantifiers: X+?, X??, X*?, X{n,m}?, X{n}?, X{n,}?
  • *
  • Back references: \captureid
  • *
  • Value binding for groups: {@code [pattern] => [value]}. * Value for matched expression can be accessed using {@code m.groupValue()} *

    Example: {@code ( one => 1 | two => 2 | three => 3 | ...)} *
  • *
* *

* Individual tokens are marked by "[" TOKEN_EXPR "]" *
Possible TOKEN_EXPR: *

*
    *
  • All specified token attributes match: *
    For Strings: * { lemma:/.../; tag:"NNP" } = attributes that need to all match. * If only one attribute, the {} can be dropped. *
    See {@link edu.stanford.nlp.ling.AnnotationLookup AnnotationLookup} for a list of predefined token attribute names. *
    Additional attributes can be bound using the environment (see below). *
    NOTE: /.../ used for regular expressions, * "..." for exact string matches *
    For Numbers: * { word>=2 } *
    NOTE: Relation can be {@code ">=", "<=", ">", "<",} or {@code "=="} *
    Others: * { word::IS_NUM } , { word::IS_NIL } or * { word::NOT_EXISTS }, { word::NOT_NIL } or { word::EXISTS } *
  • *
  • Short hand for just word/text match: * /.../ or "..." *
  • *
  • * Negation: * !{...} *
  • *
  • * Conjunction or Disjunction: * {...} & {...} or {...} | {...} *
  • *
* *

* Special tokens: * Any token: [] *

* *

* String pattern match across multiple tokens: * (?m){min,max} /pattern/ *

* *

* Special expressions: indicated by double braces: {{ expr }} *
See {@link edu.stanford.nlp.ling.tokensregex.types.Expressions} for syntax. *

* *

* Binding of variables for use in compiling patterns: *

*
    *
  1. Use {@code Env env = TokenSequencePattern.getNewEnv()} to create a new environment for binding
  2. *
  3. Bind string to attribute key (Class) lookup: * {@code env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class);} *
  4. *
  5. Bind patterns / strings for compiling patterns *
    
     *    // Bind string for later compilation using: compile("/it/ /was/ $RELDAY");
     *    env.bind("$RELDAY", "/today|yesterday|tomorrow|tonight|tonite/");
     *    // Bind pre-compiled patter for later compilation using: compile("/it/ /was/ $RELDAY");
     *    env.bind("$RELDAY", TokenSequencePattern.compile(env, "/today|yesterday|tomorrow|tonight|tonite/"));
     *    
    *
  6. *
  7. Bind custom node pattern functions (currently no arguments are supported) *
    
     *    // Bind node pattern so we can do patterns like: compile("... temporal::IS_TIMEX_DATE ...");
     *    //   (TimexTypeMatchNodePattern is a NodePattern that implements some custom logic)
     *    env.bind("::IS_TIMEX_DATE", new TimexTypeMatchNodePattern(SUTime.TimexType.DATE));
     *   
    *
  8. *
* *

* Actions (partially implemented) *

*
    *
  • {@code pattern ==> action}
  • *
  • Supported action: * &annotate( { ner="DATE" } )
  • *
  • Not applied automatically, associated with a pattern.
  • *
  • To apply, call pattern.getAction().apply(match, groupid)
  • *
* * @author Angel Chang * @see TokenSequenceMatcher */ public class TokenSequencePattern extends SequencePattern { private static final long serialVersionUID = -4760710834202406916L; public static final TokenSequencePattern ANY_NODE_PATTERN = TokenSequencePattern.compile(ANY_NODE_PATTERN_EXPR); private static final Env DEFAULT_ENV = getNewEnv(); public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) { super(patternStr, nodeSequencePattern); } public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern, SequenceMatchAction action) { super(patternStr, nodeSequencePattern, action); } public static Env getNewEnv() { Env env = new Env(new TokenSequenceParser()); env.initDefaultBindings(); return env; } /** * Compiles a regular expressions over tokens into a TokenSequencePattern * using the default environment. * * @param string Regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(String string) { return compile(DEFAULT_ENV, string); } /** * Compiles a regular expression over tokens into a TokenSequencePattern * using the specified environment. * * @param env Environment to use * @param string Regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(Env env, String string) { try { // SequencePattern.PatternExpr nodeSequencePattern = TokenSequenceParser.parseSequence(env, string); // return new TokenSequencePattern(string, nodeSequencePattern); // TODO: Check token sequence parser? Pair> p = env.parser.parseSequenceWithAction(env, string); return new TokenSequencePattern(string, p.first(), p.second()); } catch (Exception ex) { throw new RuntimeException("When parsing " + string + "\t\t" + ex); } } /** * Compiles a sequence of regular expressions into a TokenSequencePattern * using the default environment. * * @param strings List of regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(String... strings) { return compile(DEFAULT_ENV, strings); } /** * Compiles a sequence of regular expressions into a TokenSequencePattern * using the specified environment. * * @param env Environment to use * @param strings List of regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(Env env, String... strings) { try { List patterns = new ArrayList<>(); for (String string:strings) { // TODO: Check token sequence parser? SequencePattern.PatternExpr pattern = env.parser.parseSequence(env, string); patterns.add(pattern); } SequencePattern.PatternExpr nodeSequencePattern = new SequencePattern.SequencePatternExpr(patterns); return new TokenSequencePattern(StringUtils.join(strings), nodeSequencePattern); } catch (Exception ex) { throw new RuntimeException(ex); } } /** * Compiles a PatternExpr into a TokenSequencePattern. * * @param nodeSequencePattern A sequence pattern expression (before translation into a NFA) * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern) { return new TokenSequencePattern(null, nodeSequencePattern); } /** * Returns a TokenSequenceMatcher that can be used to match this pattern * against the specified list of tokens. * * @param tokens List of tokens to match against * @return TokenSequenceMatcher */ @Override public TokenSequenceMatcher getMatcher(List tokens) { return new TokenSequenceMatcher(this, tokens); } /** * Returns a TokenSequenceMatcher that can be used to match this pattern * against the specified list of tokens. * * @param tokens List of tokens to match against * @return TokenSequenceMatcher */ public TokenSequenceMatcher matcher(List tokens) { return getMatcher(tokens); } /** Returns a String representation of the TokenSequencePattern. * * @return A String representation of the TokenSequencePattern */ @Override public String toString(){ return this.pattern(); } /** * Create a multi-pattern matcher for matching across multiple TokensRegex patterns. * @param patterns Collection of input patterns * @return a MultiPatternMatcher */ public static MultiPatternMatcher getMultiPatternMatcher(Collection patterns) { return new MultiPatternMatcher<>( new MultiPatternMatcher.BasicSequencePatternTrigger<>( new CoreMapNodePatternTrigger(patterns) ), patterns); } /** * Create a multi-pattern matcher for matching across multiple TokensRegex patterns. * @param patterns input patterns * @return a MultiPatternMatcher */ public static MultiPatternMatcher getMultiPatternMatcher(TokenSequencePattern... patterns) { return new MultiPatternMatcher<>( new MultiPatternMatcher.BasicSequencePatternTrigger<>( new CoreMapNodePatternTrigger(patterns) ), patterns); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy