All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.tokensregex.TokenSequencePattern Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.ling.tokensregex;

import java.util.*;
import java.util.stream.Collectors;

import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.util.*;

/**
 * Token Sequence Pattern for regular expressions over sequences of tokens (each represented as a {@code CoreMap}).
 * Sequences over tokens can be matched like strings.
 * 

* To use: *

*
{@code
 *   TokenSequencePattern p = TokenSequencePattern.compile("....");
 *   TokenSequenceMatcher m = p.getMatcher(tokens);
 *   while (m.find()) ....
 * }
* *

* Supports the following: *

    *
  • Concatenation: {@code X Y}
  • *
  • Or: {@code X | Y}
  • *
  • And: {@code X & Y}
  • *
  • Groups: *
      *
    • capturing: {@code (X)} (with numeric group id)
    • *
    • capturing: {@code (?$var X)} (with group name "$var")
    • *
    • noncapturing: {@code (?:X)}
    • *
    * Capturing groups can be retrieved with group id or group variable, as matched string * ({@code m.group()}) or list of tokens ({@code m.groupNodes()}). *
      *
    • To retrieve group using id: {@code m.group(id)} or {@code m.groupNodes(id)} *
      NOTE: Capturing groups are indexed from left to right, starting at one. Group zero is the entire matched sequence. *
    • *
    • To retrieve group using bound variable name: {@code m.group("$var")} or {@code m.groupNodes("$var")} *
    • *
    * See {@link SequenceMatchResult} for more accessor functions to retrieve matches. *
  • *
  • Greedy Quantifiers: {@code X+, X?, X*, X{n,m}, X{n}, X{n,}}
  • *
  • Reluctant Quantifiers: {@code X+?, X??, X*?, X{n,m}?, X{n}?, X{n,}?}
  • *
  • Back references: {@code \captureid}
  • *
  • Value binding for groups: {@code [pattern] => [value]}. * Value for matched expression can be accessed using {@code m.groupValue()} *
    Example: {@code ( one => 1 | two => 2 | three => 3 | ...)} *
  • *
* *

* Individual tokens are marked by {@code "[" TOKEN_EXPR "]" } *
Possible {@code TOKEN_EXPR}: *

*
    *
  • All specified token attributes match: *
    For Strings: * {@code { lemma:/.../; tag:"NNP" } } = attributes that need to all match. * If only one attribute, the {} can be dropped. *
    See {@link edu.stanford.nlp.ling.AnnotationLookup AnnotationLookup} for a list of predefined token attribute names. *
    Additional attributes can be bound using the environment (see below). *
    NOTE: {@code /.../} used for regular expressions, * {@code "..."} for exact string matches *
    For Numbers: * {@code { word>=2 }} *
    NOTE: Relation can be {@code ">=", "<=", ">", "<",} or {@code "=="} *
    Others: * {@code { word::IS_NUM } , { word::IS_NIL } } or * {@code { word::NOT_EXISTS }, { word::NOT_NIL } } or {@code { word::EXISTS } } *
  • *
  • Short hand for just word/text match: * {@code /.../ } or {@code "..." } *
  • *
  • * Negation: * {@code !{...} } *
  • *
  • * Conjunction or Disjunction: * {@code {...} & {...} } or {@code {...} | {...} } *
  • *
* *

* Special tokens: * Any token: {@code []} *

* *

* String pattern match across multiple tokens: * {@code (?m){min,max} /pattern/} *

* *

* Special expressions: indicated by double braces: {@code {{ expr }}} *
See {@link edu.stanford.nlp.ling.tokensregex.types.Expressions} for syntax. *

* *

* Binding of variables for use in compiling patterns: *

*
    *
  1. Use {@code Env env = TokenSequencePattern.getNewEnv()} to create a new environment for binding
  2. *
  3. Bind string to attribute key (Class) lookup: * {@code env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class);} *
  4. *
  5. Bind patterns / strings for compiling patterns *
    {@code
     *    // Bind string for later compilation using: compile("/it/ /was/ $RELDAY");
     *    env.bind("$RELDAY", "/today|yesterday|tomorrow|tonight|tonite/");
     *    // Bind pre-compiled patter for later compilation using: compile("/it/ /was/ $RELDAY");
     *    env.bind("$RELDAY", TokenSequencePattern.compile(env, "/today|yesterday|tomorrow|tonight|tonite/"));
     *    }
    *
  6. *
  7. Bind custom node pattern functions (currently no arguments are supported) *
    {@code
     *    // Bind node pattern so we can do patterns like: compile("... temporal::IS_TIMEX_DATE ...");
     *    //   (TimexTypeMatchNodePattern is a NodePattern that implements some custom logic)
     *    env.bind("::IS_TIMEX_DATE", new TimexTypeMatchNodePattern(SUTime.TimexType.DATE));
     *   }
    *
  8. *
* *

* Actions (partially implemented) *

*
    *
  • {@code pattern ==> action}
  • *
  • Supported action: * {@code &annotate( { ner="DATE" } ) }
  • *
  • Not applied automatically, associated with a pattern.
  • *
  • To apply, call {@code pattern.getAction().apply(match, groupid)}
  • *
* * @author Angel Chang * @see TokenSequenceMatcher */ public class TokenSequencePattern extends SequencePattern { private static final long serialVersionUID = -4760710834202406916L; public static final TokenSequencePattern ANY_NODE_PATTERN = TokenSequencePattern.compile(ANY_NODE_PATTERN_EXPR); private static final Env DEFAULT_ENV = getNewEnv(); public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) { super(patternStr, nodeSequencePattern); } public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern, SequenceMatchAction action) { super(patternStr, nodeSequencePattern, action); } public static Env getNewEnv() { Env env = new Env(new TokenSequenceParser()); env.initDefaultBindings(); return env; } /** * Compiles a regular expression over tokens into a TokenSequencePattern * using the default environment. * * @param string Regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(String string) { return compile(DEFAULT_ENV, string); } /** * Compiles a regular expression over tokens into a TokenSequencePattern * using the specified environment. * * @param env Environment to use * @param string Regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(Env env, String string) { try { // SequencePattern.PatternExpr nodeSequencePattern = TokenSequenceParser.parseSequence(env, string); // return new TokenSequencePattern(string, nodeSequencePattern); // TODO: Check token sequence parser? Pair> p = env.parser.parseSequenceWithAction(env, string); return new TokenSequencePattern(string, p.first(), p.second()); } catch (Exception ex) { throw new RuntimeException("Error when parsing " + string, ex); } } /** * Compiles a sequence of regular expressions into a TokenSequencePattern * using the default environment. * * @param strings List of regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(String... strings) { return compile(DEFAULT_ENV, strings); } /** * Compiles a sequence of regular expressions into a TokenSequencePattern * using the specified environment. * * @param env Environment to use * @param strings List of regular expression to be compiled * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(Env env, String... strings) { try { List patterns = new ArrayList<>(); for (String string:strings) { // TODO: Check token sequence parser? SequencePattern.PatternExpr pattern = env.parser.parseSequence(env, string); patterns.add(pattern); } SequencePattern.PatternExpr nodeSequencePattern = new SequencePattern.SequencePatternExpr(patterns); return new TokenSequencePattern(StringUtils.join(strings), nodeSequencePattern); } catch (Exception ex) { throw new RuntimeException(ex); } } /** * Compiles a PatternExpr into a TokenSequencePattern. * * @param nodeSequencePattern A sequence pattern expression (before translation into a NFA) * @return Compiled TokenSequencePattern */ public static TokenSequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern) { return new TokenSequencePattern(null, nodeSequencePattern); } /** * Returns a TokenSequenceMatcher that can be used to match this pattern * against the specified list of tokens. * * @param tokens List of tokens to match against * @return TokenSequenceMatcher */ @Override public TokenSequenceMatcher getMatcher(List tokens) { return new TokenSequenceMatcher(this, tokens); } /** * Returns a TokenSequenceMatcher that can be used to match this pattern * against the specified list of tokens. * * @param tokens List of tokens to match against * @return TokenSequenceMatcher */ public TokenSequenceMatcher matcher(List tokens) { return getMatcher(tokens); } /** Returns a String representation of the TokenSequencePattern. * * @return A String representation of the TokenSequencePattern */ @Override public String toString(){ return this.pattern(); } /** * Create a multi-pattern matcher for matching across multiple TokensRegex patterns. * * @param patterns Collection of input patterns * @return A MultiPatternMatcher */ public static MultiPatternMatcher getMultiPatternMatcher(Collection patterns) { return new MultiPatternMatcher<>( new MultiPatternMatcher.BasicSequencePatternTrigger<>(new CoreMapNodePatternTrigger(patterns)), patterns); } /** * Create a multi-pattern matcher for matching across multiple TokensRegex patterns. * * @param patterns Input patterns * @return A MultiPatternMatcher */ public static MultiPatternMatcher getMultiPatternMatcher(TokenSequencePattern... patterns) { return new MultiPatternMatcher<>( new MultiPatternMatcher.BasicSequencePatternTrigger<>(new CoreMapNodePatternTrigger(patterns)), patterns); } /** * Create a multi-pattern matcher for matching across multiple TokensRegex patterns from Strings. * * @param patterns Input patterns in String format * @return A MultiPatternMatcher */ public static MultiPatternMatcher getMultiPatternMatcher(String... patterns) { List tokenSequencePatterns = Arrays.stream(patterns).map(TokenSequencePattern::compile) .collect(Collectors.toList()); return TokenSequencePattern.getMultiPatternMatcher(tokenSequencePatterns); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy