edu.stanford.nlp.ling.tokensregex.TokenSequencePattern Maven / Gradle / Ivy
package edu.stanford.nlp.ling.tokensregex;
import java.util.*;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.util.*;
/**
* Token Sequence Pattern for regular expressions over sequences of tokens (each represented as a CoreMap
).
* Sequences over tokens can be matched like strings.
*
* To use:
*
*
* TokenSequencePattern p = TokenSequencePattern.compile("....");
* TokenSequenceMatcher m = p.getMatcher(tokens);
* while (m.find()) ....
*
*
*
* Supports the following:
*
* - Concatenation:
X Y
* - Or:
X | Y
* - And: {@code X & Y}
* - Groups:
*
* - capturing:
(X)
(with numeric group id)
* - capturing:
(?$var X)
(with group name "$var")
* - noncapturing:
(?:X)
*
* Capturing groups can be retrieved with group id or group variable, as matched string
* (m.group()
) or list of tokens (m.groupNodes()
).
*
* - To retrieve group using id:
m.group(id)
or m.groupNodes(id)
*
NOTE: Capturing groups are indexed from left to right, starting at one. Group zero is the entire matched sequence.
*
* - To retrieve group using bound variable name:
m.group("$var")
or m.groupNodes("$var")
*
*
* See {@link SequenceMatchResult} for more accessor functions to retrieve matches.
*
* - Greedy Quantifiers:
X+, X?, X*, X{n,m}, X{n}, X{n,}
* - Reluctant Quantifiers:
X+?, X??, X*?, X{n,m}?, X{n}?, X{n,}?
* - Back references:
\captureid
* - Value binding for groups: {@code [pattern] => [value]}.
* Value for matched expression can be accessed using {@code m.groupValue()}
*
Example: {@code ( one => 1 | two => 2 | three => 3 | ...)}
*
*
*
*
* Individual tokens are marked by "[" TOKEN_EXPR "]"
*
Possible TOKEN_EXPR
:
*
*
* - All specified token attributes match:
*
For Strings:
* { lemma:/.../; tag:"NNP" }
= attributes that need to all match.
* If only one attribute, the {} can be dropped.
*
See {@link edu.stanford.nlp.ling.AnnotationLookup AnnotationLookup} for a list of predefined token attribute names.
*
Additional attributes can be bound using the environment (see below).
*
NOTE: /.../
used for regular expressions,
* "..."
for exact string matches
*
For Numbers:
* { word>=2 }
*
NOTE: Relation can be {@code ">=", "<=", ">", "<",} or {@code "=="}
*
Others:
* { word::IS_NUM } , { word::IS_NIL }
or
* { word::NOT_EXISTS }, { word::NOT_NIL }
or { word::EXISTS }
*
* - Short hand for just word/text match:
*
/.../
or "..."
*
* -
* Negation:
*
!{...}
*
* -
* Conjunction or Disjunction:
*
{...} & {...}
or {...} | {...}
*
*
*
*
* Special tokens:
* Any token: []
*
*
*
* String pattern match across multiple tokens:
* (?m){min,max} /pattern/
*
*
*
* Special expressions: indicated by double braces: {{ expr }}
*
See {@link edu.stanford.nlp.ling.tokensregex.types.Expressions} for syntax.
*
*
*
* Binding of variables for use in compiling patterns:
*
*
* - Use {@code Env env = TokenSequencePattern.getNewEnv()} to create a new environment for binding
* - Bind string to attribute key (Class) lookup:
* {@code env.bind("numtype", CoreAnnotations.NumericTypeAnnotation.class);}
*
* - Bind patterns / strings for compiling patterns
*
* // Bind string for later compilation using: compile("/it/ /was/ $RELDAY");
* env.bind("$RELDAY", "/today|yesterday|tomorrow|tonight|tonite/");
* // Bind pre-compiled patter for later compilation using: compile("/it/ /was/ $RELDAY");
* env.bind("$RELDAY", TokenSequencePattern.compile(env, "/today|yesterday|tomorrow|tonight|tonite/"));
*
*
* - Bind custom node pattern functions (currently no arguments are supported)
*
* // Bind node pattern so we can do patterns like: compile("... temporal::IS_TIMEX_DATE ...");
* // (TimexTypeMatchNodePattern is a NodePattern that implements some custom logic)
* env.bind("::IS_TIMEX_DATE", new TimexTypeMatchNodePattern(SUTime.TimexType.DATE));
*
*
*
*
*
* Actions (partially implemented)
*
*
* - {@code pattern ==> action}
* - Supported action:
*
&annotate( { ner="DATE" } )
* - Not applied automatically, associated with a pattern.
* - To apply, call
pattern.getAction().apply(match, groupid)
*
*
* @author Angel Chang
* @see TokenSequenceMatcher
*/
public class TokenSequencePattern extends SequencePattern {
private static final long serialVersionUID = -4760710834202406916L;
public static final TokenSequencePattern ANY_NODE_PATTERN = TokenSequencePattern.compile(ANY_NODE_PATTERN_EXPR);
private static final Env DEFAULT_ENV = getNewEnv();
public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) {
super(patternStr, nodeSequencePattern);
}
public TokenSequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern,
SequenceMatchAction action) {
super(patternStr, nodeSequencePattern, action);
}
public static Env getNewEnv() {
Env env = new Env(new TokenSequenceParser());
env.initDefaultBindings();
return env;
}
/**
* Compiles a regular expressions over tokens into a TokenSequencePattern
* using the default environment.
*
* @param string Regular expression to be compiled
* @return Compiled TokenSequencePattern
*/
public static TokenSequencePattern compile(String string)
{
return compile(DEFAULT_ENV, string);
}
/**
* Compiles a regular expression over tokens into a TokenSequencePattern
* using the specified environment.
*
* @param env Environment to use
* @param string Regular expression to be compiled
* @return Compiled TokenSequencePattern
*/
public static TokenSequencePattern compile(Env env, String string)
{
try {
// SequencePattern.PatternExpr nodeSequencePattern = TokenSequenceParser.parseSequence(env, string);
// return new TokenSequencePattern(string, nodeSequencePattern);
// TODO: Check token sequence parser?
Pair> p = env.parser.parseSequenceWithAction(env, string);
return new TokenSequencePattern(string, p.first(), p.second());
} catch (Exception ex) {
throw new RuntimeException("When parsing " + string + "\t\t" + ex);
}
}
/**
* Compiles a sequence of regular expressions into a TokenSequencePattern
* using the default environment.
*
* @param strings List of regular expression to be compiled
* @return Compiled TokenSequencePattern
*/
public static TokenSequencePattern compile(String... strings)
{
return compile(DEFAULT_ENV, strings);
}
/**
* Compiles a sequence of regular expressions into a TokenSequencePattern
* using the specified environment.
*
* @param env Environment to use
* @param strings List of regular expression to be compiled
* @return Compiled TokenSequencePattern
*/
public static TokenSequencePattern compile(Env env, String... strings)
{
try {
List patterns = new ArrayList<>();
for (String string:strings) {
// TODO: Check token sequence parser?
SequencePattern.PatternExpr pattern = env.parser.parseSequence(env, string);
patterns.add(pattern);
}
SequencePattern.PatternExpr nodeSequencePattern = new SequencePattern.SequencePatternExpr(patterns);
return new TokenSequencePattern(StringUtils.join(strings), nodeSequencePattern);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
/**
* Compiles a PatternExpr into a TokenSequencePattern.
*
* @param nodeSequencePattern A sequence pattern expression (before translation into a NFA)
* @return Compiled TokenSequencePattern
*/
public static TokenSequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern)
{
return new TokenSequencePattern(null, nodeSequencePattern);
}
/**
* Returns a TokenSequenceMatcher that can be used to match this pattern
* against the specified list of tokens.
*
* @param tokens List of tokens to match against
* @return TokenSequenceMatcher
*/
@Override
public TokenSequenceMatcher getMatcher(List extends CoreMap> tokens) {
return new TokenSequenceMatcher(this, tokens);
}
/**
* Returns a TokenSequenceMatcher that can be used to match this pattern
* against the specified list of tokens.
*
* @param tokens List of tokens to match against
* @return TokenSequenceMatcher
*/
public TokenSequenceMatcher matcher(List extends CoreMap> tokens) {
return getMatcher(tokens);
}
/** Returns a String representation of the TokenSequencePattern.
*
* @return A String representation of the TokenSequencePattern
*/
@Override
public String toString(){
return this.pattern();
}
/**
* Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
* @param patterns Collection of input patterns
* @return a MultiPatternMatcher
*/
public static MultiPatternMatcher getMultiPatternMatcher(Collection patterns) {
return new MultiPatternMatcher<>(
new MultiPatternMatcher.BasicSequencePatternTrigger<>(
new CoreMapNodePatternTrigger(patterns)
), patterns);
}
/**
* Create a multi-pattern matcher for matching across multiple TokensRegex patterns.
* @param patterns input patterns
* @return a MultiPatternMatcher
*/
public static MultiPatternMatcher getMultiPatternMatcher(TokenSequencePattern... patterns) {
return new MultiPatternMatcher<>(
new MultiPatternMatcher.BasicSequencePatternTrigger<>(
new CoreMapNodePatternTrigger(patterns)
), patterns);
}
}