edu.stanford.nlp.ling.tokensregex.SequenceMatchRules Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.ling.tokensregex.types.AssignableExpression;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;
import java.io.Serializable;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Rules for matching sequences using regular expressions
*
* There are 2 types of rules:
*
* - Assignment rules which assign a value to a variable for later use.
*
* - Extraction rules which specifies how regular expression patterns are to be matched against text,
* which matched text expressions are to extracted, and what value to assign to the matched expression.
*
*
*
* NOTE: #
or //
can be used to indicates one-line comments
*
* Assignment Rules are used to assign values to variables.
* The basic format is: variable = value
*
*
* Variable Names:
*
* - Variable names should follow the pattern [A-Za-z_][A-Za-z0-9_]*
* - Variable names for use in regular expressions (to be expanded later) must start with
$
*
*
*
* Value Types:
*
* Type Format Example Description
* BOOLEAN
TRUE | FALSE
TRUE
* STRING
"..."
"red"
* INTEGER
[+-]\d+
1500
* LONG
[+-]\d+L
1500000000000L
* DOUBLE
[+-]\d*\.\d+
6.98
* REGEX
/.../
/[Aa]pril/
* String regular expression {@link Pattern}
* TOKENS_REGEX
( [...] [...] ... )
( /up/ /to/ /4/ /months/ )
* Tokens regular expression {@link TokenSequencePattern}
* LIST
( [item1] , [item2], ... )
("red", "blue", "yellow" )
*
*
*
*
* Some typical uses and examples for assignment rules include:
*
* - Assignment of value to variables for use in later rules
* - Binding of text key to annotation key (as
Class
).
*
* tokens = { type: "CLASS", value: "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation" }
*
*
* - Defining regular expressions macros to be embedded in other regular expressions
*
* $SEASON = "/spring|summer|fall|autumn|winter/"
* $NUM = ( [ { numcomptype:NUMBER } ] )
*
*
* - Setting default environment variables.
* Rules are applied with respect to an environment ({@link Env}), which can be accessed using the variable
ENV
.
* Members of the Environment can be set as needed.
*
* # Set default parameters to be used when reading rules
* ENV.defaults["ruleType"] = "tokens"
* # Set default string pattern flags (to case-insensitive)
* ENV.defaultStringPatternFlags = 2
* # Specifies that the result should go into the tokens
key (as defined above).
* ENV.defaultResultAnnotationKey = tokens
*
*
* - Defining options
*
*
*
* Predefined values are:
*
* Variable Type Description
* ENV
{@link Env} The environment with respect to which the rules are applied.
* TRUE
BOOLEAN
The Boolean
value true
.
* FALSE
BOOLEAN
The Boolean
value false
.
* NIL
The null
value.
* tags
Class
The annotation key {@link edu.stanford.nlp.ling.tokensregex.types.Tags.TagsAnnotation}.
*
*
* Extraction Rules specifies how regular expression patterns are to be matched against text.
* See {@link CoreMapExpressionExtractor} for more information on the types of the rules, and in what sequence the rules are applied.
* A basic rule can be specified using the following template:
*
{
* # Type of the rule
* ruleType: "tokens" | "text" | "composite" | "filter",
* # Pattern to match against
* pattern: ( <TokenSequencePattern> ) | /<TextPattern>/,
* # Resulting value to go into the resulting annotation
* result: ...
*
* # More fields following...
* }
*
* Example:
*
* {
* ruleType: "tokens",
* pattern: ( /one/ ),
* result: 1
* }
*
*
* Extraction rule fields (most fields are optional):
*
* Field Values Example Description
* ruleType
"tokens" | "text" | "composite" | "filter"
* tokens
Type of the rule (required).
* pattern
<Token Sequence Pattern> = (...) | <Text Pattern> = /.../
* ( /winter/ /of/ $YEAR )
Pattern to match against.
* See {@link TokenSequencePattern} and {@link Pattern} for
* how to specify patterns over tokens and strings (required).
* action
<Action List> = (...)
* ( Annotate($0, ner, "DATE") )
List of actions to apply when the pattern is triggered.
* Each action is a {@link Expressions TokensRegex Expression}
* result
<Expression>
*
Resulting value to go into the resulting annotation. See {@link Expressions} for how to specify the result.
* name
STRING
*
Name to identify the extraction rule.
* stage
INTEGER
*
Stage at which the rule is to be applied. Rules are grouped in stages, which are applied from lowest to highest.
* active
Boolean
*
Whether this rule is enabled (active) or not (default true).
* priority
DOUBLE
*
Priority of rule. Within a stage, matches from higher priority rules are preferred.
* weight
DOUBLE
*
Weight of rule (not currently used).
* over
CLASS
*
Annotation field to check pattern against.
* matchFindType
FIND_NONOVERLAPPING | FIND_ALL
*
Whether to find all matched expression or just the nonoverlapping ones (default FIND_NONOVERLAPPING
).
* matchWithResults
Boolean
*
Whether results of the matches should be returned (default false).
* Set to true to access captured groups of embedded regular expressions.
* matchedExpressionGroup
Integer
*
What group should be treated as the matched expression group (default 0).
*
*
* @author Angel Chang
* @see CoreMapExpressionExtractor
* @see TokenSequencePattern
*/
public class SequenceMatchRules {
/** A sequence match rule */
public static interface Rule {
}
/**
* Rule that specifies what value to assign to a variable
*/
public static class AssignmentRule implements Rule {
Expression expr;
public AssignmentRule(AssignableExpression varExpr, Expression value) {
expr = varExpr.assign(value);
}
public void evaluate(Env env) {
expr.evaluate(env);
}
}
/**
* Rule that specifies how to extract sequence of MatchedExpression from an annotation (CoreMap).
* @param Output type (MatchedExpression)
*/
public static class AnnotationExtractRule implements Rule, ExtractRule, Predicate, Serializable {
/** Name of the rule */
public String name;
/** Stage in which this rule should be applied with respect to others */
public int stage = 1;
/** Priority in which this rule should be applied with respect to others */
public double priority;
/** Weight given to the rule (how likely is this rule to fire) */
public double weight;
/** Annotation field to apply rule over: text or tokens or numerizedtokens */
public Class annotationField;
public Class tokensAnnotationField;
/** Annotation field(s) on individual tokens to put new annotation */
public List tokensResultAnnotationField;
/** Annotation field(s) to put new annotation */
public List resultAnnotationField;
/** Annotation field for child/nested annotations */
public Class resultNestedAnnotationField;
public SequenceMatcher.FindType matchFindType;
public int matchedExpressionGroup;
public boolean matchWithResults;
// TODO: Combine ruleType and isComposite
/** Type of rule to apply: token string match, pattern string match */
public String ruleType;
public boolean isComposite;
public boolean includeNested = true; // TODO: Get parameter from somewhere....
public boolean active = true;
/** Actual rule performing the extraction (converting annotation to MatchedExpression) */
public ExtractRule extractRule;
public Predicate filterRule;
public void update(Env env, Map attributes) {
for (String key:attributes.keySet()) {
Object obj = attributes.get(key);
switch (key) {
case "name":
name = (String) Expressions.asObject(env, obj);
break;
case "priority":
priority = ((Number) Expressions.asObject(env, obj)).doubleValue();
break;
case "stage":
stage = ((Number) Expressions.asObject(env, obj)).intValue();
break;
case "weight":
weight = ((Number) Expressions.asObject(env, obj)).doubleValue();
break;
case "over":
Object annoKey = Expressions.asObject(env, obj);
if (annoKey instanceof Class) {
annotationField = (Class) annoKey;
} else if (annoKey instanceof String) {
annotationField = EnvLookup.lookupAnnotationKey(env, (String) annoKey);
} else if (annotationField == null) {
annotationField = CoreMap.class;
} else {
throw new IllegalArgumentException("Invalid annotation key " + annoKey);
}
break;
case "active":
active = (Boolean) Expressions.asObject(env, obj);
break;
case "ruleType":
ruleType = (String) Expressions.asObject(env, obj);
break;
case "matchFindType":
matchFindType = SequenceMatcher.FindType.valueOf((String) Expressions.asObject(env, obj));
break;
case "matchWithResults":
matchWithResults = ((Boolean) Expressions.asObject(env, obj)).booleanValue();
break;
case "matchedExpressionGroup":
matchedExpressionGroup = ((Number) Expressions.asObject(env, obj)).intValue();
break;
}
}
}
public boolean extract(S in, List out) {
return extractRule.extract(in, out);
}
public boolean test(T obj) {
return filterRule.test(obj);
}
}
public static AssignmentRule createAssignmentRule(Env env, AssignableExpression var, Expression result)
{
AssignmentRule ar = new AssignmentRule(var, result);
ar.evaluate(env);
return ar;
}
public static Rule createRule(Env env, Expressions.CompositeValue cv) {
Map attributes;
cv = cv.simplifyNoTypeConversion(env);
attributes = new HashMap();//Generics.newHashMap();
for (String s:cv.getAttributes()) {
attributes.put(s, cv.getExpression(s));
}
return createExtractionRule(env, attributes);
}
protected static AnnotationExtractRule createExtractionRule(Env env, Map attributes)
{
String ruleType = (String) Expressions.asObject(env, attributes.get("ruleType"));
if (ruleType == null && env != null) {
ruleType = (String) env.getDefaults().get("ruleType");
}
AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType);
if (ruleCreator != null) {
return ruleCreator.create(env, attributes);
} else {
throw new IllegalArgumentException("Unknown rule type: " + ruleType);
}
}
public static AnnotationExtractRule createExtractionRule(Env env, String ruleType, Object pattern, Expression result)
{
if (ruleType == null && env != null) {
ruleType = (String) env.getDefaults().get("ruleType");
}
AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType);
if (ruleCreator != null) {
Map attributes = new HashMap();//Generics.newHashMap();
attributes.put("ruleType", ruleType);
attributes.put("pattern", pattern);
attributes.put("result", result);
return ruleCreator.create(env, attributes);
} else {
throw new IllegalArgumentException("Unknown rule type: " + ruleType);
}
}
public final static String COMPOSITE_RULE_TYPE = "composite";
public final static String TOKEN_PATTERN_RULE_TYPE = "tokens";
public final static String TEXT_PATTERN_RULE_TYPE = "text";
public final static String FILTER_RULE_TYPE = "filter";
public final static TokenPatternExtractRuleCreator TOKEN_PATTERN_EXTRACT_RULE_CREATOR = new TokenPatternExtractRuleCreator();
public final static CompositeExtractRuleCreator COMPOSITE_EXTRACT_RULE_CREATOR = new CompositeExtractRuleCreator();
public final static TextPatternExtractRuleCreator TEXT_PATTERN_EXTRACT_RULE_CREATOR = new TextPatternExtractRuleCreator();
public final static AnnotationExtractRuleCreator DEFAULT_EXTRACT_RULE_CREATOR = TOKEN_PATTERN_EXTRACT_RULE_CREATOR;
final static Map registeredRuleTypes = new HashMap();//Generics.newHashMap();
static {
registeredRuleTypes.put(TOKEN_PATTERN_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(COMPOSITE_RULE_TYPE, COMPOSITE_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(TEXT_PATTERN_RULE_TYPE, TEXT_PATTERN_EXTRACT_RULE_CREATOR);
registeredRuleTypes.put(FILTER_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR);
}
protected static AnnotationExtractRuleCreator lookupExtractRuleCreator(Env env, String ruleType) {
if (env != null) {
Object obj = env.get(ruleType);
if (obj != null && obj instanceof AnnotationExtractRuleCreator) {
return (AnnotationExtractRuleCreator) obj;
}
}
if (ruleType == null) {
return DEFAULT_EXTRACT_RULE_CREATOR;
} else {
return registeredRuleTypes.get(ruleType);
}
}
static public AnnotationExtractRule createTokenPatternRule(Env env, SequencePattern.PatternExpr expr, Expression result)
{
return TOKEN_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result);
}
static public AnnotationExtractRule createTextPatternRule(Env env, String expr, Expression result)
{
return TEXT_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result);
}
public static class AnnotationExtractRuleCreator {
public AnnotationExtractRule create(Env env) {
AnnotationExtractRule r = new AnnotationExtractRule();
r.resultAnnotationField = EnvLookup.getDefaultResultAnnotationKey(env);
r.resultNestedAnnotationField = EnvLookup.getDefaultNestedResultsAnnotationKey(env);
r.tokensAnnotationField = EnvLookup.getDefaultTokensAnnotationKey(env);
r.tokensResultAnnotationField = EnvLookup.getDefaultTokensResultAnnotationKey(env);
if (env != null) {
r.update(env, env.getDefaults());
}
return r;
}
public AnnotationExtractRule create(Env env, Map attributes) {
// Get default annotation extract rule from env
AnnotationExtractRule r = create(env);
if (attributes != null) {
r.update(env, attributes);
}
return r;
}
}
public static MatchedExpression.SingleAnnotationExtractor createAnnotationExtractor(Env env, AnnotationExtractRule r) {
MatchedExpression.SingleAnnotationExtractor valueExtractor =
new MatchedExpression.SingleAnnotationExtractor();
valueExtractor.name = r.name;
valueExtractor.tokensAnnotationField = r.tokensAnnotationField;
valueExtractor.tokensResultAnnotationField = r.tokensResultAnnotationField;
valueExtractor.resultAnnotationField = r.resultAnnotationField;
valueExtractor.resultNestedAnnotationField = r.resultNestedAnnotationField;
valueExtractor.priority = r.priority;
valueExtractor.weight = r.weight;
valueExtractor.includeNested = r.includeNested;
valueExtractor.resultAnnotationExtractor = EnvLookup.getDefaultResultAnnotationExtractor(env);
valueExtractor.tokensAggregators = EnvLookup.getDefaultTokensAggregators(env);
return valueExtractor;
}
public static class CompositeExtractRuleCreator extends AnnotationExtractRuleCreator {
protected void updateExtractRule(AnnotationExtractRule r,
Env env,
SequencePattern.PatternExpr expr,
Expression action,
Expression result)
{
TokenSequencePattern pattern = TokenSequencePattern.compile(expr);
updateExtractRule(r, env, pattern, action, result);
}
protected void updateExtractRule(AnnotationExtractRule r,
Env env,
TokenSequencePattern pattern,
Expression action,
Expression result)
{
MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r);
valueExtractor.valueExtractor =
new CoreMapFunctionApplier< List extends CoreMap>, Value>(
env, r.annotationField,
new SequencePatternExtractRule(
pattern,
new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults));
r.extractRule = new SequencePatternExtractRule(pattern,
new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults);
r.filterRule = new AnnotationMatchedFilter(valueExtractor);
}
protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result)
{
AnnotationExtractRule r = super.create(env, null);
r.isComposite = true;
if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; }
if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); }
r.ruleType = TOKEN_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
public AnnotationExtractRule create(Env env, Map attributes) {
AnnotationExtractRule r = super.create(env, attributes);
r.isComposite = true;
if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; }
if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); }
if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; }
//SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern");
TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class TokenPatternExtractRuleCreator extends AnnotationExtractRuleCreator {
protected void updateExtractRule(AnnotationExtractRule r,
Env env,
SequencePattern.PatternExpr expr,
Expression action,
Expression result)
{
TokenSequencePattern pattern = TokenSequencePattern.compile(expr);
updateExtractRule(r, env, pattern, action, result);
}
protected void updateExtractRule(AnnotationExtractRule r,
Env env,
TokenSequencePattern pattern,
Expression action,
Expression result)
{
MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r);
if (r.annotationField != null && r.annotationField != CoreMap.class) {
valueExtractor.valueExtractor =
new CoreMapFunctionApplier< List extends CoreMap>, Value >(
env, r.annotationField,
new SequencePatternExtractRule(
pattern,
new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults));
r.extractRule = new CoreMapExtractRule< List extends CoreMap>, MatchedExpression >(
env, r.annotationField,
new SequencePatternExtractRule(pattern,
new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults));
} else {
valueExtractor.valueExtractor =
new CoreMapToListFunctionApplier< Value >(
env, new SequencePatternExtractRule(
pattern,
new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults));
r.extractRule = new CoreMapToListExtractRule< MatchedExpression >(
new SequencePatternExtractRule(pattern,
new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults));
}
r.filterRule = new AnnotationMatchedFilter(valueExtractor);
}
protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result)
{
AnnotationExtractRule r = super.create(env, null);
if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; }
r.ruleType = TOKEN_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
public AnnotationExtractRule create(Env env, Map attributes) {
AnnotationExtractRule r = super.create(env, attributes);
if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; }
if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; }
//SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern");
TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class TextPatternExtractRuleCreator extends AnnotationExtractRuleCreator {
protected void updateExtractRule(AnnotationExtractRule r,
Env env,
String expr,
Expression action,
Expression result)
{
final MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r);
Pattern pattern = env.getStringPattern(expr);
valueExtractor.valueExtractor =
new CoreMapFunctionApplier< String, Value >(
env, r.annotationField,
new StringPatternExtractRule(
pattern,
new StringMatchResultExtractor(env, action, result)));
r.extractRule = new CoreMapExtractRule< String, MatchedExpression >(
env, r.annotationField,
new StringPatternExtractRule(pattern,
new StringMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup)));
r.filterRule = new AnnotationMatchedFilter(valueExtractor);
}
protected AnnotationExtractRule create(Env env, String expr, Expression result)
{
AnnotationExtractRule r = super.create(env, null);
if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); }
r.ruleType = TEXT_PATTERN_RULE_TYPE;
updateExtractRule(r, env, expr, null, result);
return r;
}
public AnnotationExtractRule create(Env env, Map attributes) {
AnnotationExtractRule r = super.create(env, attributes);
if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); }
if (r.ruleType == null) { r.ruleType = TEXT_PATTERN_RULE_TYPE; }
String expr = (String) Expressions.asObject(env, attributes.get("pattern"));
Expression action = Expressions.asExpression(env, attributes.get("action"));
Expression result = Expressions.asExpression(env, attributes.get("result"));
updateExtractRule(r, env, expr, action, result);
return r;
}
}
public static class AnnotationMatchedFilter implements Predicate, Serializable {
MatchedExpression.SingleAnnotationExtractor extractor;
public AnnotationMatchedFilter(MatchedExpression.SingleAnnotationExtractor extractor) {
this.extractor = extractor;
}
public boolean test(MatchedExpression me) {
CoreMap cm = me.getAnnotation();
Value v = extractor.apply(cm);
if (v != null) {
if (v.get() == null) {
return true;
} else {
extractor.annotate(me);
return false;
}
//return v.get() == null;
} else {
return false;
}
}
}
public static class StringMatchResultExtractor implements Function {
Env env;
Expression action;
Expression result;
public StringMatchResultExtractor(Env env, Expression action, Expression result) {
this.env = env;
this.action = action;
this.result = result;
}
public StringMatchResultExtractor(Env env, Expression result) {
this.env = env;
this.result = result;
}
public Value apply(MatchResult matchResult) {
Value v = null;
if (action != null) {
action.evaluate(env, matchResult);
}
if (result != null) {
v = result.evaluate(env, matchResult);
}
return v;
}
}
public static class SequenceMatchResultExtractor implements Function,Value> {
Env env;
Expression action;
Expression result;
public SequenceMatchResultExtractor(Env env, Expression action, Expression result) {
this.env = env;
this.action = action;
this.result = result;
}
public SequenceMatchResultExtractor(Env env, Expression result) {
this.env = env;
this.result = result;
}
public Value apply(SequenceMatchResult matchResult) {
Value v = null;
if (action != null) {
action.evaluate(env, matchResult);
}
if (result != null) {
v = result.evaluate(env, matchResult);
}
return v;
}
}
/**
* Interface for a rule that extracts a list of matched items from a input
* @param
* @param
*/
public static interface ExtractRule {
public boolean extract(I in, List out);
}
/**
* Extraction rule that filters the input before passing it on to the next extractor
* @param
* @param
*/
public static class FilterExtractRule implements ExtractRule
{
Predicate filter;
ExtractRule rule;
public FilterExtractRule(Predicate filter, ExtractRule rule) {
this.filter = filter;
this.rule = rule;
}
public FilterExtractRule(Predicate filter, ExtractRule... rules) {
this.filter = filter;
this.rule = new ListExtractRule(rules);
}
public boolean extract(I in, List out) {
if (filter.test(in)) {
return rule.extract(in,out);
} else {
return false;
}
}
}
/**
* Extraction rule that applies a list of rules in sequence and aggregates
* all matches found
* @param
* @param
*/
public static class ListExtractRule implements ExtractRule
{
List> rules;
public ListExtractRule(Collection> rules)
{
this.rules = new ArrayList>(rules);
}
public ListExtractRule(ExtractRule... rules)
{
this.rules = new ArrayList>(rules.length);
for (ExtractRule rule:rules) {
this.rules.add(rule);
}
}
public boolean extract(I in, List out) {
boolean extracted = false;
for (ExtractRule rule:rules) {
if (rule.extract(in,out)) {
extracted = true;
}
}
return extracted;
}
public void addRules(ExtractRule... rules)
{
for (ExtractRule rule:rules) {
this.rules.add(rule);
}
}
public void addRules(Collection> rules)
{
this.rules.addAll(rules);
}
}
/**
* Extraction rule to apply a extraction rule on a particular CoreMap field
* @param
* @param
*/
public static class CoreMapExtractRule implements ExtractRule
{
Env env;
Class annotationField;
ExtractRule extractRule;
public CoreMapExtractRule(Env env, Class annotationField, ExtractRule extractRule) {
this.annotationField = annotationField;
this.extractRule = extractRule;
this.env = env;
}
public boolean extract(CoreMap cm, List out) {
env.push(Expressions.VAR_SELF, cm);
T field = (T) cm.get(annotationField);
boolean res = extractRule.extract(field, out);
env.pop(Expressions.VAR_SELF);
return res;
}
}
public static class CoreMapToListExtractRule implements ExtractRule
{
ExtractRule,O> extractRule;
public CoreMapToListExtractRule(ExtractRule,O> extractRule) {
this.extractRule = extractRule;
}
public boolean extract(CoreMap cm, List out) {
return extractRule.extract(Arrays.asList(cm), out);
}
}
public static class BasicSequenceExtractRule implements ExtractRule< List extends CoreMap>, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
public BasicSequenceExtractRule(MatchedExpression.SingleAnnotationExtractor extractor) {
this.extractor = extractor;
}
public boolean extract(List extends CoreMap> seq, List out) {
boolean extracted = false;
for (int i = 0; i < seq.size(); i++) {
CoreMap t = seq.get(i);
Value v = extractor.apply(t);
if (v != null) {
MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(i, i + 1, Interval.INTERVAL_OPEN_END), null);
out.add(te);
extracted = true;
}
}
return extracted;
}
}
public static class SequencePatternExtractRule implements ExtractRule< List extends T>, O>, Function, O>
{
SequencePattern pattern;
Function, O> extractor;
SequenceMatcher.FindType findType = null;
boolean matchWithResult = false;
public SequencePatternExtractRule(Env env, String regex, Function, O> extractor) {
this.extractor = extractor;
this.pattern = SequencePattern.compile(env, regex);
}
public SequencePatternExtractRule(SequencePattern p, Function, O> extractor) {
this.extractor = extractor;
this.pattern = p;
}
public SequencePatternExtractRule(SequencePattern p, Function, O> extractor,
SequenceMatcher.FindType findType, boolean matchWithResult) {
this.extractor = extractor;
this.pattern = p;
this.findType = findType;
this.matchWithResult = matchWithResult;
}
public boolean extract(List extends T> seq, List out) {
if (seq == null) return false;
boolean extracted = false;
SequenceMatcher m = pattern.getMatcher(seq);
if (findType != null) {
m.setFindType(findType);
}
m.setMatchWithResult(matchWithResult);
while (m.find()) {
out.add(extractor.apply(m));
extracted = true;
}
return extracted;
}
public O apply(List extends T> seq) {
if (seq == null) return null;
SequenceMatcher m = pattern.getMatcher(seq);
m.setMatchWithResult(matchWithResult);
if (m.matches()) {
return extractor.apply(m);
} else {
return null;
}
}
}
public static class StringPatternExtractRule implements ExtractRule, Function
{
Pattern pattern;
Function extractor;
public StringPatternExtractRule(Pattern pattern, Function extractor) {
this.pattern = pattern;
this.extractor = extractor;
}
public StringPatternExtractRule(Env env, String regex, Function extractor) {
this(env, regex, extractor, false);
}
public StringPatternExtractRule(String regex, Function extractor) {
this(null, regex, extractor, false);
}
public StringPatternExtractRule(Env env, String regex, Function extractor,
boolean addWordBoundaries) {
this.extractor = extractor;
if (addWordBoundaries) { regex = "\\b" + regex + "\\b"; }
if (env != null) {
pattern = env.getStringPattern(regex);
} else {
pattern = Pattern.compile(regex);
}
}
public boolean extract(String str, List out) {
if (str == null) return false;
boolean extracted = false;
Matcher m = pattern.matcher(str);
while (m.find()) {
out.add(extractor.apply( m ));
extracted = true;
}
return extracted;
}
public O apply(String str) {
if (str == null) return null;
Matcher m = pattern.matcher(str);
if (m.matches()) {
return extractor.apply(m);
} else {
return null;
}
}
}
public static class StringMatchedExpressionExtractor implements Function
{
MatchedExpression.SingleAnnotationExtractor extractor;
int group = 0;
public StringMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) {
this.extractor = extractor;
this.group = group;
}
public MatchedExpression apply(MatchResult matched) {
MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END), null);
return te;
}
}
public static class SequenceMatchedExpressionExtractor implements Function, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
int group = 0;
public SequenceMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) {
this.extractor = extractor;
this.group = group;
}
public MatchedExpression apply(SequenceMatchResult matched) {
MatchedExpression te = extractor.createMatchedExpression(null, Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END));
return te;
}
}
public static class CoreMapFunctionApplier implements Function
{
Env env;
Class annotationField;
Function func;
public CoreMapFunctionApplier(Env env, Class annotationField, Function func) {
this.annotationField = annotationField;
if (annotationField == null) {
throw new IllegalArgumentException("Annotation field cannot be null");
}
this.func = func;
this.env = env;
}
public O apply(CoreMap cm) {
if (env != null) { env.push(Expressions.VAR_SELF, cm); }
T field = (T) cm.get(annotationField);
O res = func.apply(field);
if (env != null) { env.pop(Expressions.VAR_SELF); }
return res;
}
}
public static class CoreMapToListFunctionApplier implements Function
{
Env env;
Function,O> func;
public CoreMapToListFunctionApplier(Env env, Function,O> func) {
this.func = func;
this.env = env;
}
public O apply(CoreMap cm) {
if (env != null) { env.push(Expressions.VAR_SELF, cm); }
O res = func.apply(Arrays.asList(cm));
if (env != null) { env.pop(Expressions.VAR_SELF); }
return res;
}
}
}