All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.tokensregex.SequenceMatchRules Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.ling.tokensregex;

import edu.stanford.nlp.ling.tokensregex.types.AssignableExpression;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;

import java.io.Serializable;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Rules for matching sequences using regular expressions.
 * 

* There are 2 types of rules: * *

    *
  1. Assignment rules which assign a value to a variable for later use. *
  2. *
  3. Extraction rules which specifies how regular expression patterns are to be matched against text, * which matched text expressions are to extracted, and what value to assign to the matched expression.
  4. *
* * NOTE: {@code #} or {@code //} can be used to indicates one-line comments. *

* Assignment Rules are used to assign values to variables. * The basic format is: {@code variable = value}. *

* Variable Names: *

    *
  • Variable names should follow the pattern [A-Za-z_][A-Za-z0-9_]*
  • *
  • Variable names for use in regular expressions (to be expanded later) must start with {@code $}
  • *
*

* Value Types: *

* * * * * * * * * * * * *
TypeFormatExampleDescription
{@code BOOLEAN}{@code TRUE | FALSE}{@code TRUE}
{@code STRING}{@code "..."}{@code "red"}
{@code INTEGER}{@code [+-]\d+}{@code 1500}
{@code LONG}{@code [+-]\d+L}{@code 1500000000000L}
{@code DOUBLE}{@code [+-]\d*\.\d+}{@code 6.98}
{@code REGEX}{@code /.../}{@code /[Aa]pril/}String regular expression {@link Pattern}
{@code TOKENS_REGEX}{@code ( [...] [...] ... ) }{@code ( /up/ /to/ /4/ /months/ )}Tokens regular expression {@link TokenSequencePattern}
{@code LIST}{@code ( [item1] , [item2], ... )}{@code ("red", "blue", "yellow" )}
*

* Some typical uses and examples for assignment rules include: *

    *
  1. Assignment of value to variables for use in later rules
  2. *
  3. Binding of text key to annotation key (as {@code Class}). *
     *      tokens = { type: "CLASS", value: "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation" }
     *    
    *
  4. *
  5. Defining regular expressions macros to be embedded in other regular expressions *
     *      $SEASON = "/spring|summer|fall|autumn|winter/"
     *      $NUM = ( [ { numcomptype:NUMBER } ] )
     *    
    *
  6. *
  7. Setting default environment variables. * Rules are applied with respect to an environment ({@link Env}), which can be accessed using the variable {@code ENV}. * Members of the Environment can be set as needed. *
     *      # Set default parameters to be used when reading rules
     *      ENV.defaults["ruleType"] = "tokens"
     *      # Set default string pattern flags (to case-insensitive)
     *      ENV.defaultStringPatternFlags = 2
     *      # Specifies that the result should go into the {@code tokens}  key (as defined above).
     *      ENV.defaultResultAnnotationKey = tokens
     *    
    *
  8. *
  9. Defining options
  10. *
*

* Predefined values are: *

* * * * * * *
VariableTypeDescription
{@code ENV}{@link Env}The environment with respect to which the rules are applied.
{@code TRUE}{@code BOOLEAN}The {@code Boolean} value {@code true}.
{@code FALSE}{@code BOOLEAN}The {@code Boolean} value {@code false}.
{@code NIL}{@code}The {@code null} value.
{@code tags}{@code Class}The annotation key {@link edu.stanford.nlp.ling.tokensregex.types.Tags.TagsAnnotation}.
* *

* Extraction Rules specifies how regular expression patterns are to be matched against text. * See {@link CoreMapExpressionExtractor} for more information on the types of the rules, and in what sequence the rules are applied. * A basic rule can be specified using the following template: *

 *   {
 *     # Type of the rule
 *     ruleType: "tokens" | "text" | "composite" | "filter",
 *     # Pattern to match against
 *     pattern: ( <TokenSequencePattern> ) | /<TextPattern>/,
 *     # Resulting value to go into the resulting annotation
 *     result: ...
 *
 *     # More fields following...
 *   }
 * 
* Example: *
 *   {
 *     ruleType: "tokens",
 *     pattern: ( /one/ ),
 *     result: 1
 *   }
 * 
* *

* Extraction rule fields (most fields are optional): *

* * * * * * * * * * * * * * * * * * * * * * * * * * * *
FieldValuesExampleDescription
{@code ruleType}{@code "tokens" | "text" | "composite" | "filter" }{@code tokens}Type of the rule (required).
{@code pattern}{@code = (...) | = /.../}{@code ( /winter/ /of/ $YEAR )}Pattern to match against. * See {@link TokenSequencePattern} and {@link Pattern} for * how to specify patterns over tokens and strings (required).
{@code action}{@code = (...)}{@code ( Annotate($0, ner, "DATE") )}List of actions to apply when the pattern is triggered. * Each action is a {@link Expressions TokensRegex Expression}
{@code result}{@code }{@code}Resulting value to go into the resulting annotation. See {@link Expressions} for how to specify the result.
{@code name}{@code STRING}{@code}Name to identify the extraction rule.
{@code stage}{@code INTEGER}{@code}Stage at which the rule is to be applied. Rules are grouped in stages, which are applied from lowest to highest.
{@code active}{@code Boolean}{@code}Whether this rule is enabled (active) or not (default true).
{@code priority}{@code DOUBLE}{@code}Priority of rule. Within a stage, matches from higher priority rules are preferred.
{@code weight}{@code DOUBLE}{@code}Weight of rule (not currently used).
{@code over}{@code CLASS}{@code}Annotation field to check pattern against.
{@code matchFindType}{@code FIND_NONOVERLAPPING | FIND_ALL}{@code}Whether to find all matched expression or just the nonoverlapping ones (default {@code FIND_NONOVERLAPPING}).
{@code matchWithResults}{@code Boolean}{@code}Whether results of the matches should be returned (default false). * Set to true to access captured groups of embedded regular expressions.
{@code matchedExpressionGroup}{@code Integer}{@code 2}What group should be treated as the matched expression group (default 0).
* * @author Angel Chang * @see CoreMapExpressionExtractor * @see TokenSequencePattern */ public class SequenceMatchRules { private SequenceMatchRules() { } // static class with inner classes /** A sequence match rule. */ public interface Rule { } /** * Rule that specifies what value to assign to a variable. */ public static class AssignmentRule implements Rule { final Expression expr; public AssignmentRule(AssignableExpression varExpr, Expression value) { expr = varExpr.assign(value); } public void evaluate(Env env) { expr.evaluate(env); } } /** * Rule that specifies how to extract sequence of MatchedExpression from an annotation (CoreMap). * * @param Output type (MatchedExpression) */ public static class AnnotationExtractRule implements Rule, ExtractRule, Predicate, Serializable { private static final long serialVersionUID = -2148125332223720424L; /** Name of the rule */ public String name; /** Stage in which this rule should be applied with respect to others */ public int stage = 1; /** Priority in which this rule should be applied with respect to others */ public double priority; /** Weight given to the rule (how likely is this rule to fire) */ public double weight; /** Annotation field to apply rule over: text or tokens or numerizedtokens */ public Class annotationField; public Class tokensAnnotationField; /** Annotation field(s) on individual tokens to put new annotation */ public List tokensResultAnnotationField; /** Annotation field(s) to put new annotation */ public List resultAnnotationField; /** Annotation field for child/nested annotations */ public Class resultNestedAnnotationField; public SequenceMatcher.FindType matchFindType; /** Which group to take as the matched expression - default is 0 */ public int matchedExpressionGroup; public boolean matchWithResults; // TODO: Combine ruleType and isComposite /** Type of rule to apply: token string match, pattern string match */ public String ruleType; public boolean isComposite; public boolean includeNested = true; // TODO: Get parameter from somewhere.... public boolean active = true; /** Actual rule performing the extraction (converting annotation to MatchedExpression) */ public ExtractRule extractRule; public Predicate filterRule; /** Pattern - the type of which is dependent on the rule type */ public Object pattern; public Expression result; public void update(Env env, Map attributes) { for (Map.Entry stringObjectEntry : attributes.entrySet()) { String key = stringObjectEntry.getKey(); Object obj = stringObjectEntry.getValue(); switch (key) { case "name": name = (String) Expressions.asObject(env, obj); break; case "priority": priority = ((Number) Expressions.asObject(env, obj)).doubleValue(); break; case "stage": stage = ((Number) Expressions.asObject(env, obj)).intValue(); break; case "weight": weight = ((Number) Expressions.asObject(env, obj)).doubleValue(); break; case "over": Object annoKey = Expressions.asObject(env, obj); if (annoKey instanceof Class) { annotationField = (Class) annoKey; } else if (annoKey instanceof String) { annotationField = EnvLookup.lookupAnnotationKeyWithClassname(env, (String) annoKey); } else if (annotationField == null) { annotationField = CoreMap.class; } else { throw new IllegalArgumentException("Invalid annotation key " + annoKey); } break; case "active": active = (Boolean) Expressions.asObject(env, obj); break; case "ruleType": ruleType = (String) Expressions.asObject(env, obj); break; case "matchFindType": matchFindType = SequenceMatcher.FindType.valueOf((String) Expressions.asObject(env, obj)); break; case "matchWithResults": matchWithResults = ((Boolean) Expressions.asObject(env, obj)).booleanValue(); break; case "matchedExpressionGroup": matchedExpressionGroup = ((Number) Expressions.asObject(env, obj)).intValue(); break; } } } @Override public boolean extract(S in, List out) { return extractRule.extract(in, out); } @Override public boolean test(T obj) { return filterRule.test(obj); } public boolean isMostlyCompatible(AnnotationExtractRule aer) { // TODO: Check tokensResultAnnotationField, resultAnnotationField, resultNestedAnnotationField? return (stage == aer.stage && Objects.equals(annotationField, aer.annotationField) && Objects.equals(tokensAnnotationField, aer.tokensAnnotationField) && matchedExpressionGroup == 0 && aer.matchedExpressionGroup == 0 && matchWithResults == aer.matchWithResults && Objects.equals(ruleType, aer.ruleType) && isComposite == aer.isComposite && active == aer.active && Objects.equals(result, aer.result)); } public boolean hasTokensRegexPattern() { return pattern != null && pattern instanceof TokenSequencePattern; } public String toString() { return getClass().getSimpleName() + '[' + pattern.toString() + ']'; } } // end static class AnnotationExtractRule public static AssignmentRule createAssignmentRule(Env env, AssignableExpression var, Expression result) { AssignmentRule ar = new AssignmentRule(var, result); ar.evaluate(env); return ar; } public static Rule createRule(Env env, Expressions.CompositeValue cv) { Map attributes; cv = cv.simplifyNoTypeConversion(env); attributes = new HashMap<>();//Generics.newHashMap(); for (String s:cv.getAttributes()) { attributes.put(s, cv.getExpression(s)); } return createExtractionRule(env, attributes); } protected static AnnotationExtractRule createExtractionRule(Env env, Map attributes) { String ruleType = (String) Expressions.asObject(env, attributes.get("ruleType")); if (ruleType == null && env != null) { ruleType = (String) env.getDefaults().get("ruleType"); } AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType); if (ruleCreator != null) { return ruleCreator.create(env, attributes); } else { throw new IllegalArgumentException("Unknown rule type: " + ruleType); } } public static AnnotationExtractRule createExtractionRule(Env env, String ruleType, Object pattern, Expression result) { if (ruleType == null && env != null) { ruleType = (String) env.getDefaults().get("ruleType"); } AnnotationExtractRuleCreator ruleCreator = lookupExtractRuleCreator(env, ruleType); if (ruleCreator != null) { Map attributes = new HashMap<>();//Generics.newHashMap(); attributes.put("ruleType", ruleType); attributes.put("pattern", pattern); attributes.put("result", result); return ruleCreator.create(env, attributes); } else { throw new IllegalArgumentException("Unknown rule type: " + ruleType); } } public static final String COMPOSITE_RULE_TYPE = "composite"; public static final String TOKEN_PATTERN_RULE_TYPE = "tokens"; public static final String TEXT_PATTERN_RULE_TYPE = "text"; public static final String FILTER_RULE_TYPE = "filter"; public static final TokenPatternExtractRuleCreator TOKEN_PATTERN_EXTRACT_RULE_CREATOR = new TokenPatternExtractRuleCreator(); public static final CompositeExtractRuleCreator COMPOSITE_EXTRACT_RULE_CREATOR = new CompositeExtractRuleCreator(); public static final TextPatternExtractRuleCreator TEXT_PATTERN_EXTRACT_RULE_CREATOR = new TextPatternExtractRuleCreator(); public static final MultiTokenPatternExtractRuleCreator MULTI_TOKEN_PATTERN_EXTRACT_RULE_CREATOR = new MultiTokenPatternExtractRuleCreator(); public static final AnnotationExtractRuleCreator DEFAULT_EXTRACT_RULE_CREATOR = TOKEN_PATTERN_EXTRACT_RULE_CREATOR; private static final Map registeredRuleTypes = new HashMap<>();//Generics.newHashMap(); static { registeredRuleTypes.put(TOKEN_PATTERN_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR); registeredRuleTypes.put(COMPOSITE_RULE_TYPE, COMPOSITE_EXTRACT_RULE_CREATOR); registeredRuleTypes.put(TEXT_PATTERN_RULE_TYPE, TEXT_PATTERN_EXTRACT_RULE_CREATOR); registeredRuleTypes.put(FILTER_RULE_TYPE, TOKEN_PATTERN_EXTRACT_RULE_CREATOR); } private static AnnotationExtractRuleCreator lookupExtractRuleCreator(Env env, String ruleType) { if (env != null) { Object obj = env.get(ruleType); if (obj != null && obj instanceof AnnotationExtractRuleCreator) { return (AnnotationExtractRuleCreator) obj; } } if (ruleType == null) { return DEFAULT_EXTRACT_RULE_CREATOR; } else { return registeredRuleTypes.get(ruleType); } } public static AnnotationExtractRule createTokenPatternRule(Env env, SequencePattern.PatternExpr expr, Expression result) { return TOKEN_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result); } public static AnnotationExtractRule createTextPatternRule(Env env, String expr, Expression result) { return TEXT_PATTERN_EXTRACT_RULE_CREATOR.create(env, expr, result); } public static AnnotationExtractRule createMultiTokenPatternRule(Env env, AnnotationExtractRule template, List patterns) { return MULTI_TOKEN_PATTERN_EXTRACT_RULE_CREATOR.create(env, template, patterns); } public static class AnnotationExtractRuleCreator { public AnnotationExtractRule create(Env env) { AnnotationExtractRule r = new AnnotationExtractRule(); r.resultAnnotationField = EnvLookup.getDefaultResultAnnotationKey(env); r.resultNestedAnnotationField = EnvLookup.getDefaultNestedResultsAnnotationKey(env); r.tokensAnnotationField = EnvLookup.getDefaultTokensAnnotationKey(env); r.tokensResultAnnotationField = EnvLookup.getDefaultTokensResultAnnotationKey(env); if (env != null) { r.update(env, env.getDefaults()); } return r; } public AnnotationExtractRule create(Env env, Map attributes) { // Get default annotation extract rule from env AnnotationExtractRule r = create(env); if (attributes != null) { r.update(env, attributes); } return r; } } public static MatchedExpression.SingleAnnotationExtractor createAnnotationExtractor(Env env, AnnotationExtractRule r) { MatchedExpression.SingleAnnotationExtractor extractor = new MatchedExpression.SingleAnnotationExtractor(); extractor.name = r.name; extractor.tokensAnnotationField = r.tokensAnnotationField; extractor.tokensResultAnnotationField = r.tokensResultAnnotationField; extractor.resultAnnotationField = r.resultAnnotationField; extractor.resultNestedAnnotationField = r.resultNestedAnnotationField; extractor.priority = r.priority; extractor.weight = r.weight; extractor.includeNested = r.includeNested; extractor.resultAnnotationExtractor = EnvLookup.getDefaultResultAnnotationExtractor(env); extractor.tokensAggregator = EnvLookup.getDefaultTokensAggregator(env); return extractor; } public static class CompositeExtractRuleCreator extends AnnotationExtractRuleCreator { protected static void updateExtractRule(AnnotationExtractRule r, Env env, SequencePattern.PatternExpr expr, Expression action, Expression result) { TokenSequencePattern pattern = TokenSequencePattern.compile(expr); updateExtractRule(r, env, pattern, action, result); } protected static void updateExtractRule(AnnotationExtractRule r, Env env, TokenSequencePattern pattern, Expression action, Expression result) { MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); SequencePatternExtractRule valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults); SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); SequencePatternExtractRule exprExtractRule = new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults); annotationExtractor.expressionToValue = matched -> { if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { return valueExtractor.apply( (SequenceMatchResult) matched.context); } else return null; }; annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); r.extractRule = exprExtractRule; r.filterRule = new AnnotationMatchedFilter(annotationExtractor); r.pattern = pattern; r.result = result; pattern.weight = r.weight; pattern.priority = r.priority; } protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result) { AnnotationExtractRule r = super.create(env, null); r.isComposite = true; if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; } if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); } r.ruleType = TOKEN_PATTERN_RULE_TYPE; updateExtractRule(r, env, expr, null, result); return r; } @Override public AnnotationExtractRule create(Env env, Map attributes) { AnnotationExtractRule r = super.create(env, attributes); r.isComposite = true; if (r.annotationField == null) { r.annotationField = r.resultNestedAnnotationField; } if (r.annotationField == null) { throw new IllegalArgumentException("Error creating composite rule: no annotation field"); } if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; } //SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern"); TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern")); Expression action = Expressions.asExpression(env, attributes.get("action")); Expression result = Expressions.asExpression(env, attributes.get("result")); updateExtractRule(r, env, expr, action, result); return r; } } public static class TokenPatternExtractRuleCreator extends AnnotationExtractRuleCreator { protected static void updateExtractRule(AnnotationExtractRule r, Env env, SequencePattern.PatternExpr expr, Expression action, Expression result) { TokenSequencePattern pattern = TokenSequencePattern.compile(expr); updateExtractRule(r, env, pattern, action, result); } protected static void updateExtractRule(AnnotationExtractRule r, Env env, TokenSequencePattern pattern, Expression action, Expression result) { MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); SequencePatternExtractRule valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults); SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); SequencePatternExtractRule exprExtractRule = new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults); annotationExtractor.expressionToValue = matched -> { if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { return valueExtractor.apply( (SequenceMatchResult) matched.context); } else return null; }; if (r.annotationField != null && r.annotationField != CoreMap.class) { annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); } else { annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule); r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule); } r.filterRule = new AnnotationMatchedFilter(annotationExtractor); r.pattern = pattern; r.result = result; pattern.weight = r.weight; pattern.priority = r.priority; } protected AnnotationExtractRule create(Env env, SequencePattern.PatternExpr expr, Expression result) { AnnotationExtractRule r = super.create(env, null); if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; } r.ruleType = TOKEN_PATTERN_RULE_TYPE; updateExtractRule(r, env, expr, null, result); return r; } @Override public AnnotationExtractRule create(Env env, Map attributes) { AnnotationExtractRule r = super.create(env, attributes); if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; } if (r.ruleType == null) { r.ruleType = TOKEN_PATTERN_RULE_TYPE; } //SequencePattern.PatternExpr expr = (SequencePattern.PatternExpr) attributes.get("pattern"); TokenSequencePattern expr = (TokenSequencePattern) Expressions.asObject(env, attributes.get("pattern")); Expression action = Expressions.asExpression(env, attributes.get("action")); Expression result = Expressions.asExpression(env, attributes.get("result")); updateExtractRule(r, env, expr, action, result); return r; } } public static class MultiTokenPatternExtractRuleCreator extends AnnotationExtractRuleCreator { protected static void updateExtractRule(AnnotationExtractRule r, Env env, MultiPatternMatcher pattern, Expression action, Expression result) { MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); MultiSequencePatternExtractRule valueExtractRule = new MultiSequencePatternExtractRule<>(pattern, valueExtractor); SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); MultiSequencePatternExtractRule exprExtractRule = new MultiSequencePatternExtractRule<>(pattern, exprExtractor); annotationExtractor.expressionToValue = matched -> { if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { return valueExtractor.apply( (SequenceMatchResult) matched.context); } else return null; }; if (r.annotationField != null && r.annotationField != CoreMap.class) { annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); } else { annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule); r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule); } r.filterRule = new AnnotationMatchedFilter(annotationExtractor); r.pattern = pattern; r.result = result; } protected static AnnotationExtractRule create(Env env, SequenceMatchRules.AnnotationExtractRule aerTemplate, List patterns) { AnnotationExtractRule r = new AnnotationExtractRule(); r.stage = aerTemplate.stage; r.active = aerTemplate.active; r.priority = Double.NaN; // Priority from patterns? r.weight = Double.NaN; // weight from patterns? r.annotationField = aerTemplate.annotationField; r.tokensAnnotationField = aerTemplate.tokensAnnotationField; r.tokensResultAnnotationField = aerTemplate.tokensResultAnnotationField; r.resultAnnotationField = aerTemplate.resultAnnotationField; r.resultNestedAnnotationField = aerTemplate.resultNestedAnnotationField; r.matchFindType = aerTemplate.matchFindType; r.matchedExpressionGroup = aerTemplate.matchedExpressionGroup; r.matchWithResults = aerTemplate.matchWithResults; r.ruleType = aerTemplate.ruleType; r.isComposite = aerTemplate.isComposite; r.includeNested = aerTemplate.includeNested; r.active = aerTemplate.active; r.result = aerTemplate.result; if (r.annotationField == null) { r.annotationField = r.tokensAnnotationField; } r.ruleType = TOKEN_PATTERN_RULE_TYPE; MultiPatternMatcher multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(patterns); multiPatternMatcher.setMatchWithResult(r.matchWithResults); updateExtractRule(r, env, multiPatternMatcher, null, r.result); return r; } @Override public AnnotationExtractRule create(Env env, Map attributes) { throw new UnsupportedOperationException(); } } public static class TextPatternExtractRuleCreator extends AnnotationExtractRuleCreator { protected static void updateExtractRule(AnnotationExtractRule r, Env env, String expr, Expression action, Expression result) { final MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); Pattern pattern = env.getStringPattern(expr); StringMatchResultExtractor valueExtractor = new StringMatchResultExtractor(env, action, result); StringPatternExtractRule valueExtractRule = new StringPatternExtractRule<>(pattern, valueExtractor); StringMatchedExpressionExtractor exprExtractor = new StringMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); StringPatternExtractRule exprExtractRule = new StringPatternExtractRule<>(pattern, exprExtractor); annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); r.filterRule = new AnnotationMatchedFilter(annotationExtractor); r.pattern = pattern; r.result = result; } protected AnnotationExtractRule create(Env env, String expr, Expression result) { AnnotationExtractRule r = super.create(env, null); if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); } r.ruleType = TEXT_PATTERN_RULE_TYPE; updateExtractRule(r, env, expr, null, result); return r; } @Override public AnnotationExtractRule create(Env env, Map attributes) { AnnotationExtractRule r = super.create(env, attributes); if (r.annotationField == null) { r.annotationField = EnvLookup.getDefaultTextAnnotationKey(env); } if (r.ruleType == null) { r.ruleType = TEXT_PATTERN_RULE_TYPE; } String expr = (String) Expressions.asObject(env, attributes.get("pattern")); Expression action = Expressions.asExpression(env, attributes.get("action")); Expression result = Expressions.asExpression(env, attributes.get("result")); updateExtractRule(r, env, expr, action, result); return r; } } public static class AnnotationMatchedFilter implements Predicate, Serializable { private static final long serialVersionUID = -2085736376364259354L; final MatchedExpression.SingleAnnotationExtractor extractor; public AnnotationMatchedFilter(MatchedExpression.SingleAnnotationExtractor extractor) { this.extractor = extractor; } @Override public boolean test(MatchedExpression me) { CoreMap cm = me.getAnnotation(); Value v = extractor.apply(cm); if (v != null) { if (v.get() == null) { return true; } else { extractor.annotate(me); return false; } //return v.get() == null; } else { return false; } } } public static class StringMatchResultExtractor implements Function { final Env env; final Expression action; final Expression result; public StringMatchResultExtractor(Env env, Expression action, Expression result) { this.env = env; this.action = action; this.result = result; } public StringMatchResultExtractor(Env env, Expression result) { this(env, null, result); } @Override public Value apply(MatchResult matchResult) { Value v = null; if (action != null) { action.evaluate(env, matchResult); } if (result != null) { v = result.evaluate(env, matchResult); } return v; } } public static class SequenceMatchResultExtractor implements Function,Value> { final Env env; final Expression action; final Expression result; public SequenceMatchResultExtractor(Env env, Expression action, Expression result) { this.env = env; this.action = action; this.result = result; } public SequenceMatchResultExtractor(Env env, Expression result) { this(env, null, result); } @Override public Value apply(SequenceMatchResult matchResult) { Value v = null; if (action != null) { action.evaluate(env, matchResult); } if (result != null) { v = result.evaluate(env, matchResult); } return v; } } /** * Interface for a rule that extracts a list of matched items from an input. * * @param input type * @param output type */ public interface ExtractRule { boolean extract(I in, List out); } /** * Extraction rule that filters the input before passing it on to the next extractor. * * @param input type * @param output type */ public static class FilterExtractRule implements ExtractRule { final Predicate filter; final ExtractRule rule; public FilterExtractRule(Predicate filter, ExtractRule rule) { this.filter = filter; this.rule = rule; } @SafeVarargs public FilterExtractRule(Predicate filter, ExtractRule... rules) { this.filter = filter; this.rule = new ListExtractRule<>(rules); } @Override public boolean extract(I in, List out) { if (filter.test(in)) { return rule.extract(in,out); } else { return false; } } } /** * Extraction rule that applies a list of rules in sequence and aggregates * all matches found. * * @param input type * @param output type */ public static class ListExtractRule implements ExtractRule { final List> rules; public ListExtractRule(Collection> rules) { this.rules = new ArrayList<>(rules); } @SafeVarargs public ListExtractRule(ExtractRule... rules) { this.rules = new ArrayList<>(rules.length); Collections.addAll(this.rules, rules); } @Override public boolean extract(I in, List out) { boolean extracted = false; for (ExtractRule rule:rules) { if (rule.extract(in,out)) { extracted = true; } } return extracted; } @SafeVarargs public final void addRules(ExtractRule... rules) { Collections.addAll(this.rules, rules); } public void addRules(Collection> rules) { this.rules.addAll(rules); } public String ruleList() { List names = new ArrayList<>(); for (ExtractRule rule: rules) { if (rule instanceof AnnotationExtractRule) { AnnotationExtractRule aer = (AnnotationExtractRule) rule; String ruleString; // initialized below if (aer.pattern != null) { ruleString = aer.pattern.toString(); } else if (aer.extractRule != null) { ruleString = aer.extractRule.toString(); } else if (aer.filterRule != null) { ruleString = aer.filterRule.toString(); } else { ruleString = aer.toString(); } names.add(ruleString); } else { names.add(rule.getClass().getName()); } } return names.toString(); } public String toString() { return "ListExtractRule[" + ruleList() + ']'; } } /** * Extraction rule to apply a extraction rule on a particular CoreMap field. * Input is of type CoreMap, output is templated type O. * * @param type of the annotation field * @param output type */ public static class CoreMapExtractRule implements ExtractRule { final Env env; final Class annotationField; final ExtractRule extractRule; public CoreMapExtractRule(Env env, Class annotationField, ExtractRule extractRule) { this.annotationField = annotationField; this.extractRule = extractRule; this.env = env; } @Override public boolean extract(CoreMap cm, List out) { env.push(Expressions.VAR_SELF, cm); try { T field = (T) cm.get(annotationField); return extractRule.extract(field, out); } finally { env.pop(Expressions.VAR_SELF); } } } /** * Extraction rule that treats a single CoreMap as a list/sequence of CoreMaps. * (A convenience class, for use with BasicSequenceExtractRule.) * Input is of type CoreMap, output is templated type O. * * @param output type */ public static class CoreMapToListExtractRule implements ExtractRule { final ExtractRule,O> extractRule; public CoreMapToListExtractRule(ExtractRule,O> extractRule) { this.extractRule = extractRule; } @Override public boolean extract(CoreMap cm, List out) { return extractRule.extract(Arrays.asList(cm), out); } } /** * Extraction rule. * Input is of type CoreMap, output is MatchedExpression. */ public static class BasicSequenceExtractRule implements ExtractRule< List, MatchedExpression> { final MatchedExpression.SingleAnnotationExtractor extractor; public BasicSequenceExtractRule(MatchedExpression.SingleAnnotationExtractor extractor) { this.extractor = extractor; } @Override public boolean extract(List seq, List out) { boolean extracted = false; for (int i = 0; i < seq.size(); i++) { CoreMap t = seq.get(i); Value v = extractor.apply(t); if (v != null) { MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(i, i + 1, Interval.INTERVAL_OPEN_END), null); out.add(te); extracted = true; } } return extracted; } } public static class SequencePatternExtractRule implements ExtractRule< List, O>, Function, O> { final SequencePattern pattern; final Function, O> extractor; final SequenceMatcher.FindType findType; final boolean matchWithResult; public SequencePatternExtractRule(Env env, String regex, Function, O> extractor) { this(SequencePattern.compile(env, regex), extractor); } public SequencePatternExtractRule(SequencePattern p, Function, O> extractor) { this(p, extractor, null, false); } public SequencePatternExtractRule(SequencePattern p, Function, O> extractor, SequenceMatcher.FindType findType, boolean matchWithResult) { this.extractor = extractor; this.pattern = p; this.findType = findType; this.matchWithResult = matchWithResult; } @Override public boolean extract(List seq, List out) { if (seq == null) return false; boolean extracted = false; SequenceMatcher m = pattern.getMatcher(seq); if (findType != null) { m.setFindType(findType); } m.setMatchWithResult(matchWithResult); while (m.find()) { out.add(extractor.apply(m)); extracted = true; } // System.err.println("SequencePattern " + pattern + " of type " + pattern.getClass() + " matched on " + extracted); return extracted; } @Override public O apply(List seq) { if (seq == null) return null; SequenceMatcher m = pattern.getMatcher(seq); m.setMatchWithResult(matchWithResult); if (m.matches()) { return extractor.apply(m); } else { return null; } } } // end static class SequencePatternExtractRule public static class MultiSequencePatternExtractRule implements ExtractRule< List, O>, Function, O> { final MultiPatternMatcher matcher; final Function, O> extractor; public MultiSequencePatternExtractRule(MultiPatternMatcher matcher, Function, O> extractor) { this.extractor = extractor; this.matcher = matcher; } @Override public boolean extract(List seq, List out) { if (seq == null) return false; boolean extracted = false; List> matched = matcher.findNonOverlappingMaxScore(seq); for (SequenceMatchResult m : matched) { out.add(extractor.apply(m)); extracted = true; } return extracted; } @Override public O apply(List seq) { if (seq == null) return null; List> matched = matcher.findNonOverlappingMaxScore(seq); if ( ! matched.isEmpty()) { return extractor.apply(matched.get(0)); } else { return null; } } } public static class StringPatternExtractRule implements ExtractRule, Function { private final Pattern pattern; private final Function extractor; public StringPatternExtractRule(Pattern pattern, Function extractor) { this.pattern = pattern; this.extractor = extractor; } public StringPatternExtractRule(Env env, String regex, Function extractor) { this(env, regex, extractor, false); } public StringPatternExtractRule(String regex, Function extractor) { this(null, regex, extractor, false); } public StringPatternExtractRule(Env env, String regex, Function extractor, boolean addWordBoundaries) { this.extractor = extractor; if (addWordBoundaries) { regex = "\\b(?:" + regex + ")\\b"; } if (env != null) { pattern = env.getStringPattern(regex); } else { pattern = Pattern.compile(regex); } } @Override public boolean extract(String str, List out) { if (str == null) return false; boolean extracted = false; Matcher m = pattern.matcher(str); while (m.find()) { out.add(extractor.apply( m )); // System.err.println("StringPatternExtractRule: " + pattern + " extracted " + out.get(out.size() - 1)); // XXXX extracted = true; } return extracted; } @Override public O apply(String str) { if (str == null) return null; Matcher m = pattern.matcher(str); if (m.matches()) { return extractor.apply(m); } else { return null; } } } // end static class StringPatternExtractRule public static class StringMatchedExpressionExtractor implements Function { final MatchedExpression.SingleAnnotationExtractor extractor; final int group; public StringMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) { this.extractor = extractor; this.group = group; } @Override public MatchedExpression apply(MatchResult matched) { MatchedExpression te = extractor.createMatchedExpression(Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END), null); return te; } } public static class SequenceMatchedExpressionExtractor implements Function, MatchedExpression> { final MatchedExpression.SingleAnnotationExtractor extractor; final int group; public SequenceMatchedExpressionExtractor(MatchedExpression.SingleAnnotationExtractor extractor, int group) { this.extractor = extractor; this.group = group; } @Override public MatchedExpression apply(SequenceMatchResult matched) { MatchedExpression te = extractor.createMatchedExpression(null, Interval.toInterval(matched.start(group), matched.end(group), Interval.INTERVAL_OPEN_END)); if (Double.isNaN(te.priority)) { te.priority = matched.priority(); } if (Double.isNaN(te.weight)) { te.weight = matched.score(); } if (this.group != 0) { // Save context so value evaluation can happen te.context = matched.toBasicSequenceMatchResult(); } return te; } } public static class CoreMapFunctionApplier implements Function { final Env env; final Class annotationField; final Function func; public CoreMapFunctionApplier(Env env, Class annotationField, Function func) { this.annotationField = annotationField; if (annotationField == null) { throw new IllegalArgumentException("Annotation field cannot be null"); } this.func = func; this.env = env; } @Override public O apply(CoreMap cm) { if (env != null) { env.push(Expressions.VAR_SELF, cm); } try { T field = (T) cm.get(annotationField); return func.apply(field); } finally { if (env != null) { env.pop(Expressions.VAR_SELF); } } } } public static class CoreMapToListFunctionApplier implements Function { final Env env; final Function,O> func; public CoreMapToListFunctionApplier(Env env, Function,O> func) { this.func = func; this.env = env; } @Override public O apply(CoreMap cm) { if (env != null) { env.push(Expressions.VAR_SELF, cm); } try { return func.apply(Collections.singletonList(cm)); } finally { if (env != null) { env.pop(Expressions.VAR_SELF); } } } } // end static class CoreMapToListFunctionApplier }