edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.parser.ParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParseException;
import edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser;
import edu.stanford.nlp.ling.tokensregex.types.Expression;
import edu.stanford.nlp.ling.tokensregex.types.Tags;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.function.Predicate;
/**
* Represents a list of assignment and extraction rules over sequence patterns.
* See {@link SequenceMatchRules} for the syntax of rules.
*
*
* Assignment rules are used to assign a value to a variable for later use in
* extraction rules or for expansions in patterns.
* Extraction rules are used to extract text/tokens matching regular expressions.
* Extraction rules are grouped into stages, with each stage consisting of the following:
*
* - Matching of rules over text and tokens. These rules are applied directly on the text and tokens fields of the {@code CoreMap}.
* - Matching of composite rules. Matched expression are merged, and composite rules
* are applied recursively until no more changes to the matched expressions are detected.
* - Filtering of an invalid expression. In the final phase, a final filtering stage filters out invalid expressions.
*
* The different stages are numbered and are applied in numeric order.
*
*
* @author Angel Chang
* @see SequenceMatchRules
*/
public class CoreMapExpressionExtractor {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(CoreMapExpressionExtractor.class);
private static boolean verbose = false;
// TODO: Remove templating of MatchedExpressions (keep for now until TimeExpression rules can be decoupled)
private final Env env;
/* Keeps temporary tags created by extractor */
private boolean keepTags = false;
/* Collapses extraction rules - use with care */
private boolean collapseExtractionRules = false;
private final Class>> tokensAnnotationKey;
private final Map> stages;
/**
* Describes one stage of extraction.
* @param
*/
public static class Stage {
/** Whether to clear matched expressions from previous stages or not */
boolean clearMatched = false;
/**
* Limit the number of iterations for which the composite rules are applied
* (prevents badly formed rules from iterating forever)
*/
int limitIters = 50;
/**
* Stage id (stages are applied in numeric order from low to high)
*/
int stageId;
/** Rules to extract matched expressions directly from tokens */
SequenceMatchRules.ExtractRule basicExtractRule;
/** Rules to extract composite expressions (grouped in stages) */
SequenceMatchRules.ExtractRule, T> compositeExtractRule;
/** Filtering rule */
Predicate filterRule;
private static SequenceMatchRules.ExtractRule addRule(SequenceMatchRules.ExtractRule origRule,
SequenceMatchRules.ExtractRule rule) {
SequenceMatchRules.ListExtractRule r;
if (origRule instanceof SequenceMatchRules.ListExtractRule) {
r = (SequenceMatchRules.ListExtractRule) origRule;
} else {
r = new SequenceMatchRules.ListExtractRule<>();
if (origRule != null)
r.addRules(origRule);
}
r.addRules(rule);
return r;
}
private void addCompositeRule(SequenceMatchRules.ExtractRule, T> rule) {
compositeExtractRule = addRule(compositeExtractRule, rule);
}
private void addBasicRule(SequenceMatchRules.ExtractRule rule) {
basicExtractRule = addRule(basicExtractRule, rule);
}
private void addFilterRule(Predicate rule) {
Filters.DisjFilter r;
if (filterRule instanceof Filters.DisjFilter) {
r = (Filters.DisjFilter) filterRule;
r.addFilter(rule);
} else {
if (filterRule == null) {
r = new Filters.DisjFilter<>(rule);
} else {
r = new Filters.DisjFilter<>(filterRule, rule);
}
filterRule = r;
}
}
}
/**
* Creates an empty instance with no rules.
*/
public CoreMapExpressionExtractor() {
this(null);
}
/**
* Creates a default instance with the specified environment.
* (use the default tokens annotation key as specified in the environment)
* @param env Environment to use for binding variables and applying rules
*/
public CoreMapExpressionExtractor(Env env) {
this.stages = new HashMap<>();//Generics.newHashMap();
this.env = env;
this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
this.collapseExtractionRules = false;
if (env != null) {
this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
if (env.get("verbose") != null)
verbose = (env.get("verbose") != null) &&
Objects.equals((Boolean) env.get("verbose"), true);
}
}
/**
* Creates an instance with the specified environment and list of rules
* @param env Environment to use for binding variables and applying rules
* @param rules List of rules for this extractor
*/
public CoreMapExpressionExtractor(Env env, List rules) {
this(env);
appendRules(rules);
}
/**
* Add specified rules to this extractor.
*
* @param rules
*/
public void appendRules(List rules)
{
if (verbose)
log.info("Read " + rules.size() + " rules");
// Put rules into stages
if (collapseExtractionRules) {
rules = collapse(rules);
if (verbose)
log.info("Collapsing into " + rules.size() + " rules");
}
for (SequenceMatchRules.Rule r:rules) {
if (r instanceof SequenceMatchRules.AssignmentRule) {
// Nothing to do
// Assignments are added to environment as they are parsed
((SequenceMatchRules.AssignmentRule) r).evaluate(env);
} else if (r instanceof SequenceMatchRules.AnnotationExtractRule) {
SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) r;
Stage stage = stages.get(aer.stage);
if (stage == null) {
stages.put(aer.stage, stage = new Stage<>());
stage.stageId = aer.stage;
Boolean clearMatched = (Boolean) env.getDefaults().get("stage.clearMatched");
if (clearMatched != null) {
stage.clearMatched = clearMatched;
}
Integer limitIters = (Integer) env.getDefaults().get("stage.limitIters");
if (limitIters != null) {
stage.limitIters = limitIters;
}
}
if (aer.active) {
if (SequenceMatchRules.FILTER_RULE_TYPE.equals(aer.ruleType)) {
stage.addFilterRule(aer);
} else {
if (aer.isComposite) {
// if (SequenceMatchRules.COMPOSITE_RULE_TYPE.equals(aer.ruleType)) {
stage.addCompositeRule(aer);
} else {
stage.addBasicRule(aer);
}
}
} else {
log.debug("Ignoring inactive rule: " + aer.name); // used to be INFO but annoyed Chris/users
}
}
}
}
private SequenceMatchRules.AnnotationExtractRule createMergedRule(SequenceMatchRules.AnnotationExtractRule aerTemplate, List patterns) {
return SequenceMatchRules.createMultiTokenPatternRule(env, aerTemplate, patterns);
}
private List collapse(List rules) {
List collapsed = new ArrayList<>();
List patterns = null;
SequenceMatchRules.AnnotationExtractRule aerTemplate = null;
for (SequenceMatchRules.Rule rule:rules) {
boolean ruleHandled = false;
if (rule instanceof SequenceMatchRules.AnnotationExtractRule) {
SequenceMatchRules.AnnotationExtractRule aer = (SequenceMatchRules.AnnotationExtractRule) rule;
if (aer.hasTokensRegexPattern()) {
if (aerTemplate == null || aerTemplate.isMostlyCompatible(aer)) {
if (aerTemplate == null) {
aerTemplate = aer;
}
if (patterns == null) {
patterns = new ArrayList<>();
}
patterns.add((TokenSequencePattern) aer.pattern);
ruleHandled = true;
}
}
}
// Did we handle this rule?
if (!ruleHandled) {
if (aerTemplate != null) {
SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
collapsed.add(merged);
aerTemplate = null;
patterns = null;
}
collapsed.add(rule);
}
}
if (aerTemplate != null) {
SequenceMatchRules.AnnotationExtractRule merged = createMergedRule(aerTemplate, patterns);
collapsed.add(merged);
}
return collapsed;
}
public Env getEnv() {
return env;
}
public void setExtractRules(SequenceMatchRules.ExtractRule basicExtractRule,
SequenceMatchRules.ExtractRule, T> compositeExtractRule,
Predicate filterRule)
{
Stage stage = new Stage<>();
stage.basicExtractRule = basicExtractRule;
stage.compositeExtractRule = compositeExtractRule;
stage.filterRule = filterRule;
this.stages.clear();
this.stages.put(1, stage);
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filenames.
* @param env
* @param filenames
* @throws RuntimeException
*/
public static CoreMapExpressionExtractor createExtractorFromFiles(Env env, String... filenames) throws RuntimeException {
return createExtractorFromFiles(env, Arrays.asList(filenames));
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filenames.
* @param env
* @param filenames
* @throws RuntimeException
*/
public static CoreMapExpressionExtractor createExtractorFromFiles(Env env, List filenames) throws RuntimeException {
CoreMapExpressionExtractor extractor = new CoreMapExpressionExtractor<>(env);
for (String filename:filenames) {
try {
if (verbose)
log.info("Reading TokensRegex rules from " + filename);
BufferedReader br = IOUtils.readerFromString(filename);
TokenSequenceParser parser = new TokenSequenceParser();
parser.updateExpressionExtractor(extractor, br);
IOUtils.closeIgnoringExceptions(br);
} catch (Exception ex) {
throw new RuntimeException("Error parsing file: " + filename, ex);
}
}
return extractor;
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given filename.
* @param env
* @param filename
* @throws RuntimeException
*/
public static CoreMapExpressionExtractor createExtractorFromFile(Env env, String filename) throws RuntimeException {
return createExtractorFromFiles(env, Collections.singletonList(filename));
}
/**
* Creates an extractor using the specified environment, and reading the rules from the given string
* @param env
* @param str
* @throws IOException, ParseException
*/
public static CoreMapExpressionExtractor createExtractorFromString(Env env, String str) throws IOException, ParseException, TokenSequenceParseException {
TokenSequenceParser parser = new TokenSequenceParser();
CoreMapExpressionExtractor extractor = parser.getExpressionExtractor(env, new StringReader(str));
return extractor;
}
public Value getValue(String varname)
{
Expression expr = (Expression) env.get(varname);
if (expr != null) {
return expr.evaluate(env);
} else {
throw new RuntimeException("Unable get expression for variable " + varname);
}
}
private List extractCoreMapsToList(List res, CoreMap annotation) {
List exprs = extractExpressions(annotation);
for (T expr : exprs) {
res.add(expr.getAnnotation());
}
return res;
}
/**
* Returns list of coremaps that matches the specified rules.
*
* @param annotation
*/
public List extractCoreMaps(CoreMap annotation) {
List res = new ArrayList<>();
return extractCoreMapsToList(res, annotation);
}
/**
* Returns list of merged tokens and original tokens.
*
* @param annotation
*/
public List extractCoreMapsMergedWithTokens(CoreMap annotation) {
List res = extractCoreMaps(annotation);
Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
if (startTokenOffset == null) {
startTokenOffset = 0;
}
final Integer startTokenOffsetFinal = startTokenOffset;
List merged = CollectionUtils.mergeListWithSortedMatchedPreAggregated(
annotation.get(tokensAnnotationKey), res, (CoreMap in) -> Interval.toInterval(in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal,
in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal)
);
return merged;
}
public List flatten(List cms) {
return flatten(cms, tokensAnnotationKey);
}
private static List flatten(List cms, Class key) {
List res = new ArrayList<>();
for (CoreMap cm : cms) {
if (cm.get(key) != null) {
res.addAll( (List) cm.get(key));
} else {
res.add(cm);
}
}
return res;
}
private void cleanupTags(Collection objs, Map