All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.tokensregex.Env Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ling.tokensregex;

import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import edu.stanford.nlp.ling.tokensregex.types.Tags;
import edu.stanford.nlp.pipeline.CoreMapAggregator;
import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator;
import java.util.function.Function;

import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.util.Pair;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Holds environment variables to be used for compiling string into a pattern.
 * Use {@link EnvLookup} to perform actual lookup (it will provide reasonable defaults).
 *
 * 

* Some of the types of variables to bind are: *

* *
    *
  • {@code SequencePattern} (compiled pattern)
  • *
  • {@code PatternExpr} (sequence pattern expression - precompiled)
  • *
  • {@code NodePattern} (pattern for matching one element)
  • *
  • {@code Class} (binding of CoreMap attribute to java Class)
  • *
*/ public class Env { /** * Parser that converts a string into a SequencePattern. * @see edu.stanford.nlp.ling.tokensregex.parser.TokenSequenceParser */ SequencePattern.Parser parser; /** * Mapping of variable names to their values */ private Map variables = new HashMap<>();//Generics.newHashMap(); /** * Mapping of per thread temporary variables to their values. */ private ThreadLocal> threadLocalVariables = new ThreadLocal<>(); /** * Mapping of variables that can be expanded in a regular expression for strings, * to their regular expressions. * The variable name must start with "$" and include only the alphanumeric characters * (it should follow the pattern {@code $[A-Za-z0-9_]+}). * Each variable is mapped to a pair, consisting of the {@code Pattern} representing * the name of the variable to be replaced, and a {@code String} representing the * regular expression (escaped) that is used to replace the name of the variable. */ private Map> stringRegexVariables = new HashMap<>(); //Generics.newHashMap(); /** * Default parameters (used when reading in rules for {@link SequenceMatchRules}. */ public Map defaults = new HashMap<>();//Generics.newHashMap(); /** * Default flags to use for string regular expressions match * @see java.util.regex.Pattern#compile(String,int) */ public int defaultStringPatternFlags = 0; /** * Default flags to use for string literal match * @see NodePattern#CASE_INSENSITIVE */ public int defaultStringMatchFlags = 0; public Class sequenceMatchResultExtractor; public Class stringMatchResultExtractor; /** * Annotation key to use to getting tokens (default is CoreAnnotations.TokensAnnotation.class) */ public Class defaultTokensAnnotationKey; /** * Annotation key to use to getting text (default is CoreAnnotations.TextAnnotation.class) */ public Class defaultTextAnnotationKey; /** * List of keys indicating the per-token annotations (default is null). * If specified, each token will be annotated with the extracted results from the * {@link #defaultResultsAnnotationExtractor}. * If null, then individual tokens that are matched are not annotated. */ public List defaultTokensResultAnnotationKey; /** * List of keys indicating what fields should be annotated for the aggregated CoreMap. * If specified, the aggregated CoreMap is annotated with the extracted results from the * {@link #defaultResultsAnnotationExtractor}. * If null, then the aggregated CoreMap is not annotated. */ public List defaultResultAnnotationKey; /** * Annotation key to use during composite phase for storing matched sequences and to match against. */ public Class defaultNestedResultsAnnotationKey; /** * How should the tokens be aggregated when collapsing a sequence of tokens into one CoreMap */ public Map defaultTokensAggregators; private CoreMapAggregator defaultTokensAggregator; /** * Whether we should merge and output CoreLabels or not */ public boolean aggregateToTokens; /** * How annotations are extracted from the MatchedExpression. * If the result type is a List and more than one annotation key is specified, * then the result is paired with the annotation key. * Example: If annotation key is [ner,normalized] and result is [CITY,San Francisco] * then the final CoreMap will have ner=CITY, normalized=San Francisco. * Otherwise, the result is treated as one object (all keys will be assigned that value). */ Function defaultResultsAnnotationExtractor; /** * Interface for performing custom binding of values to the environment */ public interface Binder { void init(String prefix, Properties props); void bind(Env env); } public Env(SequencePattern.Parser p) { this.parser = p; } public void initDefaultBindings() { bind("FALSE", Expressions.FALSE); bind("TRUE", Expressions.TRUE); bind("NIL", Expressions.NIL); bind("ENV", this); bind("tags", Tags.TagsAnnotation.class); } public Map getDefaults() { return defaults; } public void setDefaults(Map defaults) { this.defaults = defaults; } public Map getDefaultTokensAggregators() { return defaultTokensAggregators; } public void setDefaultTokensAggregators(Map defaultTokensAggregators) { this.defaultTokensAggregators = defaultTokensAggregators; } public CoreMapAggregator getDefaultTokensAggregator() { if (defaultTokensAggregator == null && (defaultTokensAggregators != null || aggregateToTokens)) { CoreLabelTokenFactory tokenFactory = (aggregateToTokens)? new CoreLabelTokenFactory():null; Map aggregators = defaultTokensAggregators; if (aggregators == null) { aggregators = CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS; } defaultTokensAggregator = CoreMapAggregator.getAggregator(aggregators, null, tokenFactory); } return defaultTokensAggregator; } public Class getDefaultTextAnnotationKey() { return defaultTextAnnotationKey; } public void setDefaultTextAnnotationKey(Class defaultTextAnnotationKey) { this.defaultTextAnnotationKey = defaultTextAnnotationKey; } public Class getDefaultTokensAnnotationKey() { return defaultTokensAnnotationKey; } public void setDefaultTokensAnnotationKey(Class defaultTokensAnnotationKey) { this.defaultTokensAnnotationKey = defaultTokensAnnotationKey; } public List getDefaultTokensResultAnnotationKey() { return defaultTokensResultAnnotationKey; } public void setDefaultTokensResultAnnotationKey(Class... defaultTokensResultAnnotationKey) { this.defaultTokensResultAnnotationKey = Arrays.asList(defaultTokensResultAnnotationKey); } public void setDefaultTokensResultAnnotationKey(List defaultTokensResultAnnotationKey) { this.defaultTokensResultAnnotationKey = defaultTokensResultAnnotationKey; } public List getDefaultResultAnnotationKey() { return defaultResultAnnotationKey; } public void setDefaultResultAnnotationKey(Class... defaultResultAnnotationKey) { this.defaultResultAnnotationKey = Arrays.asList(defaultResultAnnotationKey); } public void setDefaultResultAnnotationKey(List defaultResultAnnotationKey) { this.defaultResultAnnotationKey = defaultResultAnnotationKey; } public Class getDefaultNestedResultsAnnotationKey() { return defaultNestedResultsAnnotationKey; } public void setDefaultNestedResultsAnnotationKey(Class defaultNestedResultsAnnotationKey) { this.defaultNestedResultsAnnotationKey = defaultNestedResultsAnnotationKey; } public Function getDefaultResultsAnnotationExtractor() { return defaultResultsAnnotationExtractor; } public void setDefaultResultsAnnotationExtractor(Function defaultResultsAnnotationExtractor) { this.defaultResultsAnnotationExtractor = defaultResultsAnnotationExtractor; } public Class getSequenceMatchResultExtractor() { return sequenceMatchResultExtractor; } public void setSequenceMatchResultExtractor(Class sequenceMatchResultExtractor) { this.sequenceMatchResultExtractor = sequenceMatchResultExtractor; } public Class getStringMatchResultExtractor() { return stringMatchResultExtractor; } public void setStringMatchResultExtractor(Class stringMatchResultExtractor) { this.stringMatchResultExtractor = stringMatchResultExtractor; } public Map getVariables() { return variables; } public void setVariables(Map variables) { this.variables = variables; } public void clearVariables() { this.variables.clear(); } public int getDefaultStringPatternFlags() { return defaultStringPatternFlags; } public void setDefaultStringPatternFlags(int defaultStringPatternFlags) { this.defaultStringPatternFlags = defaultStringPatternFlags; } public int getDefaultStringMatchFlags() { return defaultStringMatchFlags; } public void setDefaultStringMatchFlags(int defaultStringMatchFlags) { this.defaultStringMatchFlags = defaultStringMatchFlags; } private static final Pattern STRING_REGEX_VAR_NAME_PATTERN = Pattern.compile("\\$[A-Za-z0-9_]+"); public void bindStringRegex(String var, String regex) { // Enforce requirements on variable names ($alphanumeric_) if (!STRING_REGEX_VAR_NAME_PATTERN.matcher(var).matches()) { throw new IllegalArgumentException("StringRegex binding error: Invalid variable name " + var); } Pattern varPattern = Pattern.compile(Pattern.quote(var)); String replace = Matcher.quoteReplacement(regex); stringRegexVariables.put(var, new Pair<>(varPattern, replace)); } public String expandStringRegex(String regex) { // Replace all variables in regex String expanded = regex; for (Map.Entry> stringPairEntry : stringRegexVariables.entrySet()) { Pair p = stringPairEntry.getValue(); expanded = p.first().matcher(expanded).replaceAll(p.second()); } return expanded; } public Pattern getStringPattern(String regex) { String expanded = expandStringRegex(regex); return Pattern.compile(expanded, defaultStringPatternFlags); } public void bind(String name, Object obj) { if (obj != null) { variables.put(name, obj); } else { variables.remove(name); } } public void bind(String name, SequencePattern pattern) { bind(name, pattern.getPatternExpr()); } public void unbind(String name) { bind(name, null); } public NodePattern getNodePattern(String name) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; if (seqPattern.getPatternExpr() instanceof SequencePattern.NodePatternExpr) { return ((SequencePattern.NodePatternExpr) seqPattern.getPatternExpr()).nodePattern; } else { throw new Error("Invalid node pattern class: " + seqPattern.getPatternExpr().getClass() + " for variable " + name); } } else if (obj instanceof SequencePattern.NodePatternExpr) { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) obj; return pe.nodePattern; } else if (obj instanceof NodePattern) { return (NodePattern) obj; } else if (obj instanceof String) { try { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) parser.parseNode(this, (String) obj); return pe.nodePattern; } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to node pattern", pex); } } else { throw new Error("Invalid node pattern variable class: " + obj.getClass() + " for variable " + name); } } return null; } public SequencePattern.PatternExpr getSequencePatternExpr(String name, boolean copy) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; return seqPattern.getPatternExpr(); } else if (obj instanceof SequencePattern.PatternExpr) { SequencePattern.PatternExpr pe = (SequencePattern.PatternExpr) obj; return (copy)? pe.copy():pe; } else if (obj instanceof NodePattern) { return new SequencePattern.NodePatternExpr( (NodePattern) obj); } else if (obj instanceof String) { try { return parser.parseSequence(this, (String) obj); } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to sequence pattern", pex); } } else { throw new Error("Invalid sequence pattern variable class: " + obj.getClass()); } } return null; } public Object get(String name) { return variables.get(name); } // Functions for storing temporary thread specific variables // that are used when running tokensregex public void push(String name, Object value) { Map vars = threadLocalVariables.get(); if (vars == null) { threadLocalVariables.set(vars = new HashMap<>()); //Generics.newHashMap()); } Stack stack = (Stack) vars.get(name); if (stack == null) { vars.put(name, stack = new Stack<>()); } stack.push(value); } public Object pop(String name) { Map vars = threadLocalVariables.get(); if (vars == null) return null; Stack stack = (Stack) vars.get(name); if (stack == null || stack.isEmpty()) { return null; } else { return stack.pop(); } } public Object peek(String name) { Map vars = threadLocalVariables.get(); if (vars == null) return null; Stack stack = (Stack) vars.get(name); if (stack == null || stack.isEmpty()) { return null; } else { return stack.peek(); } } }