All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.tokensregex.SequencePattern Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.ling.tokensregex;

import edu.stanford.nlp.util.*;

import java.util.*;
import java.util.function.Function;

/**
 * Generic Sequence Pattern for regular expressions.
 *
 * 

* Similar to Java's {@link java.util.regex.Pattern} except it is for sequences over arbitrary types T instead * of just characters. *

* *

A regular expression must first be compiled into * an instance of this class. The resulting pattern can then be used to create * a {@link SequenceMatcher} object that can match arbitrary sequences of type T * against the regular expression. All of the state involved in performing a match * resides in the matcher, so many matchers can share the same pattern. *

* *

* To support sequence matching on a new type T, the following is needed: *

    *
  • Implement a {@link NodePattern for matching type T}
  • *
  • Optionally define a language for node matches and implement {@link SequencePattern.Parser} to compile a * regular expression into a SequencePattern. *
  • *
  • Optionally implement a {@link MultiPatternMatcher.NodePatternTrigger} * for optimizing matches across multiple patterns
  • *
  • Optionally implement a {@link NodesMatchChecker} to support backreferences
  • *
* See {@link TokenSequencePattern} for an example of how this class can be extended * to support a specific type {@code T}. *

* To use *


 *   SequencePattern p = SequencePattern.compile("....");
 *   SequenceMatcher m = p.getMatcher(tokens);
 *   while (m.find()) ....
 * 
*

* * *

* To support a new type {@code T}: *

    *
  1. For a type {@code T} to be matchable, it has to have a corresponding NodePattern that indicates * whether a node is matched or not (see CoreMapNodePattern for example)
  2. *
  3. To compile a string into corresponding pattern, will need to create a parser * (see inner class Parser, TokenSequencePattern and TokenSequenceParser.jj)
  4. *
*

* *

* SequencePattern supports the following standard regex features: *

    *
  • Concatenation
  • *
  • Or
  • *
  • Groups (capturing / noncapturing )
  • *
  • Quantifiers (greedy / nongreedy)
  • *
*

* *

* SequencePattern also supports the following less standard features: *

    *
  1. Environment (see {@link Env}) with respect to which the patterns are compiled
  2. *
  3. Binding of variables *
    Use {@link Env} to bind variables for use when compiling patterns *
    Can also bind names to groups (see {@link SequenceMatchResult} for accessor methods to retrieve matched groups) *
  4. *
  5. Backreference matches - need to specify how back references are to be matched using {@link NodesMatchChecker}
  6. *
  7. Multinode matches - for matching of multiple nodes using non-regex (at least not regex over nodes) patterns * (need to have corresponding {@link MultiNodePattern}, * see {@link MultiCoreMapNodePattern} for example)
  8. *
  9. Conjunctions - conjunctions of sequence patterns (works for some cases)
  10. *
*

* * @author Angel Chang * @see SequenceMatcher */ public class SequencePattern { // TODO: // 1. Validate backref capture groupid // 2. Actions // 3. Inconsistent templating with T // 4. Match sequence begin/end (update TokensSequenceParser to map ^ => SEQ_BEGIN_PATTERN_EXPR, and $ to SEQ_END_PATTERN_EXPR) // 5. Update TokensSequenceParser to handle backref of other attributes (\9{attr1,attr2,...}) private String patternStr; private PatternExpr patternExpr; private SequenceMatchAction action; State root; int totalGroups = 0; // binding of group number to variable name VarGroupBindings varGroupBindings; // Priority associated with the pattern (higher priority patterns should take precedence over lower priority ones) double priority = 0.0; // Weight associated with the pattern double weight = 0.0; protected SequencePattern(SequencePattern.PatternExpr nodeSequencePattern) { this(null, nodeSequencePattern); } protected SequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern) { this(patternStr, nodeSequencePattern, null); } protected SequencePattern(String patternStr, SequencePattern.PatternExpr nodeSequencePattern, SequenceMatchAction action) { this.patternStr = patternStr; this.patternExpr = nodeSequencePattern; this.action = action; nodeSequencePattern = new GroupPatternExpr(nodeSequencePattern, true); nodeSequencePattern = nodeSequencePattern.optimize(); this.totalGroups = nodeSequencePattern.assignGroupIds(0); Frag f = nodeSequencePattern.build(); f.connect(MATCH_STATE); this.root = f.start; varGroupBindings = new VarGroupBindings(totalGroups+1); nodeSequencePattern.updateBindings(varGroupBindings); } @Override public String toString() { return this.pattern(); } public SequencePattern transform(NodePatternTransformer transformer) { if (action != null) { throw new UnsupportedOperationException("transform on actions not yet implemented"); } SequencePattern.PatternExpr transformedPattern = this.patternExpr.transform(transformer); // TODO: Make string unique by indicating this pattern was transformed return new SequencePattern(this.patternStr, transformedPattern, null); } public String pattern() { return patternStr; } protected PatternExpr getPatternExpr() { return patternExpr; } public double getPriority() { return priority; } public void setPriority(double priority) { this.priority = priority; } public double getWeight() { return weight; } public void setWeight(double weight) { this.weight = weight; } public SequenceMatchAction getAction() { return action; } public void setAction(SequenceMatchAction action) { this.action = action; } public int getTotalGroups() { return totalGroups; } // Compiles string (regex) to NFA for doing pattern simulation public static SequencePattern compile(Env env, String string) { try { Pair> p = env.parser.parseSequenceWithAction(env, string); return new SequencePattern(string, p.first(), p.second()); } catch (Exception ex) { throw new RuntimeException("Error compiling " + string + " using environment " + env); } //throw new UnsupportedOperationException("Compile from string not implemented"); } protected static SequencePattern compile(SequencePattern.PatternExpr nodeSequencePattern) { return new SequencePattern(nodeSequencePattern); } public SequenceMatcher getMatcher(List tokens) { return new SequenceMatcher(this, tokens); } public OUT findNodePattern(Function, OUT> filter) { Queue todo = new LinkedList(); Set seen = new HashSet(); todo.add(root); seen.add(root); while (!todo.isEmpty()) { State state = todo.poll(); if (state instanceof NodePatternState) { NodePattern pattern = ((NodePatternState) state).pattern; OUT res = filter.apply(pattern); if (res != null) return res; } if (state.next != null) { for (State s: state.next) { if (!seen.contains(s)) { seen.add(s); todo.add(s); } } } } return null; } // Parses string to PatternExpr public static interface Parser { public SequencePattern.PatternExpr parseSequence(Env env, String s) throws Exception; public Pair> parseSequenceWithAction(Env env, String s) throws Exception; public SequencePattern.PatternExpr parseNode(Env env, String s) throws Exception; } // Binding of variable names to groups // matches the group indices static class VarGroupBindings { final String[] varnames; // Assumes number of groups low protected VarGroupBindings(int size) { varnames = new String[size]; } protected void set(int index, String name) { varnames[index] = name; } } // Interface indicating when two nodes match protected static interface NodesMatchChecker { public boolean matches(T o1, T o2); } public static final NodesMatchChecker NODES_EQUAL_CHECKER = new NodesMatchChecker() { @Override public boolean matches(Object o1, Object o2) { return o1.equals(o2); } }; public static final PatternExpr ANY_NODE_PATTERN_EXPR = new NodePatternExpr(NodePattern.ANY_NODE); public static final PatternExpr SEQ_BEGIN_PATTERN_EXPR = new SequenceStartPatternExpr(); public static final PatternExpr SEQ_END_PATTERN_EXPR = new SequenceEndPatternExpr(); /** * Represents a sequence pattern expressions (before translating into NFA). */ public abstract static class PatternExpr { protected abstract Frag build(); /** * Assigns group ids to groups embedded in this patterns starting with at the specified number, * returns the next available group id. * * @param start Group id to start with * @return The next available group id */ protected abstract int assignGroupIds(int start); /** * Make a deep copy of the sequence pattern expressions */ protected abstract PatternExpr copy(); /** * Updates the binding of group to variable name * @param bindings */ protected abstract void updateBindings(VarGroupBindings bindings); protected Object value() { return null; } /** Returns an optimized version of this pattern - default is a noop */ protected PatternExpr optimize() { return this; } protected abstract PatternExpr transform(NodePatternTransformer transformer); } /** Represents one element to be matched. */ public static class NodePatternExpr extends PatternExpr { final NodePattern nodePattern; public NodePatternExpr(NodePattern nodePattern) { this.nodePattern = nodePattern; } @Override protected Frag build() { State s = new NodePatternState(nodePattern); return new Frag(s); } @Override protected PatternExpr copy() { return new NodePatternExpr(nodePattern); } @Override protected int assignGroupIds(int start) { return start; } @Override protected void updateBindings(VarGroupBindings bindings) {} @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new NodePatternExpr(transformer.transform(nodePattern)); } public String toString() { return nodePattern.toString(); } } /** Represents a pattern that can match multiple nodes. */ public static class MultiNodePatternExpr extends PatternExpr { private final MultiNodePattern multiNodePattern; public MultiNodePatternExpr(MultiNodePattern nodePattern) { this.multiNodePattern = nodePattern; } @Override protected Frag build() { State s = new MultiNodePatternState(multiNodePattern); return new Frag(s); } @Override protected PatternExpr copy() { return new MultiNodePatternExpr(multiNodePattern); } @Override protected int assignGroupIds(int start) { return start; } @Override protected void updateBindings(VarGroupBindings bindings) {} @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new MultiNodePatternExpr(transformer.transform(multiNodePattern)); } public String toString() { return multiNodePattern.toString(); } } /** Represents one element to be matched. */ public static class SpecialNodePatternExpr extends PatternExpr { private final String name; Factory stateFactory; public SpecialNodePatternExpr(String name) { this.name = name; } public SpecialNodePatternExpr(String name, Factory stateFactory) { this.name = name; this.stateFactory = stateFactory; } @Override protected Frag build() { State s = stateFactory.create(); return new Frag(s); } @Override protected PatternExpr copy() { return new SpecialNodePatternExpr(name, stateFactory); } @Override protected int assignGroupIds(int start) { return start; } @Override protected void updateBindings(VarGroupBindings bindings) {} @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new SpecialNodePatternExpr(name, stateFactory); } public String toString() { return name; } } public static class SequenceStartPatternExpr extends SpecialNodePatternExpr implements Factory { public SequenceStartPatternExpr() { super("SEQ_START"); this.stateFactory = this; } @Override public State create() { return new SeqStartState(); } } public static class SequenceEndPatternExpr extends SpecialNodePatternExpr implements Factory { public SequenceEndPatternExpr() { super("SEQ_END"); this.stateFactory = this; } @Override public State create() { return new SeqEndState(); } } // Represents a sequence of patterns to be matched public static class SequencePatternExpr extends PatternExpr { final List patterns; public SequencePatternExpr(List patterns) { this.patterns = patterns; } public SequencePatternExpr(PatternExpr... patterns) { this.patterns = Arrays.asList(patterns); } @Override protected Frag build() { Frag frag = null; if (patterns.size() > 0) { PatternExpr first = patterns.get(0); frag = first.build(); for (int i = 1; i < patterns.size(); i++) { PatternExpr pattern = patterns.get(i); Frag f = pattern.build(); frag.connect(f); } } return frag; } @Override protected int assignGroupIds(int start) { int nextId = start; for (PatternExpr pattern : patterns) { nextId = pattern.assignGroupIds(nextId); } return nextId; } @Override protected void updateBindings(VarGroupBindings bindings) { for (PatternExpr pattern : patterns) { pattern.updateBindings(bindings); } } @Override protected PatternExpr copy() { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.copy()); } return new SequencePatternExpr(newPatterns); } @Override public PatternExpr optimize() { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.optimize()); } return new SequencePatternExpr(newPatterns); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.transform(transformer)); } return new SequencePatternExpr(newPatterns); } public String toString() { return StringUtils.join(patterns, " "); } } // Expression that indicates a back reference // Need to match a previously matched group somehow public static class BackRefPatternExpr extends PatternExpr { private NodesMatchChecker matcher; // How a match is determined private int captureGroupId = -1; // Indicates the previously matched group this need to match public BackRefPatternExpr(NodesMatchChecker matcher, int captureGroupId) { if (captureGroupId <= 0) { throw new IllegalArgumentException("Invalid captureGroupId=" + captureGroupId); } this.captureGroupId = captureGroupId; this.matcher = matcher; } @Override protected Frag build() { State s = new BackRefState(matcher, captureGroupId); return new Frag(s); } @Override protected int assignGroupIds(int start) { return start; } @Override protected void updateBindings(VarGroupBindings bindings) {} @Override protected PatternExpr copy() { return new BackRefPatternExpr(matcher, captureGroupId); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { // TODO: Implement me!!! throw new UnsupportedOperationException("BackRefPatternExpr.transform not implemented yet!!! Please implement me!!!"); } public String toString() { StringBuilder sb = new StringBuilder(); if (captureGroupId >= 0) { sb.append('\\').append(captureGroupId); } else { sb.append('\\'); } sb.append('{').append(matcher).append('}'); return sb.toString(); } } public static class ValuePatternExpr extends PatternExpr { private final PatternExpr expr; private final Object value; public ValuePatternExpr(PatternExpr expr, Object value) { this.expr = expr; this.value = value; } @Override protected Frag build() { Frag frag = expr.build(); frag.connect(new ValueState(value)); return frag; } @Override protected int assignGroupIds(int start) { return expr.assignGroupIds(start); } @Override protected PatternExpr copy() { return new ValuePatternExpr(expr.copy(), value); } @Override protected PatternExpr optimize() { return new ValuePatternExpr(expr.optimize(), value); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new ValuePatternExpr(expr.transform(transformer), value); } @Override protected void updateBindings(VarGroupBindings bindings) { expr.updateBindings(bindings); } } /** Expression that represents a group. */ public static class GroupPatternExpr extends PatternExpr { private final PatternExpr pattern; private final boolean capture; // Do capture or not? If do capture, an capture group id will be assigned private int captureGroupId; // -1 if this pattern is not part of a capture group or capture group not yet assigned, // otherwise, capture group number private final String varname; // Alternate variable with which to refer to this group public GroupPatternExpr(PatternExpr pattern) { this(pattern, true); } public GroupPatternExpr(PatternExpr pattern, boolean capture) { this(pattern, capture, -1, null); } public GroupPatternExpr(PatternExpr pattern, String varname) { this(pattern, true, -1, varname); } private GroupPatternExpr(PatternExpr pattern, boolean capture, int captureGroupId, String varname) { this.pattern = pattern; this.capture = capture; this.captureGroupId = captureGroupId; this.varname = varname; } @Override protected Frag build() { Frag f = pattern.build(); Frag frag = new Frag(new GroupStartState(captureGroupId, f.start), f.out); frag.connect(new GroupEndState(captureGroupId)); return frag; } @Override protected int assignGroupIds(int start) { int nextId = start; if (capture) { captureGroupId = nextId; nextId++; } return pattern.assignGroupIds(nextId); } @Override protected void updateBindings(VarGroupBindings bindings) { if (varname != null) { bindings.set(captureGroupId, varname); } pattern.updateBindings(bindings); } @Override protected PatternExpr copy() { return new GroupPatternExpr(pattern.copy(), capture, captureGroupId, varname); } @Override protected PatternExpr optimize() { return new GroupPatternExpr(pattern.optimize(), capture, captureGroupId, varname); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new GroupPatternExpr(pattern.transform(transformer), capture, captureGroupId, varname); } public String toString() { StringBuilder sb = new StringBuilder(); sb.append('('); if (!capture) { sb.append("?: "); } else if (varname != null) { sb.append('?').append(varname).append(' '); } sb.append(pattern); sb.append(')'); return sb.toString(); } } /** Expression that represents a pattern that repeats for a number of times. */ public static class RepeatPatternExpr extends PatternExpr { private final PatternExpr pattern; private final int minMatch; private final int maxMatch; private final boolean greedyMatch; public RepeatPatternExpr(PatternExpr pattern, int minMatch, int maxMatch) { this(pattern, minMatch, maxMatch, true); } public RepeatPatternExpr(PatternExpr pattern, int minMatch, int maxMatch, boolean greedy) { if (minMatch < 0) { throw new IllegalArgumentException("Invalid minMatch=" + minMatch); } if (maxMatch >= 0 && minMatch > maxMatch) { throw new IllegalArgumentException("Invalid minMatch=" + minMatch + ", maxMatch=" + maxMatch); } this.pattern = pattern; this.minMatch = minMatch; this.maxMatch = maxMatch; this.greedyMatch = greedy; } @Override protected Frag build() { Frag f = pattern.build(); if (minMatch == 1 && maxMatch == 1) { return f; } else if (minMatch <= 5 && maxMatch <= 5 && greedyMatch) { // Make copies if number of matches is low // Doesn't handle nongreedy matches yet // For non greedy match need to move curOut before the recursive connect // Create NFA fragment that // have child pattern repeating for minMatch times if (minMatch > 0) { // frag.start -> pattern NFA -> pattern NFA -> for (int i = 0; i < minMatch-1; i++) { Frag f2 = pattern.build(); f.connect(f2); } } else { // minMatch is 0 // frag.start -> f = new Frag(new State()); } if (maxMatch < 0) { // Unlimited (loop back to self) // -------- // \|/ | // ---> pattern NFA ---> Set curOut = f.out; Frag f2 = pattern.build(); f2.connect(f2); f.connect(f2); f.add(curOut); } else { // Limited number of times this pattern repeat, // just keep add pattern (with option of being done) until maxMatch reached // ----> pattern NFA ----> pattern NFA ---> // | | // --> ---> for (int i = minMatch; i < maxMatch; i++) { Set curOut = f.out; Frag f2 = pattern.build(); f.connect(f2); f.add(curOut); } } return f; } else { // More general but more expensive matching (when branching, need to keep state explicitly) State s = new RepeatState(f.start, minMatch, maxMatch, greedyMatch); f.connect(s); return new Frag(s); } } @Override protected int assignGroupIds(int start) { return pattern.assignGroupIds(start); } @Override protected void updateBindings(VarGroupBindings bindings) { pattern.updateBindings(bindings); } @Override protected PatternExpr copy() { return new RepeatPatternExpr(pattern.copy(), minMatch, maxMatch, greedyMatch); } @Override protected PatternExpr optimize() { return new RepeatPatternExpr(pattern.optimize(), minMatch, maxMatch, greedyMatch); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { return new RepeatPatternExpr(pattern.transform(transformer), minMatch, maxMatch, greedyMatch); } public String toString() { StringBuilder sb = new StringBuilder(); sb.append(pattern); sb.append('{').append(minMatch).append(',').append(maxMatch).append('}'); if (!greedyMatch) { sb.append('?'); } return sb.toString(); } } /** Expression that represents a disjunction. */ public static class OrPatternExpr extends PatternExpr { private final List patterns; public OrPatternExpr(List patterns) { this.patterns = patterns; } public OrPatternExpr(PatternExpr... patterns) { this.patterns = Arrays.asList(patterns); } @Override protected Frag build() { Frag frag = new Frag(); frag.start = new State(); // Create NFA fragment that // have one starting state that branches out to NFAs created by the children expressions // ---> pattern 1 ---> // | // ---> pattern 2 ---> // ... for (PatternExpr pattern : patterns) { // Build child NFA Frag f = pattern.build(); if (pattern.value() != null) { // Add value state to child NFA f.connect(new ValueState(pattern.value())); } // Add child NFA to next states of fragment start frag.start.add(f.start); // Add child NFA out (unlinked) states to out (unlinked) states of this fragment frag.add(f.out); } return frag; } @Override protected int assignGroupIds(int start) { int nextId = start; // assign group ids of child expressions for (PatternExpr pattern : patterns) { nextId = pattern.assignGroupIds(nextId); } return nextId; } @Override protected void updateBindings(VarGroupBindings bindings) { // update bindings of child expressions for (PatternExpr pattern : patterns) { pattern.updateBindings(bindings); } } @Override protected PatternExpr copy() { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.copy()); } return new OrPatternExpr(newPatterns); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.transform(transformer)); } return new OrPatternExpr(newPatterns); } public String toString() { return StringUtils.join(patterns, " | "); } // minimize size of or clauses to trigger optimization private static final int OPTIMIZE_MIN_SIZE = 5; @Override protected PatternExpr optimize() { if (patterns.size() <= OPTIMIZE_MIN_SIZE) { // Not enough patterns for fancy optimization List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.optimize()); } return new OrPatternExpr(newPatterns); } else { // More fancy optimization return optimizeOr(); } } private PatternExpr optimizeOr() { PatternExpr optimizedStringSeqs = optimizeOrStringSeqs(); // Go through patterns and get candidate sequences with the same start... return optimizedStringSeqs; } private PatternExpr optimizeOrStringSeqs() { // Try to collapse OR of NodePattern with just strings into a StringInSetAnnotationPattern List opts = new ArrayList(patterns.size()); // Map from annotation key (Class), ignoreCase (Boolean) to set of patterns/strings Map, Pair, Set>> stringPatterns = new HashMap, Pair, Set>>(); Map, Pair, Set>>> stringSeqPatterns = new HashMap, Pair, Set>>>(); // Go through patterns and get candidates for optimization for (PatternExpr p:patterns) { PatternExpr opt = p.optimize(); opts.add(opt); // Check for special patterns that we can optimize if (opt instanceof NodePatternExpr) { Pair pair = _getStringAnnotation_(opt); if (pair != null) { Boolean ignoreCase = pair.second.ignoreCase(); String target = pair.second.target; Pair key = Pair.makePair(pair.first, ignoreCase); Pair, Set> saved = stringPatterns.get(key); if (saved == null) { saved = new Pair, Set>(new ArrayList(), new HashSet()); stringPatterns.put(key, saved); } saved.first.add(opt); saved.second.add(target); } } else if (opt instanceof SequencePatternExpr) { SequencePatternExpr seq = (SequencePatternExpr) opt; if (seq.patterns.size() > 0) { boolean isStringSeq = true; Pair key = null; List strings = null; for (PatternExpr sp: seq.patterns) { // check if string match over same key Pair pair = _getStringAnnotation_(sp); if (pair != null) { if (key != null) { // check key if (key.first.equals(pair.first) && key.second.equals(pair.second.ignoreCase())) { // okay } else { isStringSeq = false; break; } } else { key = Pair.makePair(pair.first, pair.second.ignoreCase()); strings = new ArrayList(); } strings.add(pair.second.target); } else { isStringSeq = false; break; } } if (isStringSeq) { Pair, Set>> saved = stringSeqPatterns.get(key); if (saved == null) { saved = new Pair, Set>>(new ArrayList(), new HashSet>()); stringSeqPatterns.put(key, saved); } saved.first.add(opt); saved.second.add(strings); } } } } // Go over our maps and see if any of these strings should be optimized away // Keep track of things we have optimized away Map alreadyOptimized = new IdentityHashMap(); List finalOptimizedPatterns = new ArrayList(patterns.size()); // optimize strings for (Map.Entry, Pair, Set>> entry : stringPatterns.entrySet()) { Pair, Set> saved = entry.getValue(); Set set = saved.second; int flags = (entry.getKey().second)? NodePattern.CASE_INSENSITIVE:0; if (set.size() > OPTIMIZE_MIN_SIZE) { PatternExpr optimized = new NodePatternExpr( new CoreMapNodePattern(entry.getKey().first, new CoreMapNodePattern.StringInSetAnnotationPattern(set, flags))); finalOptimizedPatterns.add(optimized); for (PatternExpr p:saved.first) { alreadyOptimized.put(p, true); } } } // optimize string sequences for (Map.Entry, Pair, Set>>> entry : stringSeqPatterns.entrySet()) { Pair, Set>> saved = entry.getValue(); Set> set = saved.second; if (set.size() > OPTIMIZE_MIN_SIZE) { Pair key = entry.getKey(); PatternExpr optimized = new MultiNodePatternExpr( new MultiCoreMapNodePattern.StringSequenceAnnotationPattern(key.first(), set, key.second())); finalOptimizedPatterns.add(optimized); for (PatternExpr p:saved.first) { alreadyOptimized.put(p, true); } } } // Add back original stuff that we didn't optimize for (PatternExpr p: opts) { Boolean included = alreadyOptimized.get(p); if (included == null || !included) { finalOptimizedPatterns.add(p); } } return new OrPatternExpr(finalOptimizedPatterns); } private static Pair _getStringAnnotation_(PatternExpr p) { if (p instanceof NodePatternExpr) { NodePattern nodePattern = ((NodePatternExpr) p).nodePattern; if (nodePattern instanceof CoreMapNodePattern) { List> annotationPatterns = ((CoreMapNodePattern) nodePattern).getAnnotationPatterns(); if (annotationPatterns.size() == 1) { // Check if it is a string annotation pattern Pair pair = annotationPatterns.get(0); if (pair.second instanceof CoreMapNodePattern.StringAnnotationPattern) { return Pair.makePair(pair.first, (CoreMapNodePattern.StringAnnotationPattern) pair.second); } } } } return null; } } // Expression that represents a conjunction public static class AndPatternExpr extends PatternExpr { private final List patterns; public AndPatternExpr(List patterns) { this.patterns = patterns; } public AndPatternExpr(PatternExpr... patterns) { this.patterns = Arrays.asList(patterns); } @Override protected Frag build() { ConjStartState conjStart = new ConjStartState(patterns.size()); Frag frag = new Frag(); frag.start = conjStart; // Create NFA fragment that // have one starting state that branches out to NFAs created by the children expressions // AND START ---> pattern 1 ---> AND END (0/n) // | // ---> pattern 2 ---> AND END (1/n) // ... for (int i = 0; i < patterns.size(); i++) { PatternExpr pattern = patterns.get(i); // Build child NFA Frag f = pattern.build(); // Add child NFA to next states of fragment start frag.start.add(f.start); f.connect(new ConjEndState(conjStart, i)); // Add child NFA out (unlinked) states to out (unlinked) states of this fragment frag.add(f.out); } return frag; } @Override protected int assignGroupIds(int start) { int nextId = start; // assign group ids of child expressions for (PatternExpr pattern : patterns) { nextId = pattern.assignGroupIds(nextId); } return nextId; } @Override protected void updateBindings(VarGroupBindings bindings) { // update bindings of child expressions for (PatternExpr pattern : patterns) { pattern.updateBindings(bindings); } } @Override protected PatternExpr copy() { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.copy()); } return new AndPatternExpr(newPatterns); } @Override protected PatternExpr optimize() { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.optimize()); } return new AndPatternExpr(newPatterns); } @Override protected PatternExpr transform(NodePatternTransformer transformer) { List newPatterns = new ArrayList(patterns.size()); for (PatternExpr p:patterns) { newPatterns.add(p.transform(transformer)); } return new AndPatternExpr(newPatterns); } public String toString() { return StringUtils.join(patterns, " & "); } } /****** NFA states for matching sequences *********/ // Patterns are converted to the NFA states // Assumes the matcher will step through the NFA states one token at a time /** * An accepting matching state */ protected static final State MATCH_STATE = new MatchState(); /** * Represents a state in the NFA corresponding to a regular expression for matching a sequence */ static class State { /** * Set of next states from this current state. * NOTE: Most of the time, next is just one state. */ Set next; boolean hasSavedValue; protected State() {} /** * Update the set of out states by unlinked states from this state * @param out - Current set of out states (to be updated by this function) */ protected void updateOutStates(Set out) { if (next == null) { out.add(this); } else { for (State s:next) { s.updateOutStates(out); } } } /** * Non-consuming match. * @param bid - Branch id * @param matchedStates - State of the matching so far (to be updated by the matching process) * @return true if match */ protected boolean match0(int bid, SequenceMatcher.MatchedStates matchedStates) { return match(bid, matchedStates, false); } /** * Consuming match. * @param bid - Branch id * @param matchedStates - State of the matching so far (to be updated by the matching process) * @return true if match */ protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates) { return match(bid, matchedStates, true); } protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume) { return match(bid, matchedStates, consume, null); } /** * Given the current matched states, attempts to run NFA from this state. * If consuming: tries to match the next element - goes through states until an element is consumed or match is false * If non-consuming: does not match the next element - goes through non element consuming states * In both cases, matchedStates should be updated as follows: * - matchedStates should be updated with the next state to be processed * @param bid - Branch id * @param matchedStates - State of the matching so far (to be updated by the matching process) * @param consume - Whether to consume the next element or not * @return true if match */ protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { boolean match = false; if (next != null) { int i = 0; for (State s:next) { i++; boolean m = s.match(matchedStates.branchStates.getBranchId(bid,i,next.size()), matchedStates, consume, this); if (m) { // NOTE: We don't break because other branches may have useful state information match = true; } } } return match; } /** * Add state to the set of next states. * @param nextState - state to add */ protected void add(State nextState) { if (next == null) { next = new LinkedHashSet(); } next.add(nextState); } public Object value(int bid, SequenceMatcher.MatchedStates matchedStates) { if (hasSavedValue) { HasInterval matchedInterval = matchedStates.getBranchStates().getMatchedInterval(bid, this); if (matchedInterval != null && matchedInterval instanceof ValuedInterval) { return ((ValuedInterval) matchedInterval).getValue(); } } return null; } } /** * Final accepting state. */ private static class MatchState extends State { @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // Always add this state back (effectively looping forever in this matching state) matchedStates.addState(bid, this); return false; } } /** * State with associated value. */ private static class ValueState extends State { final Object value; private ValueState(Object value) { this.value = value; } @Override public Object value(int bid, SequenceMatcher.MatchedStates matchedStates) { return value; } } /** * State for matching one element/node */ private static class NodePatternState extends State { final NodePattern pattern; protected NodePatternState(NodePattern p) { this.pattern = p; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { if (consume) { // Get element and return if it matched or not T node = matchedStates.get(); // TODO: Fix type checking if (matchedStates.matcher.matchWithResult) { Object obj = pattern.matchWithResult(node); if (obj != null) { if (obj != Boolean.TRUE) { matchedStates.branchStates.setMatchedResult(bid, matchedStates.curPosition, obj); } // If matched, need to add next states to the queue of states to be processed matchedStates.addStates(bid, next); return true; } else { return false; } } else { if (node != null && pattern.match(node)) { // If matched, need to add next states to the queue of states to be processed matchedStates.addStates(bid, next); return true; } else { return false; } } } else { // Not consuming element - add this state back to queue of states to be processed // This state was not successfully matched matchedStates.addState(bid, this); return false; } } } /** * State for matching multiple elements/nodes. */ private static class MultiNodePatternState extends State { private final MultiNodePattern pattern; protected MultiNodePatternState(MultiNodePattern p) { this.pattern = p; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { if (consume) { HasInterval matchedInterval = matchedStates.getBranchStates().getMatchedInterval(bid, this); int cur = matchedStates.curPosition; if (matchedInterval == null) { // Haven't tried to match this node before, try now // Get element and return if it matched or not List nodes = matchedStates.elements(); // TODO: Fix type checking Collection> matched = pattern.match(nodes, cur); // TODO: Check intervals are valid? Start at cur and ends after? if (matched != null && matched.size() > 0) { int nBranches = matched.size(); int i = 0; for (HasInterval interval:matched) { i++; int bid2 = matchedStates.getBranchStates().getBranchId(bid, i, nBranches); matchedStates.getBranchStates().setMatchedInterval(bid2, this, interval); // If matched, need to add next states to the queue of states to be processed // keep in current state until end node reached if (interval.getInterval().getEnd()-1 <= cur) { matchedStates.addStates(bid2, next); } else { matchedStates.addState(bid2, this); } } return true; } else { return false; } } else { // Previously matched this state - just need to step through until we get to end of matched interval if (matchedInterval.getInterval().getEnd()-1 <= cur) { matchedStates.addStates(bid, next); } else { matchedStates.addState(bid, this); } return true; } } else { // Not consuming element - add this state back to queue of states to be processed // This state was not successfully matched matchedStates.addState(bid, this); return false; } } } /** * State that matches a pattern that can occur multiple times. */ private static class RepeatState extends State { private final State repeatStart; private final int minMatch; private final int maxMatch; private final boolean greedyMatch; public RepeatState(State start, int minMatch, int maxMatch, boolean greedyMatch) { this.repeatStart = start; this.minMatch = minMatch; this.maxMatch = maxMatch; this.greedyMatch = greedyMatch; if (minMatch < 0) { throw new IllegalArgumentException("Invalid minMatch=" + minMatch); } if (maxMatch >= 0 && minMatch > maxMatch) { throw new IllegalArgumentException("Invalid minMatch=" + minMatch + ", maxMatch=" + maxMatch); } } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // Get how many times this states has already been matched int matchedCount = matchedStates.getBranchStates().endMatchedCountInc(bid, this); // Get the minimum number of times we still need to match this state int minMatchLeft = minMatch - matchedCount; if (minMatchLeft < 0) { minMatchLeft = 0; } // Get the maximum number of times we can match this state int maxMatchLeft; if (maxMatch < 0) { // Indicate unlimited matching maxMatchLeft = maxMatch; } else { maxMatchLeft = maxMatch - matchedCount; if (maxMatch < 0) { // Already exceeded the maximum number of times we can match this state // indicate state not matched return false; } } boolean match = false; // See how many branching options there are... int totalBranches = 0; if (minMatchLeft == 0 && next != null) { totalBranches += next.size(); } if (maxMatchLeft != 0) { totalBranches++; } int i = 0; // branch index // Check if there we have met the minimum number of matches // If so, go ahead and try to match next state // (if we need to consume an element or end a group) if (minMatchLeft == 0 && next != null) { for (State s:next) { i++; // Increment branch index // Depending on greedy match or not, different priority to branches int pi = (greedyMatch && maxMatchLeft != 0)? i+1:i; int bid2 = matchedStates.getBranchStates().getBranchId(bid,pi,totalBranches); matchedStates.getBranchStates().clearMatchedCount(bid2, this); boolean m = s.match(bid2, matchedStates, consume); if (m) { match = true; } } } // Check if we have the option of matching more // (maxMatchLeft < 0 indicate unlimited, maxMatchLeft > 0 indicate we are still allowed more matches) if (maxMatchLeft != 0) { i++; // Increment branch index // Depending on greedy match or not, different priority to branches int pi = greedyMatch? 1:i; int bid2 = matchedStates.getBranchStates().getBranchId(bid,pi,totalBranches); if (consume) { // Consuming - try to see if repeating this pattern does anything boolean m = repeatStart.match(bid2, matchedStates, consume); if (m) { match = true; // Mark how many times we have matched this pattern matchedStates.getBranchStates().startMatchedCountInc(bid2, this); } } else { // Not consuming - don't do anything, just add this back to list of states to be processed matchedStates.addState(bid2, this); } } return match; } } /** * State for matching previously matched group. */ static class BackRefState extends State { private final NodesMatchChecker matcher; private final int captureGroupId; public BackRefState(NodesMatchChecker matcher, int captureGroupId) { this.matcher = matcher; this.captureGroupId = captureGroupId; } protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, SequenceMatcher.MatchedGroup matchedGroup, int matchedNodes) { T node = matchedStates.get(); if (matcher.matches(node, matchedStates.elements().get(matchedGroup.matchBegin+matchedNodes))) { matchedNodes++; matchedStates.getBranchStates().setMatchStateInfo(bid, this, new Pair(matchedGroup, matchedNodes)); int len = matchedGroup.matchEnd - matchedGroup.matchBegin; if (len == matchedNodes) { matchedStates.addStates(bid, next); } else { matchedStates.addState(bid, this); } return true; } return false; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // Try to match previous node/nodes exactly if (consume) { // First element is group that is matched, second is number of nodes matched so far Pair backRefState = (Pair) matchedStates.getBranchStates().getMatchStateInfo(bid, this); if (backRefState == null) { // Haven't tried to match this node before, try now // Get element and return if it matched or not SequenceMatcher.MatchedGroup matchedGroup = matchedStates.getBranchStates().getMatchedGroup(bid, captureGroupId); if (matchedGroup != null) { // See if the first node matches if (matchedGroup.matchEnd > matchedGroup.matchBegin) { boolean matched = match(bid, matchedStates, matchedGroup, 0); return matched; } else { // TODO: Check handling of previous nodes that are zero elements? return super.match(bid, matchedStates, consume, prevState); } } return false; } else { SequenceMatcher.MatchedGroup matchedGroup = backRefState.first(); int matchedNodes = backRefState.second(); boolean matched = match(bid, matchedStates, matchedGroup, matchedNodes); return matched; } } else { // Not consuming, just add this state back to list of states to be processed matchedStates.addState(bid, this); return false; } } } /** * State for matching the start of a group. */ static class GroupStartState extends State { private final int captureGroupId; public GroupStartState(int captureGroupId, State startState) { this.captureGroupId = captureGroupId; add(startState); } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // We only mark start when about to consume elements if (consume) { // Start of group, mark start matchedStates.setGroupStart(bid, captureGroupId); return super.match(bid, matchedStates, consume, prevState); } else { // Not consuming, just add this state back to list of states to be processed matchedStates.addState(bid, this); return false; } } } /** * State for matching the end of a group. */ static class GroupEndState extends State { private final int captureGroupId; public GroupEndState(int captureGroupId) { this.captureGroupId = captureGroupId; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // Opposite of GroupStartState // Don't do anything when we are about to consume an element // Only we are done consuming, and preparing to go on to the next element // do we mark the end of the group if (consume) { return false; } else { Object v = (prevState != null)? prevState.value(bid, matchedStates):null; matchedStates.setGroupEnd(bid, captureGroupId, v); return super.match(bid, matchedStates, consume, prevState); } } } static class ConjMatchStateInfo { // A conjunction consists of several child expressions // When the conjunction state is entered, // we keep track of the branch id and the node index // we are on at that time (startBid and startPos) /** * The branch id when the conjunction state is entered */ private final int startBid; /** * The node index when the conjunction state is entered */ private final int startPos; /** * The number of child expressions making up the conjunction */ private final int childCount; /** * For each child expression, we keep track of the * set of branch ids that causes the child expression to * be satisfied (and their corresponding node index * when the expression is satisfied) */ private final Set>[] reachableChildBids; private ConjMatchStateInfo(int startBid, int childCount, int startPos) { this.startBid = startBid; this.startPos = startPos; this.childCount = childCount; this.reachableChildBids = new Set[childCount]; } private void addChildBid(int i, int bid, int pos) { if (reachableChildBids[i] == null) { reachableChildBids[i] = new ArraySet>(); } reachableChildBids[i].add(new Pair(bid,pos) ); } private boolean isAllChildMatched() { for (Set> v:reachableChildBids) { if (v == null || v.isEmpty()) return false; } return true; } /** * Returns true if there is a feasible combination of child branch ids that * causes all child expressions to be satisfied with * respect to the specified child expression * (assuming satisfaction with the specified branch and node index) * For other child expressions to have a compatible satisfiable branch, * that branch must also terminate with the same node index as this one. * * @param index - Index of the child expression * @param bid - Branch id that causes the indexed child to be satisfied * @param pos - Node index that causes the indexed child to be satisfied * @return whether there is a feasible combination that causes all * children to be satisfied with respect to specified child. */ private boolean isAllChildMatched(int index, int bid, int pos) { for (int i = 0; i < reachableChildBids.length; i++) { Set> v = reachableChildBids[i]; if (v == null || v.isEmpty()) return false; if (i != index) { boolean ok = false; for (Pair p:v) { if (p.second() == pos) { ok = true; break; } } if (!ok) { return false; } } } return true; } /** * Returns array of child branch ids that * causes all child expressions to be satisfied with * respect to the specified child expression * (assuming satisfaction with the specified branch and node index). * For other child expressions to have a compatible satisfiable branch, * that branch must also terminate with the same node index as this one. * * @param index - Index of the child expression * @param bid - Branch id that causes the indexed child to be satisfied * @param pos - Node index that causes the indexed child to be satisfied * @return array of child branch ids if there is a valid combination * null otherwise */ private int[] getAllChildMatchedBids(int index, int bid, int pos) { int[] matchedBids = new int[reachableChildBids.length]; for (int i = 0; i < reachableChildBids.length; i++) { Set> v = reachableChildBids[i]; if (v == null || v.isEmpty()) return null; if (i != index) { boolean ok = false; for (Pair p:v) { if (p.second() == pos) { ok = true; matchedBids[i] = p.first(); break; } } if (!ok) { return null; } } else { matchedBids[i] = bid; } } return matchedBids; } protected void updateKeepBids(Set bids) { // TODO: Is there a point when we don't need to keep these bids anymore? for (Set> v : reachableChildBids) { if (v != null) { for (Pair p : v) { bids.add(p.first()); } } } } } // States for matching conjunctions // - Basic, not well tested implementation that may not work for all cases ... // - Can be optimized to terminate earlier if one branch of the conjunction is known not to succeed // - May cause lots of states to be kept (not efficient) // - priority should be specified for conjunction branches (there can be conflicting greedy/nongreedy patterns) // (should we prioritize by order?) - currently behavior is not well defined /** * State for matching a conjunction */ static class ConjStartState extends State { private final int childCount; // Number of children that this conjunction consists of public ConjStartState(int childCount) { this.childCount = childCount; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { matchedStates.getBranchStates().setMatchStateInfo(bid, this, new ConjMatchStateInfo(bid, childCount, matchedStates.curPosition)); // Start of conjunction, mark start boolean allMatch = true; if (next != null) { int i = 0; for (State s:next) { i++; boolean m = s.match(matchedStates.getBranchStates().getBranchId(bid,i,next.size()), matchedStates, consume); if (!m) { allMatch = false; break; } } } return allMatch; } } /** * State for matching the end of a conjunction. */ static class ConjEndState extends State { private final ConjStartState startState; private final int childIndex; public ConjEndState(ConjStartState startState, int childIndex) { this.startState = startState; this.childIndex = childIndex; } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { // Opposite of ConjStartState // Don't do anything when we are about to consume an element // Only we are done consuming, and preparing to go on to the next element // do we check if all branches matched if (consume) { return false; } else { // NOTE: There is a delayed matched here, in that we actually want to remember // which of the incoming branches succeeded // Use the bid of the corresponding ConjAndState? ConjMatchStateInfo stateInfo = (ConjMatchStateInfo) matchedStates.getBranchStates().getMatchStateInfo(bid, startState); if (stateInfo != null) { stateInfo.addChildBid(childIndex, bid, matchedStates.curPosition); int[] matchedBids = stateInfo.getAllChildMatchedBids(childIndex, bid, matchedStates.curPosition); if (matchedBids != null) { matchedStates.getBranchStates().addBidsToCollapse(bid, matchedBids); return super.match(bid, matchedStates, consume, prevState); } } return false; } } } /** * State for matching start of sequence. */ static class SeqStartState extends State { public SeqStartState() { } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { if (consume) { if (matchedStates.curPosition == 0) { // Okay - try next return super.match(bid, matchedStates, consume, this); } } return false; } } /** * State for matching end of sequence. */ static class SeqEndState extends State { public SeqEndState() { } @Override protected boolean match(int bid, SequenceMatcher.MatchedStates matchedStates, boolean consume, State prevState) { if (!consume) { if (matchedStates.curPosition == matchedStates.elements().size()-1) { // Okay - try next return super.match(bid, matchedStates, consume, this); } } return false; } } /** * Represents a incomplete NFS with start State and a set of unlinked out states. */ private static class Frag { State start; Set out; protected Frag() { // this(new State()); } protected Frag(State start) { this.start = start; this.out = new LinkedHashSet (); start.updateOutStates(out); } protected Frag(State start, Set out) { this.start = start; this.out = out; } protected void add(State outState) { if (out == null) { out = new LinkedHashSet(); } out.add(outState); } protected void add(Collection outStates) { if (out == null) { out = new LinkedHashSet(); } out.addAll(outStates); } // Connect frag f to the out states of this frag // the out states of this frag is updated to be the out states of f protected void connect(Frag f) { for (State s:out) { s.add(f.start); } out = f.out; } // Connect state to the out states of this frag // the out states of this frag is updated to be the out states of state protected void connect(State state) { for (State s:out) { s.add(state); } out = new LinkedHashSet(); state.updateOutStates(out); /* if (state.next != null) { out.addAll(state.next); } else { out.add(state); } */ } } // end static class Frag } // end class SequencePattern