All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.Operation Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2020 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.regex;


import net.sf.saxon.expr.sort.EmptyIntIterator;
import net.sf.saxon.regex.charclass.CharacterClass;
import net.sf.saxon.regex.charclass.EmptyCharacterClass;
import net.sf.saxon.regex.charclass.IntSetCharacterClass;
import net.sf.saxon.regex.charclass.SingletonCharacterClass;
import net.sf.saxon.trans.UncheckedXPathException;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.util.FastStringBuffer;
import net.sf.saxon.z.*;

import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.function.IntPredicate;

/**
 * Represents an operation or instruction in the regular expression program. The class Operation
 * is abstract, and has concrete subclasses for each kind of operation/instruction
 */
public abstract class Operation {

    static final int MATCHES_ZLS_AT_START = 1;
    static final int MATCHES_ZLS_AT_END = 2;
    static final int MATCHES_ZLS_ANYWHERE = 7;
    static final int MATCHES_ZLS_NEVER = 1024;

    /**
     * Get an iterator returning all the matches for this operation
     *
     * @param matcher  supplies the context for the matching; may be updated with information about
     *                 captured groups
     * @param position the start position to seek a match
     * @return an iterator returning the endpoints of all matches starting at the supplied position
     */

    public abstract IntIterator iterateMatches(REMatcher matcher, int position);

    /**
     * Get the length of the matches returned by this operation if they are fixed-length
     *
     * @return the length of the matches, or -1 if the length is variable
     */

    public int getMatchLength() {
        return -1;
    }

    /**
     * Get the minimum length of the matches returned by this operation
     * @return the length of the shortest string that will match the operation
     */

    public int getMinimumMatchLength() {
        int fixed = getMatchLength();
        return fixed < 0 ? 0 : fixed;
    }

    /**
     * Ask whether the regular expression is known, after static analysis, to match a
     * zero-length string
     * @return a value indicating whether the regex is statically known to match
     * a zero-length string. Specifically:
     * 
    *
  • returns {@link #MATCHES_ZLS_AT_START} * if the expression is statically known to match a zero-length string at the * start of the supplied input;
  • *
  • returns * {@link #MATCHES_ZLS_AT_END} if it is statically known to return a zero-length * string at the end of the supplied input;
  • *
  • returns {@link #MATCHES_ZLS_ANYWHERE} * if it is statically known to match a zero-length string anywhere in the input. *
  • *
  • returns {@link #MATCHES_ZLS_NEVER} if it is statically known that the * regex will never match a zero length string.
  • *
* Returning 0 means that it is not known statically whether or not the regex * will match a zero-length string; this case typically arises when back-references * are involved. */ public abstract int matchesEmptyString(); /** * Ask whether the expression contains any capturing sub-expressions * @return true if the expression contains any capturing sub-expressions (but not * if it is a capturing expression itself, unless it contains nested capturing * expressions) */ public boolean containsCapturingExpressions() { return false; } /** * Get a CharacterClass identifying the set of characters that can appear as the first * character of a non-empty string that matches this term. This is allowed to be an * over-estimate (that is, the returned Character class must match every character * that can legitimately appear at the start of the matched string, but it can * also match other characters). * @param caseBlind true if case-blind matching is in force ("i" flag) */ public CharacterClass getInitialCharacterClass(boolean caseBlind) { return EmptyCharacterClass.getComplement(); } /** * Optimize the operation * @return the optimized operation * @param program the program being optimized * @param flags the regular expression flags */ public Operation optimize(REProgram program, REFlags flags) { return this; } /** * Display the operation as a regular expression, possibly in abbreviated form * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ public abstract String display(); /** * A choice of several branches */ public static class OpChoice extends Operation { List branches; OpChoice(List branches) { this.branches = branches; } @Override public int getMatchLength() { int fixed = branches.get(0).getMatchLength(); for (int i = 1; i < branches.size(); i++) { if (branches.get(i).getMatchLength() != fixed) { return -1; } } return fixed; } @Override public int getMinimumMatchLength() { int min = branches.get(0).getMinimumMatchLength(); for (int i = 1; i < branches.size(); i++) { int m = branches.get(i).getMinimumMatchLength(); if (m < min) { min = m; } } return min; } @Override public int matchesEmptyString() { int m = 0; for (Operation branch : branches) { int b = branch.matchesEmptyString(); if (b != MATCHES_ZLS_NEVER) { m |= b; } } return m; } @Override public boolean containsCapturingExpressions() { for (Operation o : branches) { if (o instanceof OpCapture || o.containsCapturingExpressions()) { return true; } } return false; } @Override public CharacterClass getInitialCharacterClass(boolean caseBlind) { CharacterClass result = EmptyCharacterClass.getInstance(); for (Operation o : branches) { result = RECompiler.makeUnion(result, o.getInitialCharacterClass(caseBlind)); } return result; } @Override public Operation optimize(REProgram program, REFlags flags) { for (int i=0; i branchIter = branches.iterator(); IntIterator currentIter = null; Operation currentOp = null; @Override public boolean hasNext() { while (true) { if (currentIter == null) { if (branchIter.hasNext()) { matcher.clearCapturedGroupsBeyond(position); currentOp = branchIter.next(); currentIter = currentOp.iterateMatches(matcher, position); } else { return false; } } if (currentIter.hasNext()) { return true; } else { currentIter = null; //continue; } } } @Override public int next() { return currentIter.next(); } }; } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.C64); fsb.append("(?:"); boolean first = true; for (Operation branch : branches) { if (first) { first = false; } else { fsb.cat('|'); } fsb.append(branch.display()); } fsb.append(")"); return fsb.toString(); } } /** * A sequence of multiple pieces */ public static class OpSequence extends Operation { private List operations; OpSequence(List operations) { this.operations = operations; } public List getOperations() { return operations; } @Override public int getMatchLength() { int len = 0; for (Operation o : operations) { int i = o.getMatchLength(); if (i == -1) { return -1; } len += i; } return len; } @Override public int getMinimumMatchLength() { int len = 0; for (Operation o : operations) { len += o.getMinimumMatchLength(); } return len; } @Override public int matchesEmptyString() { // The operation matches empty anywhere if every suboperation matches empty anywhere boolean matchesEmptyAnywhere = true; boolean matchesEmptyNowhere = false; for (Operation o : operations) { int m = o.matchesEmptyString(); if (m == MATCHES_ZLS_NEVER) { return MATCHES_ZLS_NEVER; } if (m != MATCHES_ZLS_ANYWHERE) { matchesEmptyAnywhere = false; break; } } if (matchesEmptyAnywhere) { return MATCHES_ZLS_ANYWHERE; } // The operation matches BOL if every suboperation matches BOL (which includes // the case of matching empty anywhere) boolean matchesBOL = true; for (Operation o : operations) { if ((o.matchesEmptyString() & MATCHES_ZLS_AT_START) == 0) { matchesBOL = false; break; } } if (matchesBOL) { return MATCHES_ZLS_AT_START; } // The operation matches EOL if every suboperation matches EOL (which includes // the case of matching empty anywhere) boolean matchesEOL = true; for (Operation o : operations) { if ((o.matchesEmptyString() & MATCHES_ZLS_AT_END) == 0) { matchesEOL = false; break; } } if (matchesEOL) { return MATCHES_ZLS_AT_END; } return 0; } @Override public boolean containsCapturingExpressions() { for (Operation o : operations) { if (o instanceof OpCapture || o.containsCapturingExpressions()) { return true; } } return false; } @Override public CharacterClass getInitialCharacterClass(boolean caseBlind) { CharacterClass result = EmptyCharacterClass.getInstance(); for (Operation o : operations) { result = RECompiler.makeUnion(result, o.getInitialCharacterClass(caseBlind)); if (o.matchesEmptyString() == MATCHES_ZLS_NEVER) { return result; } } return result; } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.C64); for (Operation op : operations) { fsb.append(op.display()); } return fsb.toString(); } @Override public Operation optimize(REProgram program, REFlags flags) { if (operations.size() == 0) { return new OpNothing(); } else if (operations.size() == 1) { return operations.get(0); } else { for (int i=0; i iterators = new Stack<>(); final REMatcher.State savedState = containsCapturingExpressions() ? matcher.captureState() : null; final int backtrackingLimit = matcher.getProgram().getBacktrackingLimit(); return new IntIterator() { private boolean primed = false; private int nextPos; /** * Advance the current iterator if possible, getting the first match for all subsequent * iterators in the sequence. If we get all the way to the end of the sequence, return the * position in the input string that we have reached. If we don't get all the way to the * end of the sequence, work backwards getting the next match for each term in the sequence * until we find a route through. * @return if we find a match for the whole sequence, return the position in the input string * at which the match ends. Otherwise return -1. */ private int advance() { int counter = 0; while (!iterators.isEmpty()) { IntIterator top = iterators.peek(); while (top.hasNext()) { int p = top.next(); matcher.clearCapturedGroupsBeyond(p); int i = iterators.size(); if (i >= operations.size()) { return p; } top = operations.get(i).iterateMatches(matcher, p); iterators.push(top); } iterators.pop(); if (backtrackingLimit >=0 && counter++ > backtrackingLimit) { throw new UncheckedXPathException(new XPathException( "Regex backtracking limit exceeded processing " + matcher.operation.display() + ". Simplify the regular expression, " + "or set Feature.REGEX_BACKTRACKING_LIMIT to -1 to remove this limit." )); } } if (savedState != null) { matcher.resetState(savedState); } //matcher.clearCapturedGroupsBeyond(position); return -1; } @Override public boolean hasNext() { if (!primed) { iterators.push(operations.get(0).iterateMatches(matcher, position)); primed = true; } nextPos = advance(); return nextPos >= 0; } @Override public int next() { return nextPos; } }; } } /** * A match of a single character in the input against a set of permitted characters */ public static class OpCharClass extends Operation { private IntPredicate predicate; OpCharClass(IntPredicate predicate) { this.predicate = predicate; } public IntPredicate getPredicate() { return predicate; } @Override public int getMatchLength() { return 1; } @Override public int matchesEmptyString() { return MATCHES_ZLS_NEVER; } @Override public CharacterClass getInitialCharacterClass(boolean caseBlind) { if (predicate instanceof CharacterClass) { return (CharacterClass)predicate; } else { return super.getInitialCharacterClass(caseBlind); } } @Override public IntIterator iterateMatches(REMatcher matcher, int position) { //System.err.println("CharClass " + predicate + " matching at position " + position); UnicodeString in = matcher.search; if (position < in.uLength() && predicate.test(in.uCharAt(position))) { return new IntSingletonIterator(position + 1); } else { return EmptyIntIterator.getInstance(); } } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { if (predicate instanceof IntSetPredicate) { IntSet s = ((IntSetPredicate)predicate).getIntSet(); if (s instanceof IntSingletonSet) { return "" + (char)((IntSingletonSet)s).getMember(); } else if (s instanceof IntRangeSet) { FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.C64); IntRangeSet irs = (IntRangeSet)s; fsb.append("["); for (int i=0; i 0) { set = new IntHashSet(variants.length); set.add(ch); for (int v : variants) { set.add(v); } return new IntSetCharacterClass(set); } } return new SingletonCharacterClass(atom.uCharAt(0)); } @Override public IntIterator iterateMatches(REMatcher matcher, int position) { UnicodeString in = matcher.search; if (position + len > in.uLength()) { return EmptyIntIterator.getInstance(); } if (matcher.program.flags.isCaseIndependent()) { for (int i = 0; i < len; i++) { if (!matcher.equalCaseBlind(in.uCharAt(position + i), atom.uCharAt(i))) { return EmptyIntIterator.getInstance(); } } } else { for (int i = 0; i < len; i++) { if (in.uCharAt(position + i) != atom.uCharAt(i)) { return EmptyIntIterator.getInstance(); } } } return new IntSingletonIterator(position + len); } @Override public String display() { return atom.toString(); } } /** * Handle a greedy repetition (with possible min and max) where the * size of the repeated unit is fixed. */ public static class OpGreedyFixed extends OpRepeat { private int len; OpGreedyFixed(Operation op, int min, int max, int len) { super(op, min, max, true); this.len = len; } @Override public int getMatchLength() { return min == max ? min * len : -1; } @Override public int matchesEmptyString() { if (min == 0) { return MATCHES_ZLS_ANYWHERE; } return op.matchesEmptyString(); } @Override public Operation optimize(REProgram program, REFlags flags) { if (max == 0) { return new OpNothing(); } if (op.getMatchLength() == 0) { return op; } op = op.optimize(program, flags); return this; } @Override public IntIterator iterateMatches(REMatcher matcher, int position) { int guard = matcher.search.uLength(); if (max < Integer.MAX_VALUE) { guard = java.lang.Math.min(guard, position + len * max); } if (position >= guard && min > 0) { return EmptyIntIterator.getInstance(); } int p = position; int matches = 0; while (p <= guard) { IntIterator it = op.iterateMatches(matcher, p); boolean matched = false; if (it.hasNext()) { matched = true; it.next(); } if (matched) { matches++; p += len; if (matches == max) { break; } } else { break; } } if (matches < min) { return EmptyIntIterator.getInstance(); } return new IntStepIterator(p, -len, position + len * min); } } /** * Handle a repetition where there is no ambiguity; if the repeated * term is matched in the string, then it cannot match anything other than * the repeated term. It is also used when the number of occurrences is * fixed. In this situation there will never be any need for * backtracking, so there is no need to keep any information to support * backtracking, and in addition, there is no distinction between greedy * and reluctant matching. This operation is used only for a repeated * atom or CharClass, which also means that if the repeated term matches * then it can only match in one way; a typical example is the term "A*" * in the regex "A*B". */ public static class OpUnambiguousRepeat extends OpRepeat { OpUnambiguousRepeat(Operation op, int min, int max) { super(op, min, max, true); } @Override public int matchesEmptyString() { if (min == 0) { return MATCHES_ZLS_ANYWHERE; } return op.matchesEmptyString(); } @Override public int getMatchLength() { if (op.getMatchLength() != -1 && min == max) { return op.getMatchLength()*min; } else { return -1; } } @Override public Operation optimize(REProgram program, REFlags flags) { op = op.optimize(program, flags); return this; } @Override public IntIterator iterateMatches(REMatcher matcher, int position) { int guard = matcher.search.uLength(); int p = position; int matches = 0; while (matches < max && p <= guard) { IntIterator it = op.iterateMatches(matcher, p); if (it.hasNext()) { matches++; p = it.next(); } else { break; } } if (matches < min) { return EmptyIntIterator.getInstance(); } else { return new IntSingletonIterator(p); } } } /** * Handle a repetition (with possible min and max) where the * size of the repeated unit is variable. */ public static class OpRepeat extends Operation { protected Operation op; protected int min; protected int max; boolean greedy; OpRepeat(Operation op, int min, int max, boolean greedy) { this.op = op; this.min = min; this.max = max; this.greedy = greedy; } /** * Get the operation being repeated * @return the child operation */ Operation getRepeatedOperation() { return op; } @Override public int matchesEmptyString() { if (min == 0) { return MATCHES_ZLS_ANYWHERE; } return op.matchesEmptyString(); } @Override public boolean containsCapturingExpressions() { return op instanceof OpCapture || op.containsCapturingExpressions(); } @Override public CharacterClass getInitialCharacterClass(boolean caseBlind) { return op.getInitialCharacterClass(caseBlind); } @Override public int getMatchLength() { return min == max && op.getMatchLength() >= 0 ? min * op.getMatchLength() : -1; } @Override public int getMinimumMatchLength() { return min * op.getMinimumMatchLength(); } @Override public Operation optimize(REProgram program, REFlags flags) { op = op.optimize(program, flags); if (min == 0 && op.matchesEmptyString() == MATCHES_ZLS_ANYWHERE) { // turns (a?)* into (a?)+ min = 1; } return this; } @Override public IntIterator iterateMatches(final REMatcher matcher, int position) { final Stack iterators = new Stack<>(); final Stack positions = new Stack<>(); final int bound = Math.min(max, matcher.search.uLength() - position + 1); int p = position; if (greedy) { // Prime the arrays first with iterators up to the maximum length, stopping if there is no match if (min == 0 && !matcher.history.isDuplicateZeroLengthMatch(this, position)) { // add a match at the current position if zero occurrences are allowed iterators.push(new IntSingletonIterator(position)); positions.push(p); } for (int i=0; i= min) { return !iterators.isEmpty(); } else if (iterators.isEmpty()) { return false; } else { do { advance(); } while (iterators.size() < min && !iterators.isEmpty()); return !iterators.isEmpty(); } } @Override public int next() { primed = false; return positions.peek(); } }; return new ForceProgressIterator(base); } else { // reluctant (non-greedy) repeat. // rewritten for bug 3902 IntIterator iter = new IntIterator() { private int pos = position; private int counter = 0; private void advance() { IntIterator it = op.iterateMatches(matcher, pos); if (it.hasNext()) { pos = it.next(); if (++counter > max) { pos = -1; } } else if (min == 0 && counter == 0) { counter++; } else { pos = -1; } } @Override public boolean hasNext() { do { advance(); } while (counter < min && pos >= 0); return pos >= 0; } @Override public int next() { return pos; } }; return new ForceProgressIterator(iter); } } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { String quantifier; if (min==0 && max==Integer.MAX_VALUE) { quantifier = "*"; } else if (min==1 && max==Integer.MAX_VALUE) { quantifier = "+"; } else if (min==0 && max==1) { quantifier = "?"; } else { quantifier = "{" + min + "," + max + "}"; } if (!greedy) { quantifier += "?"; } return op.display() + quantifier; } } /** * Handle a reluctant repetition (with possible min and max) where the * size of the repeated unit is fixed. */ public static class OpReluctantFixed extends OpRepeat { private int len; OpReluctantFixed(Operation op, int min, int max, int len) { super(op, min, max, false); this.len = len; } @Override public int getMatchLength() { return min == max ? min * len : -1; } @Override public int matchesEmptyString() { if (min == 0) { return MATCHES_ZLS_ANYWHERE; } return op.matchesEmptyString(); } @Override public Operation optimize(REProgram program, REFlags flags) { op = op.optimize(program, flags); return this; } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { return new IntIterator() { private int pos = position; private int count = 0; private boolean started = false; @Override public boolean hasNext() { if (!started) { started = true; while (count < min) { IntIterator child = op.iterateMatches(matcher, pos); if (child.hasNext()) { pos = child.next(); count++; } else { return false; } } return true; } if (count < max) { matcher.clearCapturedGroupsBeyond(pos); IntIterator child = op.iterateMatches(matcher, pos); if (child.hasNext()) { pos = child.next(); count++; return true; } } return false; } @Override public int next() { return pos; } }; } } /** * End of program */ public static class OpEndProgram extends Operation { @Override public int getMatchLength() { return 0; } @Override public int matchesEmptyString() { return MATCHES_ZLS_ANYWHERE; } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { // An anchored match is successful only if we are at the end of the string. // Otherwise, match has succeeded unconditionally if (matcher.anchoredMatch) { if (matcher.search.isEnd(position)) { return new IntSingletonIterator(position); } else { return EmptyIntIterator.getInstance(); } } else { matcher.setParenEnd(0, position); return new IntSingletonIterator(position); } } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "\\Z"; } } /** * Beginning of Line (^) */ public static class OpBOL extends Operation { @Override public int getMatchLength() { return 0; } @Override public int matchesEmptyString() { return MATCHES_ZLS_AT_START; } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { // Fail if we're not at the start of the string if (position != 0) { // If we're multiline matching, we could still be at the start of a line if (matcher.program.flags.isMultiLine()) { // Continue if at the start of a line if (matcher.isNewline(position - 1) && !matcher.search.isEnd(position)) { return new IntSingletonIterator(position); } } return EmptyIntIterator.getInstance(); } return new IntSingletonIterator(position); } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "^"; } } /** * End of Line ($) */ public static class OpEOL extends Operation { @Override public int getMatchLength() { return 0; } @Override public int matchesEmptyString() { return MATCHES_ZLS_AT_END; } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { // If we're not at the end of string UnicodeString search = matcher.search; if (matcher.program.flags.isMultiLine()) { if (search.isEnd(0) || search.isEnd(position) || matcher.isNewline(position)) { return new IntSingletonIterator(position); //match successful } else { return EmptyIntIterator.getInstance(); } } else { // In spec bug 16809 we decided that '$' does not match a trailing newline when not in multiline mode if (search.isEnd(0) || search.isEnd(position)) { return new IntSingletonIterator(position); } else { return EmptyIntIterator.getInstance(); } } } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "$"; } } /** * Open paren (captured group) */ public static class OpCapture extends Operation { int groupNr; Operation childOp; OpCapture(Operation childOp, int group) { this.childOp = childOp; this.groupNr = group; } @Override public int getMatchLength() { return childOp.getMatchLength(); } @Override public int getMinimumMatchLength() { return childOp.getMinimumMatchLength(); } @Override public int matchesEmptyString() { return childOp.matchesEmptyString(); } @Override public Operation optimize(REProgram program, REFlags flags) { childOp = childOp.optimize(program, flags); return this; } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { if ((matcher.program.optimizationFlags & REProgram.OPT_HASBACKREFS) != 0) { matcher.startBackref[groupNr] = position; } final IntIterator base = childOp.iterateMatches(matcher, position); return new IntIterator() { @Override public boolean hasNext() { return base.hasNext(); } @Override public int next() { int next = base.next(); // Increase valid paren count if (groupNr >= matcher.captureState.parenCount) { matcher.captureState.parenCount = groupNr + 1; } // Don't set paren if already set later on //if (matcher.getParenStart(groupNr) == -1) { matcher.setParenStart(groupNr, position); matcher.setParenEnd(groupNr, next); //} if ((matcher.program.optimizationFlags & REProgram.OPT_HASBACKREFS) != 0) { matcher.startBackref[groupNr] = position; matcher.endBackref[groupNr] = next; } return next; } }; } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "(" + childOp.display() + ")"; } } /** * Back-reference */ public static class OpBackReference extends Operation { int groupNr; OpBackReference(int groupNr) { this.groupNr = groupNr; } /** * Ask whether the regular expression is known, after static analysis, to match a * zero-length string * * @return false. Returning true means that * the expression is known statically to match ""; returning false means that this cannot * be determined statically; it does not mean that the expression does not match "". * We cannot do the analysis statically where back-references are involved, so we return false. */ @Override public int matchesEmptyString() { return 0; // no information available } @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { // Get the start and end of the backref int s = matcher.startBackref[groupNr]; int e = matcher.endBackref[groupNr]; // We don't know the backref yet if (s == -1 || e == -1) { return EmptyIntIterator.getInstance(); } // The backref is empty size if (s == e) { return new IntSingletonIterator(position); } // Get the length of the backref int l = e - s; // If there's not enough input left, give up. UnicodeString search = matcher.search; if (search.isEnd(position + l - 1)) { return EmptyIntIterator.getInstance(); } // Case fold the backref? if (matcher.program.flags.isCaseIndependent()) { // Compare backref to input for (int i = 0; i < l; i++) { if (!matcher.equalCaseBlind(search.uCharAt(position + i), search.uCharAt(s + i))) { return EmptyIntIterator.getInstance(); } } } else { // Compare backref to input for (int i = 0; i < l; i++) { if (search.uCharAt(position + i) != search.uCharAt(s + i)) { return EmptyIntIterator.getInstance(); } } } return new IntSingletonIterator(position + l); } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "\\" + groupNr; } } /** * Match empty string */ public static class OpNothing extends Operation { @Override public IntIterator iterateMatches(final REMatcher matcher, final int position) { return new IntSingletonIterator(position); } @Override public int matchesEmptyString() { return MATCHES_ZLS_ANYWHERE; } @Override public int getMatchLength() { return 0; } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return "()"; } } /** * Operation that wraps a base operation and traces its execution */ public static class OpTrace extends Operation { private Operation base; private static int counter = 0; OpTrace(Operation base) { this.base = base; } @Override public IntIterator iterateMatches(REMatcher matcher, int position) { final IntIterator baseIter = base.iterateMatches(matcher, position); final int iterNr = counter++; String clName = baseIter.getClass().getName(); int lastDot = clName.lastIndexOf("."); String iterName = clName.substring(lastDot + 1); System.err.println("Iterating over " + base.getClass().getSimpleName() + " " + base.display() + " at position " + position + " returning " + iterName + " " + iterNr); return new IntIterator() { @Override public boolean hasNext() { boolean b = baseIter.hasNext(); System.err.println("IntIterator " + iterNr + " hasNext() = " + b); return b; } @Override public int next() { int n = baseIter.next(); System.err.println("IntIterator " + iterNr + " next() = " + n); return n; } }; } @Override public int getMatchLength() { return base.getMatchLength(); } @Override public int matchesEmptyString() { return base.matchesEmptyString(); } @Override public Operation optimize(REProgram program, REFlags flags) { base = base.optimize(program, flags); return this; } /** * Display the operation as a regular expression, possibly in abbreviated form * * @return the operation in a form that is recognizable as a regular expression or abbreviated * regular expression */ @Override public String display() { return base.display(); } } /** * The ForceProgressIterator is used to protect against non-termination; specifically, * iterators that return an infinite number of zero-length matches. After getting a * certain number of zero-length matches at the same position, hasNext() returns false. * (Potentially this gives problems with an expression such as (a?|b?|c?|d) that can * legitimately return more than one zero-length match). */ private static class ForceProgressIterator implements IntIterator { private IntIterator base; int countZeroLength = 0; int currentPos = -1; ForceProgressIterator(IntIterator base) { this.base = base; } @Override public boolean hasNext() { return countZeroLength <= 3 && base.hasNext(); } @Override public int next() { int p = base.next(); if (p == currentPos) { countZeroLength++; } else { countZeroLength = 0; currentPos = p; } return p; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy