All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.RECompiler Maven / Gradle / Ivy

There is a newer version: 10.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2013 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Originally part of Apache's Jakarta project (downloaded January 2012),
 * this file has been extensively modified for integration into Saxon by
 * Michael Kay, Saxonica.
 */

package net.sf.saxon.regex;

import net.sf.saxon.tree.util.FastStringBuffer;
import net.sf.saxon.value.Whitespace;
import net.sf.saxon.z.*;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * A regular expression compiler class.  This class compiles a pattern string into a
 * regular expression program interpretable by the RE evaluator class.  The 'recompile'
 * command line tool uses this compiler to pre-compile regular expressions for use
 * with RE.  For a description of the syntax accepted by RECompiler and what you can
 * do with regular expressions, see the documentation for the RE matcher class.
 *
 * @author Jonathan Locke
 * @author Michael McCallum
 * @version $Id: RECompiler.java 518156 2007-03-14 14:31:26Z vgritsenko $
 * @see net.sf.saxon.regex.REMatcher
 */

/*
 * Changes made for Saxon:
 *
 * - handle full Unicode repertoire (esp non-BMP characters) using UnicodeString class for
 *   both the source string and the regular expression
 * - added support for subtraction in a character class
 * - in a character range, changed the condition start < end to start <= end
 * - removed support for [:POSIX:] construct
 * - added support for \p{} and \P{} classes
 * - removed support for unsupported escapes: f, x, u, b, octal characters; added i and c
 * - changed the handling of hyphens within square brackets, and ^ appearing other than at the start
 * - changed the data structure used for the executable so that terms that match a character class
 *   now reference an IntPredicate that tests for membership of the character in a set
 * - added support for reluctant {n,m}? quantifiers
 * - allow a quantifier on a nullable expression
 * - allow a quantifier on '$' or '^'
 * - some constructs (back-references, non-capturing groups, etc) are conditional on which XPath/XSD version
 *   is in use
 * - regular expression flags are now fixed at the time the RE is compiled, this can no longer be deferred
 *   until the RE is evaluated
 * - split() function includes a zero-length string at the end of the returned sequence if the last
 *   separator is at the end of the string
 * - added support for the 'q' and 'x' flags; improved support for the 'i' flag
 * - added a method to determine whether there is an anchored match (for XSD use)
 * - tests for newline (e.g in multiline mode) now match \n only, as required by the XPath specification
 * - reorganised the executable program to use Operation objects rather than integer opcodes
 * - introduced optimization for non-backtracking + and * operators (with simple operands)
 */
public class RECompiler {
    // The compiled program
    ArrayList instructions = new ArrayList(20);

    // Input state for compiling regular expression
    UnicodeString pattern;                                     // Input string
    int len;                                            // Length of the pattern string
    int idx;                                            // Current input index into ac
    int parens;                                         // Total number of paren pairs

    // Node flags
    static final int NODE_NORMAL = 0;                   // No flags (nothing special)
    static final int NODE_NULLABLE = 1;                 // True if node is potentially null
    static final int NODE_TOPLEVEL = 2;                 // True if top level expr

    // {m,n} stacks
    static final int bracketUnbounded = -1;             // Unbounded value
    int bracketMin;                                     // Minimum number of matches
    int bracketOpt;                                     // Additional optional matches

    boolean isXPath = true;
    boolean isXPath30 = true;
    boolean isXSD11 = false;
    IntHashSet captures = new IntHashSet();

    REFlags reFlags;

    List warnings;

    /**
     * Constructor.  Creates (initially empty) storage for a regular expression program.
     */
    public RECompiler() {

    }

    /**
     * Set the regular expression flags to be used
     * @param flags the regular expression flags
     */

    public void setFlags(REFlags flags) {
        this.reFlags = flags;
        isXPath = flags.isAllowsXPath20Extensions();
        isXPath30 = flags.isAllowsXPath30Extensions();
        isXSD11 = flags.isAllowsXSD11Syntax();
    }


    private void insertNode(Operation node, int insertAt) {
        instructions.add(insertAt, node);
    }

    private void warning(String s) {
        if (warnings == null) {
            warnings = new ArrayList(4);
        }
        warnings.add(s);
    }

    /**
     * On completion of compilation, get any warnings that were generated
     * @return the list of warning messages
     */

    public List getWarnings() {
        if (warnings == null) {
            return Collections.emptyList();
        } else {
            return warnings;
        }
    }

    /**
     * Appends a node to the end of a node chain
     *
     * @param node    Start of node chain to traverse
     * @param pointTo Node to have the tail of the chain point to
     */
    void setNextOfEnd(int node, int pointTo) {
        //System.err.println("NEW nextOfEnd " + node + " " + pointTo);
        // Traverse the chain until the next offset is 0
        int next = instructions.get(node).next;
        // while the 'node' is not the last in the chain
        // and the 'node' is not the last in the program.
        while (next != 0 && node < instructions.size()) {
            // if the node we are supposed to point to is in the chain then
            // point to the end of the program instead.
            // Michael McCallum 
            // FIXME: This is a _hack_ to stop infinite programs.
            // I believe that the implementation of the reluctant matches is wrong but
            // have not worked out a better way yet.
            if (node == pointTo) {
                pointTo = instructions.size();
            }
            node += next;
            next = instructions.get(node).next;
        }

        // if we have reached the end of the program then dont set the pointTo.
        // im not sure if this will break any thing but passes all the tests.
        if (node < instructions.size()) {
            int offset = pointTo - node;

            // Point the last node in the chain to pointTo.
            instructions.get(node).next = offset;
        }
    }

    /**
     * Throws a new internal error exception
     *
     * @throws Error Thrown in the event of an internal error.
     */
    void internalError() throws Error {
        throw new Error("Internal error!");
    }

    /**
     * Throws a new syntax error exception
     * @param s the error message
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    void syntaxError(String s) throws RESyntaxException {
        throw new RESyntaxException(s, idx);
    }

    /**
     * Match bracket {m,n} expression put results in bracket member variables
     *
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    void bracket() throws RESyntaxException {
        // Current character must be a '{'
        if (idx >= len || pattern.charAt(idx++) != '{') {
            internalError();
        }

        // Next char must be a digit
        if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
            syntaxError("Expected digit");
        }

        // Get min ('m' of {m,n}) number
        StringBuffer number = new StringBuffer();
        while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
            number.append((char)pattern.charAt(idx++));
        }
        try {
            bracketMin = Integer.parseInt(number.toString());
        } catch (NumberFormatException e) {
            syntaxError("Expected valid number");
        }

        // If out of input, fail
        if (idx >= len) {
            syntaxError("Expected comma or right bracket");
        }

        // If end of expr, optional limit is 0
        if (pattern.charAt(idx) == '}') {
            idx++;
            bracketOpt = 0;
            return;
        }

        // Must have at least {m,} and maybe {m,n}.
        if (idx >= len || pattern.charAt(idx++) != ',') {
            syntaxError("Expected comma");
        }

        // If out of input, fail
        if (idx >= len) {
            syntaxError("Expected comma or right bracket");
        }

        // If {m,} max is unlimited
        if (pattern.charAt(idx) == '}') {
            idx++;
            bracketOpt = bracketUnbounded;
            return;
        }

        // Next char must be a digit
        if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
            syntaxError("Expected digit");
        }

        // Get max number
        number.setLength(0);
        while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
            number.append((char)pattern.charAt(idx++));
        }
        try {
            bracketOpt = Integer.parseInt(number.toString()) - bracketMin;
        } catch (NumberFormatException e) {
            syntaxError("Expected valid number");
        }

        // Optional repetitions must be >= 0
        if (bracketOpt < 0) {
            syntaxError("Bad range");
        }

        // Must have close brace
        if (idx >= len || pattern.charAt(idx++) != '}') {
            syntaxError("Missing close brace");
        }
    }

    /**
     * Test whether a character is an ASCII decimal digit
     * @param ch the character to be matched
     * @return true if the character is an ASCII digit (0-9)
     */

    private static boolean isAsciiDigit(int ch) {
        return ch >= '0' && ch <= '9';
    }

    /**
     * Match an escape sequence.  Handles quoted chars and octal escapes as well
     * as normal escape characters.  Always advances the input stream by the
     * right amount. This code "understands" the subtle difference between an
     * octal escape and a backref.  You can access the type of ESC_CLASS or
     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
     *
     * @return an IntPredicate that matches the character or characters represented
     * by this escape sequence. For a single-character escape this must be an IntValuePredicate
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    IntPredicate escape(boolean inSquareBrackets) throws RESyntaxException {
        // "Shouldn't" happen
        if (pattern.charAt(idx) != '\\') {
            internalError();
        }

        // Escape shouldn't occur as last character in string!
        if (idx + 1 == len) {
            syntaxError("Escape terminates string");
        }

        // Switch on character after backslash
        idx += 2;
        int escapeChar = pattern.charAt(idx - 1);
        switch (escapeChar) {

            case 'n':
                return new IntValuePredicate('\n');
            case 'r':
                return new IntValuePredicate('\r');
            case 't':
                return new IntValuePredicate('\t');

            case '\\':
            case '|':
            case '.':
            case '-':
            case '^':
            case '?':
            case '*':
            case '+':
            case '{':
            case '}':
            case '(':
            case ')':
            case '[':
            case ']':
                return new IntValuePredicate(escapeChar);

            case '$':
                if (isXPath) {
                    return new IntValuePredicate(escapeChar);
                } else {
                    syntaxError("In XSD, '$' must not be escaped");
                }

            case 's':
                return Categories.ESCAPE_s;

            case 'S':
                return Categories.ESCAPE_S;

            case 'i':
                return Categories.ESCAPE_i;

            case 'I':
                return Categories.ESCAPE_I;

            case 'c':
                return Categories.ESCAPE_c;

            case 'C':
                return Categories.ESCAPE_C;

            case 'd':
                return Categories.ESCAPE_d;

            case 'D':
                return Categories.ESCAPE_D;

            case 'w':
                return Categories.ESCAPE_w;

            case 'W':
                return Categories.ESCAPE_W;


            case 'p':
            case 'P':

                if (idx == len) {
                    syntaxError("Expected '{' after \\" + escapeChar);
                }
                if (pattern.charAt(idx) != '{') {
                    syntaxError("Expected '{' after \\" + escapeChar);
                }
                int close = pattern.indexOf('}', idx++);
                if (close == -1) {
                    syntaxError("No closing '}' after \\" + escapeChar);
                }
                UnicodeString block = pattern.substring(idx, close);
                if (block.length() == 1 || block.length() == 2) {
                    IntPredicate primary = Categories.getCategory(block.toString());
                    if (primary == null) {
                        syntaxError("Unknown character category " + block.toString());
                    }
                    idx = close+1;
                    if (escapeChar == 'p') {
                        return primary;
                    } else {
                        return makeComplement(primary);
                    }
                } else if (block.toString().startsWith("Is")) {
                    String blockName = block.toString().substring(2);
                    IntSet uniBlock = UnicodeBlocks.getBlock(blockName);
                    if (uniBlock == null) {
                        // XSD 1.1 says this is not an error, but by default we reject it
                        if (reFlags.isAllowUnknownBlockNames()) {
                            warning("Unknown Unicode block: " + blockName);
                            idx = close+1;
                            return new IntSetPredicate(IntUniversalSet.getInstance());
                        } else {
                            syntaxError("Unknown Unicode block: " + blockName);
                        }
                    }
                    idx = close+1;
                    IntPredicate primary = new IntSetPredicate(uniBlock);
                    if (escapeChar == 'p') {
                        return primary;
                    } else {
                        return makeComplement(primary);
                    }
                } else {
                    syntaxError("Unknown character category: " + block);
                }

            case '0':
                syntaxError("Octal escapes not allowed");

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':

                if (inSquareBrackets) {
                    syntaxError("Backreference not allowed within character class");
                } else if (isXPath) {
                    int backRef = (escapeChar - '0');
                    while (idx < len) {
                        int c1 = "0123456789".indexOf(pattern.charAt(idx));
                        if (c1 < 0) {
                            break;
                        } else {
                            int backRef2 = backRef * 10 + c1;
                            if (backRef2 > (parens - 1)) {
                                break;
                            } else {
                                backRef = backRef2;
                                idx++;
                            }
                        }

                    }
                    if (!captures.contains(backRef)) {
                        String explanation = (backRef > (parens - 1) ? "(no such group)" : "(group not yet closed)");
                        syntaxError("invalid backreference \\" + backRef + " " + explanation);
                    }
                    return new BackReference(backRef);
                } else {
                    syntaxError("digit not allowed after \\");
                }

            default:

                // Other characters not allowed in XSD regexes
                syntaxError("Escape character '" + (char)escapeChar + "' not allowed");
        }
        return null;
    }

    /**
     * For convenience a back-reference is treated as an IntPredicate, although this a fiction
     */

    class BackReference extends IntValuePredicate {
        public BackReference(int number) {
            super(number);
        }
    }


    /**
     * Compile a character class (in square brackets)
     *
     * @return an IntPredicate that tests whether a character matches this character class
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    IntPredicate parseCharacterClass() throws RESyntaxException {
        // Check for bad calling or empty class
        if (pattern.charAt(idx) != '[') {
            internalError();
        }

        // Check for unterminated or empty class
        if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
            syntaxError("Missing ']'");
        }

        // Parse class declaration
        int simpleChar;
        boolean positive = true;
        boolean definingRange = false;
        int rangeStart = -1;
        int rangeEnd;
        IntRangeSet range = new IntRangeSet();
        IntPredicate addend = null;
        IntPredicate subtrahend = null;
        if (thereFollows("^")) {
            if (thereFollows("^-[")) {
                syntaxError("Nothing before subtraction operator");
            } else if (thereFollows("^]")) {
                syntaxError("Empty negative character group");
            } else {
                positive = false;
                idx++;
            }
        } else if (thereFollows("-[")) {
            syntaxError("Nothing before subtraction operator");
        }
        while (idx < len && pattern.charAt(idx) != ']') {
            int ch = pattern.charAt(idx);
            simpleChar = -1;
            switch (ch) {
                case '[':
                    syntaxError("Unescaped '[' within square brackets");
                    break;
                case '\\': {
                    // Escape always advances the stream
                    IntPredicate cc = escape(true);
                    if (cc instanceof IntValuePredicate) {
                        simpleChar = ((IntValuePredicate) cc).getTarget();
                        break;
                    } else {
                        if (definingRange) {
                            syntaxError("Multi-character escape cannot follow '-'");
                        } else if (addend == null) {
                            addend = cc;
                        } else {
                            addend = makeUnion(addend, cc);
                        }
                        continue;
                    }
                }
                case '-':
                    if (thereFollows("-[")) {
                        idx++;
                        subtrahend = parseCharacterClass();
                        if (!thereFollows("]")) {
                            syntaxError("Expected closing ']' after subtraction");
                        }
                    } else if (thereFollows("-]")) {
                        simpleChar = '-';
                        idx++;
                    } else if (rangeStart >= 0) {
                        definingRange = true;
                        idx++;
                        continue;
                    } else if (definingRange) {
                        syntaxError("Bad range");
                    } else if (thereFollows("--") && !thereFollows("--[")) {
                        syntaxError("Unescaped hyphen as start of range");
                    } else if (!isXSD11 && pattern.charAt(idx-1) != '[' && pattern.charAt(idx-1) != '^' && !thereFollows("]") && !thereFollows("-[")) {
                        syntaxError("In XSD 1.0, hyphen is allowed only at the beginning or end of a positive character group");
                    } else {
                        simpleChar = '-';
                        idx++;
                    }
                    break;

                default:
                    simpleChar = ch;
                    idx++;
                    break;
            }

            // Handle simple character simpleChar
            if (definingRange) {
                // if we are defining a range make it now
                rangeEnd = simpleChar;

                // Actually create a range if the range is ok
                if (rangeStart > rangeEnd) {
                    syntaxError("Bad character range: start > end");
                    // TODO: not an error in XSD, merely a no-op?
                }
                range.addRange(rangeStart, rangeEnd);
                if (reFlags.isCaseIndependent()) {
                    // Special-case A-Z and a-z
                    if (rangeStart == 'a' && rangeEnd == 'z') {
                        range.addRange('A', 'Z');
                        for (int v=0; v= len) {
            return ret;
        }

        boolean greedy = true;
        int quantifierType = pattern.charAt(idx);
        switch (quantifierType) {
            case '?':
            case '*':

                // The current node can be null
                flags[0] |= NODE_NULLABLE;

                // Drop through

            case '+':

                // Eat quantifier character
                idx++;

                // Drop through

            case '{':

                if (quantifierType == '{') {
                    bracket();
                    if (bracketMin == 0) {
                        flags[0] |= NODE_NULLABLE;
                    }
                }



                Operation op = instructions.get(ret);
                if (op instanceof Operation.OpBOL || op instanceof Operation.OpEOL) {
                    // Pretty meaningless, but legal. If the quantifier allows zero occurrences, ignore the instruction.
                    // Otherwise, ignore the quantifier
                    if (quantifierType == '?' || quantifierType == '*' ||
                            (quantifierType == '{' && bracketMin == 0)) {
                        instructions.set(ret, new Operation.OpNothing());
                    } else {
                        quantifierType = 0;
                    }
                }
                if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
                    if (quantifierType == '?') {
                        // can ignore the quantifier
                        quantifierType = 0;
                    } else if (quantifierType == '+') {
                        // '*' and '+' are equivalent
                        quantifierType = '*';
                    } else if (quantifierType == '{') {
                        // bounds are meaningless
                        quantifierType = '*';
                    }
                }

        }

        // If the next character is a '?', make the quantifier non-greedy (reluctant)
        if (idx < len && pattern.charAt(idx) == '?') {
            if (!isXPath) {
                syntaxError("Reluctant quantifiers are not allowed in XSD");
            }
            idx++;
            greedy = false;
        }

        if (greedy) {
            // Actually do the quantifier now
            switch (quantifierType) {
                case '{': {
                    //bracket();
                    int bracketEnd = idx;
                    int bracketMin = this.bracketMin;
                    int bracketOpt = this.bracketOpt;

                    // Pointer to the last terminal
                    int pos = ret;

                    // Process min first
                    for (int c = 0; c < bracketMin; c++) {
                        // Rewind stream and run it through again - more matchers coming
                        idx = idxBeforeTerminal;
                        setNextOfEnd(pos, pos = terminal(terminalFlags));
                    }

                    // Do the right thing for maximum ({m,})
                    if (bracketOpt == bracketUnbounded) {
                        // Drop through now and quantifier expression.
                        // We are done with the {m,} expr, so skip rest
                        idx = bracketEnd;
                        Operation.OpStar op = new Operation.OpStar();
                        insertNode(op, pos);
                        setNextOfEnd(pos + 1, pos);
                        break;
                    } else if (bracketOpt > 0) {
                        int opt[] = new int[bracketOpt + 1];
                        // Surround first optional terminal with MAYBE
                        Operation.OpMaybe op = new Operation.OpMaybe();
                        insertNode(op, pos);
                        opt[0] = pos;

                        // Add all the rest optional terminals with preceding MAYBEs
                        for (int c = 1; c < bracketOpt; c++) {
                            op = new Operation.OpMaybe();
                            opt[c] = appendNode(op);
                            // Rewind stream and run it through again - more matchers coming
                            idx = idxBeforeTerminal;
                            terminal(terminalFlags);
                        }

                        // Tie ends together
                        int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
                        for (int c = 0; c < bracketOpt; c++) {
                            setNextOfEnd(opt[c], end);
                            setNextOfEnd(opt[c] + 1, opt[c + 1]);
                        }
                    } else {
                        // Rollback terminal - no opt matchers present
                        //lenInstruction = pos;
                        while (instructions.size() > pos) {
                            instructions.remove(instructions.size()-1);
                        }
                        Operation.OpNothing nothing = new Operation.OpNothing();
                        appendNode(nothing);
                    }

                    // We are done. skip the reminder of {m,n} expr
                    idx = bracketEnd;
                    break;
                }

                case '?': {
                    Operation.OpMaybe maybe = new Operation.OpMaybe();
                    insertNode(maybe, ret);
                    Operation.OpNothing nothing = new Operation.OpNothing();
                    int n = appendNode(nothing);
                    setNextOfEnd(ret, n);
                    setNextOfEnd(ret + 1, n);
                    break;
                }

                case '*': {
                    Operation.OpStar star = new Operation.OpStar();
                    insertNode(star, ret);
                    setNextOfEnd(ret + 1, ret);
                    break;
                }

                case '+': {
                    Operation.OpContinue continu = new Operation.OpContinue();
                    insertNode(continu, ret);
                    Operation.OpPlus plus = new Operation.OpPlus();
                    int n = appendNode(plus);
                    setNextOfEnd(ret + 1, n);
                    setNextOfEnd(n, ret);
                    break;
                }
            }
        } else {
            // Not greedy (reluctant): Actually do the quantifier now
            switch (quantifierType) {
                case '?': {
                    Operation.OpReluctantMaybe reluctantMaybe = new Operation.OpReluctantMaybe();
                    insertNode(reluctantMaybe, ret);
                    //nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
                    int n = appendNode(new Operation.OpNothing());
                    //int n = node(RE.OP_NOTHING, 0);
                    setNextOfEnd(ret, n);
                    setNextOfEnd(ret + 1, n);
                    break;
                }

                case '*': {
                    Operation.OpReluctantStar reluctantStar = new Operation.OpReluctantStar();
                    insertNode(reluctantStar, ret);
                    setNextOfEnd(ret + 1, ret);
                    break;
                }

                case '+': {
                    insertNode(new Operation.OpContinue(), ret);
                    //nodeInsert(RE.OP_CONTINUE, 0, ret);
                    int n = appendNode(new Operation.OpReluctantPlus());
                    //int n = node(RE.OP_RELUCTANTPLUS, 0);
                    setNextOfEnd(n, ret);
                    setNextOfEnd(ret + 1, n);
                    break;
                }

                case '{': {
                    // reluctant {..}? - added by MHK
                    //bracket();
                    int bracketEnd = idx;
                    int bracketMin = this.bracketMin;
                    int bracketOpt = this.bracketOpt;

                    // Pointer to the last terminal
                    int pos = ret;

                    // Process min first
                    for (int c = 0; c < bracketMin; c++) {
                        // Rewind stream and run it through again - more matchers coming
                        idx = idxBeforeTerminal;
                        setNextOfEnd(pos, pos = terminal(terminalFlags));
                    }

                    // Do the right thing for maximum ({m,})
                    if (bracketOpt == bracketUnbounded) {
                        // Drop through now and quantifier expression.
                        // We are done with the {m,} expr, so skip rest
                        idx = bracketEnd;
                        insertNode(new Operation.OpReluctantStar(), pos);
                        //nodeInsert(RE.OP_RELUCTANTSTAR, 0, pos);
                        setNextOfEnd(pos + 1, pos);
                        break;
                    } else if (bracketOpt > 0) {
                        int opt[] = new int[bracketOpt + 1];
                        // Surround first optional terminal with MAYBE
                        insertNode(new Operation.OpReluctantMaybe(), pos);
                        //nodeInsert(RE.OP_RELUCTANTMAYBE, 0, pos);
                        opt[0] = pos;

                        // Add all the rest optional terminals with preceeding MAYBEs
                        for (int c = 1; c < bracketOpt; c++) {
                            opt[c] = appendNode(new Operation.OpReluctantMaybe());
                            //opt[c] = node(RE.OP_RELUCTANTMAYBE, 0);
                            // Rewind stream and run it through again - more matchers coming
                            idx = idxBeforeTerminal;
                            terminal(terminalFlags);
                        }

                        // Tie ends together
                        int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
                        for (int c = 0; c < bracketOpt; c++) {
                            setNextOfEnd(opt[c], end);
                            setNextOfEnd(opt[c] + 1, opt[c + 1]);
                        }
                    } else {
                        // Rollback terminal - no opt matchers present
                        while (instructions.size() > pos) {
                            instructions.remove(instructions.size() - 1);
                        }
                        appendNode(new Operation.OpNothing());
                    }

                    // We are done. skip the reminder of {m,n} expr
                    idx = bracketEnd;
                    break;
                }
            }
        }

        return ret;
    }

    /**
     * Compile body of one branch of an or operator (implements concatenation)
     *
     * @param compilerFlags Flags passed by reference
     * @return Pointer to first node in the branch
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int branch(int[] compilerFlags) throws RESyntaxException {
        // Get each possibly qnatified piece and concat
        int node;
        int ret = -1;
        int chain = -1;
        int[] quantifierFlags = new int[1];
        boolean nullable = true;
        while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
            // Get new node
            quantifierFlags[0] = NODE_NORMAL;
            node = piece(quantifierFlags);
            if (quantifierFlags[0] == NODE_NORMAL) {
                nullable = false;
            }

            // If there's a chain, append to the end
            if (chain != -1) {
                setNextOfEnd(chain, node);
            }

            // Chain starts at current
            chain = node;
            if (ret == -1) {
                ret = node;
            }
        }

        // If we don't run loop, make a nothing node
        if (ret == -1) {
            Operation nothing = new Operation.OpNothing();
            ret = appendNode(nothing);
        }

        // Set nullable flag for this branch
        if (nullable) {
            compilerFlags[0] |= NODE_NULLABLE;
        }

        return ret;
    }

    /**
     * Compile an expression with possible parens around it.  Paren matching
     * is done at this level so we can tie the branch tails together.
     *
     * @param compilerFlags Flag value passed by reference
     * @return Node index of expression in instruction array
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int expr(int[] compilerFlags) throws RESyntaxException {
        // Create open paren node unless we were called from the top level (which has no parens)
        int paren = -1;
        int ret = -1;
        int closeParens = parens;
        if ((compilerFlags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
            // if its a cluster ( rather than a proper subexpression ie with backrefs )
            if (idx + 2 < len && pattern.charAt(idx + 1) == '?' && pattern.charAt(idx + 2) == ':') {
                if (!isXPath30) {
                    syntaxError("Non-capturing groups allowed only in XPath3.0");
                }
                paren = 2;
                idx += 3;
                ret = appendNode(new Operation.OpOpenCluster());
            } else {
                paren = 1;
                idx++;
                ret = appendNode(new Operation.OpOpen(parens++));
            }
        }
        compilerFlags[0] &= ~NODE_TOPLEVEL;

        // Process contents of first branch node
        boolean open = false;
        int branch = branch(compilerFlags);
        if (ret == -1) {
            ret = branch;
        } else {
            setNextOfEnd(ret, branch);
        }

        // Loop through branches
        while (idx < len && pattern.charAt(idx) == '|') {
            // Now open the first branch since there are more than one
            if (!open) {
                Operation.OpBranch op = new Operation.OpBranch();
                insertNode(op, branch);
                open = true;
            }

            idx++;
            setNextOfEnd(branch, branch = appendNode(new Operation.OpBranch()));
            branch(compilerFlags);
        }

        // Create an ending node (either a close paren or an OP_END)
        int end;
        if (paren > 0) {
            if (idx < len && pattern.charAt(idx) == ')') {
                idx++;
            } else {
                syntaxError("Missing close paren");
            }
            if (paren == 1) {
                end = appendNode(new Operation.OpClose(closeParens));
                captures.add(closeParens);
            } else {
                end = appendNode(new Operation.OpCloseCluster());
            }
        } else {
            end = appendNode(new Operation.OpEndProgram());
        }

        // Append the ending node to the ret nodelist
        setNextOfEnd(ret, end);

        // Hook the ends of each branch to the end node
        int currentNode = ret;
        int nextNodeOffset = instructions.get(currentNode).next;
        // while the next node o
        while (nextNodeOffset != 0 && currentNode < instructions.size()) {
            // If branch, make the end of the branch's operand chain point to the end node.
            if (instructions.get(currentNode) instanceof Operation.OpBranch) {
                setNextOfEnd(currentNode + 1, end);
            }
            nextNodeOffset = instructions.get(currentNode).next;
            currentNode += nextNodeOffset;
        }

        // Return the node list
        return ret;
    }

    /**
     * Compiles a regular expression pattern into a program runnable by the pattern
     * matcher class 'RE'.
     *
     * @param pattern Regular expression pattern to compile (see RECompiler class
     *                for details).
     * @return A compiled regular expression program.
     * @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
     * @see RECompiler
     * @see net.sf.saxon.regex.REMatcher
     */
    public REProgram compile(UnicodeString pattern) throws RESyntaxException {
        // Initialize variables for compilation
        this.pattern = pattern;                         // Save pattern in instance variable
        len = pattern.length();                         // Precompute pattern length for speed
        idx = 0;                                        // Set parsing index to the first character
        parens = 1;                                     // Set paren level to 1 (the implicit outer parens)
        boolean nullable = false;

        if (reFlags.isLiteral()) {

            // 'q' flag is set
            int ret = literalAtom();
            Operation.OpEndProgram endNode = new Operation.OpEndProgram();
            int end = appendNode(endNode);
            setNextOfEnd(ret, end);

        } else {

            if (reFlags.isAllowWhitespace()) {
                // 'x' flag is set. Preprocess the expression to strip whitespace, other than between
                // square brackets
                FastStringBuffer sb = new FastStringBuffer(pattern.length());
                int nesting = 0;
                boolean astral = false;
                boolean escaped = false;
                for (int i=0; i 65535) {
                        astral = true;
                    }
                    if (ch == '\\' && !escaped) {
                        escaped = true;
                        sb.appendWideChar(ch);
                    } else if (ch == '[' && !escaped) {
                        nesting++;
                        escaped = false;
                        sb.appendWideChar(ch);
                    } else if (ch == ']' && !escaped) {
                        nesting--;
                        escaped = false;
                        sb.appendWideChar(ch);
                    } else if (nesting==0 && Whitespace.isWhitespace(ch)) {
                        // no action
                    } else {
                        escaped = false;
                        sb.appendWideChar(ch);
                    }
                }
                if (astral) {
                    pattern = new GeneralUnicodeString(sb);
                } else {
                    pattern = new BMPString(sb);
                }
                this.pattern = pattern;
                this.len = pattern.length();
            }

            // Initialize pass by reference flags value
            int[] compilerFlags = {NODE_TOPLEVEL};

            // Parse expression
            expr(compilerFlags);

            nullable = (compilerFlags[0] & NODE_NULLABLE) != 0;

            // Should be at end of input
            if (idx != len) {
                if (pattern.charAt(idx) == ')') {
                    syntaxError("Unmatched close paren");
                }
                syntaxError("Unexpected input remains");
            }

        }

        // Return the result
        Operation[] ops = new Operation[instructions.size()];
        for (int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy