All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.RECompiler Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Originally part of Apache's Jakarta project (downloaded January 2012),
 * this file has been extensively modified for integration into Saxon by
 * Michael Kay, Saxonica.
 */

package net.sf.saxon.regex;

import net.sf.saxon.regex.charclass.*;
import net.sf.saxon.str.StringConstants;
import net.sf.saxon.str.UnicodeBuilder;
import net.sf.saxon.str.UnicodeString;
import net.sf.saxon.transpile.CSharp;
import net.sf.saxon.value.Whitespace;
import net.sf.saxon.z.*;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * A regular expression compiler class.  This class compiles a pattern string into a
 * regular expression program interpretable by the RE evaluator class.  The 'recompile'
 * command line tool uses this compiler to pre-compile regular expressions for use
 * with RE.  For a description of the syntax accepted by RECompiler and what you can
 * do with regular expressions, see the documentation for the RE matcher class.
 *
 * @author Jonathan Locke
 * @author Michael McCallum
 * @version $Id: RECompiler.java 518156 2007-03-14 14:31:26Z vgritsenko $
 * @see net.sf.saxon.regex.REMatcher
 */

/*
 * Changes made for Saxon:
 *
 * - handle full Unicode repertoire (esp non-BMP characters) using UnicodeString class for
 *   both the source string and the regular expression
 * - added support for subtraction in a character class
 * - in a character range, changed the condition start < end to start <= end
 * - removed support for [:POSIX:] construct
 * - added support for \p{} and \P{} classes
 * - removed support for unsupported escapes: f, x, u, b, octal characters; added i and c
 * - changed the handling of hyphens within square brackets, and ^ appearing other than at the start
 * - changed the data structure used for the executable so that terms that match a character class
 *   now reference an IntPredicate that tests for membership of the character in a set
 * - added support for reluctant {n,m}? quantifiers
 * - allow a quantifier on a nullable expression
 * - allow a quantifier on '$' or '^'
 * - some constructs (back-references, non-capturing groups, etc) are conditional on which XPath/XSD version
 *   is in use
 * - regular expression flags are now fixed at the time the RE is compiled, this can no longer be deferred
 *   until the RE is evaluated
 * - split() function includes a zero-length string at the end of the returned sequence if the last
 *   separator is at the end of the string
 * - added support for the 'q' and 'x' flags; improved support for the 'i' flag
 * - added a method to determine whether there is an anchored match (for XSD use)
 * - tests for newline (e.g in multiline mode) now match \n only, as required by the XPath specification
 * - reorganised the executable program to use Operation objects rather than integer opcodes
 * - introduced optimization for non-backtracking + and * operators (with simple operands)
 *
 * Further changes made February 2014:
 * - complete rewrite of the run-time engine to use an interpreter approach directly on the parsed expression
 *   tree, bypassing the generation of a finite state machine. This achieves a substantial reduction in
 *   recursive depth; the old code had one level of recursion per input character in some cases. In addition
 *   the compiled code for expressions involving large finite counters is much more compact.
 */
public class RECompiler {

    // Input state for compiling regular expression
    UnicodeString pattern;                                     // Input string
    int len;                                            // Length of the pattern string
    int idx;                                            // Current input index into ac
    int capturingOpenParenCount;                                         // Total number of paren pairs

    // Node flags
    static final int NODE_NORMAL = 0;                   // No flags (nothing special)
    static final int NODE_TOPLEVEL = 2;                 // True if top level expr

    // {m,n} stacks
    int bracketMin;                                     // Minimum number of matches
    int bracketMax;                                     // Maximum number of matches

    boolean isXPath = true;
    boolean isXPath30 = true;
    boolean isXSD11 = false;
    IntHashSet captures = new IntHashSet();
    boolean hasBackReferences = false;

    REFlags reFlags;

    List warnings;

    private final static boolean TRACING = false;

    /**
     * Constructor.  Creates (initially empty) storage for a regular expression program.
     */
    public RECompiler() {

    }

    /**
     * Set the regular expression flags to be used
     *
     * @param flags the regular expression flags
     */

    public void setFlags(REFlags flags) {
        this.reFlags = flags;
        isXPath = flags.isAllowsXPath20Extensions();
        isXPath30 = flags.isAllowsXPath30Extensions();
        isXSD11 = flags.isAllowsXSD11Syntax();
    }


    private void warning(String s) {
        if (warnings == null) {
            warnings = new ArrayList<>(4);
        }
        warnings.add(s);
    }

    /**
     * On completion of compilation, get any warnings that were generated
     *
     * @return the list of warning messages
     */

    public List getWarnings() {
        if (warnings == null) {
            return Collections.emptyList();
        } else {
            return warnings;
        }
    }

    /**
     * Throws a new internal error exception
     *
     * @throws Error Thrown in the event of an internal error.
     */
    void internalError() throws Error {
        throw new AssertionError("Internal error!");
    }

    /**
     * Throws a new syntax error exception
     *
     * @param s the error message
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    void syntaxError(String s) throws RESyntaxException {
        throw new RESyntaxException(s, idx);
    }

    /**
     * Optionally add trace code around an operation
     * @param base the operation to which trace code is to be added
     * @return the trace operation; this matches the same strings as the base operation,
     * but traces its execution for diagnostic purposes, provided the TRACING switch is set.
     */

    static Operation trace(Operation base) {
        if (TRACING && !(base instanceof OpTrace)) {
            return new OpTrace(base);
        } else {
            return base;
        }
    }

    /**
     * Match bracket {m,n} expression, putting the results in bracket member variables
     *
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    void bracket() throws RESyntaxException {
        // Current character must be a '{'
        if (idx >= len || pattern.codePointAt(idx++) != '{') {
            internalError();
        }

        // Next char must be a digit
        if (idx >= len || !isAsciiDigit(pattern.codePointAt(idx))) {
            syntaxError("Expected digit");
        }

        // Get min ('m' of {m,n}) number
        StringBuilder number = new StringBuilder(16);
        while (idx < len && isAsciiDigit(pattern.codePointAt(idx))) {
            number.appendCodePoint(pattern.codePointAt(idx++));
        }
        try {
            bracketMin = Integer.parseInt(number.toString());
        } catch (NumberFormatException e) {
            syntaxError("Expected valid number");
        }

        // If out of input, fail
        if (idx >= len) {
            syntaxError("Expected comma or right bracket");
        }

        // If end of expr, optional limit is 0
        if (pattern.codePointAt(idx) == '}') {
            idx++;
            bracketMax = bracketMin;
            return;
        }

        // Must have at least {m,} and maybe {m,n}.
        if (idx >= len || pattern.codePointAt(idx++) != ',') {
            syntaxError("Expected comma");
        }

        // If out of input, fail
        if (idx >= len) {
            syntaxError("Expected comma or right bracket");
        }

        // If {m,} max is unlimited
        if (pattern.codePointAt(idx) == '}') {
            idx++;
            bracketMax = Integer.MAX_VALUE;
            return;
        }

        // Next char must be a digit
        if (idx >= len || !isAsciiDigit(pattern.codePointAt(idx))) {
            syntaxError("Expected digit");
        }

        // Get max number
        number.setLength(0);
        while (idx < len && isAsciiDigit(pattern.codePointAt(idx))) {
            number.appendCodePoint(pattern.codePointAt(idx++));
        }
        try {
            bracketMax = Integer.parseInt(number.toString());
        } catch (NumberFormatException e) {
            syntaxError("Expected valid number");
        }

        // Optional repetitions must be >= 0
        if (bracketMax < bracketMin) {
            syntaxError("Bad range");
        }

        // Must have close brace
        if (idx >= len || pattern.codePointAt(idx++) != '}') {
            syntaxError("Missing close brace");
        }
    }

    /**
     * Test whether a character is an ASCII decimal digit
     *
     * @param ch the character to be matched
     * @return true if the character is an ASCII digit (0-9)
     */

    private static boolean isAsciiDigit(int ch) {
        return ch >= '0' && ch <= '9';
    }

    /**
     * Match an escape sequence.  Handles quoted chars and octal escapes as well
     * as normal escape characters.  Always advances the input stream by the
     * right amount. This code "understands" the subtle difference between an
     * octal escape and a backref.  You can access the type of ESC_CLASS or
     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
     * @param inSquareBrackets true if the escape sequence is within square brackets
     * @return an IntPredicate that matches the character or characters represented
     *         by this escape sequence. For a single-character escape this must be an IntValuePredicate
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    CharacterClass escape(boolean inSquareBrackets) throws RESyntaxException {
        // "Shouldn't" happen
        if (pattern.codePointAt(idx) != '\\') {
            internalError();
        }

        // Escape shouldn't occur as last character in string!
        if (idx + 1 == len) {
            syntaxError("Escape terminates string");
        }

        // Switch on character after backslash
        idx += 2;
        int escapeChar = pattern.codePointAt(idx - 1);
        switch (escapeChar) {

            case 'n':
                return new SingletonCharacterClass('\n');
            case 'r':
                return new SingletonCharacterClass('\r');
            case 't':
                return new SingletonCharacterClass('\t');

            case '\\':
            case '|':
            case '.':
            case '-':
            case '^':
            case '?':
            case '*':
            case '+':
            case '{':
            case '}':
            case '(':
            case ')':
            case '[':
            case ']':
                return new SingletonCharacterClass(escapeChar);

            case '$':
                if (isXPath) {
                    return new SingletonCharacterClass(escapeChar);
                } else {
                    syntaxError("In XSD, '$' must not be escaped");
                }
                break;

            case 's':
                return Categories.ESCAPE_s;

            case 'S':
                return Categories.ESCAPE_S;

            case 'i':
                return Categories.ESCAPE_i;

            case 'I':
                return Categories.ESCAPE_I;

            case 'c':
                return Categories.ESCAPE_c;

            case 'C':
                return Categories.ESCAPE_C;

            case 'd':
                return Categories.ESCAPE_d;

            case 'D':
                return Categories.ESCAPE_D;

            case 'w':
                return Categories.ESCAPE_w;

            case 'W':
                return Categories.ESCAPE_W;


            case 'p':
            case 'P':

                if (idx == len) {
                    syntaxError("Expected '{' after \\" + escapeChar);
                }
                if (pattern.codePointAt(idx) != '{') {
                    syntaxError("Expected '{' after \\" + escapeChar);
                }
                int from = idx++;
                int close = (int)pattern.indexOf('}', from);
                if (close == -1) {
                    syntaxError("No closing '}' after \\" + escapeChar);
                }
                String block = pattern.substring(idx, close).toString();
                if (block.length() == 1 || block.length() == 2) {
                    CharacterClass primary = Categories.getCategory(block);
                    if (primary == null) {
                        syntaxError("Unknown character category " + block);
                    }
                    idx = close + 1;
                    if (escapeChar == 'p') {
                        return primary;
                    } else {
                        return makeComplement(primary);
                    }
                } else if (block.startsWith("Is")) {
                    String blockName = block.substring(2);
                    IntSet uniBlock = UnicodeBlocks.getBlock(blockName);
                    if (uniBlock == null) {
                        // XSD 1.1 says this is not an error, but by default we reject it
                        if (reFlags.isAllowUnknownBlockNames()) {
                            warning("Unknown Unicode block: " + blockName);
                            idx = close + 1;
                            return EmptyCharacterClass.getComplement();
                        } else {
                            syntaxError("Unknown Unicode block: " + blockName);
                        }
                    }
                    idx = close + 1;
                    IntSetCharacterClass primary = new IntSetCharacterClass(uniBlock);
                    if (escapeChar == 'p') {
                        return primary;
                    } else {
                        return makeComplement(primary);
                    }
                } else {
                    syntaxError("Unknown character category: " + block);
                }
                break;

            case '0':
                syntaxError("Octal escapes not allowed");
                break;

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':

                if (inSquareBrackets) {
                    syntaxError("Backreference not allowed within character class");
                } else if (isXPath) {
                    int backRef = escapeChar - '0';
                    while (idx < len) {
                        int c1 = (int)StringConstants.ZERO_TO_NINE.indexOf(pattern.codePointAt(idx));
                        if (c1 < 0) {
                            break;
                        } else {
                            int backRef2 = backRef * 10 + c1;
                            if (backRef2 > (capturingOpenParenCount - 1)) {
                                break;
                            } else {
                                backRef = backRef2;
                                idx++;
                            }
                        }

                    }
                    if (!captures.contains(backRef)) {
                        String explanation = backRef > (capturingOpenParenCount - 1) ? "(no such group)" : "(group not yet closed)";
                        syntaxError("invalid backreference \\" + backRef + " " + explanation);
                    }
                    hasBackReferences = true;
                    return new BackReference(backRef);
                } else {
                    syntaxError("digit not allowed after \\");
                }
                break;

            default:

                // Other characters not allowed in XSD regexes
                syntaxError("Escape character '" + (char) escapeChar + "' not allowed");
                break;
        }
        return null;
    }

    /**
     * For convenience a back-reference is treated as an CharacterClass, although this a fiction
     */

    class BackReference extends SingletonCharacterClass {
        public BackReference(int number) {
            super(number);
        }
    }


    /**
     * Compile a character class (in square brackets)
     *
     * @return an IntPredicate that tests whether a character matches this character class
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    CharacterClass parseCharacterClass() throws RESyntaxException {
        // Check for bad calling or empty class
        if (pattern.codePointAt(idx) != '[') {
            internalError();
        }

        // Check for unterminated or empty class
        int index = ++idx;
        if ((idx + 1) >= len || pattern.codePointAt(index) == ']') {
            syntaxError("Missing ']'");
        }

        // Parse class declaration
        int simpleChar;
        boolean positive = true;
        boolean definingRange = false;
        int rangeStart = -1;
        int rangeEnd;
        IntRangeSet range = new IntRangeSet();
        CharacterClass addend = null;
        CharacterClass subtrahend = null;
        if (thereFollows('^')) {
            if (thereFollows('^', '-', '[')) {
                syntaxError("Nothing before subtraction operator");
            } else if (thereFollows('^', ']')) {
                syntaxError("Empty negative character group");
            } else {
                positive = false;
                idx++;
            }
        } else if (thereFollows('-','[')) {
            syntaxError("Nothing before subtraction operator");
        }
        while (idx < len && pattern.codePointAt(idx) != ']') {
            int ch = pattern.codePointAt(idx);
            simpleChar = -1;
            switch (ch) {
                case '[':
                    syntaxError("Unescaped '[' within square brackets");
                    break;
                case '\\': {
                    // Escape always advances the stream
                    CharacterClass cc = escape(true);
                    if (cc instanceof SingletonCharacterClass) {
                        simpleChar = ((SingletonCharacterClass) cc).getCodepoint();
                        break;
                    } else {
                        if (definingRange) {
                            syntaxError("Multi-character escape cannot follow '-'");
                        } else if (addend == null) {
                            addend = cc;
                        } else {
                            addend = makeUnion(addend, cc);
                        }
                        continue;
                    }
                }
                case '-':
                    if (thereFollows('-','[')) {
                        idx++;
                        subtrahend = parseCharacterClass();
                        if (!thereFollows(']')) {
                            syntaxError("Expected closing ']' after subtraction");
                        }
                    } else if (thereFollows('-',']')) {
                        simpleChar = '-';
                        idx++;
                    } else if (rangeStart >= 0) {
                        definingRange = true;
                        idx++;
                        continue;
                    } else if (definingRange) {
                        syntaxError("Bad range");
                    } else if (thereFollows('-','-') && !thereFollows('-','-','[')) {
                        syntaxError("Unescaped hyphen as start of range");
                    } else if (!isXSD11 && pattern.codePointAt(idx - 1) != '[' && pattern.codePointAt(idx - 1) != '^' && !thereFollows(']') && !thereFollows('-','[')) {
                        syntaxError("In XSD 1.0, hyphen is allowed only at the beginning or end of a positive character group");
                    } else {
                        simpleChar = '-';
                        idx++;
                    }
                    break;

                default:
                    simpleChar = ch;
                    idx++;
                    break;
            }

            // Handle simple character simpleChar
            if (definingRange) {
                // if we are defining a range make it now
                rangeEnd = simpleChar;

                // Actually create a range if the range is ok
                if (rangeStart > rangeEnd) {
                    syntaxError("Bad character range: start > end");
                    // Technically this is not an error in XSD, merely a no-op; but it is so
                    // utterly pointless that it is almost certainly a mistake; and we have no
                    // way of indicating warnings.
                }
                range.addRange(rangeStart, rangeEnd);
                if (reFlags.isCaseIndependent()) {
                    // Special-case A-Z and a-z
                    if (rangeStart == 'a' && rangeEnd == 'z') {
                        range.addRange('A', 'Z');
                        for (int v = 0; v < CaseVariants.ROMAN_VARIANTS.length; v++) {
                            range.add(CaseVariants.ROMAN_VARIANTS[v]);
                        }
                    } else if (rangeStart == 'A' && rangeEnd == 'Z') {
                        range.addRange('a', 'z');
                        for (int v = 0; v < CaseVariants.ROMAN_VARIANTS.length; v++) {
                            range.add(CaseVariants.ROMAN_VARIANTS[v]);
                        }
                    } else {
                        for (int k = rangeStart; k <= rangeEnd; k++) {
                            int[] variants = CaseVariants.getCaseVariants(k);
                            for (int variant : variants) {
                                range.add(variant);
                            }
                        }
                    }
                }

                // We are done defining the range
                definingRange = false;
                rangeStart = -1;
            } else {
                // If simple character and not start of range, include it (see XSD 1.1 rules)
                if (thereFollows('-')) {
                    if (thereFollows('-','[')) {
                        range.add(simpleChar);
                    } else if (thereFollows('-',']')) {
                        range.add(simpleChar);
                    } else if (thereFollows('-','-','[')) {
                        range.add(simpleChar);
                    } else if (thereFollows('-','-')) {
                        syntaxError("Unescaped hyphen cannot act as end of range");
                    } else {
                        rangeStart = simpleChar;
                    }
                } else {
                    range.add(simpleChar);
                    if (reFlags.isCaseIndependent()) {
                        int[] variants = CaseVariants.getCaseVariants(simpleChar);
                        for (int variant : variants) {
                            range.add(variant);
                        }
                    }
                }
            }
        }

        // Shouldn't be out of input
        if (idx == len) {
            syntaxError("Unterminated character class");
        }

        // Absorb the ']' end of class marker
        idx++;
        CharacterClass result = new IntSetCharacterClass(range);
        if (addend != null) {
            result = makeUnion(result, addend);
        }
        if (!positive) {
            result = makeComplement(result);
        }
        if (subtrahend != null) {
            result = makeDifference(result, subtrahend);
        }
        return result;
    }

    /**
     * Test whether the string starting at the current position is equal to some specified
     * sequence of characters
     *
     * @param chars the string being tested, as an array of characters which must not include surrogates
     * @return true if the specified string is present
     */

    private boolean thereFollows(int... chars) {
        if (idx + chars.length > len) {
            return false;
        }
        for (int i=0; i p1.test(ch) || p2.test(ch));
        } else {
            return new IntSetCharacterClass(is1.union(is2));
        }
    }

    /**
     * Make the difference of two IntPredicates (matches if p1 matches and p2 does not match)
     *
     * @param p1 the first
     * @param p2 the second
     * @return the result
     */

    public static CharacterClass makeDifference(CharacterClass p1, CharacterClass p2) {
        if (p1 == EmptyCharacterClass.getInstance()) {
            return p1;
        }
        if (p2 == EmptyCharacterClass.getInstance()) {
            return p1;
        }
        IntSet is1 = p1.getIntSet();
        IntSet is2 = p2.getIntSet();
        if (is1 == null || is2 == null) {
            return new PredicateCharacterClass(ch -> IntExceptPredicate.makeDifference(p1, p2).test(ch));
        } else {
            return new IntSetCharacterClass(is1.except(is2));
        }
    }

    /**
     * Make the complement of an IntPredicate (matches if p1 does not match)
     *
     * @param p1 the operand
     * @return the result
     */

    public static CharacterClass makeComplement(CharacterClass p1) {
        if (p1 instanceof InverseCharacterClass) {
            return ((InverseCharacterClass) p1).getComplement();
        } else {
            return new InverseCharacterClass(p1);
        }
    }

    /**
     * Absorb an atomic character string.  This method is a little tricky because
     * it can un-include the last character of string if a quantifier operator follows.
     * This is correct because *+? have higher precedence than concatentation (thus
     * ABC* means AB(C*) and NOT (ABC)*).
     *
     * @return Index of new atom node
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    Operation parseAtom() throws RESyntaxException {

        // Length of atom
        int lenAtom = 0;

        // Loop while we've got input

        UnicodeBuilder ub = new UnicodeBuilder();

        // Avoid "break loop" construct to allow conversion to C#
        boolean breakAtomLoop = false;
        while (idx < len) {
            // Is there a next char?
            if ((idx + 1) < len) {
                int c = pattern.codePointAt(idx + 1);

                // If the next 'char' is an escape, look past the whole escape
                if (pattern.codePointAt(idx) == '\\') {
                    int idxEscape = idx;
                    escape(false);
                    if (idx < len) {
                        c = pattern.codePointAt(idx);
                    }
                    idx = idxEscape;
                }

                // Switch on next char
                switch (c) {
                    case '{':
                    case '?':
                    case '*':
                    case '+':

                        // If the next character is a quantifier operator and our atom is non-empty, the
                        // current character should bind to the quantifier operator rather than the atom
                        if (lenAtom != 0) {
                            breakAtomLoop = true;
                        }
                        break;
                }
            }
            if (breakAtomLoop) {
                break;
            }

            // Switch on current char
            switch (pattern.codePointAt(idx)) {
                case ']':
                case '.':
                case '[':
                case '(':
                case ')':
                case '|':
                    breakAtomLoop = true;
                    break;

                case '{':
                case '?':
                case '*':
                case '+':

                    // We should have an atom by now
                    if (lenAtom == 0) {
                        // No atom before quantifier
                        syntaxError("No expression before quantifier");
                    }
                    breakAtomLoop = true;
                    break;

                case '}':
                    syntaxError("Unescaped right curly brace");
                    breakAtomLoop = true;
                    break;

                case '\\': {
                    // Get the escaped character (advances input automatically)
                    int idxBeforeEscape = idx;
                    CharacterClass charClass = escape(false);

                    // Check if it's a simple escape (as opposed to, say, a backreference)
                    if (!(charClass instanceof IntValuePredicate)) {
                        // Not a simple escape, so backup to where we were before the escape.
                        idx = idxBeforeEscape;
                        breakAtomLoop = true;
                        break;
                    }

                    // Add escaped char to atom
                    ub.append(((IntValuePredicate) charClass).getTarget());
                    lenAtom++;
                    break;
                }

                case '^':
                case '$':
                    if (isXPath) {
                        breakAtomLoop = true;
                        break;
                    }
                    // else fall through ($ is not a metacharacter in XSD)
                    CSharp.emitCode("goto default;");

                default:

                    // Add normal character to atom
                    int index = idx++;
                    ub.append(pattern.codePointAt(index));
                    lenAtom++;
                    break;
            }
            if (breakAtomLoop) {
                break;
            }
        }

        // This shouldn't happen
        if (ub.isEmpty()) {
            internalError();
        }

        // Return the instruction
        return trace(new OpAtom(ub.toUnicodeString()));
    }


    /**
     * Match a terminal symbol.
     *
     * @param flags Flags
     * @return Index of terminal node (closeable)
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    Operation parseTerminal(int[] flags) throws RESyntaxException {
        switch (pattern.codePointAt(idx)) {
            case '$':
                if (isXPath) {
                    idx++;
                    return trace(new OpEOL());
                }
                break;

            case '^':
                if (isXPath) {
                    idx++;
                    return trace(new OpBOL());
                }
                break;

            case '.':
                idx++;
                IntPredicateProxy predicate;
                if (reFlags.isSingleLine()) {
                    // in XPath with the 's' flag, '.' matches everything
                    predicate = IntSetPredicate.ALWAYS_TRUE;
                } else {
                    // in XSD, "." matches everything except \n and \r. See also bug 15594.
                    predicate = IntPredicateLambda.of(value -> value != '\n' && value != '\r');
                }
                return trace(new OpCharClass(predicate));

            case '[':
                CharacterClass range = parseCharacterClass();
                return trace(new OpCharClass(range));

            case '(':
                return parseExpr(flags);

            case ')':
                syntaxError("Unexpected closing ')'");
                break;

            case '|':
                internalError();
                break;

            case ']':
                syntaxError("Unexpected closing ']'");
                break;

            case 0:
                syntaxError("Unexpected end of input");
                break;

            case '?':
            case '+':
            case '{':
            case '*':
                syntaxError("No expression before quantifier");
                break;

            case '\\': {
                // Don't forget, escape() advances the input stream!
                int idxBeforeEscape = idx;

                CharacterClass esc = escape(false);

                if (esc instanceof BackReference) {
                    // this is a total kludge
                    int backreference = ((BackReference) esc).getCodepoint();
                    if (capturingOpenParenCount <= backreference) {
                        syntaxError("Bad backreference");
                    }
                    return trace(new OpBackReference(backreference));

                } else if (esc instanceof IntSingletonSet) {
                    // We had a simple escape and we want to have it end up in
                    // an atom, so we back up and fall though to the default handling
                    idx = idxBeforeEscape;

                } else {
                    return trace(new OpCharClass(esc));
                }
                break;

            }
        }

        // Everything above either fails or returns.
        // If it wasn't one of the above, it must be the start of an atom.
        return parseAtom();
    }

    /**
     * Compile a piece consisting of an atom and optional quantifier
     *
     * @param flags Flags passed by reference
     * @return Index of resulting instruction
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    Operation piece(int[] flags) throws RESyntaxException {

        // Values to pass by reference to terminal()
        int[] terminalFlags = {NODE_NORMAL};

        // Get terminal symbol
        Operation ret = parseTerminal(terminalFlags);

        // Or in flags from terminal symbol
        flags[0] |= terminalFlags[0];

        // Advance input, set NODE_NULLABLE flag and do sanity checks
        if (idx >= len) {
            return ret;
        }

        boolean greedy = true;
        int quantifierType = pattern.codePointAt(idx);
        switch (quantifierType) {
            case '?':
            case '*':
            case '+':

                // Eat quantifier character
                idx++;

                // Drop through
                CSharp.emitCode("goto case '{';");

            case '{':

                if (quantifierType == '{') {
                    bracket();
                }


                if (ret instanceof OpBOL || ret instanceof OpEOL) {
                    // Pretty meaningless, but legal. If the quantifier allows zero occurrences, ignore the instruction.
                    // Otherwise, ignore the quantifier
                    if (quantifierType == '?' || quantifierType == '*' ||
                            (quantifierType == '{' && bracketMin == 0)) {
                        return new OpNothing();
                    } else {
                        quantifierType = 0;
                    }
                }
                if (ret.matchesEmptyString() == Operation.MATCHES_ZLS_ANYWHERE) {
                    if (quantifierType == '?') {
                        // can ignore the quantifier
                        quantifierType = 0;
                    } else if (quantifierType == '+') {
                        // '*' and '+' are equivalent
                        quantifierType = '*';
                    } else if (quantifierType == '{') {
                        // bounds are meaningless
                        quantifierType = '*';
                    }
                }
                break;

        }

        // If the next character is a '?', make the quantifier non-greedy (reluctant)
        if (idx < len && pattern.codePointAt(idx) == '?') {
            if (!isXPath) {
                syntaxError("Reluctant quantifiers are not allowed in XSD");
            }
            idx++;
            greedy = false;
        }
        int min = 1;
        int max = 1;
        switch (quantifierType) {
            case '{':
                min = this.bracketMin;
                max = this.bracketMax;
                break;
            case '?':
                min = 0;
                max = 1;
                break;
            case '+':
                min = 1;
                max = Integer.MAX_VALUE;
                break;
            case '*':
                min = 0;
                max = Integer.MAX_VALUE;
                break;
        }

        Operation result;
        if (max == 0) {
            result = new OpNothing();
        } else if (min == 1 && max == 1) {
            return ret;
        } else if (greedy) {
            // Actually do the quantifier now
            if (ret.getMatchLength() == -1) {
                result = trace(new OpRepeat(ret, min, max, true));
            } else {
                result = new OpGreedyFixed(ret, min, max, ret.getMatchLength());
            }
        } else {
            if (ret.getMatchLength() == -1) {
                result = new OpRepeat(ret, min, max, false);
            } else {
                result = new OpReluctantFixed(ret, min, max, ret.getMatchLength());
            }
        }
        return trace(result);

    }

    /**
     * Compile body of one branch of an or operator (implements concatenation)
     *
     * @return Pointer to first node in the branch
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    Operation parseBranch() throws RESyntaxException {
        // Get each possibly qnatified piece and concat
        Operation current = null;
        int[] quantifierFlags = new int[1];
        while (idx < len && pattern.codePointAt(idx) != '|' && pattern.codePointAt(idx) != ')') {
            // Get new node
            quantifierFlags[0] = NODE_NORMAL;
            Operation op = piece(quantifierFlags);
            if (current == null) {
                current = op;
            } else {
                current = makeSequence(current, op);
            }
        }

        // If we don't run loop, make a nothing node
        if (current == null) {
            return new OpNothing();
        }

        return current;
    }

    /**
     * Compile an expression with possible parens around it.  Paren matching
     * is done at this level so we can tie the branch tails together.
     *
     * @param compilerFlags Flag value passed by reference
     * @return Node index of expression in instruction array
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     */
    private Operation parseExpr(int[] compilerFlags) throws RESyntaxException {
        // Create open paren node unless we were called from the top level (which has no parens)
        int paren = -1;
        int group = 0;
        List branches = new ArrayList<>();
        int closeParens = capturingOpenParenCount;
        boolean capturing = true;
        if ((compilerFlags[0] & NODE_TOPLEVEL) == 0 && pattern.codePointAt(idx) == '(') {
            // if its a cluster ( rather than a proper subexpression ie with backrefs )
            if (idx + 2 < len && pattern.codePointAt(idx + 1) == '?' && pattern.codePointAt(idx + 2) == ':') {
                if (!isXPath30) {
                    syntaxError("Non-capturing groups allowed only in XPath3.0");
                }
                paren = 2;
                idx += 3;
                capturing = false;
            } else {
                paren = 1;
                idx++;
                group = capturingOpenParenCount++;
            }
        }
        compilerFlags[0] &= ~NODE_TOPLEVEL;

        // Process contents of first branch node
        branches.add(parseBranch());

        // Loop through branches
        while (idx < len && pattern.codePointAt(idx) == '|') {
            idx++;
            branches.add(parseBranch());
        }

        Operation op;
        if (branches.size() == 1) {
            op = branches.get(0);
        } else {
            op = new OpChoice(branches);
        }

        // Create an ending node (either a close paren or an OP_END)
        if (paren > 0) {
            if (idx < len && pattern.codePointAt(idx) == ')') {
                idx++;
            } else {
                syntaxError("Missing close paren");
            }
            if (capturing) {
                op = new OpCapture(op, group);
                captures.add(closeParens);
            }
        } else {
            op = makeSequence(op, new OpEndProgram());
        }

        // Return the node list
        return op;
    }

    private static Operation makeSequence(Operation o1, Operation o2) {
        if (o1 instanceof OpSequence) {
            if (o2 instanceof OpSequence) {
                List list1 = ((OpSequence)o1).getOperations();
                List list2 = ((OpSequence)o2).getOperations();
                list1.addAll(list2);
                return o1;
            }
            List l1 = ((OpSequence)o1).getOperations();
            l1.add(o2);
            return o1;
        } else if (o2 instanceof OpSequence) {
            List l2 = ((OpSequence)o2).getOperations();
            l2.add(0, o1);
            return o2;
        } else {
            List list = new ArrayList<>(4);
            list.add(o1);
            list.add(o2);
            return trace(new OpSequence(list));
        }
    }

    /**
     * Compiles a regular expression pattern into a program runnable by the pattern
     * matcher class 'RE'.
     *
     * @param pattern Regular expression pattern to compile (see RECompiler class
     *                for details).
     * @return A compiled regular expression program.
     * @throws net.sf.saxon.regex.RESyntaxException
     *          Thrown if the regular expression has invalid syntax.
     * @see RECompiler
     * @see net.sf.saxon.regex.REMatcher
     */
    public REProgram compile(UnicodeString pattern) throws RESyntaxException {
        // Initialize variables for compilation
        //System.err.println("Compiling regex " + pattern);
        this.pattern = pattern;                         // Save pattern in instance variable
        len = this.pattern.length32();                  // Precompute pattern length for speed
        idx = 0;                                        // Set parsing index to the first character
        capturingOpenParenCount = 1;                    // Set paren level to 1 (the implicit outer parens)

        if (reFlags.isLiteral()) {

            // 'q' flag is set
            // Create a string node
            Operation ret = new OpAtom(this.pattern);
            Operation endNode = new OpEndProgram();
            Operation seq = makeSequence(ret, endNode);
            return new REProgram(seq, capturingOpenParenCount, reFlags);

        } else {

            if (reFlags.isAllowWhitespace()) {
                // 'x' flag is set. Preprocess the expression to strip whitespace, other than between
                // square brackets
                UnicodeBuilder sb = new UnicodeBuilder();
                int nesting = 0;
                boolean escaped = false;
                IntIterator iter = pattern.codePoints();
                while (iter.hasNext()) {
                    int ch = iter.next();
                    if (ch == '\\' && !escaped) {
                        escaped = true;
                        sb.append(ch);
                    } else if (ch == '[' && !escaped) {
                        nesting++;
                        escaped = false;
                        sb.append(ch);
                    } else if (ch == ']' && !escaped) {
                        nesting--;
                        escaped = false;
                        sb.append(ch);
                    } else if (nesting == 0 && Whitespace.isWhite(ch)) {
                        // no action
                    } else {
                        escaped = false;
                        sb.append(ch);
                    }
                }
                this.pattern = sb.toUnicodeString();
                this.len = this.pattern.length32();
            }

            // Initialize pass by reference flags value
            int[] compilerFlags = {NODE_TOPLEVEL};

            // Parse expression
            Operation exp = parseExpr(compilerFlags);

            // Should be at end of input
            if (idx != len) {
                if (pattern.codePointAt(idx) == ')') {
                    syntaxError("Unmatched close paren");
                }
                syntaxError("Unexpected input remains");
            }

            REProgram program = new REProgram(exp, capturingOpenParenCount, reFlags);
            if (hasBackReferences) {
                program.optimizationFlags |= REProgram.OPT_HASBACKREFS;
            }
            return program;

        }

    }


    /**
     * Determine that there is no ambiguity between two branches, that is, if one of them matches then the
     * other cannot possibly match. (This is for optimization, so it does not have to detect all cases; but
     * if it returns true, then the result must be dependable.)
     * @param op0 the first branch
     * @param op1 the second branch
     * @param caseBlind true if the "i" flag is in force
     * @param reluctant true if the first branch is a repeat branch with a reluctant quantifier
     * @return true if it can be established that there is no input sequence that will match both instructions
     */

    public static boolean noAmbiguity(Operation op0, Operation op1, boolean caseBlind, boolean reluctant) {
        if (op1 instanceof OpEndProgram) {
            return !reluctant;
        }
        if (op1 instanceof OpBOL || op1 instanceof OpEOL) {
            return true;
        }
        if (op1 instanceof OpRepeat && ((OpRepeat)op1).min == 0) {
            return false; //Bug 3429
        }
        CharacterClass c0 = op0.getInitialCharacterClass(caseBlind);
        CharacterClass c1 = op1.getInitialCharacterClass(caseBlind);
        return c0.isDisjoint(c1);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy