net.sf.saxon.regex.RECompiler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of saxon-he Show documentation
Show all versions of saxon-he Show documentation
An OSGi bundle for Saxon-HE
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2013 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Originally part of Apache's Jakarta project (downloaded January 2012),
* this file has been extensively modified for integration into Saxon by
* Michael Kay, Saxonica.
*/
package net.sf.saxon.regex;
import net.sf.saxon.tree.util.FastStringBuffer;
import net.sf.saxon.value.Whitespace;
import net.sf.saxon.z.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* A regular expression compiler class. This class compiles a pattern string into a
* regular expression program interpretable by the RE evaluator class. The 'recompile'
* command line tool uses this compiler to pre-compile regular expressions for use
* with RE. For a description of the syntax accepted by RECompiler and what you can
* do with regular expressions, see the documentation for the RE matcher class.
*
* @author Jonathan Locke
* @author Michael McCallum
* @version $Id: RECompiler.java 518156 2007-03-14 14:31:26Z vgritsenko $
* @see net.sf.saxon.regex.REMatcher
*/
/*
* Changes made for Saxon:
*
* - handle full Unicode repertoire (esp non-BMP characters) using UnicodeString class for
* both the source string and the regular expression
* - added support for subtraction in a character class
* - in a character range, changed the condition start < end to start <= end
* - removed support for [:POSIX:] construct
* - added support for \p{} and \P{} classes
* - removed support for unsupported escapes: f, x, u, b, octal characters; added i and c
* - changed the handling of hyphens within square brackets, and ^ appearing other than at the start
* - changed the data structure used for the executable so that terms that match a character class
* now reference an IntPredicate that tests for membership of the character in a set
* - added support for reluctant {n,m}? quantifiers
* - allow a quantifier on a nullable expression
* - allow a quantifier on '$' or '^'
* - some constructs (back-references, non-capturing groups, etc) are conditional on which XPath/XSD version
* is in use
* - regular expression flags are now fixed at the time the RE is compiled, this can no longer be deferred
* until the RE is evaluated
* - split() function includes a zero-length string at the end of the returned sequence if the last
* separator is at the end of the string
* - added support for the 'q' and 'x' flags; improved support for the 'i' flag
* - added a method to determine whether there is an anchored match (for XSD use)
* - tests for newline (e.g in multiline mode) now match \n only, as required by the XPath specification
* - reorganised the executable program to use Operation objects rather than integer opcodes
* - introduced optimization for non-backtracking + and * operators (with simple operands)
*/
public class RECompiler {
// The compiled program
ArrayList instructions = new ArrayList(20);
// Input state for compiling regular expression
UnicodeString pattern; // Input string
int len; // Length of the pattern string
int idx; // Current input index into ac
int parens; // Total number of paren pairs
// Node flags
static final int NODE_NORMAL = 0; // No flags (nothing special)
static final int NODE_NULLABLE = 1; // True if node is potentially null
static final int NODE_TOPLEVEL = 2; // True if top level expr
// {m,n} stacks
static final int bracketUnbounded = -1; // Unbounded value
int bracketMin; // Minimum number of matches
int bracketOpt; // Additional optional matches
boolean isXPath = true;
boolean isXPath30 = true;
boolean isXSD11 = false;
IntHashSet captures = new IntHashSet();
REFlags reFlags;
List warnings;
/**
* Constructor. Creates (initially empty) storage for a regular expression program.
*/
public RECompiler() {
}
/**
* Set the regular expression flags to be used
* @param flags the regular expression flags
*/
public void setFlags(REFlags flags) {
this.reFlags = flags;
isXPath = flags.isAllowsXPath20Extensions();
isXPath30 = flags.isAllowsXPath30Extensions();
isXSD11 = flags.isAllowsXSD11Syntax();
}
private void insertNode(Operation node, int insertAt) {
instructions.add(insertAt, node);
}
private void warning(String s) {
if (warnings == null) {
warnings = new ArrayList(4);
}
warnings.add(s);
}
/**
* On completion of compilation, get any warnings that were generated
* @return the list of warning messages
*/
public List getWarnings() {
if (warnings == null) {
return Collections.emptyList();
} else {
return warnings;
}
}
/**
* Appends a node to the end of a node chain
*
* @param node Start of node chain to traverse
* @param pointTo Node to have the tail of the chain point to
*/
void setNextOfEnd(int node, int pointTo) {
//System.err.println("NEW nextOfEnd " + node + " " + pointTo);
// Traverse the chain until the next offset is 0
int next = instructions.get(node).next;
// while the 'node' is not the last in the chain
// and the 'node' is not the last in the program.
while (next != 0 && node < instructions.size()) {
// if the node we are supposed to point to is in the chain then
// point to the end of the program instead.
// Michael McCallum
// FIXME: This is a _hack_ to stop infinite programs.
// I believe that the implementation of the reluctant matches is wrong but
// have not worked out a better way yet.
if (node == pointTo) {
pointTo = instructions.size();
}
node += next;
next = instructions.get(node).next;
}
// if we have reached the end of the program then dont set the pointTo.
// im not sure if this will break any thing but passes all the tests.
if (node < instructions.size()) {
int offset = pointTo - node;
// Point the last node in the chain to pointTo.
instructions.get(node).next = offset;
}
}
/**
* Throws a new internal error exception
*
* @throws Error Thrown in the event of an internal error.
*/
void internalError() throws Error {
throw new Error("Internal error!");
}
/**
* Throws a new syntax error exception
* @param s the error message
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void syntaxError(String s) throws RESyntaxException {
throw new RESyntaxException(s, idx);
}
/**
* Match bracket {m,n} expression put results in bracket member variables
*
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void bracket() throws RESyntaxException {
// Current character must be a '{'
if (idx >= len || pattern.charAt(idx++) != '{') {
internalError();
}
// Next char must be a digit
if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
syntaxError("Expected digit");
}
// Get min ('m' of {m,n}) number
StringBuffer number = new StringBuffer();
while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
number.append((char)pattern.charAt(idx++));
}
try {
bracketMin = Integer.parseInt(number.toString());
} catch (NumberFormatException e) {
syntaxError("Expected valid number");
}
// If out of input, fail
if (idx >= len) {
syntaxError("Expected comma or right bracket");
}
// If end of expr, optional limit is 0
if (pattern.charAt(idx) == '}') {
idx++;
bracketOpt = 0;
return;
}
// Must have at least {m,} and maybe {m,n}.
if (idx >= len || pattern.charAt(idx++) != ',') {
syntaxError("Expected comma");
}
// If out of input, fail
if (idx >= len) {
syntaxError("Expected comma or right bracket");
}
// If {m,} max is unlimited
if (pattern.charAt(idx) == '}') {
idx++;
bracketOpt = bracketUnbounded;
return;
}
// Next char must be a digit
if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
syntaxError("Expected digit");
}
// Get max number
number.setLength(0);
while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
number.append((char)pattern.charAt(idx++));
}
try {
bracketOpt = Integer.parseInt(number.toString()) - bracketMin;
} catch (NumberFormatException e) {
syntaxError("Expected valid number");
}
// Optional repetitions must be >= 0
if (bracketOpt < 0) {
syntaxError("Bad range");
}
// Must have close brace
if (idx >= len || pattern.charAt(idx++) != '}') {
syntaxError("Missing close brace");
}
}
/**
* Test whether a character is an ASCII decimal digit
* @param ch the character to be matched
* @return true if the character is an ASCII digit (0-9)
*/
private static boolean isAsciiDigit(int ch) {
return ch >= '0' && ch <= '9';
}
/**
* Match an escape sequence. Handles quoted chars and octal escapes as well
* as normal escape characters. Always advances the input stream by the
* right amount. This code "understands" the subtle difference between an
* octal escape and a backref. You can access the type of ESC_CLASS or
* ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
*
* @return an IntPredicate that matches the character or characters represented
* by this escape sequence. For a single-character escape this must be an IntValuePredicate
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
IntPredicate escape(boolean inSquareBrackets) throws RESyntaxException {
// "Shouldn't" happen
if (pattern.charAt(idx) != '\\') {
internalError();
}
// Escape shouldn't occur as last character in string!
if (idx + 1 == len) {
syntaxError("Escape terminates string");
}
// Switch on character after backslash
idx += 2;
int escapeChar = pattern.charAt(idx - 1);
switch (escapeChar) {
case 'n':
return new IntValuePredicate('\n');
case 'r':
return new IntValuePredicate('\r');
case 't':
return new IntValuePredicate('\t');
case '\\':
case '|':
case '.':
case '-':
case '^':
case '?':
case '*':
case '+':
case '{':
case '}':
case '(':
case ')':
case '[':
case ']':
return new IntValuePredicate(escapeChar);
case '$':
if (isXPath) {
return new IntValuePredicate(escapeChar);
} else {
syntaxError("In XSD, '$' must not be escaped");
}
case 's':
return Categories.ESCAPE_s;
case 'S':
return Categories.ESCAPE_S;
case 'i':
return Categories.ESCAPE_i;
case 'I':
return Categories.ESCAPE_I;
case 'c':
return Categories.ESCAPE_c;
case 'C':
return Categories.ESCAPE_C;
case 'd':
return Categories.ESCAPE_d;
case 'D':
return Categories.ESCAPE_D;
case 'w':
return Categories.ESCAPE_w;
case 'W':
return Categories.ESCAPE_W;
case 'p':
case 'P':
if (idx == len) {
syntaxError("Expected '{' after \\" + escapeChar);
}
if (pattern.charAt(idx) != '{') {
syntaxError("Expected '{' after \\" + escapeChar);
}
int close = pattern.indexOf('}', idx++);
if (close == -1) {
syntaxError("No closing '}' after \\" + escapeChar);
}
UnicodeString block = pattern.substring(idx, close);
if (block.length() == 1 || block.length() == 2) {
IntPredicate primary = Categories.getCategory(block.toString());
if (primary == null) {
syntaxError("Unknown character category " + block.toString());
}
idx = close+1;
if (escapeChar == 'p') {
return primary;
} else {
return makeComplement(primary);
}
} else if (block.toString().startsWith("Is")) {
String blockName = block.toString().substring(2);
IntSet uniBlock = UnicodeBlocks.getBlock(blockName);
if (uniBlock == null) {
// XSD 1.1 says this is not an error, but by default we reject it
if (reFlags.isAllowUnknownBlockNames()) {
warning("Unknown Unicode block: " + blockName);
idx = close+1;
return new IntSetPredicate(IntUniversalSet.getInstance());
} else {
syntaxError("Unknown Unicode block: " + blockName);
}
}
idx = close+1;
IntPredicate primary = new IntSetPredicate(uniBlock);
if (escapeChar == 'p') {
return primary;
} else {
return makeComplement(primary);
}
} else {
syntaxError("Unknown character category: " + block);
}
case '0':
syntaxError("Octal escapes not allowed");
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (inSquareBrackets) {
syntaxError("Backreference not allowed within character class");
} else if (isXPath) {
int backRef = (escapeChar - '0');
while (idx < len) {
int c1 = "0123456789".indexOf(pattern.charAt(idx));
if (c1 < 0) {
break;
} else {
int backRef2 = backRef * 10 + c1;
if (backRef2 > (parens - 1)) {
break;
} else {
backRef = backRef2;
idx++;
}
}
}
if (!captures.contains(backRef)) {
String explanation = (backRef > (parens - 1) ? "(no such group)" : "(group not yet closed)");
syntaxError("invalid backreference \\" + backRef + " " + explanation);
}
return new BackReference(backRef);
} else {
syntaxError("digit not allowed after \\");
}
default:
// Other characters not allowed in XSD regexes
syntaxError("Escape character '" + (char)escapeChar + "' not allowed");
}
return null;
}
/**
* For convenience a back-reference is treated as an IntPredicate, although this a fiction
*/
class BackReference extends IntValuePredicate {
public BackReference(int number) {
super(number);
}
}
/**
* Compile a character class (in square brackets)
*
* @return an IntPredicate that tests whether a character matches this character class
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
IntPredicate parseCharacterClass() throws RESyntaxException {
// Check for bad calling or empty class
if (pattern.charAt(idx) != '[') {
internalError();
}
// Check for unterminated or empty class
if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
syntaxError("Missing ']'");
}
// Parse class declaration
int simpleChar;
boolean positive = true;
boolean definingRange = false;
int rangeStart = -1;
int rangeEnd;
IntRangeSet range = new IntRangeSet();
IntPredicate addend = null;
IntPredicate subtrahend = null;
if (thereFollows("^")) {
if (thereFollows("^-[")) {
syntaxError("Nothing before subtraction operator");
} else if (thereFollows("^]")) {
syntaxError("Empty negative character group");
} else {
positive = false;
idx++;
}
} else if (thereFollows("-[")) {
syntaxError("Nothing before subtraction operator");
}
while (idx < len && pattern.charAt(idx) != ']') {
int ch = pattern.charAt(idx);
simpleChar = -1;
switch (ch) {
case '[':
syntaxError("Unescaped '[' within square brackets");
break;
case '\\': {
// Escape always advances the stream
IntPredicate cc = escape(true);
if (cc instanceof IntValuePredicate) {
simpleChar = ((IntValuePredicate) cc).getTarget();
break;
} else {
if (definingRange) {
syntaxError("Multi-character escape cannot follow '-'");
} else if (addend == null) {
addend = cc;
} else {
addend = makeUnion(addend, cc);
}
continue;
}
}
case '-':
if (thereFollows("-[")) {
idx++;
subtrahend = parseCharacterClass();
if (!thereFollows("]")) {
syntaxError("Expected closing ']' after subtraction");
}
} else if (thereFollows("-]")) {
simpleChar = '-';
idx++;
} else if (rangeStart >= 0) {
definingRange = true;
idx++;
continue;
} else if (definingRange) {
syntaxError("Bad range");
} else if (thereFollows("--") && !thereFollows("--[")) {
syntaxError("Unescaped hyphen as start of range");
} else if (!isXSD11 && pattern.charAt(idx-1) != '[' && pattern.charAt(idx-1) != '^' && !thereFollows("]") && !thereFollows("-[")) {
syntaxError("In XSD 1.0, hyphen is allowed only at the beginning or end of a positive character group");
} else {
simpleChar = '-';
idx++;
}
break;
default:
simpleChar = ch;
idx++;
break;
}
// Handle simple character simpleChar
if (definingRange) {
// if we are defining a range make it now
rangeEnd = simpleChar;
// Actually create a range if the range is ok
if (rangeStart > rangeEnd) {
syntaxError("Bad character range: start > end");
// TODO: not an error in XSD, merely a no-op?
}
range.addRange(rangeStart, rangeEnd);
if (reFlags.isCaseIndependent()) {
// Special-case A-Z and a-z
if (rangeStart == 'a' && rangeEnd == 'z') {
range.addRange('A', 'Z');
for (int v=0; v= len) {
return ret;
}
boolean greedy = true;
int quantifierType = pattern.charAt(idx);
switch (quantifierType) {
case '?':
case '*':
// The current node can be null
flags[0] |= NODE_NULLABLE;
// Drop through
case '+':
// Eat quantifier character
idx++;
// Drop through
case '{':
if (quantifierType == '{') {
bracket();
if (bracketMin == 0) {
flags[0] |= NODE_NULLABLE;
}
}
Operation op = instructions.get(ret);
if (op instanceof Operation.OpBOL || op instanceof Operation.OpEOL) {
// Pretty meaningless, but legal. If the quantifier allows zero occurrences, ignore the instruction.
// Otherwise, ignore the quantifier
if (quantifierType == '?' || quantifierType == '*' ||
(quantifierType == '{' && bracketMin == 0)) {
instructions.set(ret, new Operation.OpNothing());
} else {
quantifierType = 0;
}
}
if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
if (quantifierType == '?') {
// can ignore the quantifier
quantifierType = 0;
} else if (quantifierType == '+') {
// '*' and '+' are equivalent
quantifierType = '*';
} else if (quantifierType == '{') {
// bounds are meaningless
quantifierType = '*';
}
}
}
// If the next character is a '?', make the quantifier non-greedy (reluctant)
if (idx < len && pattern.charAt(idx) == '?') {
if (!isXPath) {
syntaxError("Reluctant quantifiers are not allowed in XSD");
}
idx++;
greedy = false;
}
if (greedy) {
// Actually do the quantifier now
switch (quantifierType) {
case '{': {
//bracket();
int bracketEnd = idx;
int bracketMin = this.bracketMin;
int bracketOpt = this.bracketOpt;
// Pointer to the last terminal
int pos = ret;
// Process min first
for (int c = 0; c < bracketMin; c++) {
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
setNextOfEnd(pos, pos = terminal(terminalFlags));
}
// Do the right thing for maximum ({m,})
if (bracketOpt == bracketUnbounded) {
// Drop through now and quantifier expression.
// We are done with the {m,} expr, so skip rest
idx = bracketEnd;
Operation.OpStar op = new Operation.OpStar();
insertNode(op, pos);
setNextOfEnd(pos + 1, pos);
break;
} else if (bracketOpt > 0) {
int opt[] = new int[bracketOpt + 1];
// Surround first optional terminal with MAYBE
Operation.OpMaybe op = new Operation.OpMaybe();
insertNode(op, pos);
opt[0] = pos;
// Add all the rest optional terminals with preceding MAYBEs
for (int c = 1; c < bracketOpt; c++) {
op = new Operation.OpMaybe();
opt[c] = appendNode(op);
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
terminal(terminalFlags);
}
// Tie ends together
int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
for (int c = 0; c < bracketOpt; c++) {
setNextOfEnd(opt[c], end);
setNextOfEnd(opt[c] + 1, opt[c + 1]);
}
} else {
// Rollback terminal - no opt matchers present
//lenInstruction = pos;
while (instructions.size() > pos) {
instructions.remove(instructions.size()-1);
}
Operation.OpNothing nothing = new Operation.OpNothing();
appendNode(nothing);
}
// We are done. skip the reminder of {m,n} expr
idx = bracketEnd;
break;
}
case '?': {
Operation.OpMaybe maybe = new Operation.OpMaybe();
insertNode(maybe, ret);
Operation.OpNothing nothing = new Operation.OpNothing();
int n = appendNode(nothing);
setNextOfEnd(ret, n);
setNextOfEnd(ret + 1, n);
break;
}
case '*': {
Operation.OpStar star = new Operation.OpStar();
insertNode(star, ret);
setNextOfEnd(ret + 1, ret);
break;
}
case '+': {
Operation.OpContinue continu = new Operation.OpContinue();
insertNode(continu, ret);
Operation.OpPlus plus = new Operation.OpPlus();
int n = appendNode(plus);
setNextOfEnd(ret + 1, n);
setNextOfEnd(n, ret);
break;
}
}
} else {
// Not greedy (reluctant): Actually do the quantifier now
switch (quantifierType) {
case '?': {
Operation.OpReluctantMaybe reluctantMaybe = new Operation.OpReluctantMaybe();
insertNode(reluctantMaybe, ret);
//nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
int n = appendNode(new Operation.OpNothing());
//int n = node(RE.OP_NOTHING, 0);
setNextOfEnd(ret, n);
setNextOfEnd(ret + 1, n);
break;
}
case '*': {
Operation.OpReluctantStar reluctantStar = new Operation.OpReluctantStar();
insertNode(reluctantStar, ret);
setNextOfEnd(ret + 1, ret);
break;
}
case '+': {
insertNode(new Operation.OpContinue(), ret);
//nodeInsert(RE.OP_CONTINUE, 0, ret);
int n = appendNode(new Operation.OpReluctantPlus());
//int n = node(RE.OP_RELUCTANTPLUS, 0);
setNextOfEnd(n, ret);
setNextOfEnd(ret + 1, n);
break;
}
case '{': {
// reluctant {..}? - added by MHK
//bracket();
int bracketEnd = idx;
int bracketMin = this.bracketMin;
int bracketOpt = this.bracketOpt;
// Pointer to the last terminal
int pos = ret;
// Process min first
for (int c = 0; c < bracketMin; c++) {
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
setNextOfEnd(pos, pos = terminal(terminalFlags));
}
// Do the right thing for maximum ({m,})
if (bracketOpt == bracketUnbounded) {
// Drop through now and quantifier expression.
// We are done with the {m,} expr, so skip rest
idx = bracketEnd;
insertNode(new Operation.OpReluctantStar(), pos);
//nodeInsert(RE.OP_RELUCTANTSTAR, 0, pos);
setNextOfEnd(pos + 1, pos);
break;
} else if (bracketOpt > 0) {
int opt[] = new int[bracketOpt + 1];
// Surround first optional terminal with MAYBE
insertNode(new Operation.OpReluctantMaybe(), pos);
//nodeInsert(RE.OP_RELUCTANTMAYBE, 0, pos);
opt[0] = pos;
// Add all the rest optional terminals with preceeding MAYBEs
for (int c = 1; c < bracketOpt; c++) {
opt[c] = appendNode(new Operation.OpReluctantMaybe());
//opt[c] = node(RE.OP_RELUCTANTMAYBE, 0);
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
terminal(terminalFlags);
}
// Tie ends together
int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
for (int c = 0; c < bracketOpt; c++) {
setNextOfEnd(opt[c], end);
setNextOfEnd(opt[c] + 1, opt[c + 1]);
}
} else {
// Rollback terminal - no opt matchers present
while (instructions.size() > pos) {
instructions.remove(instructions.size() - 1);
}
appendNode(new Operation.OpNothing());
}
// We are done. skip the reminder of {m,n} expr
idx = bracketEnd;
break;
}
}
}
return ret;
}
/**
* Compile body of one branch of an or operator (implements concatenation)
*
* @param compilerFlags Flags passed by reference
* @return Pointer to first node in the branch
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int branch(int[] compilerFlags) throws RESyntaxException {
// Get each possibly qnatified piece and concat
int node;
int ret = -1;
int chain = -1;
int[] quantifierFlags = new int[1];
boolean nullable = true;
while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
// Get new node
quantifierFlags[0] = NODE_NORMAL;
node = piece(quantifierFlags);
if (quantifierFlags[0] == NODE_NORMAL) {
nullable = false;
}
// If there's a chain, append to the end
if (chain != -1) {
setNextOfEnd(chain, node);
}
// Chain starts at current
chain = node;
if (ret == -1) {
ret = node;
}
}
// If we don't run loop, make a nothing node
if (ret == -1) {
Operation nothing = new Operation.OpNothing();
ret = appendNode(nothing);
}
// Set nullable flag for this branch
if (nullable) {
compilerFlags[0] |= NODE_NULLABLE;
}
return ret;
}
/**
* Compile an expression with possible parens around it. Paren matching
* is done at this level so we can tie the branch tails together.
*
* @param compilerFlags Flag value passed by reference
* @return Node index of expression in instruction array
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int expr(int[] compilerFlags) throws RESyntaxException {
// Create open paren node unless we were called from the top level (which has no parens)
int paren = -1;
int ret = -1;
int closeParens = parens;
if ((compilerFlags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
// if its a cluster ( rather than a proper subexpression ie with backrefs )
if (idx + 2 < len && pattern.charAt(idx + 1) == '?' && pattern.charAt(idx + 2) == ':') {
if (!isXPath30) {
syntaxError("Non-capturing groups allowed only in XPath3.0");
}
paren = 2;
idx += 3;
ret = appendNode(new Operation.OpOpenCluster());
} else {
paren = 1;
idx++;
ret = appendNode(new Operation.OpOpen(parens++));
}
}
compilerFlags[0] &= ~NODE_TOPLEVEL;
// Process contents of first branch node
boolean open = false;
int branch = branch(compilerFlags);
if (ret == -1) {
ret = branch;
} else {
setNextOfEnd(ret, branch);
}
// Loop through branches
while (idx < len && pattern.charAt(idx) == '|') {
// Now open the first branch since there are more than one
if (!open) {
Operation.OpBranch op = new Operation.OpBranch();
insertNode(op, branch);
open = true;
}
idx++;
setNextOfEnd(branch, branch = appendNode(new Operation.OpBranch()));
branch(compilerFlags);
}
// Create an ending node (either a close paren or an OP_END)
int end;
if (paren > 0) {
if (idx < len && pattern.charAt(idx) == ')') {
idx++;
} else {
syntaxError("Missing close paren");
}
if (paren == 1) {
end = appendNode(new Operation.OpClose(closeParens));
captures.add(closeParens);
} else {
end = appendNode(new Operation.OpCloseCluster());
}
} else {
end = appendNode(new Operation.OpEndProgram());
}
// Append the ending node to the ret nodelist
setNextOfEnd(ret, end);
// Hook the ends of each branch to the end node
int currentNode = ret;
int nextNodeOffset = instructions.get(currentNode).next;
// while the next node o
while (nextNodeOffset != 0 && currentNode < instructions.size()) {
// If branch, make the end of the branch's operand chain point to the end node.
if (instructions.get(currentNode) instanceof Operation.OpBranch) {
setNextOfEnd(currentNode + 1, end);
}
nextNodeOffset = instructions.get(currentNode).next;
currentNode += nextNodeOffset;
}
// Return the node list
return ret;
}
/**
* Compiles a regular expression pattern into a program runnable by the pattern
* matcher class 'RE'.
*
* @param pattern Regular expression pattern to compile (see RECompiler class
* for details).
* @return A compiled regular expression program.
* @throws net.sf.saxon.regex.RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see net.sf.saxon.regex.REMatcher
*/
public REProgram compile(UnicodeString pattern) throws RESyntaxException {
// Initialize variables for compilation
this.pattern = pattern; // Save pattern in instance variable
len = pattern.length(); // Precompute pattern length for speed
idx = 0; // Set parsing index to the first character
parens = 1; // Set paren level to 1 (the implicit outer parens)
boolean nullable = false;
if (reFlags.isLiteral()) {
// 'q' flag is set
int ret = literalAtom();
Operation.OpEndProgram endNode = new Operation.OpEndProgram();
int end = appendNode(endNode);
setNextOfEnd(ret, end);
} else {
if (reFlags.isAllowWhitespace()) {
// 'x' flag is set. Preprocess the expression to strip whitespace, other than between
// square brackets
FastStringBuffer sb = new FastStringBuffer(pattern.length());
int nesting = 0;
boolean astral = false;
boolean escaped = false;
for (int i=0; i 65535) {
astral = true;
}
if (ch == '\\' && !escaped) {
escaped = true;
sb.appendWideChar(ch);
} else if (ch == '[' && !escaped) {
nesting++;
escaped = false;
sb.appendWideChar(ch);
} else if (ch == ']' && !escaped) {
nesting--;
escaped = false;
sb.appendWideChar(ch);
} else if (nesting==0 && Whitespace.isWhitespace(ch)) {
// no action
} else {
escaped = false;
sb.appendWideChar(ch);
}
}
if (astral) {
pattern = new GeneralUnicodeString(sb);
} else {
pattern = new BMPString(sb);
}
this.pattern = pattern;
this.len = pattern.length();
}
// Initialize pass by reference flags value
int[] compilerFlags = {NODE_TOPLEVEL};
// Parse expression
expr(compilerFlags);
nullable = (compilerFlags[0] & NODE_NULLABLE) != 0;
// Should be at end of input
if (idx != len) {
if (pattern.charAt(idx) == ')') {
syntaxError("Unmatched close paren");
}
syntaxError("Unexpected input remains");
}
}
// Return the result
Operation[] ops = new Operation[instructions.size()];
for (int i=0; i