org.mozilla.javascript.regexp.NativeRegExp Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of js Show documentation
Show all versions of js Show documentation
Rhino is an open-source implementation of JavaScript written entirely in Java. It is typically embedded into Java applications to provide scripting to end users.
The newest version!
/* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Rhino code, released
* May 6, 1998.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1997-1999
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Norris Boyd
* Igor Bukanov
* Brendan Eich
* Matthias Radestock
*
* Alternatively, the contents of this file may be used under the terms of
* the GNU General Public License Version 2 or later (the "GPL"), in which
* case the provisions of the GPL are applicable instead of those above. If
* you wish to allow use of your version of this file only under the terms of
* the GPL and not to allow others to use your version of this file under the
* MPL, indicate your decision by deleting the provisions above and replacing
* them with the notice and other provisions required by the GPL. If you do
* not delete the provisions above, a recipient may use your version of this
* file under either the MPL or the GPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.javascript.regexp;
import java.io.Serializable;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.IdFunctionObject;
import org.mozilla.javascript.IdScriptableObject;
import org.mozilla.javascript.Kit;
import org.mozilla.javascript.ScriptRuntime;
import org.mozilla.javascript.Scriptable;
import org.mozilla.javascript.ScriptableObject;
import org.mozilla.javascript.Undefined;
/**
* This class implements the RegExp native object.
*
* Revision History:
* Implementation in C by Brendan Eich
* Initial port to Java by Norris Boyd from jsregexp.c version 1.36
* Merged up to version 1.38, which included Unicode support.
* Merged bug fixes in version 1.39.
* Merged JSFUN13_BRANCH changes up to 1.32.2.13
*
* @author Brendan Eich
* @author Norris Boyd
*/
public class NativeRegExp extends IdScriptableObject implements Function
{
static final long serialVersionUID = 4965263491464903264L;
private static final Object REGEXP_TAG = new Object();
public static final int JSREG_GLOB = 0x1; // 'g' flag: global
public static final int JSREG_FOLD = 0x2; // 'i' flag: fold
public static final int JSREG_MULTILINE = 0x4; // 'm' flag: multiline
//type of match to perform
public static final int TEST = 0;
public static final int MATCH = 1;
public static final int PREFIX = 2;
private static final boolean debug = false;
private static final byte REOP_EMPTY = 0; /* match rest of input against rest of r.e. */
private static final byte REOP_ALT = 1; /* alternative subexpressions in kid and next */
private static final byte REOP_BOL = 2; /* beginning of input (or line if multiline) */
private static final byte REOP_EOL = 3; /* end of input (or line if multiline) */
private static final byte REOP_WBDRY = 4; /* match "" at word boundary */
private static final byte REOP_WNONBDRY = 5; /* match "" at word non-boundary */
private static final byte REOP_QUANT = 6; /* quantified atom: atom{1,2} */
private static final byte REOP_STAR = 7; /* zero or more occurrences of kid */
private static final byte REOP_PLUS = 8; /* one or more occurrences of kid */
private static final byte REOP_OPT = 9; /* optional subexpression in kid */
private static final byte REOP_LPAREN = 10; /* left paren bytecode: kid is u.num'th sub-regexp */
private static final byte REOP_RPAREN = 11; /* right paren bytecode */
private static final byte REOP_DOT = 12; /* stands for any character */
// private static final byte REOP_CCLASS = 13; /* character class: [a-f] */
private static final byte REOP_DIGIT = 14; /* match a digit char: [0-9] */
private static final byte REOP_NONDIGIT = 15; /* match a non-digit char: [^0-9] */
private static final byte REOP_ALNUM = 16; /* match an alphanumeric char: [0-9a-z_A-Z] */
private static final byte REOP_NONALNUM = 17; /* match a non-alphanumeric char: [^0-9a-z_A-Z] */
private static final byte REOP_SPACE = 18; /* match a whitespace char */
private static final byte REOP_NONSPACE = 19; /* match a non-whitespace char */
private static final byte REOP_BACKREF = 20; /* back-reference (e.g., \1) to a parenthetical */
private static final byte REOP_FLAT = 21; /* match a flat string */
private static final byte REOP_FLAT1 = 22; /* match a single char */
private static final byte REOP_JUMP = 23; /* for deoptimized closure loops */
// private static final byte REOP_DOTSTAR = 24; /* optimize .* to use a single opcode */
// private static final byte REOP_ANCHOR = 25; /* like .* but skips left context to unanchored r.e. */
// private static final byte REOP_EOLONLY = 26; /* $ not preceded by any pattern */
// private static final byte REOP_UCFLAT = 27; /* flat Unicode string; len immediate counts chars */
private static final byte REOP_UCFLAT1 = 28; /* single Unicode char */
// private static final byte REOP_UCCLASS = 29; /* Unicode character class, vector of chars to match */
// private static final byte REOP_NUCCLASS = 30; /* negated Unicode character class */
// private static final byte REOP_BACKREFi = 31; /* case-independent REOP_BACKREF */
private static final byte REOP_FLATi = 32; /* case-independent REOP_FLAT */
private static final byte REOP_FLAT1i = 33; /* case-independent REOP_FLAT1 */
// private static final byte REOP_UCFLATi = 34; /* case-independent REOP_UCFLAT */
private static final byte REOP_UCFLAT1i = 35; /* case-independent REOP_UCFLAT1 */
// private static final byte REOP_ANCHOR1 = 36; /* first-char discriminating REOP_ANCHOR */
// private static final byte REOP_NCCLASS = 37; /* negated 8-bit character class */
// private static final byte REOP_DOTSTARMIN = 38; /* ungreedy version of REOP_DOTSTAR */
// private static final byte REOP_LPARENNON = 39; /* non-capturing version of REOP_LPAREN */
// private static final byte REOP_RPARENNON = 40; /* non-capturing version of REOP_RPAREN */
private static final byte REOP_ASSERT = 41; /* zero width positive lookahead assertion */
private static final byte REOP_ASSERT_NOT = 42; /* zero width negative lookahead assertion */
private static final byte REOP_ASSERTTEST = 43; /* sentinel at end of assertion child */
private static final byte REOP_ASSERTNOTTEST = 44; /* sentinel at end of !assertion child */
private static final byte REOP_MINIMALSTAR = 45; /* non-greedy version of * */
private static final byte REOP_MINIMALPLUS = 46; /* non-greedy version of + */
private static final byte REOP_MINIMALOPT = 47; /* non-greedy version of ? */
private static final byte REOP_MINIMALQUANT = 48; /* non-greedy version of {} */
private static final byte REOP_ENDCHILD = 49; /* sentinel at end of quantifier child */
private static final byte REOP_CLASS = 50; /* character class with index */
private static final byte REOP_REPEAT = 51; /* directs execution of greedy quantifier */
private static final byte REOP_MINIMALREPEAT = 52; /* directs execution of non-greedy quantifier */
private static final byte REOP_END = 53;
public static void init(Context cx, Scriptable scope, boolean sealed)
{
NativeRegExp proto = new NativeRegExp();
proto.re = (RECompiled)compileRE(cx, "", null, false);
proto.activatePrototypeMap(MAX_PROTOTYPE_ID);
proto.setParentScope(scope);
proto.setPrototype(getObjectPrototype(scope));
NativeRegExpCtor ctor = new NativeRegExpCtor();
// Bug #324006: ECMA-262 15.10.6.1 says "The initial value of
// RegExp.prototype.constructor is the builtin RegExp constructor."
proto.put("constructor", proto, ctor);
ScriptRuntime.setFunctionProtoAndParent(ctor, scope);
ctor.setImmunePrototypeProperty(proto);
if (sealed) {
proto.sealObject();
ctor.sealObject();
}
defineProperty(scope, "RegExp", ctor, ScriptableObject.DONTENUM);
}
NativeRegExp(Scriptable scope, Object regexpCompiled)
{
this.re = (RECompiled)regexpCompiled;
this.lastIndex = 0;
ScriptRuntime.setObjectProtoAndParent(this, scope);
}
@Override
public String getClassName()
{
return "RegExp";
}
public Object call(Context cx, Scriptable scope, Scriptable thisObj,
Object[] args)
{
return execSub(cx, scope, args, MATCH);
}
public Scriptable construct(Context cx, Scriptable scope, Object[] args)
{
return (Scriptable)execSub(cx, scope, args, MATCH);
}
Scriptable compile(Context cx, Scriptable scope, Object[] args)
{
if (args.length > 0 && args[0] instanceof NativeRegExp) {
if (args.length > 1 && args[1] != Undefined.instance) {
// report error
throw ScriptRuntime.typeError0("msg.bad.regexp.compile");
}
NativeRegExp thatObj = (NativeRegExp) args[0];
this.re = thatObj.re;
this.lastIndex = thatObj.lastIndex;
return this;
}
String s = args.length == 0 ? "" : ScriptRuntime.toString(args[0]);
String global = args.length > 1 && args[1] != Undefined.instance
? ScriptRuntime.toString(args[1])
: null;
this.re = (RECompiled)compileRE(cx, s, global, false);
this.lastIndex = 0;
return this;
}
@Override
public String toString()
{
StringBuffer buf = new StringBuffer();
buf.append('/');
if (re.source.length != 0) {
buf.append(re.source);
} else {
// See bugzilla 226045
buf.append("(?:)");
}
buf.append('/');
if ((re.flags & JSREG_GLOB) != 0)
buf.append('g');
if ((re.flags & JSREG_FOLD) != 0)
buf.append('i');
if ((re.flags & JSREG_MULTILINE) != 0)
buf.append('m');
return buf.toString();
}
NativeRegExp() { }
private static RegExpImpl getImpl(Context cx)
{
return (RegExpImpl) ScriptRuntime.getRegExpProxy(cx);
}
private Object execSub(Context cx, Scriptable scopeObj,
Object[] args, int matchType)
{
RegExpImpl reImpl = getImpl(cx);
String str;
if (args.length == 0) {
str = reImpl.input;
if (str == null) {
reportError("msg.no.re.input.for", toString());
}
} else {
str = ScriptRuntime.toString(args[0]);
}
double d = ((re.flags & JSREG_GLOB) != 0) ? lastIndex : 0;
Object rval;
if (d < 0 || str.length() < d) {
lastIndex = 0;
rval = null;
}
else {
int indexp[] = { (int)d };
rval = executeRegExp(cx, scopeObj, reImpl, str, indexp, matchType);
if ((re.flags & JSREG_GLOB) != 0) {
lastIndex = (rval == null || rval == Undefined.instance)
? 0 : indexp[0];
}
}
return rval;
}
static Object compileRE(Context cx, String str, String global, boolean flat)
{
RECompiled regexp = new RECompiled();
regexp.source = str.toCharArray();
int length = str.length();
int flags = 0;
if (global != null) {
for (int i = 0; i < global.length(); i++) {
char c = global.charAt(i);
if (c == 'g') {
flags |= JSREG_GLOB;
} else if (c == 'i') {
flags |= JSREG_FOLD;
} else if (c == 'm') {
flags |= JSREG_MULTILINE;
} else {
reportError("msg.invalid.re.flag", String.valueOf(c));
}
}
}
regexp.flags = flags;
CompilerState state = new CompilerState(cx, regexp.source, length, flags);
if (flat && length > 0) {
if (debug) {
System.out.println("flat = \"" + str + "\"");
}
state.result = new RENode(REOP_FLAT);
state.result.chr = state.cpbegin[0];
state.result.length = length;
state.result.flatIndex = 0;
state.progLength += 5;
}
else
if (!parseDisjunction(state))
return null;
regexp.program = new byte[state.progLength + 1];
if (state.classCount != 0) {
regexp.classList = new RECharSet[state.classCount];
regexp.classCount = state.classCount;
}
int endPC = emitREBytecode(state, regexp, 0, state.result);
regexp.program[endPC++] = REOP_END;
if (debug) {
System.out.println("Prog. length = " + endPC);
for (int i = 0; i < endPC; i++) {
System.out.print(regexp.program[i]);
if (i < (endPC - 1)) System.out.print(", ");
}
System.out.println();
}
regexp.parenCount = state.parenCount;
// If re starts with literal, init anchorCh accordingly
switch (regexp.program[0]) {
case REOP_UCFLAT1:
case REOP_UCFLAT1i:
regexp.anchorCh = (char)getIndex(regexp.program, 1);
break;
case REOP_FLAT1:
case REOP_FLAT1i:
regexp.anchorCh = (char)(regexp.program[1] & 0xFF);
break;
case REOP_FLAT:
case REOP_FLATi:
int k = getIndex(regexp.program, 1);
regexp.anchorCh = regexp.source[k];
break;
}
if (debug) {
if (regexp.anchorCh >= 0) {
System.out.println("Anchor ch = '" + (char)regexp.anchorCh + "'");
}
}
return regexp;
}
static boolean isDigit(char c)
{
return '0' <= c && c <= '9';
}
private static boolean isWord(char c)
{
return Character.isLetter(c) || isDigit(c) || c == '_';
}
private static boolean isLineTerm(char c)
{
return ScriptRuntime.isJSLineTerminator(c);
}
private static boolean isREWhiteSpace(int c)
{
return (c == '\u0020' || c == '\u0009'
|| c == '\n' || c == '\r'
|| c == 0x2028 || c == 0x2029
|| c == '\u000C' || c == '\u000B'
|| c == '\u00A0'
|| Character.getType((char)c) == Character.SPACE_SEPARATOR);
}
/*
*
* 1. If IgnoreCase is false, return ch.
* 2. Let u be ch converted to upper case as if by calling
* String.prototype.toUpperCase on the one-character string ch.
* 3. If u does not consist of a single character, return ch.
* 4. Let cu be u's character.
* 5. If ch's code point value is greater than or equal to decimal 128 and cu's
* code point value is less than decimal 128, then return ch.
* 6. Return cu.
*/
private static char upcase(char ch)
{
if (ch < 128) {
if ('a' <= ch && ch <= 'z') {
return (char)(ch + ('A' - 'a'));
}
return ch;
}
char cu = Character.toUpperCase(ch);
if ((ch >= 128) && (cu < 128)) return ch;
return cu;
}
private static char downcase(char ch)
{
if (ch < 128) {
if ('A' <= ch && ch <= 'Z') {
return (char)(ch + ('a' - 'A'));
}
return ch;
}
char cl = Character.toLowerCase(ch);
if ((ch >= 128) && (cl < 128)) return ch;
return cl;
}
/*
* Validates and converts hex ascii value.
*/
private static int toASCIIHexDigit(int c)
{
if (c < '0')
return -1;
if (c <= '9') {
return c - '0';
}
c |= 0x20;
if ('a' <= c && c <= 'f') {
return c - 'a' + 10;
}
return -1;
}
/*
* Top-down regular expression grammar, based closely on Perl4.
*
* regexp: altern A regular expression is one or more
* altern '|' regexp alternatives separated by vertical bar.
*/
private static boolean parseDisjunction(CompilerState state)
{
if (!parseAlternative(state))
return false;
char[] source = state.cpbegin;
int index = state.cp;
if (index != source.length && source[index] == '|') {
RENode altResult;
++state.cp;
altResult = new RENode(REOP_ALT);
altResult.kid = state.result;
if (!parseDisjunction(state))
return false;
altResult.kid2 = state.result;
state.result = altResult;
/* ALT, , ..., JUMP, ... JUMP */
state.progLength += 9;
}
return true;
}
/*
* altern: item An alternative is one or more items,
* item altern concatenated together.
*/
private static boolean parseAlternative(CompilerState state)
{
RENode headTerm = null;
RENode tailTerm = null;
char[] source = state.cpbegin;
while (true) {
if (state.cp == state.cpend || source[state.cp] == '|'
|| (state.parenNesting != 0 && source[state.cp] == ')'))
{
if (headTerm == null) {
state.result = new RENode(REOP_EMPTY);
}
else
state.result = headTerm;
return true;
}
if (!parseTerm(state))
return false;
if (headTerm == null)
headTerm = state.result;
else {
if (tailTerm == null) {
headTerm.next = state.result;
tailTerm = state.result;
while (tailTerm.next != null) tailTerm = tailTerm.next;
}
else {
tailTerm.next = state.result;
tailTerm = tailTerm.next;
while (tailTerm.next != null) tailTerm = tailTerm.next;
}
}
}
}
/* calculate the total size of the bitmap required for a class expression */
private static boolean
calculateBitmapSize(CompilerState state, RENode target, char[] src,
int index, int end)
{
char rangeStart = 0;
char c;
int n;
int nDigits;
int i;
int max = 0;
boolean inRange = false;
target.bmsize = 0;
if (index == end)
return true;
if (src[index] == '^')
++index;
while (index != end) {
int localMax = 0;
nDigits = 2;
switch (src[index]) {
case '\\':
++index;
c = src[index++];
switch (c) {
case 'b':
localMax = 0x8;
break;
case 'f':
localMax = 0xC;
break;
case 'n':
localMax = 0xA;
break;
case 'r':
localMax = 0xD;
break;
case 't':
localMax = 0x9;
break;
case 'v':
localMax = 0xB;
break;
case 'c':
if (((index + 1) < end) && Character.isLetter(src[index + 1]))
localMax = (char)(src[index++] & 0x1F);
else
localMax = '\\';
break;
case 'u':
nDigits += 2;
// fall thru...
case 'x':
n = 0;
for (i = 0; (i < nDigits) && (index < end); i++) {
c = src[index++];
n = Kit.xDigitToInt(c, n);
if (n < 0) {
// Back off to accepting the original
// '\' as a literal
index -= (i + 1);
n = '\\';
break;
}
}
localMax = n;
break;
case 'd':
if (inRange) {
reportError("msg.bad.range", "");
return false;
}
localMax = '9';
break;
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
if (inRange) {
reportError("msg.bad.range", "");
return false;
}
target.bmsize = 65535;
return true;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/*
* This is a non-ECMA extension - decimal escapes (in this
* case, octal!) are supposed to be an error inside class
* ranges, but supported here for backwards compatibility.
*
*/
n = (c - '0');
c = src[index];
if ('0' <= c && c <= '7') {
index++;
n = 8 * n + (c - '0');
c = src[index];
if ('0' <= c && c <= '7') {
index++;
i = 8 * n + (c - '0');
if (i <= 0377)
n = i;
else
index--;
}
}
localMax = n;
break;
default:
localMax = c;
break;
}
break;
default:
localMax = src[index++];
break;
}
if (inRange) {
if (rangeStart > localMax) {
reportError("msg.bad.range", "");
return false;
}
inRange = false;
}
else {
if (index < (end - 1)) {
if (src[index] == '-') {
++index;
inRange = true;
rangeStart = (char)localMax;
continue;
}
}
}
if ((state.flags & JSREG_FOLD) != 0){
char cu = upcase((char)localMax);
char cd = downcase((char)localMax);
localMax = (cu >= cd) ? cu : cd;
}
if (localMax > max)
max = localMax;
}
target.bmsize = max;
return true;
}
/*
* item: assertion An item is either an assertion or
* quantatom a quantified atom.
*
* assertion: '^' Assertions match beginning of string
* (or line if the class static property
* RegExp.multiline is true).
* '$' End of string (or line if the class
* static property RegExp.multiline is
* true).
* '\b' Word boundary (between \w and \W).
* '\B' Word non-boundary.
*
* quantatom: atom An unquantified atom.
* quantatom '{' n ',' m '}'
* Atom must occur between n and m times.
* quantatom '{' n ',' '}' Atom must occur at least n times.
* quantatom '{' n '}' Atom must occur exactly n times.
* quantatom '*' Zero or more times (same as {0,}).
* quantatom '+' One or more times (same as {1,}).
* quantatom '?' Zero or one time (same as {0,1}).
*
* any of which can be optionally followed by '?' for ungreedy
*
* atom: '(' regexp ')' A parenthesized regexp (what matched
* can be addressed using a backreference,
* see '\' n below).
* '.' Matches any char except '\n'.
* '[' classlist ']' A character class.
* '[' '^' classlist ']' A negated character class.
* '\f' Form Feed.
* '\n' Newline (Line Feed).
* '\r' Carriage Return.
* '\t' Horizontal Tab.
* '\v' Vertical Tab.
* '\d' A digit (same as [0-9]).
* '\D' A non-digit.
* '\w' A word character, [0-9a-z_A-Z].
* '\W' A non-word character.
* '\s' A whitespace character, [ \b\f\n\r\t\v].
* '\S' A non-whitespace character.
* '\' n A backreference to the nth (n decimal
* and positive) parenthesized expression.
* '\' octal An octal escape sequence (octal must be
* two or three digits long, unless it is
* 0 for the null character).
* '\x' hex A hex escape (hex must be two digits).
* '\c' ctrl A control character, ctrl is a letter.
* '\' literalatomchar Any character except one of the above
* that follow '\' in an atom.
* otheratomchar Any character not first among the other
* atom right-hand sides.
*/
private static void doFlat(CompilerState state, char c)
{
state.result = new RENode(REOP_FLAT);
state.result.chr = c;
state.result.length = 1;
state.result.flatIndex = -1;
state.progLength += 3;
}
private static int
getDecimalValue(char c, CompilerState state, int maxValue,
String overflowMessageId)
{
boolean overflow = false;
int start = state.cp;
char[] src = state.cpbegin;
int value = c - '0';
for (; state.cp != state.cpend; ++state.cp) {
c = src[state.cp];
if (!isDigit(c)) {
break;
}
if (!overflow) {
int digit = c - '0';
if (value < (maxValue - digit) / 10) {
value = value * 10 + digit;
} else {
overflow = true;
value = maxValue;
}
}
}
if (overflow) {
reportError(overflowMessageId,
String.valueOf(src, start, state.cp - start));
}
return value;
}
private static boolean
parseTerm(CompilerState state)
{
char[] src = state.cpbegin;
char c = src[state.cp++];
int nDigits = 2;
int parenBaseCount = state.parenCount;
int num, tmp;
RENode term;
int termStart;
switch (c) {
/* assertions and atoms */
case '^':
state.result = new RENode(REOP_BOL);
state.progLength++;
return true;
case '$':
state.result = new RENode(REOP_EOL);
state.progLength++;
return true;
case '\\':
if (state.cp < state.cpend) {
c = src[state.cp++];
switch (c) {
/* assertion escapes */
case 'b' :
state.result = new RENode(REOP_WBDRY);
state.progLength++;
return true;
case 'B':
state.result = new RENode(REOP_WNONBDRY);
state.progLength++;
return true;
/* Decimal escape */
case '0':
/*
* Under 'strict' ECMA 3, we interpret \0 as NUL and don't accept octal.
* However, (XXX and since Rhino doesn't have a 'strict' mode) we'll just
* behave the old way for compatibility reasons.
* (see http://bugzilla.mozilla.org/show_bug.cgi?id=141078)
*
*/
reportWarning(state.cx, "msg.bad.backref", "");
/* octal escape */
num = 0;
while (state.cp < state.cpend) {
c = src[state.cp];
if ((c >= '0') && (c <= '7')) {
state.cp++;
tmp = 8 * num + (c - '0');
if (tmp > 0377)
break;
num = tmp;
}
else
break;
}
c = (char)(num);
doFlat(state, c);
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
termStart = state.cp - 1;
num = getDecimalValue(c, state, 0xFFFF,
"msg.overlarge.backref");
if (num > state.parenCount)
reportWarning(state.cx, "msg.bad.backref", "");
/*
* n > 9 or > count of parentheses,
* then treat as octal instead.
*/
if ((num > 9) && (num > state.parenCount)) {
state.cp = termStart;
num = 0;
while (state.cp < state.cpend) {
c = src[state.cp];
if ((c >= '0') && (c <= '7')) {
state.cp++;
tmp = 8 * num + (c - '0');
if (tmp > 0377)
break;
num = tmp;
}
else
break;
}
c = (char)(num);
doFlat(state, c);
break;
}
/* otherwise, it's a back-reference */
state.result = new RENode(REOP_BACKREF);
state.result.parenIndex = num - 1;
state.progLength += 3;
break;
/* Control escape */
case 'f':
c = 0xC;
doFlat(state, c);
break;
case 'n':
c = 0xA;
doFlat(state, c);
break;
case 'r':
c = 0xD;
doFlat(state, c);
break;
case 't':
c = 0x9;
doFlat(state, c);
break;
case 'v':
c = 0xB;
doFlat(state, c);
break;
/* Control letter */
case 'c':
if (((state.cp + 1) < state.cpend) &&
Character.isLetter(src[state.cp + 1]))
c = (char)(src[state.cp++] & 0x1F);
else {
/* back off to accepting the original '\' as a literal */
--state.cp;
c = '\\';
}
doFlat(state, c);
break;
/* UnicodeEscapeSequence */
case 'u':
nDigits += 2;
// fall thru...
/* HexEscapeSequence */
case 'x':
{
int n = 0;
int i;
for (i = 0; (i < nDigits)
&& (state.cp < state.cpend); i++) {
c = src[state.cp++];
n = Kit.xDigitToInt(c, n);
if (n < 0) {
// Back off to accepting the original
// 'u' or 'x' as a literal
state.cp -= (i + 2);
n = src[state.cp++];
break;
}
}
c = (char)(n);
}
doFlat(state, c);
break;
/* Character class escapes */
case 'd':
state.result = new RENode(REOP_DIGIT);
state.progLength++;
break;
case 'D':
state.result = new RENode(REOP_NONDIGIT);
state.progLength++;
break;
case 's':
state.result = new RENode(REOP_SPACE);
state.progLength++;
break;
case 'S':
state.result = new RENode(REOP_NONSPACE);
state.progLength++;
break;
case 'w':
state.result = new RENode(REOP_ALNUM);
state.progLength++;
break;
case 'W':
state.result = new RENode(REOP_NONALNUM);
state.progLength++;
break;
/* IdentityEscape */
default:
state.result = new RENode(REOP_FLAT);
state.result.chr = c;
state.result.length = 1;
state.result.flatIndex = state.cp - 1;
state.progLength += 3;
break;
}
break;
}
else {
/* a trailing '\' is an error */
reportError("msg.trail.backslash", "");
return false;
}
case '(': {
RENode result = null;
termStart = state.cp;
if (state.cp + 1 < state.cpend && src[state.cp] == '?'
&& ((c = src[state.cp + 1]) == '=' || c == '!' || c == ':'))
{
state.cp += 2;
if (c == '=') {
result = new RENode(REOP_ASSERT);
/* ASSERT, , ... ASSERTTEST */
state.progLength += 4;
} else if (c == '!') {
result = new RENode(REOP_ASSERT_NOT);
/* ASSERTNOT, , ... ASSERTNOTTEST */
state.progLength += 4;
}
} else {
result = new RENode(REOP_LPAREN);
/* LPAREN, , ... RPAREN, */
state.progLength += 6;
result.parenIndex = state.parenCount++;
}
++state.parenNesting;
if (!parseDisjunction(state))
return false;
if (state.cp == state.cpend || src[state.cp] != ')') {
reportError("msg.unterm.paren", "");
return false;
}
++state.cp;
--state.parenNesting;
if (result != null) {
result.kid = state.result;
state.result = result;
}
break;
}
case ')':
reportError("msg.re.unmatched.right.paren", "");
return false;
case '[':
state.result = new RENode(REOP_CLASS);
termStart = state.cp;
state.result.startIndex = termStart;
while (true) {
if (state.cp == state.cpend) {
reportError("msg.unterm.class", "");
return false;
}
if (src[state.cp] == '\\')
state.cp++;
else {
if (src[state.cp] == ']') {
state.result.kidlen = state.cp - termStart;
break;
}
}
state.cp++;
}
state.result.index = state.classCount++;
/*
* Call calculateBitmapSize now as we want any errors it finds
* to be reported during the parse phase, not at execution.
*/
if (!calculateBitmapSize(state, state.result, src, termStart, state.cp++))
return false;
state.progLength += 3; /* CLASS, */
break;
case '.':
state.result = new RENode(REOP_DOT);
state.progLength++;
break;
case '*':
case '+':
case '?':
reportError("msg.bad.quant", String.valueOf(src[state.cp - 1]));
return false;
default:
state.result = new RENode(REOP_FLAT);
state.result.chr = c;
state.result.length = 1;
state.result.flatIndex = state.cp - 1;
state.progLength += 3;
break;
}
term = state.result;
if (state.cp == state.cpend) {
return true;
}
boolean hasQ = false;
switch (src[state.cp]) {
case '+':
state.result = new RENode(REOP_QUANT);
state.result.min = 1;
state.result.max = -1;
/* , , , ... */
state.progLength += 8;
hasQ = true;
break;
case '*':
state.result = new RENode(REOP_QUANT);
state.result.min = 0;
state.result.max = -1;
/* , , , ... */
state.progLength += 8;
hasQ = true;
break;
case '?':
state.result = new RENode(REOP_QUANT);
state.result.min = 0;
state.result.max = 1;
/* , , , ... */
state.progLength += 8;
hasQ = true;
break;
case '{': /* balance '}' */
{
int min = 0;
int max = -1;
int leftCurl = state.cp;
/* For Perl etc. compatibility, if quntifier does not match
* \{\d+(,\d*)?\} exactly back off from it
* being a quantifier, and chew it up as a literal
* atom next time instead.
*/
c = src[++state.cp];
if (isDigit(c)) {
++state.cp;
min = getDecimalValue(c, state, 0xFFFF,
"msg.overlarge.min");
c = src[state.cp];
if (c == ',') {
c = src[++state.cp];
if (isDigit(c)) {
++state.cp;
max = getDecimalValue(c, state, 0xFFFF,
"msg.overlarge.max");
c = src[state.cp];
if (min > max) {
reportError("msg.max.lt.min",
String.valueOf(src[state.cp]));
return false;
}
}
} else {
max = min;
}
/* balance '{' */
if (c == '}') {
state.result = new RENode(REOP_QUANT);
state.result.min = min;
state.result.max = max;
// QUANT, , , ,
// , ...
state.progLength += 12;
hasQ = true;
}
}
if (!hasQ) {
state.cp = leftCurl;
}
break;
}
}
if (!hasQ)
return true;
++state.cp;
state.result.kid = term;
state.result.parenIndex = parenBaseCount;
state.result.parenCount = state.parenCount - parenBaseCount;
if ((state.cp < state.cpend) && (src[state.cp] == '?')) {
++state.cp;
state.result.greedy = false;
}
else
state.result.greedy = true;
return true;
}
private static void resolveForwardJump(byte[] array, int from, int pc)
{
if (from > pc) throw Kit.codeBug();
addIndex(array, from, pc - from);
}
private static int getOffset(byte[] array, int pc)
{
return getIndex(array, pc);
}
private static int addIndex(byte[] array, int pc, int index)
{
if (index < 0) throw Kit.codeBug();
if (index > 0xFFFF)
throw Context.reportRuntimeError("Too complex regexp");
array[pc] = (byte)(index >> 8);
array[pc + 1] = (byte)(index);
return pc + 2;
}
private static int getIndex(byte[] array, int pc)
{
return ((array[pc] & 0xFF) << 8) | (array[pc + 1] & 0xFF);
}
private static final int OFFSET_LEN = 2;
private static final int INDEX_LEN = 2;
private static int
emitREBytecode(CompilerState state, RECompiled re, int pc, RENode t)
{
RENode nextAlt;
int nextAltFixup, nextTermFixup;
byte[] program = re.program;
while (t != null) {
program[pc++] = t.op;
switch (t.op) {
case REOP_EMPTY:
--pc;
break;
case REOP_ALT:
nextAlt = t.kid2;
nextAltFixup = pc; /* address of next alternate */
pc += OFFSET_LEN;
pc = emitREBytecode(state, re, pc, t.kid);
program[pc++] = REOP_JUMP;
nextTermFixup = pc; /* address of following term */
pc += OFFSET_LEN;
resolveForwardJump(program, nextAltFixup, pc);
pc = emitREBytecode(state, re, pc, nextAlt);
program[pc++] = REOP_JUMP;
nextAltFixup = pc;
pc += OFFSET_LEN;
resolveForwardJump(program, nextTermFixup, pc);
resolveForwardJump(program, nextAltFixup, pc);
break;
case REOP_FLAT:
/*
* Consecutize FLAT's if possible.
*/
if (t.flatIndex != -1) {
while ((t.next != null) && (t.next.op == REOP_FLAT)
&& ((t.flatIndex + t.length)
== t.next.flatIndex)) {
t.length += t.next.length;
t.next = t.next.next;
}
}
if ((t.flatIndex != -1) && (t.length > 1)) {
if ((state.flags & JSREG_FOLD) != 0)
program[pc - 1] = REOP_FLATi;
else
program[pc - 1] = REOP_FLAT;
pc = addIndex(program, pc, t.flatIndex);
pc = addIndex(program, pc, t.length);
}
else {
if (t.chr < 256) {
if ((state.flags & JSREG_FOLD) != 0)
program[pc - 1] = REOP_FLAT1i;
else
program[pc - 1] = REOP_FLAT1;
program[pc++] = (byte)(t.chr);
}
else {
if ((state.flags & JSREG_FOLD) != 0)
program[pc - 1] = REOP_UCFLAT1i;
else
program[pc - 1] = REOP_UCFLAT1;
pc = addIndex(program, pc, t.chr);
}
}
break;
case REOP_LPAREN:
pc = addIndex(program, pc, t.parenIndex);
pc = emitREBytecode(state, re, pc, t.kid);
program[pc++] = REOP_RPAREN;
pc = addIndex(program, pc, t.parenIndex);
break;
case REOP_BACKREF:
pc = addIndex(program, pc, t.parenIndex);
break;
case REOP_ASSERT:
nextTermFixup = pc;
pc += OFFSET_LEN;
pc = emitREBytecode(state, re, pc, t.kid);
program[pc++] = REOP_ASSERTTEST;
resolveForwardJump(program, nextTermFixup, pc);
break;
case REOP_ASSERT_NOT:
nextTermFixup = pc;
pc += OFFSET_LEN;
pc = emitREBytecode(state, re, pc, t.kid);
program[pc++] = REOP_ASSERTNOTTEST;
resolveForwardJump(program, nextTermFixup, pc);
break;
case REOP_QUANT:
if ((t.min == 0) && (t.max == -1))
program[pc - 1] = (t.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
else
if ((t.min == 0) && (t.max == 1))
program[pc - 1] = (t.greedy) ? REOP_OPT : REOP_MINIMALOPT;
else
if ((t.min == 1) && (t.max == -1))
program[pc - 1] = (t.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
else {
if (!t.greedy) program[pc - 1] = REOP_MINIMALQUANT;
pc = addIndex(program, pc, t.min);
// max can be -1 which addIndex does not accept
pc = addIndex(program, pc, t.max + 1);
}
pc = addIndex(program, pc, t.parenCount);
pc = addIndex(program, pc, t.parenIndex);
nextTermFixup = pc;
pc += OFFSET_LEN;
pc = emitREBytecode(state, re, pc, t.kid);
program[pc++] = REOP_ENDCHILD;
resolveForwardJump(program, nextTermFixup, pc);
break;
case REOP_CLASS:
pc = addIndex(program, pc, t.index);
re.classList[t.index] = new RECharSet(t.bmsize, t.startIndex,
t.kidlen);
break;
default:
break;
}
t = t.next;
}
return pc;
}
private static void
pushProgState(REGlobalData gData, int min, int max,
REBackTrackData backTrackLastToSave,
int continuation_pc, int continuation_op)
{
gData.stateStackTop = new REProgState(gData.stateStackTop, min, max,
gData.cp, backTrackLastToSave,
continuation_pc,
continuation_op);
}
private static REProgState
popProgState(REGlobalData gData)
{
REProgState state = gData.stateStackTop;
gData.stateStackTop = state.previous;
return state;
}
private static void
pushBackTrackState(REGlobalData gData, byte op, int target)
{
gData.backTrackStackTop = new REBackTrackData(gData, op, target);
}
/*
* Consecutive literal characters.
*/
private static boolean
flatNMatcher(REGlobalData gData, int matchChars,
int length, char[] chars, int end)
{
if ((gData.cp + length) > end)
return false;
for (int i = 0; i < length; i++) {
if (gData.regexp.source[matchChars + i] != chars[gData.cp + i]) {
return false;
}
}
gData.cp += length;
return true;
}
private static boolean
flatNIMatcher(REGlobalData gData, int matchChars,
int length, char[] chars, int end)
{
if ((gData.cp + length) > end)
return false;
for (int i = 0; i < length; i++) {
if (upcase(gData.regexp.source[matchChars + i])
!= upcase(chars[gData.cp + i]))
{
return false;
}
}
gData.cp += length;
return true;
}
/*
1. Evaluate DecimalEscape to obtain an EscapeValue E.
2. If E is not a character then go to step 6.
3. Let ch be E's character.
4. Let A be a one-element RECharSet containing the character ch.
5. Call CharacterSetMatcher(A, false) and return its Matcher result.
6. E must be an integer. Let n be that integer.
7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
8. Return an internal Matcher closure that takes two arguments, a State x
and a Continuation c, and performs the following:
1. Let cap be x's captures internal array.
2. Let s be cap[n].
3. If s is undefined, then call c(x) and return its result.
4. Let e be x's endIndex.
5. Let len be s's length.
6. Let f be e+len.
7. If f>InputLength, return failure.
8. If there exists an integer i between 0 (inclusive) and len (exclusive)
such that Canonicalize(s[i]) is not the same character as
Canonicalize(Input [e+i]), then return failure.
9. Let y be the State (f, cap).
10. Call c(y) and return its result.
*/
private static boolean
backrefMatcher(REGlobalData gData, int parenIndex,
char[] chars, int end)
{
int len;
int i;
int parenContent = gData.parens_index(parenIndex);
if (parenContent == -1)
return true;
len = gData.parens_length(parenIndex);
if ((gData.cp + len) > end)
return false;
if ((gData.regexp.flags & JSREG_FOLD) != 0) {
for (i = 0; i < len; i++) {
if (upcase(chars[parenContent + i]) != upcase(chars[gData.cp + i]))
return false;
}
}
else {
for (i = 0; i < len; i++) {
if (chars[parenContent + i] != chars[gData.cp + i])
return false;
}
}
gData.cp += len;
return true;
}
/* Add a single character to the RECharSet */
private static void
addCharacterToCharSet(RECharSet cs, char c)
{
int byteIndex = (c / 8);
if (c > cs.length)
throw new RuntimeException();
cs.bits[byteIndex] |= 1 << (c & 0x7);
}
/* Add a character range, c1 to c2 (inclusive) to the RECharSet */
private static void
addCharacterRangeToCharSet(RECharSet cs, char c1, char c2)
{
int i;
int byteIndex1 = (c1 / 8);
int byteIndex2 = (c2 / 8);
if ((c2 > cs.length) || (c1 > c2))
throw new RuntimeException();
c1 &= 0x7;
c2 &= 0x7;
if (byteIndex1 == byteIndex2) {
cs.bits[byteIndex1] |= ((0xFF) >> (7 - (c2 - c1))) << c1;
}
else {
cs.bits[byteIndex1] |= 0xFF << c1;
for (i = byteIndex1 + 1; i < byteIndex2; i++)
cs.bits[i] = (byte)0xFF;
cs.bits[byteIndex2] |= (0xFF) >> (7 - c2);
}
}
/* Compile the source of the class into a RECharSet */
private static void
processCharSet(REGlobalData gData, RECharSet charSet)
{
synchronized (charSet) {
if (!charSet.converted) {
processCharSetImpl(gData, charSet);
charSet.converted = true;
}
}
}
private static void
processCharSetImpl(REGlobalData gData, RECharSet charSet)
{
int src = charSet.startIndex;
int end = src + charSet.strlength;
char rangeStart = 0, thisCh;
int byteLength;
char c;
int n;
int nDigits;
int i;
boolean inRange = false;
charSet.sense = true;
byteLength = (charSet.length / 8) + 1;
charSet.bits = new byte[byteLength];
if (src == end)
return;
if (gData.regexp.source[src] == '^') {
charSet.sense = false;
++src;
}
while (src != end) {
nDigits = 2;
switch (gData.regexp.source[src]) {
case '\\':
++src;
c = gData.regexp.source[src++];
switch (c) {
case 'b':
thisCh = 0x8;
break;
case 'f':
thisCh = 0xC;
break;
case 'n':
thisCh = 0xA;
break;
case 'r':
thisCh = 0xD;
break;
case 't':
thisCh = 0x9;
break;
case 'v':
thisCh = 0xB;
break;
case 'c':
if (((src + 1) < end) && isWord(gData.regexp.source[src + 1]))
thisCh = (char)(gData.regexp.source[src++] & 0x1F);
else {
--src;
thisCh = '\\';
}
break;
case 'u':
nDigits += 2;
// fall thru
case 'x':
n = 0;
for (i = 0; (i < nDigits) && (src < end); i++) {
c = gData.regexp.source[src++];
int digit = toASCIIHexDigit(c);
if (digit < 0) {
/* back off to accepting the original '\'
* as a literal
*/
src -= (i + 1);
n = '\\';
break;
}
n = (n << 4) | digit;
}
thisCh = (char)(n);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/*
* This is a non-ECMA extension - decimal escapes (in this
* case, octal!) are supposed to be an error inside class
* ranges, but supported here for backwards compatibility.
*
*/
n = (c - '0');
c = gData.regexp.source[src];
if ('0' <= c && c <= '7') {
src++;
n = 8 * n + (c - '0');
c = gData.regexp.source[src];
if ('0' <= c && c <= '7') {
src++;
i = 8 * n + (c - '0');
if (i <= 0377)
n = i;
else
src--;
}
}
thisCh = (char)(n);
break;
case 'd':
addCharacterRangeToCharSet(charSet, '0', '9');
continue; /* don't need range processing */
case 'D':
addCharacterRangeToCharSet(charSet, (char)0, (char)('0' - 1));
addCharacterRangeToCharSet(charSet, (char)('9' + 1),
(char)(charSet.length));
continue;
case 's':
for (i = charSet.length; i >= 0; i--)
if (isREWhiteSpace(i))
addCharacterToCharSet(charSet, (char)(i));
continue;
case 'S':
for (i = charSet.length; i >= 0; i--)
if (!isREWhiteSpace(i))
addCharacterToCharSet(charSet, (char)(i));
continue;
case 'w':
for (i = charSet.length; i >= 0; i--)
if (isWord((char)i))
addCharacterToCharSet(charSet, (char)(i));
continue;
case 'W':
for (i = charSet.length; i >= 0; i--)
if (!isWord((char)i))
addCharacterToCharSet(charSet, (char)(i));
continue;
default:
thisCh = c;
break;
}
break;
default:
thisCh = gData.regexp.source[src++];
break;
}
if (inRange) {
if ((gData.regexp.flags & JSREG_FOLD) != 0) {
addCharacterRangeToCharSet(charSet,
upcase(rangeStart),
upcase(thisCh));
addCharacterRangeToCharSet(charSet,
downcase(rangeStart),
downcase(thisCh));
} else {
addCharacterRangeToCharSet(charSet, rangeStart, thisCh);
}
inRange = false;
}
else {
if ((gData.regexp.flags & JSREG_FOLD) != 0) {
addCharacterToCharSet(charSet, upcase(thisCh));
addCharacterToCharSet(charSet, downcase(thisCh));
} else {
addCharacterToCharSet(charSet, thisCh);
}
if (src < (end - 1)) {
if (gData.regexp.source[src] == '-') {
++src;
inRange = true;
rangeStart = thisCh;
}
}
}
}
}
/*
* Initialize the character set if it this is the first call.
* Test the bit - if the ^ flag was specified, non-inclusion is a success
*/
private static boolean
classMatcher(REGlobalData gData, RECharSet charSet, char ch)
{
if (!charSet.converted) {
processCharSet(gData, charSet);
}
int byteIndex = ch / 8;
if (charSet.sense) {
if ((charSet.length == 0) ||
( (ch > charSet.length)
|| ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0) ))
return false;
} else {
if (! ((charSet.length == 0) ||
( (ch > charSet.length)
|| ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0) )))
return false;
}
return true;
}
private static boolean
executeREBytecode(REGlobalData gData, char[] chars, int end)
{
int pc = 0;
byte program[] = gData.regexp.program;
int currentContinuation_op;
int currentContinuation_pc;
boolean result = false;
currentContinuation_pc = 0;
currentContinuation_op = REOP_END;
if (debug) {
System.out.println("Input = \"" + new String(chars) + "\", start at " + gData.cp);
}
int op = program[pc++];
for (;;) {
if (debug) {
System.out.println("Testing at " + gData.cp + ", op = " + op);
}
switch (op) {
case REOP_EMPTY:
result = true;
break;
case REOP_BOL:
if (gData.cp != 0) {
if (gData.multiline ||
((gData.regexp.flags & JSREG_MULTILINE) != 0)) {
if (!isLineTerm(chars[gData.cp - 1])) {
result = false;
break;
}
}
else {
result = false;
break;
}
}
result = true;
break;
case REOP_EOL:
if (gData.cp != end) {
if (gData.multiline ||
((gData.regexp.flags & JSREG_MULTILINE) != 0)) {
if (!isLineTerm(chars[gData.cp])) {
result = false;
break;
}
}
else {
result = false;
break;
}
}
result = true;
break;
case REOP_WBDRY:
result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1]))
^ !((gData.cp < end) && isWord(chars[gData.cp])));
break;
case REOP_WNONBDRY:
result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1]))
^ ((gData.cp < end) && isWord(chars[gData.cp])));
break;
case REOP_DOT:
result = (gData.cp != end && !isLineTerm(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_DIGIT:
result = (gData.cp != end && isDigit(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_NONDIGIT:
result = (gData.cp != end && !isDigit(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_SPACE:
result = (gData.cp != end && isREWhiteSpace(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_NONSPACE:
result = (gData.cp != end && !isREWhiteSpace(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_ALNUM:
result = (gData.cp != end && isWord(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_NONALNUM:
result = (gData.cp != end && !isWord(chars[gData.cp]));
if (result) {
gData.cp++;
}
break;
case REOP_FLAT:
{
int offset = getIndex(program, pc);
pc += INDEX_LEN;
int length = getIndex(program, pc);
pc += INDEX_LEN;
result = flatNMatcher(gData, offset, length, chars, end);
}
break;
case REOP_FLATi:
{
int offset = getIndex(program, pc);
pc += INDEX_LEN;
int length = getIndex(program, pc);
pc += INDEX_LEN;
result = flatNIMatcher(gData, offset, length, chars, end);
}
break;
case REOP_FLAT1:
{
char matchCh = (char)(program[pc++] & 0xFF);
result = (gData.cp != end && chars[gData.cp] == matchCh);
if (result) {
gData.cp++;
}
}
break;
case REOP_FLAT1i:
{
char matchCh = (char)(program[pc++] & 0xFF);
result = (gData.cp != end
&& upcase(chars[gData.cp]) == upcase(matchCh));
if (result) {
gData.cp++;
}
}
break;
case REOP_UCFLAT1:
{
char matchCh = (char)getIndex(program, pc);
pc += INDEX_LEN;
result = (gData.cp != end && chars[gData.cp] == matchCh);
if (result) {
gData.cp++;
}
}
break;
case REOP_UCFLAT1i:
{
char matchCh = (char)getIndex(program, pc);
pc += INDEX_LEN;
result = (gData.cp != end
&& upcase(chars[gData.cp]) == upcase(matchCh));
if (result) {
gData.cp++;
}
}
break;
case REOP_ALT:
{
int nextpc;
byte nextop;
pushProgState(gData, 0, 0, null,
currentContinuation_pc,
currentContinuation_op);
nextpc = pc + getOffset(program, pc);
nextop = program[nextpc++];
pushBackTrackState(gData, nextop, nextpc);
pc += INDEX_LEN;
op = program[pc++];
}
continue;
case REOP_JUMP:
{
int offset;
REProgState state = popProgState(gData);
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
offset = getOffset(program, pc);
pc += offset;
op = program[pc++];
}
continue;
case REOP_LPAREN:
{
int parenIndex = getIndex(program, pc);
pc += INDEX_LEN;
gData.set_parens(parenIndex, gData.cp, 0);
op = program[pc++];
}
continue;
case REOP_RPAREN:
{
int cap_index;
int parenIndex = getIndex(program, pc);
pc += INDEX_LEN;
cap_index = gData.parens_index(parenIndex);
gData.set_parens(parenIndex, cap_index,
gData.cp - cap_index);
if (parenIndex > gData.lastParen)
gData.lastParen = parenIndex;
op = program[pc++];
}
continue;
case REOP_BACKREF:
{
int parenIndex = getIndex(program, pc);
pc += INDEX_LEN;
result = backrefMatcher(gData, parenIndex, chars, end);
}
break;
case REOP_CLASS:
{
int index = getIndex(program, pc);
pc += INDEX_LEN;
if (gData.cp != end) {
if (classMatcher(gData, gData.regexp.classList[index],
chars[gData.cp]))
{
gData.cp++;
result = true;
break;
}
}
result = false;
}
break;
case REOP_ASSERT:
case REOP_ASSERT_NOT:
{
byte testOp;
pushProgState(gData, 0, 0, gData.backTrackStackTop,
currentContinuation_pc,
currentContinuation_op);
if (op == REOP_ASSERT) {
testOp = REOP_ASSERTTEST;
} else {
testOp = REOP_ASSERTNOTTEST;
}
pushBackTrackState(gData, testOp,
pc + getOffset(program, pc));
pc += INDEX_LEN;
op = program[pc++];
}
continue;
case REOP_ASSERTTEST:
case REOP_ASSERTNOTTEST:
{
REProgState state = popProgState(gData);
gData.cp = state.index;
gData.backTrackStackTop = state.backTrack;
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
if (result) {
if (op == REOP_ASSERTTEST) {
result = true;
} else {
result = false;
}
} else {
if (op == REOP_ASSERTTEST) {
// Do nothing
} else {
result = true;
}
}
}
break;
case REOP_STAR:
case REOP_PLUS:
case REOP_OPT:
case REOP_QUANT:
case REOP_MINIMALSTAR:
case REOP_MINIMALPLUS:
case REOP_MINIMALOPT:
case REOP_MINIMALQUANT:
{
int min, max;
boolean greedy = false;
switch (op) {
case REOP_STAR:
greedy = true;
// fallthrough
case REOP_MINIMALSTAR:
min = 0;
max = -1;
break;
case REOP_PLUS:
greedy = true;
// fallthrough
case REOP_MINIMALPLUS:
min = 1;
max = -1;
break;
case REOP_OPT:
greedy = true;
// fallthrough
case REOP_MINIMALOPT:
min = 0;
max = 1;
break;
case REOP_QUANT:
greedy = true;
// fallthrough
case REOP_MINIMALQUANT:
min = getOffset(program, pc);
pc += INDEX_LEN;
// See comments in emitREBytecode for " - 1" reason
max = getOffset(program, pc) - 1;
pc += INDEX_LEN;
break;
default:
throw Kit.codeBug();
}
pushProgState(gData, min, max, null,
currentContinuation_pc,
currentContinuation_op);
if (greedy) {
currentContinuation_op = REOP_REPEAT;
currentContinuation_pc = pc;
pushBackTrackState(gData, REOP_REPEAT, pc);
/* Step over , & */
pc += 3 * INDEX_LEN;
op = program[pc++];
} else {
if (min != 0) {
currentContinuation_op = REOP_MINIMALREPEAT;
currentContinuation_pc = pc;
/* & */
pc += 3 * INDEX_LEN;
op = program[pc++];
} else {
pushBackTrackState(gData, REOP_MINIMALREPEAT, pc);
popProgState(gData);
pc += 2 * INDEX_LEN; // &
pc = pc + getOffset(program, pc);
op = program[pc++];
}
}
}
continue;
case REOP_ENDCHILD:
// Use the current continuation.
pc = currentContinuation_pc;
op = currentContinuation_op;
continue;
case REOP_REPEAT:
{
REProgState state = popProgState(gData);
if (!result) {
//
// There's been a failure, see if we have enough
// children.
//
if (state.min == 0)
result = true;
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
pc += 2 * INDEX_LEN; /* & */
pc = pc + getOffset(program, pc);
break;
}
else {
if (state.min == 0 && gData.cp == state.index) {
// matched an empty string, that'll get us nowhere
result = false;
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
pc += 2 * INDEX_LEN;
pc = pc + getOffset(program, pc);
break;
}
int new_min = state.min, new_max = state.max;
if (new_min != 0) new_min--;
if (new_max != -1) new_max--;
if (new_max == 0) {
result = true;
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
pc += 2 * INDEX_LEN;
pc = pc + getOffset(program, pc);
break;
}
pushProgState(gData, new_min, new_max, null,
state.continuation_pc,
state.continuation_op);
currentContinuation_op = REOP_REPEAT;
currentContinuation_pc = pc;
pushBackTrackState(gData, REOP_REPEAT, pc);
int parenCount = getIndex(program, pc);
pc += INDEX_LEN;
int parenIndex = getIndex(program, pc);
pc += 2 * INDEX_LEN;
op = program[pc++];
for (int k = 0; k < parenCount; k++) {
gData.set_parens(parenIndex + k, -1, 0);
}
}
}
continue;
case REOP_MINIMALREPEAT:
{
REProgState state = popProgState(gData);
if (!result) {
//
// Non-greedy failure - try to consume another child.
//
if (state.max == -1 || state.max > 0) {
pushProgState(gData, state.min, state.max, null,
state.continuation_pc,
state.continuation_op);
currentContinuation_op = REOP_MINIMALREPEAT;
currentContinuation_pc = pc;
int parenCount = getIndex(program, pc);
pc += INDEX_LEN;
int parenIndex = getIndex(program, pc);
pc += 2 * INDEX_LEN;
for (int k = 0; k < parenCount; k++) {
gData.set_parens(parenIndex + k, -1, 0);
}
op = program[pc++];
continue;
} else {
// Don't need to adjust pc since we're going to pop.
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
break;
}
} else {
if (state.min == 0 && gData.cp == state.index) {
// Matched an empty string, that'll get us nowhere.
result = false;
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
break;
}
int new_min = state.min, new_max = state.max;
if (new_min != 0) new_min--;
if (new_max != -1) new_max--;
pushProgState(gData, new_min, new_max, null,
state.continuation_pc,
state.continuation_op);
if (new_min != 0) {
currentContinuation_op = REOP_MINIMALREPEAT;
currentContinuation_pc = pc;
int parenCount = getIndex(program, pc);
pc += INDEX_LEN;
int parenIndex = getIndex(program, pc);
pc += 2 * INDEX_LEN;
for (int k = 0; k < parenCount; k++) {
gData.set_parens(parenIndex + k, -1, 0);
}
op = program[pc++];
} else {
currentContinuation_pc = state.continuation_pc;
currentContinuation_op = state.continuation_op;
pushBackTrackState(gData, REOP_MINIMALREPEAT, pc);
popProgState(gData);
pc += 2 * INDEX_LEN;
pc = pc + getOffset(program, pc);
op = program[pc++];
}
continue;
}
}
case REOP_END:
return true;
default:
throw Kit.codeBug();
}
/*
* If the match failed and there's a backtrack option, take it.
* Otherwise this is a complete and utter failure.
*/
if (!result) {
REBackTrackData backTrackData = gData.backTrackStackTop;
if (backTrackData != null) {
gData.backTrackStackTop = backTrackData.previous;
gData.lastParen = backTrackData.lastParen;
// XXX: If backTrackData will no longer be used, then
// there is no need to clone backTrackData.parens
if (backTrackData.parens != null) {
gData.parens = backTrackData.parens.clone();
}
gData.cp = backTrackData.cp;
gData.stateStackTop = backTrackData.stateStackTop;
currentContinuation_op
= gData.stateStackTop.continuation_op;
currentContinuation_pc
= gData.stateStackTop.continuation_pc;
pc = backTrackData.continuation_pc;
op = backTrackData.continuation_op;
continue;
}
else
return false;
}
op = program[pc++];
}
}
private static boolean
matchRegExp(REGlobalData gData, RECompiled re,
char[] chars, int start, int end, boolean multiline)
{
if (re.parenCount != 0) {
gData.parens = new long[re.parenCount];
} else {
gData.parens = null;
}
gData.backTrackStackTop = null;
gData.stateStackTop = null;
gData.multiline = multiline;
gData.regexp = re;
gData.lastParen = 0;
int anchorCh = gData.regexp.anchorCh;
//
// have to include the position beyond the last character
// in order to detect end-of-input/line condition
//
for (int i = start; i <= end; ++i) {
//
// If the first node is a literal match, step the index into
// the string until that match is made, or fail if it can't be
// found at all.
//
if (anchorCh >= 0) {
for (;;) {
if (i == end) {
return false;
}
char matchCh = chars[i];
if (matchCh == anchorCh ||
((gData.regexp.flags & JSREG_FOLD) != 0
&& upcase(matchCh) == upcase((char)anchorCh)))
{
break;
}
++i;
}
}
gData.cp = i;
for (int j = 0; j < re.parenCount; j++) {
gData.set_parens(j, -1, 0);
}
boolean result = executeREBytecode(gData, chars, end);
gData.backTrackStackTop = null;
gData.stateStackTop = null;
if (result) {
gData.skipped = i - start;
return true;
}
}
return false;
}
/*
* indexp is assumed to be an array of length 1
*/
Object executeRegExp(Context cx, Scriptable scopeObj, RegExpImpl res,
String str, int indexp[], int matchType)
{
REGlobalData gData = new REGlobalData();
int start = indexp[0];
char[] charArray = str.toCharArray();
int end = charArray.length;
if (start > end)
start = end;
//
// Call the recursive matcher to do the real work.
//
boolean matches = matchRegExp(gData, re, charArray, start, end,
res.multiline);
if (!matches) {
if (matchType != PREFIX) return null;
return Undefined.instance;
}
int index = gData.cp;
int i = index;
indexp[0] = i;
int matchlen = i - (start + gData.skipped);
int ep = index;
index -= matchlen;
Object result;
Scriptable obj;
if (matchType == TEST) {
/*
* Testing for a match and updating cx.regExpImpl: don't allocate
* an array object, do return true.
*/
result = Boolean.TRUE;
obj = null;
}
else {
/*
* The array returned on match has element 0 bound to the matched
* string, elements 1 through re.parenCount bound to the paren
* matches, an index property telling the length of the left context,
* and an input property referring to the input string.
*/
Scriptable scope = getTopLevelScope(scopeObj);
result = ScriptRuntime.newObject(cx, scope, "Array", null);
obj = (Scriptable) result;
String matchstr = new String(charArray, index, matchlen);
obj.put(0, obj, matchstr);
}
if (re.parenCount == 0) {
res.parens = null;
res.lastParen = SubString.emptySubString;
} else {
SubString parsub = null;
int num;
res.parens = new SubString[re.parenCount];
for (num = 0; num < re.parenCount; num++) {
int cap_index = gData.parens_index(num);
String parstr;
if (cap_index != -1) {
int cap_length = gData.parens_length(num);
parsub = new SubString(charArray, cap_index, cap_length);
res.parens[num] = parsub;
if (matchType == TEST) continue;
parstr = parsub.toString();
obj.put(num+1, obj, parstr);
}
else {
if (matchType != TEST)
obj.put(num+1, obj, Undefined.instance);
}
}
res.lastParen = parsub;
}
if (! (matchType == TEST)) {
/*
* Define the index and input properties last for better for/in loop
* order (so they come after the elements).
*/
obj.put("index", obj, new Integer(start + gData.skipped));
obj.put("input", obj, str);
}
if (res.lastMatch == null) {
res.lastMatch = new SubString();
res.leftContext = new SubString();
res.rightContext = new SubString();
}
res.lastMatch.charArray = charArray;
res.lastMatch.index = index;
res.lastMatch.length = matchlen;
res.leftContext.charArray = charArray;
if (cx.getLanguageVersion() == Context.VERSION_1_2) {
/*
* JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used
* in scalar contexts, and unintentionally for the string.match "list"
* psuedo-context. On "hi there bye", the following would result:
*
* Language while(/ /g){print("$`");} s/ /$`/g
* perl4.036 "hi", "there" "hihitherehi therebye"
* perl5 "hi", "hi there" "hihitherehi therebye"
* js1.2 "hi", "there" "hihitheretherebye"
*
* Insofar as JS1.2 always defined $` as "left context from the last
* match" for global regexps, it was more consistent than perl4.
*/
res.leftContext.index = start;
res.leftContext.length = gData.skipped;
} else {
/*
* For JS1.3 and ECMAv2, emulate Perl5 exactly:
*
* js1.3 "hi", "hi there" "hihitherehi therebye"
*/
res.leftContext.index = 0;
res.leftContext.length = start + gData.skipped;
}
res.rightContext.charArray = charArray;
res.rightContext.index = ep;
res.rightContext.length = end - ep;
return result;
}
int getFlags()
{
return re.flags;
}
private static void reportWarning(Context cx, String messageId, String arg)
{
if (cx.hasFeature(Context.FEATURE_STRICT_MODE)) {
String msg = ScriptRuntime.getMessage1(messageId, arg);
Context.reportWarning(msg);
}
}
private static void reportError(String messageId, String arg)
{
String msg = ScriptRuntime.getMessage1(messageId, arg);
throw ScriptRuntime.constructError("SyntaxError", msg);
}
// #string_id_map#
private static final int
Id_lastIndex = 1,
Id_source = 2,
Id_global = 3,
Id_ignoreCase = 4,
Id_multiline = 5,
MAX_INSTANCE_ID = 5;
@Override
protected int getMaxInstanceId()
{
return MAX_INSTANCE_ID;
}
@Override
protected int findInstanceIdInfo(String s)
{
int id;
// #generated# Last update: 2007-05-09 08:16:24 EDT
L0: { id = 0; String X = null; int c;
int s_length = s.length();
if (s_length==6) {
c=s.charAt(0);
if (c=='g') { X="global";id=Id_global; }
else if (c=='s') { X="source";id=Id_source; }
}
else if (s_length==9) {
c=s.charAt(0);
if (c=='l') { X="lastIndex";id=Id_lastIndex; }
else if (c=='m') { X="multiline";id=Id_multiline; }
}
else if (s_length==10) { X="ignoreCase";id=Id_ignoreCase; }
if (X!=null && X!=s && !X.equals(s)) id = 0;
break L0;
}
// #/generated#
// #/string_id_map#
if (id == 0) return super.findInstanceIdInfo(s);
int attr;
switch (id) {
case Id_lastIndex:
attr = PERMANENT | DONTENUM;
break;
case Id_source:
case Id_global:
case Id_ignoreCase:
case Id_multiline:
attr = PERMANENT | READONLY | DONTENUM;
break;
default:
throw new IllegalStateException();
}
return instanceIdInfo(attr, id);
}
@Override
protected String getInstanceIdName(int id)
{
switch (id) {
case Id_lastIndex: return "lastIndex";
case Id_source: return "source";
case Id_global: return "global";
case Id_ignoreCase: return "ignoreCase";
case Id_multiline: return "multiline";
}
return super.getInstanceIdName(id);
}
@Override
protected Object getInstanceIdValue(int id)
{
switch (id) {
case Id_lastIndex:
return ScriptRuntime.wrapNumber(lastIndex);
case Id_source:
return new String(re.source);
case Id_global:
return ScriptRuntime.wrapBoolean((re.flags & JSREG_GLOB) != 0);
case Id_ignoreCase:
return ScriptRuntime.wrapBoolean((re.flags & JSREG_FOLD) != 0);
case Id_multiline:
return ScriptRuntime.wrapBoolean((re.flags & JSREG_MULTILINE) != 0);
}
return super.getInstanceIdValue(id);
}
@Override
protected void setInstanceIdValue(int id, Object value)
{
if (id == Id_lastIndex) {
lastIndex = ScriptRuntime.toNumber(value);
return;
}
super.setInstanceIdValue(id, value);
}
@Override
protected void initPrototypeId(int id)
{
String s;
int arity;
switch (id) {
case Id_compile: arity=1; s="compile"; break;
case Id_toString: arity=0; s="toString"; break;
case Id_toSource: arity=0; s="toSource"; break;
case Id_exec: arity=1; s="exec"; break;
case Id_test: arity=1; s="test"; break;
case Id_prefix: arity=1; s="prefix"; break;
default: throw new IllegalArgumentException(String.valueOf(id));
}
initPrototypeMethod(REGEXP_TAG, id, s, arity);
}
@Override
public Object execIdCall(IdFunctionObject f, Context cx, Scriptable scope,
Scriptable thisObj, Object[] args)
{
if (!f.hasTag(REGEXP_TAG)) {
return super.execIdCall(f, cx, scope, thisObj, args);
}
int id = f.methodId();
switch (id) {
case Id_compile:
return realThis(thisObj, f).compile(cx, scope, args);
case Id_toString:
case Id_toSource:
return realThis(thisObj, f).toString();
case Id_exec:
return realThis(thisObj, f).execSub(cx, scope, args, MATCH);
case Id_test: {
Object x = realThis(thisObj, f).execSub(cx, scope, args, TEST);
return Boolean.TRUE.equals(x) ? Boolean.TRUE : Boolean.FALSE;
}
case Id_prefix:
return realThis(thisObj, f).execSub(cx, scope, args, PREFIX);
}
throw new IllegalArgumentException(String.valueOf(id));
}
private static NativeRegExp realThis(Scriptable thisObj, IdFunctionObject f)
{
if (!(thisObj instanceof NativeRegExp))
throw incompatibleCallError(f);
return (NativeRegExp)thisObj;
}
// #string_id_map#
@Override
protected int findPrototypeId(String s)
{
int id;
// #generated# Last update: 2007-05-09 08:16:24 EDT
L0: { id = 0; String X = null; int c;
L: switch (s.length()) {
case 4: c=s.charAt(0);
if (c=='e') { X="exec";id=Id_exec; }
else if (c=='t') { X="test";id=Id_test; }
break L;
case 6: X="prefix";id=Id_prefix; break L;
case 7: X="compile";id=Id_compile; break L;
case 8: c=s.charAt(3);
if (c=='o') { X="toSource";id=Id_toSource; }
else if (c=='t') { X="toString";id=Id_toString; }
break L;
}
if (X!=null && X!=s && !X.equals(s)) id = 0;
break L0;
}
// #/generated#
return id;
}
private static final int
Id_compile = 1,
Id_toString = 2,
Id_toSource = 3,
Id_exec = 4,
Id_test = 5,
Id_prefix = 6,
MAX_PROTOTYPE_ID = 6;
// #/string_id_map#
private RECompiled re;
double lastIndex; /* index after last match, for //g iterator */
} // class NativeRegExp
class RECompiled implements Serializable
{
static final long serialVersionUID = -6144956577595844213L;
char []source; /* locked source string, sans // */
int parenCount; /* number of parenthesized submatches */
int flags; /* flags */
byte[] program; /* regular expression bytecode */
int classCount; /* count [...] bitmaps */
RECharSet[] classList; /* list of [...] bitmaps */
int anchorCh = -1; /* if >= 0, then re starts with this literal char */
}
class RENode {
RENode(byte op)
{
this.op = op;
}
byte op; /* r.e. op bytecode */
RENode next; /* next in concatenation order */
RENode kid; /* first operand */
RENode kid2; /* second operand */
int num; /* could be a number */
int parenIndex; /* or a parenthesis index */
/* or a range */
int min;
int max;
int parenCount;
boolean greedy;
/* or a character class */
int startIndex;
int kidlen; /* length of string at kid, in chars */
int bmsize; /* bitmap size, based on max char code */
int index; /* index into class list */
/* or a literal sequence */
char chr; /* of one character */
int length; /* or many (via the index) */
int flatIndex; /* which is -1 if not sourced */
}
class CompilerState {
CompilerState(Context cx, char[] source, int length, int flags)
{
this.cx = cx;
this.cpbegin = source;
this.cp = 0;
this.cpend = length;
this.flags = flags;
this.parenCount = 0;
this.classCount = 0;
this.progLength = 0;
}
Context cx;
char cpbegin[];
int cpend;
int cp;
int flags;
int parenCount;
int parenNesting;
int classCount; /* number of [] encountered */
int progLength; /* estimated bytecode length */
RENode result;
}
class REProgState
{
REProgState(REProgState previous, int min, int max, int index,
REBackTrackData backTrack,
int continuation_pc, int continuation_op)
{
this.previous = previous;
this.min = min;
this.max = max;
this.index = index;
this.continuation_op = continuation_op;
this.continuation_pc = continuation_pc;
this.backTrack = backTrack;
}
REProgState previous; // previous state in stack
int min; /* current quantifier min */
int max; /* current quantifier max */
int index; /* progress in text */
int continuation_op;
int continuation_pc;
REBackTrackData backTrack; // used by ASSERT_ to recover state
}
class REBackTrackData {
REBackTrackData(REGlobalData gData, int op, int pc)
{
previous = gData.backTrackStackTop;
continuation_op = op;
continuation_pc = pc;
lastParen = gData.lastParen;
if (gData.parens != null) {
parens = gData.parens.clone();
}
cp = gData.cp;
stateStackTop = gData.stateStackTop;
}
REBackTrackData previous;
int continuation_op; /* where to backtrack to */
int continuation_pc;
int lastParen;
long[] parens; /* parenthesis captures */
int cp; /* char buffer index */
REProgState stateStackTop; /* state of op that backtracked */
}
class REGlobalData {
boolean multiline;
RECompiled regexp; /* the RE in execution */
int lastParen; /* highest paren set so far */
int skipped; /* chars skipped anchoring this r.e. */
int cp; /* char buffer index */
long[] parens; /* parens captures */
REProgState stateStackTop; /* stack of state of current ancestors */
REBackTrackData backTrackStackTop; /* last matched-so-far position */
/**
* Get start of parenthesis capture contents, -1 for empty.
*/
int parens_index(int i)
{
return (int)(parens[i]);
}
/**
* Get length of parenthesis capture contents.
*/
int parens_length(int i)
{
return (int)(parens[i] >>> 32);
}
void set_parens(int i, int index, int length)
{
parens[i] = (index & 0xffffffffL) | ((long)length << 32);
}
}
/*
* This struct holds a bitmap representation of a class from a regexp.
* There's a list of these referenced by the classList field in the NativeRegExp
* struct below. The initial state has startIndex set to the offset in the
* original regexp source of the beginning of the class contents. The first
* use of the class converts the source representation into a bitmap.
*
*/
final class RECharSet implements Serializable
{
static final long serialVersionUID = 7931787979395898394L;
RECharSet(int length, int startIndex, int strlength)
{
this.length = length;
this.startIndex = startIndex;
this.strlength = strlength;
}
int length;
int startIndex;
int strlength;
volatile transient boolean converted;
volatile transient boolean sense;
volatile transient byte[] bits;
}