All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.mozilla.javascript.regexp.NativeRegExp Maven / Gradle / Ivy

Go to download

Rhino is an open-source implementation of JavaScript written entirely in Java. It is typically embedded into Java applications to provide scripting to end users.

The newest version!
/* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

package org.mozilla.javascript.regexp;

import java.io.Serializable;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.IdFunctionObject;
import org.mozilla.javascript.IdScriptableObject;
import org.mozilla.javascript.Kit;
import org.mozilla.javascript.NativeObject;
import org.mozilla.javascript.ScriptRuntime;
import org.mozilla.javascript.ScriptRuntimeES6;
import org.mozilla.javascript.Scriptable;
import org.mozilla.javascript.ScriptableObject;
import org.mozilla.javascript.Symbol;
import org.mozilla.javascript.SymbolKey;
import org.mozilla.javascript.TopLevel;
import org.mozilla.javascript.Undefined;

/**
 * This class implements the RegExp native object.
 *
 * 

Revision History: Implementation in C by Brendan Eich Initial port to Java by Norris Boyd from * jsregexp.c version 1.36 Merged up to version 1.38, which included Unicode support. Merged bug * fixes in version 1.39. Merged JSFUN13_BRANCH changes up to 1.32.2.13 * * @author Brendan Eich * @author Norris Boyd */ public class NativeRegExp extends IdScriptableObject { private static final long serialVersionUID = 4965263491464903264L; private static final Object REGEXP_TAG = new Object(); public static final int JSREG_GLOB = 0x1; // 'g' flag: global public static final int JSREG_FOLD = 0x2; // 'i' flag: fold public static final int JSREG_MULTILINE = 0x4; // 'm' flag: multiline public static final int JSREG_DOTALL = 0x8; // 's' flag: dotAll public static final int JSREG_STICKY = 0x10; // 'y' flag: sticky // type of match to perform public static final int TEST = 0; public static final int MATCH = 1; public static final int PREFIX = 2; private static final boolean debug = false; private static final byte REOP_SIMPLE_START = 1; /* start of 'simple opcodes' */ private static final byte REOP_EMPTY = 1; /* match rest of input against rest of r.e. */ private static final byte REOP_BOL = 2; /* beginning of input (or line if multiline) */ private static final byte REOP_EOL = 3; /* end of input (or line if multiline) */ private static final byte REOP_WBDRY = 4; /* match "" at word boundary */ private static final byte REOP_WNONBDRY = 5; /* match "" at word non-boundary */ private static final byte REOP_DOT = 6; /* stands for any character */ private static final byte REOP_DIGIT = 7; /* match a digit char: [0-9] */ private static final byte REOP_NONDIGIT = 8; /* match a non-digit char: [^0-9] */ private static final byte REOP_ALNUM = 9; /* match an alphanumeric char: [0-9a-z_A-Z] */ private static final byte REOP_NONALNUM = 10; /* match a non-alphanumeric char: [^0-9a-z_A-Z] */ private static final byte REOP_SPACE = 11; /* match a whitespace char */ private static final byte REOP_NONSPACE = 12; /* match a non-whitespace char */ private static final byte REOP_BACKREF = 13; /* back-reference (e.g., \1) to a parenthetical */ private static final byte REOP_FLAT = 14; /* match a flat string */ private static final byte REOP_FLAT1 = 15; /* match a single char */ private static final byte REOP_FLATi = 16; /* case-independent REOP_FLAT */ private static final byte REOP_FLAT1i = 17; /* case-independent REOP_FLAT1 */ private static final byte REOP_UCFLAT1 = 18; /* single Unicode char */ private static final byte REOP_UCFLAT1i = 19; /* case-independent REOP_UCFLAT1 */ // private static final byte REOP_UCFLAT = 20; /* flat Unicode string; len immediate // counts chars */ // private static final byte REOP_UCFLATi = 21; /* case-independent REOP_UCFLAT */ private static final byte REOP_CLASS = 22; /* character class with index */ private static final byte REOP_NCLASS = 23; /* negated character class with index */ private static final byte REOP_SIMPLE_END = 23; /* end of 'simple opcodes' */ private static final byte REOP_QUANT = 25; /* quantified atom: atom{1,2} */ private static final byte REOP_STAR = 26; /* zero or more occurrences of kid */ private static final byte REOP_PLUS = 27; /* one or more occurrences of kid */ private static final byte REOP_OPT = 28; /* optional subexpression in kid */ private static final byte REOP_LPAREN = 29; /* left paren bytecode: kid is u.num'th sub-regexp */ private static final byte REOP_RPAREN = 30; /* right paren bytecode */ private static final byte REOP_ALT = 31; /* alternative subexpressions in kid and next */ private static final byte REOP_JUMP = 32; /* for deoptimized closure loops */ // private static final byte REOP_DOTSTAR = 33; /* optimize .* to use a single opcode // */ // private static final byte REOP_ANCHOR = 34; /* like .* but skips left context to // unanchored r.e. */ // private static final byte REOP_EOLONLY = 35; /* $ not preceded by any pattern */ // private static final byte REOP_BACKREFi = 37; /* case-independent REOP_BACKREF */ // private static final byte REOP_LPARENNON = 40; /* non-capturing version of REOP_LPAREN // */ private static final byte REOP_ASSERT = 41; /* zero width positive lookahead assertion */ private static final byte REOP_ASSERT_NOT = 42; /* zero width negative lookahead assertion */ private static final byte REOP_ASSERTTEST = 43; /* sentinel at end of assertion child */ private static final byte REOP_ASSERTNOTTEST = 44; /* sentinel at end of !assertion child */ private static final byte REOP_MINIMALSTAR = 45; /* non-greedy version of * */ private static final byte REOP_MINIMALPLUS = 46; /* non-greedy version of + */ private static final byte REOP_MINIMALOPT = 47; /* non-greedy version of ? */ private static final byte REOP_MINIMALQUANT = 48; /* non-greedy version of {} */ private static final byte REOP_ENDCHILD = 49; /* sentinel at end of quantifier child */ private static final byte REOP_REPEAT = 51; /* directs execution of greedy quantifier */ private static final byte REOP_MINIMALREPEAT = 52; /* directs execution of non-greedy quantifier */ private static final byte REOP_ALTPREREQ = 53; /* prerequisite for ALT, either of two chars */ private static final byte REOP_ALTPREREQi = 54; /* case-independent REOP_ALTPREREQ */ private static final byte REOP_ALTPREREQ2 = 55; /* prerequisite for ALT, a char or a class */ // private static final byte REOP_ENDALT = 56; /* end of final alternate */ private static final byte REOP_END = 57; private static final int ANCHOR_BOL = -2; public static void init(Context cx, Scriptable scope, boolean sealed) { NativeRegExp proto = NativeRegExpInstantiator.withLanguageVersion(cx.getLanguageVersion()); proto.re = compileRE(cx, "", null, false); proto.activatePrototypeMap(MAX_PROTOTYPE_ID); proto.setParentScope(scope); proto.setPrototype(getObjectPrototype(scope)); NativeRegExpCtor ctor = new NativeRegExpCtor(); // Bug #324006: ECMA-262 15.10.6.1 says "The initial value of // RegExp.prototype.constructor is the builtin RegExp constructor." proto.defineProperty("constructor", ctor, ScriptableObject.DONTENUM); ScriptRuntime.setFunctionProtoAndParent(ctor, cx, scope); ctor.setImmunePrototypeProperty(proto); if (sealed) { proto.sealObject(); ctor.sealObject(); } defineProperty(scope, "RegExp", ctor, ScriptableObject.DONTENUM); ScriptRuntimeES6.addSymbolSpecies(cx, scope, ctor); } NativeRegExp(Scriptable scope, RECompiled regexpCompiled) { this.re = regexpCompiled; setLastIndex(ScriptRuntime.zeroObj); ScriptRuntime.setBuiltinProtoAndParent(this, scope, TopLevel.Builtins.RegExp); } @Override public String getClassName() { return "RegExp"; } /** * Gets the value to be returned by the typeof operator called on this object. * * @see org.mozilla.javascript.ScriptableObject#getTypeOf() * @return "object" */ @Override public String getTypeOf() { return "object"; } Scriptable compile(Context cx, Scriptable scope, Object[] args) { if (args.length > 0 && args[0] instanceof NativeRegExp) { if (args.length > 1 && args[1] != Undefined.instance) { // report error throw ScriptRuntime.typeErrorById("msg.bad.regexp.compile"); } NativeRegExp thatObj = (NativeRegExp) args[0]; this.re = thatObj.re; setLastIndex(thatObj.lastIndex); return this; } String s = args.length == 0 || args[0] instanceof Undefined ? "" : escapeRegExp(args[0]); String global = args.length > 1 && args[1] != Undefined.instance ? ScriptRuntime.toString(args[1]) : null; this.re = compileRE(cx, s, global, false); setLastIndex(ScriptRuntime.zeroObj); return this; } @Override public String toString() { StringBuilder buf = new StringBuilder(); buf.append('/'); if (re.source.length != 0) { buf.append(re.source); } else { // See bugzilla 226045 buf.append("(?:)"); } buf.append('/'); appendFlags(buf); return buf.toString(); } private void appendFlags(StringBuilder buf) { if ((re.flags & JSREG_GLOB) != 0) buf.append('g'); if ((re.flags & JSREG_FOLD) != 0) buf.append('i'); if ((re.flags & JSREG_MULTILINE) != 0) buf.append('m'); if ((re.flags & JSREG_DOTALL) != 0) buf.append('s'); if ((re.flags & JSREG_STICKY) != 0) buf.append('y'); } NativeRegExp() {} private static RegExpImpl getImpl(Context cx) { return (RegExpImpl) ScriptRuntime.getRegExpProxy(cx); } private static String escapeRegExp(Object src) { String s = ScriptRuntime.toString(src); // Escape any naked slashes in regexp source, see bug #510265 StringBuilder sb = null; // instantiated only if necessary int start = 0; int slash = s.indexOf('/'); while (slash > -1) { if (slash == start || s.charAt(slash - 1) != '\\') { if (sb == null) { sb = new StringBuilder(); } sb.append(s, start, slash); sb.append("\\/"); start = slash + 1; } slash = s.indexOf('/', slash + 1); } if (sb != null) { sb.append(s, start, s.length()); s = sb.toString(); } return s; } Object execSub(Context cx, Scriptable scopeObj, Object[] args, int matchType) { RegExpImpl reImpl = getImpl(cx); String str; if (args.length == 0) { str = reImpl.input; if (str == null) { str = ScriptRuntime.toString(Undefined.instance); } } else { str = ScriptRuntime.toString(args[0]); } boolean globalOrSticky = (re.flags & JSREG_GLOB) != 0 || (re.flags & JSREG_STICKY) != 0; double d = 0; if (globalOrSticky) { d = ScriptRuntime.toInteger(lastIndex); if (d < 0 || str.length() < d) { setLastIndex(ScriptRuntime.zeroObj); return null; } } int[] indexp = {(int) d}; Object rval = executeRegExp(cx, scopeObj, reImpl, str, indexp, matchType); if (globalOrSticky) { if (rval == null || rval == Undefined.instance) { setLastIndex(ScriptRuntime.zeroObj); } else { setLastIndex(Double.valueOf(indexp[0])); } } return rval; } static RECompiled compileRE(Context cx, String str, String global, boolean flat) { RECompiled regexp = new RECompiled(str); int length = str.length(); int flags = 0; if (global != null) { for (int i = 0; i < global.length(); i++) { char c = global.charAt(i); int f = 0; if (c == 'g') { f = JSREG_GLOB; } else if (c == 'i') { f = JSREG_FOLD; } else if (c == 'm') { f = JSREG_MULTILINE; } else if (c == 's') { f = JSREG_DOTALL; } else if (c == 'y') { f = JSREG_STICKY; } else { reportError("msg.invalid.re.flag", String.valueOf(c)); } if ((flags & f) != 0) { reportError("msg.invalid.re.flag", String.valueOf(c)); } flags |= f; } } regexp.flags = flags; CompilerState state = new CompilerState(cx, regexp.source, length, flags); if (flat && length > 0) { if (debug) { System.out.println("flat = \"" + str + "\""); } state.result = new RENode(REOP_FLAT); state.result.chr = state.cpbegin[0]; state.result.length = length; state.result.flatIndex = 0; state.progLength += 5; } else { if (!parseDisjunction(state)) return null; // Need to reparse if pattern contains invalid backreferences: // "Note: if the number of left parentheses is less than the number // specified in \#, the \# is taken as an octal escape" if (state.maxBackReference > state.parenCount) { state = new CompilerState(cx, regexp.source, length, flags); state.backReferenceLimit = state.parenCount; if (!parseDisjunction(state)) return null; } } regexp.program = new byte[state.progLength + 1]; if (state.classCount != 0) { regexp.classList = new RECharSet[state.classCount]; regexp.classCount = state.classCount; } int endPC = emitREBytecode(state, regexp, 0, state.result); regexp.program[endPC++] = REOP_END; if (debug) { System.out.println("Prog. length = " + endPC); for (int i = 0; i < endPC; i++) { System.out.print(regexp.program[i]); if (i < (endPC - 1)) System.out.print(", "); } System.out.println(); } regexp.parenCount = state.parenCount; // If re starts with literal, init anchorCh accordingly switch (regexp.program[0]) { case REOP_UCFLAT1: case REOP_UCFLAT1i: regexp.anchorCh = (char) getIndex(regexp.program, 1); break; case REOP_FLAT1: case REOP_FLAT1i: regexp.anchorCh = (char) (regexp.program[1] & 0xFF); break; case REOP_FLAT: case REOP_FLATi: int k = getIndex(regexp.program, 1); regexp.anchorCh = regexp.source[k]; break; case REOP_BOL: regexp.anchorCh = ANCHOR_BOL; break; case REOP_ALT: RENode n = state.result; if (n.kid.op == REOP_BOL && n.kid2.op == REOP_BOL) { regexp.anchorCh = ANCHOR_BOL; } break; } if (debug) { if (regexp.anchorCh >= 0) { System.out.println("Anchor ch = '" + (char) regexp.anchorCh + "'"); } } return regexp; } static boolean isDigit(char c) { return '0' <= c && c <= '9'; } private static boolean isWord(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || isDigit(c) || c == '_'; } private static boolean isControlLetter(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); } private static boolean isLineTerm(char c) { return ScriptRuntime.isJSLineTerminator(c); } private static boolean isREWhiteSpace(int c) { return ScriptRuntime.isJSWhitespaceOrLineTerminator(c); } /* * * 1. If IgnoreCase is false, return ch. * 2. Let u be ch converted to upper case as if by calling * String.prototype.toUpperCase on the one-character string ch. * 3. If u does not consist of a single character, return ch. * 4. Let cu be u's character. * 5. If ch's code point value is greater than or equal to decimal 128 and cu's * code point value is less than decimal 128, then return ch. * 6. Return cu. */ private static char upcase(char ch) { if (ch < 128) { if ('a' <= ch && ch <= 'z') { return (char) (ch + ('A' - 'a')); } return ch; } char cu = Character.toUpperCase(ch); return (cu < 128) ? ch : cu; } private static char downcase(char ch) { if (ch < 128) { if ('A' <= ch && ch <= 'Z') { return (char) (ch + ('a' - 'A')); } return ch; } char cl = Character.toLowerCase(ch); return (cl < 128) ? ch : cl; } /* * Validates and converts hex ascii value. */ private static int toASCIIHexDigit(int c) { if (c < '0') return -1; if (c <= '9') { return c - '0'; } c |= 0x20; if ('a' <= c && c <= 'f') { return c - 'a' + 10; } return -1; } /* * Top-down regular expression grammar, based closely on Perl4. * * regexp: altern A regular expression is one or more * altern '|' regexp alternatives separated by vertical bar. */ private static boolean parseDisjunction(CompilerState state) { if (!parseAlternative(state)) return false; char[] source = state.cpbegin; int index = state.cp; if (index != source.length && source[index] == '|') { RENode result; ++state.cp; result = new RENode(REOP_ALT); result.kid = state.result; if (!parseDisjunction(state)) return false; result.kid2 = state.result; state.result = result; /* * Look at both alternates to see if there's a FLAT or a CLASS at * the start of each. If so, use a prerequisite match. */ if (result.kid.op == REOP_FLAT && result.kid2.op == REOP_FLAT) { result.op = (state.flags & JSREG_FOLD) == 0 ? REOP_ALTPREREQ : REOP_ALTPREREQi; result.chr = result.kid.chr; result.index = result.kid2.chr; /* ALTPREREQ, uch1, uch2, , ..., JUMP, ... JUMP, */ state.progLength += 13; } else if (result.kid.op == REOP_CLASS && result.kid.index < 256 && result.kid2.op == REOP_FLAT && (state.flags & JSREG_FOLD) == 0) { result.op = REOP_ALTPREREQ2; result.chr = result.kid2.chr; result.index = result.kid.index; /* ALTPREREQ2, uch1, uch2, , ..., JUMP, ... JUMP, */ state.progLength += 13; } else if (result.kid.op == REOP_FLAT && result.kid2.op == REOP_CLASS && result.kid2.index < 256 && (state.flags & JSREG_FOLD) == 0) { result.op = REOP_ALTPREREQ2; result.chr = result.kid.chr; result.index = result.kid2.index; /* ALTPREREQ2, uch1, uch2, , ..., JUMP, ... JUMP, */ state.progLength += 13; } else { /* ALT, , ..., JUMP, ... JUMP, */ state.progLength += 9; } } return true; } /* * altern: item An alternative is one or more items, * item altern concatenated together. */ private static boolean parseAlternative(CompilerState state) { RENode headTerm = null; RENode tailTerm = null; char[] source = state.cpbegin; while (true) { if (state.cp == state.cpend || source[state.cp] == '|' || (state.parenNesting != 0 && source[state.cp] == ')')) { if (headTerm == null) { state.result = new RENode(REOP_EMPTY); } else state.result = headTerm; return true; } if (!parseTerm(state)) return false; if (headTerm == null) { headTerm = state.result; tailTerm = headTerm; } else tailTerm.next = state.result; while (tailTerm.next != null) tailTerm = tailTerm.next; } } /* calculate the total size of the bitmap required for a class expression */ private static boolean calculateBitmapSize( CompilerState state, RENode target, char[] src, int index, int end) { char rangeStart = 0; char c; int n; int nDigits; int i; int max = 0; boolean inRange = false; target.bmsize = 0; target.sense = true; if (index == end) return true; if (src[index] == '^') { ++index; target.sense = false; } while (index != end) { int localMax = 0; nDigits = 2; switch (src[index]) { case '\\': ++index; c = src[index++]; switch (c) { case 'b': localMax = 0x8; break; case 'f': localMax = 0xC; break; case 'n': localMax = 0xA; break; case 'r': localMax = 0xD; break; case 't': localMax = 0x9; break; case 'v': localMax = 0xB; break; case 'c': if ((index < end) && isControlLetter(src[index])) localMax = (char) (src[index++] & 0x1F); else --index; localMax = '\\'; break; case 'u': nDigits += 2; // fall through case 'x': n = 0; for (i = 0; (i < nDigits) && (index < end); i++) { c = src[index++]; n = Kit.xDigitToInt(c, n); if (n < 0) { // Back off to accepting the original // '\' as a literal index -= (i + 1); n = '\\'; break; } } localMax = n; break; case 'd': if (inRange) { target.bmsize = 65536; return true; } localMax = '9'; break; case 'D': case 'w': case 'W': case 'S': case 's': target.bmsize = 65536; return true; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = src[index]; if ('0' <= c && c <= '7') { index++; n = 8 * n + (c - '0'); c = src[index]; if ('0' <= c && c <= '7') { index++; i = 8 * n + (c - '0'); if (i <= 0377) n = i; else index--; } } localMax = n; break; default: localMax = c; break; } break; default: localMax = src[index++]; break; } if (inRange) { if (rangeStart > localMax) { reportError("msg.bad.range", ""); return false; } inRange = false; } else { if (index < (end - 1)) { if (src[index] == '-') { ++index; inRange = true; rangeStart = (char) localMax; continue; } } } if ((state.flags & JSREG_FOLD) != 0) { char cu = upcase((char) localMax); char cd = downcase((char) localMax); localMax = (cu >= cd) ? cu : cd; } if (localMax > max) max = localMax; } target.bmsize = max + 1; return true; } /* * item: assertion An item is either an assertion or * quantatom a quantified atom. * * assertion: '^' Assertions match beginning of string * (or line if the class static property * RegExp.multiline is true). * '$' End of string (or line if the class * static property RegExp.multiline is * true). * '\b' Word boundary (between \w and \W). * '\B' Word non-boundary. * * quantatom: atom An unquantified atom. * quantatom '{' n ',' m '}' * Atom must occur between n and m times. * quantatom '{' n ',' '}' Atom must occur at least n times. * quantatom '{' n '}' Atom must occur exactly n times. * quantatom '*' Zero or more times (same as {0,}). * quantatom '+' One or more times (same as {1,}). * quantatom '?' Zero or one time (same as {0,1}). * * any of which can be optionally followed by '?' for ungreedy * * atom: '(' regexp ')' A parenthesized regexp (what matched * can be addressed using a backreference, * see '\' n below). * '.' Matches any char except '\n'. * '[' classlist ']' A character class. * '[' '^' classlist ']' A negated character class. * '\f' Form Feed. * '\n' Newline (Line Feed). * '\r' Carriage Return. * '\t' Horizontal Tab. * '\v' Vertical Tab. * '\d' A digit (same as [0-9]). * '\D' A non-digit. * '\w' A word character, [0-9a-z_A-Z]. * '\W' A non-word character. * '\s' A whitespace character, [ \b\f\n\r\t\v]. * '\S' A non-whitespace character. * '\' n A backreference to the nth (n decimal * and positive) parenthesized expression. * '\' octal An octal escape sequence (octal must be * two or three digits long, unless it is * 0 for the null character). * '\x' hex A hex escape (hex must be two digits). * '\c' ctrl A control character, ctrl is a letter. * '\' literalatomchar Any character except one of the above * that follow '\' in an atom. * otheratomchar Any character not first among the other * atom right-hand sides. */ private static void doFlat(CompilerState state, char c) { state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = -1; state.progLength += 3; } private static int getDecimalValue( char c, CompilerState state, int maxValue, String overflowMessageId) { boolean overflow = false; int start = state.cp; char[] src = state.cpbegin; int value = c - '0'; for (; state.cp != state.cpend; ++state.cp) { c = src[state.cp]; if (!isDigit(c)) { break; } if (!overflow) { int v = value * 10 + (c - '0'); if (v < maxValue) { value = v; } else { overflow = true; value = maxValue; } } } if (overflow) { reportError(overflowMessageId, String.valueOf(src, start, state.cp - start)); } return value; } private static boolean parseTerm(CompilerState state) { char[] src = state.cpbegin; char c = src[state.cp++]; int nDigits = 2; int parenBaseCount = state.parenCount; int num; RENode term; int termStart; switch (c) { /* assertions and atoms */ case '^': state.result = new RENode(REOP_BOL); state.progLength++; return true; case '$': state.result = new RENode(REOP_EOL); state.progLength++; return true; case '\\': if (state.cp < state.cpend) { c = src[state.cp++]; switch (c) { /* assertion escapes */ case 'b': state.result = new RENode(REOP_WBDRY); state.progLength++; return true; case 'B': state.result = new RENode(REOP_WNONBDRY); state.progLength++; return true; /* Decimal escape */ case '0': /* * We're deliberately violating the ECMA 5.1 specification and allow octal * escapes to follow spidermonkey and general 'web reality': * http://wiki.ecmascript.org/doku.php?id=harmony:regexp_match_web_reality * http://wiki.ecmascript.org/doku.php?id=strawman:match_web_reality_spec */ reportWarning(state.cx, "msg.bad.backref", ""); /* octal escape */ num = 0; // follow spidermonkey and allow multiple leading zeros, // e.g. let /\0000/ match the string "\0" while (num < 040 && state.cp < state.cpend) { c = src[state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; num = 8 * num + (c - '0'); } else break; } c = (char) (num); doFlat(state, c); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': termStart = state.cp - 1; num = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.backref"); if (num > state.backReferenceLimit) reportWarning(state.cx, "msg.bad.backref", ""); /* * n > count of parentheses, then treat as octal instead. * Also see note above concerning 'web reality' */ if (num > state.backReferenceLimit) { state.cp = termStart; if (c >= '8') { // invalid octal escape, follow spidermonkey and // treat as \\8 resp. \\9 c = '\\'; doFlat(state, c); break; } state.cp++; num = c - '0'; while (num < 040 && state.cp < state.cpend) { c = src[state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; num = 8 * num + (c - '0'); } else break; } c = (char) (num); doFlat(state, c); break; } /* otherwise, it's a back-reference */ state.result = new RENode(REOP_BACKREF); state.result.parenIndex = num - 1; state.progLength += 3; if (state.maxBackReference < num) { state.maxBackReference = num; } break; /* Control escape */ case 'f': c = 0xC; doFlat(state, c); break; case 'n': c = 0xA; doFlat(state, c); break; case 'r': c = 0xD; doFlat(state, c); break; case 't': c = 0x9; doFlat(state, c); break; case 'v': c = 0xB; doFlat(state, c); break; /* Control letter */ case 'c': if ((state.cp < state.cpend) && isControlLetter(src[state.cp])) c = (char) (src[state.cp++] & 0x1F); else { /* back off to accepting the original '\' as a literal */ --state.cp; c = '\\'; } doFlat(state, c); break; /* UnicodeEscapeSequence */ case 'u': nDigits += 2; /* fall through */ case 'x': /* HexEscapeSequence */ { int n = 0; int i; for (i = 0; (i < nDigits) && (state.cp < state.cpend); i++) { c = src[state.cp++]; n = Kit.xDigitToInt(c, n); if (n < 0) { // Back off to accepting the original // 'u' or 'x' as a literal state.cp -= (i + 2); n = src[state.cp++]; break; } } c = (char) (n); } doFlat(state, c); break; /* Character class escapes */ case 'd': state.result = new RENode(REOP_DIGIT); state.progLength++; break; case 'D': state.result = new RENode(REOP_NONDIGIT); state.progLength++; break; case 's': state.result = new RENode(REOP_SPACE); state.progLength++; break; case 'S': state.result = new RENode(REOP_NONSPACE); state.progLength++; break; case 'w': state.result = new RENode(REOP_ALNUM); state.progLength++; break; case 'W': state.result = new RENode(REOP_NONALNUM); state.progLength++; break; /* IdentityEscape */ default: state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } break; } /* a trailing '\' is an error */ reportError("msg.trail.backslash", ""); return false; case '(': { RENode result = null; termStart = state.cp; if (state.cp + 1 < state.cpend && src[state.cp] == '?' && ((c = src[state.cp + 1]) == '=' || c == '!' || c == ':')) { state.cp += 2; if (c == '=') { result = new RENode(REOP_ASSERT); /* ASSERT, , ... ASSERTTEST */ state.progLength += 4; } else if (c == '!') { result = new RENode(REOP_ASSERT_NOT); /* ASSERTNOT, , ... ASSERTNOTTEST */ state.progLength += 4; } } else { result = new RENode(REOP_LPAREN); /* LPAREN, , ... RPAREN, */ state.progLength += 6; result.parenIndex = state.parenCount++; } ++state.parenNesting; if (!parseDisjunction(state)) return false; if (state.cp == state.cpend || src[state.cp] != ')') { reportError("msg.unterm.paren", ""); return false; } ++state.cp; --state.parenNesting; if (result != null) { result.kid = state.result; state.result = result; } break; } case ')': reportError("msg.re.unmatched.right.paren", ""); return false; case '[': state.result = new RENode(REOP_CLASS); termStart = state.cp; state.result.startIndex = termStart; while (true) { if (state.cp == state.cpend) { reportError("msg.unterm.class", ""); return false; } if (src[state.cp] == '\\') state.cp++; else { if (src[state.cp] == ']') { state.result.kidlen = state.cp - termStart; break; } } state.cp++; } state.result.index = state.classCount++; /* * Call calculateBitmapSize now as we want any errors it finds * to be reported during the parse phase, not at execution. */ if (!calculateBitmapSize(state, state.result, src, termStart, state.cp++)) return false; state.progLength += 3; /* CLASS, */ break; case '.': state.result = new RENode(REOP_DOT); state.progLength++; break; case '*': case '+': case '?': reportError("msg.bad.quant", String.valueOf(src[state.cp - 1])); return false; default: state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } term = state.result; if (state.cp == state.cpend) { return true; } boolean hasQ = false; switch (src[state.cp]) { case '+': state.result = new RENode(REOP_QUANT); state.result.min = 1; state.result.max = -1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '*': state.result = new RENode(REOP_QUANT); state.result.min = 0; state.result.max = -1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '?': state.result = new RENode(REOP_QUANT); state.result.min = 0; state.result.max = 1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '{': /* balance '}' */ { int min = 0; int max = -1; int leftCurl = state.cp; /* For Perl etc. compatibility, if quantifier does not match * \{\d+(,\d*)?\} exactly back off from it * being a quantifier, and chew it up as a literal * atom next time instead. */ if (++state.cp < src.length && isDigit(c = src[state.cp])) { ++state.cp; min = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.min"); if (state.cp < src.length) { c = src[state.cp]; if (c == ',' && ++state.cp < src.length) { c = src[state.cp]; if (isDigit(c) && ++state.cp < src.length) { max = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.max"); c = src[state.cp]; if (min > max) { String msg = ScriptRuntime.getMessageById( "msg.max.lt.min", Integer.valueOf(max), Integer.valueOf(min)); throw ScriptRuntime.constructError("SyntaxError", msg); } } } else { max = min; } /* balance '{' */ if (c == '}') { state.result = new RENode(REOP_QUANT); state.result.min = min; state.result.max = max; // QUANT, , , , // , ... state.progLength += 12; hasQ = true; } } } if (!hasQ) { state.cp = leftCurl; } break; } } if (!hasQ) return true; ++state.cp; state.result.kid = term; state.result.parenIndex = parenBaseCount; state.result.parenCount = state.parenCount - parenBaseCount; if ((state.cp < state.cpend) && (src[state.cp] == '?')) { ++state.cp; state.result.greedy = false; } else state.result.greedy = true; return true; } private static void resolveForwardJump(byte[] array, int from, int pc) { if (from > pc) throw Kit.codeBug(); addIndex(array, from, pc - from); } private static int getOffset(byte[] array, int pc) { return getIndex(array, pc); } private static int addIndex(byte[] array, int pc, int index) { if (index < 0) throw Kit.codeBug(); if (index > 0xFFFF) throw Context.reportRuntimeError("Too complex regexp"); array[pc] = (byte) (index >> 8); array[pc + 1] = (byte) (index); return pc + 2; } private static int getIndex(byte[] array, int pc) { return ((array[pc] & 0xFF) << 8) | (array[pc + 1] & 0xFF); } private static final int INDEX_LEN = 2; private static int emitREBytecode(CompilerState state, RECompiled re, int pc, RENode t) { RENode nextAlt; int nextAltFixup, nextTermFixup; byte[] program = re.program; while (t != null) { program[pc++] = t.op; switch (t.op) { case REOP_EMPTY: --pc; break; case REOP_ALTPREREQ: case REOP_ALTPREREQi: case REOP_ALTPREREQ2: boolean ignoreCase = t.op == REOP_ALTPREREQi; addIndex(program, pc, ignoreCase ? upcase(t.chr) : t.chr); pc += INDEX_LEN; addIndex(program, pc, ignoreCase ? upcase((char) t.index) : t.index); pc += INDEX_LEN; // fall through to REOP_ALT case REOP_ALT: nextAlt = t.kid2; nextAltFixup = pc; /* address of next alternate */ pc += INDEX_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_JUMP; nextTermFixup = pc; /* address of following term */ pc += INDEX_LEN; resolveForwardJump(program, nextAltFixup, pc); pc = emitREBytecode(state, re, pc, nextAlt); program[pc++] = REOP_JUMP; nextAltFixup = pc; pc += INDEX_LEN; resolveForwardJump(program, nextTermFixup, pc); resolveForwardJump(program, nextAltFixup, pc); break; case REOP_FLAT: /* * Consecutize FLAT's if possible. */ if (t.flatIndex != -1) { while ((t.next != null) && (t.next.op == REOP_FLAT) && ((t.flatIndex + t.length) == t.next.flatIndex)) { t.length += t.next.length; t.next = t.next.next; } } if ((t.flatIndex != -1) && (t.length > 1)) { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_FLATi; else program[pc - 1] = REOP_FLAT; pc = addIndex(program, pc, t.flatIndex); pc = addIndex(program, pc, t.length); } else { if (t.chr < 256) { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_FLAT1i; else program[pc - 1] = REOP_FLAT1; program[pc++] = (byte) (t.chr); } else { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_UCFLAT1i; else program[pc - 1] = REOP_UCFLAT1; pc = addIndex(program, pc, t.chr); } } break; case REOP_LPAREN: pc = addIndex(program, pc, t.parenIndex); pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_RPAREN; pc = addIndex(program, pc, t.parenIndex); break; case REOP_BACKREF: pc = addIndex(program, pc, t.parenIndex); break; case REOP_ASSERT: nextTermFixup = pc; pc += INDEX_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ASSERTTEST; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_ASSERT_NOT: nextTermFixup = pc; pc += INDEX_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ASSERTNOTTEST; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_QUANT: if ((t.min == 0) && (t.max == -1)) program[pc - 1] = (t.greedy) ? REOP_STAR : REOP_MINIMALSTAR; else if ((t.min == 0) && (t.max == 1)) program[pc - 1] = (t.greedy) ? REOP_OPT : REOP_MINIMALOPT; else if ((t.min == 1) && (t.max == -1)) program[pc - 1] = (t.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; else { if (!t.greedy) program[pc - 1] = REOP_MINIMALQUANT; pc = addIndex(program, pc, t.min); // max can be -1 which addIndex does not accept pc = addIndex(program, pc, t.max + 1); } pc = addIndex(program, pc, t.parenCount); pc = addIndex(program, pc, t.parenIndex); nextTermFixup = pc; pc += INDEX_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ENDCHILD; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_CLASS: if (!t.sense) program[pc - 1] = REOP_NCLASS; pc = addIndex(program, pc, t.index); re.classList[t.index] = new RECharSet( t.bmsize, t.startIndex, t.kidlen, t.sense); break; default: break; } t = t.next; } return pc; } private static void pushProgState( REGlobalData gData, int min, int max, int cp, REBackTrackData backTrackLastToSave, int continuationOp, int continuationPc) { gData.stateStackTop = new REProgState( gData.stateStackTop, min, max, cp, backTrackLastToSave, continuationOp, continuationPc); } private static REProgState popProgState(REGlobalData gData) { REProgState state = gData.stateStackTop; gData.stateStackTop = state.previous; return state; } private static void pushBackTrackState(REGlobalData gData, byte op, int pc) { REProgState state = gData.stateStackTop; gData.backTrackStackTop = new REBackTrackData( gData, op, pc, gData.cp, state.continuationOp, state.continuationPc); } private static void pushBackTrackState( REGlobalData gData, byte op, int pc, int cp, int continuationOp, int continuationPc) { gData.backTrackStackTop = new REBackTrackData(gData, op, pc, cp, continuationOp, continuationPc); } /* * Consecutive literal characters. */ private static boolean flatNMatcher( REGlobalData gData, int matchChars, int length, String input, int end) { if ((gData.cp + length) > end) return false; for (int i = 0; i < length; i++) { if (gData.regexp.source[matchChars + i] != input.charAt(gData.cp + i)) { return false; } } gData.cp += length; return true; } private static boolean flatNIMatcher( REGlobalData gData, int matchChars, int length, String input, int end) { if ((gData.cp + length) > end) return false; char[] source = gData.regexp.source; for (int i = 0; i < length; i++) { char c1 = source[matchChars + i]; char c2 = input.charAt(gData.cp + i); if (c1 != c2 && upcase(c1) != upcase(c2)) { return false; } } gData.cp += length; return true; } /* 1. Evaluate DecimalEscape to obtain an EscapeValue E. 2. If E is not a character then go to step 6. 3. Let ch be E's character. 4. Let A be a one-element RECharSet containing the character ch. 5. Call CharacterSetMatcher(A, false) and return its Matcher result. 6. E must be an integer. Let n be that integer. 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. 8. Return an internal Matcher closure that takes two arguments, a State x and a Continuation c, and performs the following: 1. Let cap be x's captures internal array. 2. Let s be cap[n]. 3. If s is undefined, then call c(x) and return its result. 4. Let e be x's endIndex. 5. Let len be s's length. 6. Let f be e+len. 7. If f>InputLength, return failure. 8. If there exists an integer i between 0 (inclusive) and len (exclusive) such that Canonicalize(s[i]) is not the same character as Canonicalize(Input [e+i]), then return failure. 9. Let y be the State (f, cap). 10. Call c(y) and return its result. */ private static boolean backrefMatcher( REGlobalData gData, int parenIndex, String input, int end) { int len; int i; if (gData.parens == null || parenIndex >= gData.parens.length) return false; int parenContent = gData.parensIndex(parenIndex); if (parenContent == -1) return true; len = gData.parensLength(parenIndex); if ((gData.cp + len) > end) return false; if ((gData.regexp.flags & JSREG_FOLD) != 0) { for (i = 0; i < len; i++) { char c1 = input.charAt(parenContent + i); char c2 = input.charAt(gData.cp + i); if (c1 != c2 && upcase(c1) != upcase(c2)) return false; } } else if (!input.regionMatches(parenContent, input, gData.cp, len)) { return false; } gData.cp += len; return true; } /* Add a single character to the RECharSet */ private static void addCharacterToCharSet(RECharSet cs, char c) { int byteIndex = (c / 8); if (c >= cs.length) { throw ScriptRuntime.constructError("SyntaxError", "invalid range in character class"); } cs.bits[byteIndex] |= 1 << (c & 0x7); } /* Add a character range, c1 to c2 (inclusive) to the RECharSet */ private static void addCharacterRangeToCharSet(RECharSet cs, char c1, char c2) { int i; int byteIndex1 = (c1 / 8); int byteIndex2 = (c2 / 8); if ((c2 >= cs.length) || (c1 > c2)) { throw ScriptRuntime.constructError("SyntaxError", "invalid range in character class"); } c1 &= 0x7; c2 &= 0x7; if (byteIndex1 == byteIndex2) { cs.bits[byteIndex1] |= ((0xFF) >> (7 - (c2 - c1))) << c1; } else { cs.bits[byteIndex1] |= 0xFF << c1; for (i = byteIndex1 + 1; i < byteIndex2; i++) cs.bits[i] = (byte) 0xFF; cs.bits[byteIndex2] |= (0xFF) >> (7 - c2); } } /* Compile the source of the class into a RECharSet */ private static void processCharSet(REGlobalData gData, RECharSet charSet) { synchronized (charSet) { if (!charSet.converted) { processCharSetImpl(gData, charSet); charSet.converted = true; } } } private static void processCharSetImpl(REGlobalData gData, RECharSet charSet) { int src = charSet.startIndex; int end = src + charSet.strlength; char rangeStart = 0, thisCh; int byteLength; char c; int n; int nDigits; int i; boolean inRange = false; byteLength = (charSet.length + 7) / 8; charSet.bits = new byte[byteLength]; if (src == end) return; if (gData.regexp.source[src] == '^') { assert (!charSet.sense); ++src; } else { assert (charSet.sense); } while (src != end) { nDigits = 2; switch (gData.regexp.source[src]) { case '\\': ++src; c = gData.regexp.source[src++]; switch (c) { case 'b': thisCh = 0x8; break; case 'f': thisCh = 0xC; break; case 'n': thisCh = 0xA; break; case 'r': thisCh = 0xD; break; case 't': thisCh = 0x9; break; case 'v': thisCh = 0xB; break; case 'c': if ((src < end) && isControlLetter(gData.regexp.source[src])) thisCh = (char) (gData.regexp.source[src++] & 0x1F); else { --src; thisCh = '\\'; } break; case 'u': nDigits += 2; // fall through case 'x': n = 0; for (i = 0; (i < nDigits) && (src < end); i++) { c = gData.regexp.source[src++]; int digit = toASCIIHexDigit(c); if (digit < 0) { /* back off to accepting the original '\' * as a literal */ src -= (i + 1); n = '\\'; break; } n = (n << 4) | digit; } thisCh = (char) (n); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = gData.regexp.source[src]; if ('0' <= c && c <= '7') { src++; n = 8 * n + (c - '0'); c = gData.regexp.source[src]; if ('0' <= c && c <= '7') { src++; i = 8 * n + (c - '0'); if (i <= 0377) n = i; else src--; } } thisCh = (char) (n); break; case 'd': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } addCharacterRangeToCharSet(charSet, '0', '9'); continue; /* don't need range processing */ case 'D': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } addCharacterRangeToCharSet(charSet, (char) 0, (char) ('0' - 1)); addCharacterRangeToCharSet( charSet, (char) ('9' + 1), (char) (charSet.length - 1)); continue; case 's': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } for (i = (charSet.length - 1); i >= 0; i--) if (isREWhiteSpace(i)) addCharacterToCharSet(charSet, (char) (i)); continue; case 'S': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } for (i = (charSet.length - 1); i >= 0; i--) if (!isREWhiteSpace(i)) addCharacterToCharSet(charSet, (char) (i)); continue; case 'w': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } for (i = (charSet.length - 1); i >= 0; i--) if (isWord((char) i)) addCharacterToCharSet(charSet, (char) (i)); continue; case 'W': if (inRange) { addCharacterToCharSet(charSet, '-'); inRange = false; } for (i = (charSet.length - 1); i >= 0; i--) if (!isWord((char) i)) addCharacterToCharSet(charSet, (char) (i)); continue; default: thisCh = c; break; } break; default: thisCh = gData.regexp.source[src++]; break; } if (inRange) { if ((gData.regexp.flags & JSREG_FOLD) != 0) { assert (rangeStart <= thisCh); for (c = rangeStart; c <= thisCh; ) { addCharacterToCharSet(charSet, c); char uch = upcase(c); char dch = downcase(c); if (c != uch) addCharacterToCharSet(charSet, uch); if (c != dch) addCharacterToCharSet(charSet, dch); if (++c == 0) break; // overflow } } else { addCharacterRangeToCharSet(charSet, rangeStart, thisCh); } inRange = false; } else { if ((gData.regexp.flags & JSREG_FOLD) != 0) { addCharacterToCharSet(charSet, upcase(thisCh)); addCharacterToCharSet(charSet, downcase(thisCh)); } else { addCharacterToCharSet(charSet, thisCh); } if (src < (end - 1)) { if (gData.regexp.source[src] == '-') { ++src; inRange = true; rangeStart = thisCh; } } } } } /* * Initialize the character set if it this is the first call. * Test the bit - if the ^ flag was specified, non-inclusion is a success */ private static boolean classMatcher(REGlobalData gData, RECharSet charSet, char ch) { if (!charSet.converted) { processCharSet(gData, charSet); } int byteIndex = ch >> 3; return (charSet.length == 0 || ch >= charSet.length || (charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0) ^ charSet.sense; } private static boolean reopIsSimple(int op) { return op >= REOP_SIMPLE_START && op <= REOP_SIMPLE_END; } /* * Apply the current op against the given input to see if * it's going to match or fail. Return false if we don't * get a match, true if we do and update the state of the * input and pc if the update flag is true. */ private static int simpleMatch( REGlobalData gData, String input, int op, byte[] program, int pc, int end, boolean updatecp) { boolean result = false; char matchCh; int parenIndex; int offset, length, index; int startcp = gData.cp; switch (op) { case REOP_EMPTY: result = true; break; case REOP_BOL: if (gData.cp != 0) { if (!gData.multiline || !isLineTerm(input.charAt(gData.cp - 1))) { break; } } result = true; break; case REOP_EOL: if (gData.cp != end) { if (!gData.multiline || !isLineTerm(input.charAt(gData.cp))) { break; } } result = true; break; case REOP_WBDRY: result = ((gData.cp == 0 || !isWord(input.charAt(gData.cp - 1))) ^ !((gData.cp < end) && isWord(input.charAt(gData.cp)))); break; case REOP_WNONBDRY: result = ((gData.cp == 0 || !isWord(input.charAt(gData.cp - 1))) ^ ((gData.cp < end) && isWord(input.charAt(gData.cp)))); break; case REOP_DOT: if (gData.cp != end && ((gData.regexp.flags & JSREG_DOTALL) != 0 || !isLineTerm(input.charAt(gData.cp)))) { result = true; gData.cp++; } break; case REOP_DIGIT: if (gData.cp != end && isDigit(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_NONDIGIT: if (gData.cp != end && !isDigit(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_ALNUM: if (gData.cp != end && isWord(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_NONALNUM: if (gData.cp != end && !isWord(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_SPACE: if (gData.cp != end && isREWhiteSpace(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_NONSPACE: if (gData.cp != end && !isREWhiteSpace(input.charAt(gData.cp))) { result = true; gData.cp++; } break; case REOP_BACKREF: { parenIndex = getIndex(program, pc); pc += INDEX_LEN; result = backrefMatcher(gData, parenIndex, input, end); } break; case REOP_FLAT: { offset = getIndex(program, pc); pc += INDEX_LEN; length = getIndex(program, pc); pc += INDEX_LEN; result = flatNMatcher(gData, offset, length, input, end); } break; case REOP_FLAT1: { matchCh = (char) (program[pc++] & 0xFF); if (gData.cp != end && input.charAt(gData.cp) == matchCh) { result = true; gData.cp++; } } break; case REOP_FLATi: { offset = getIndex(program, pc); pc += INDEX_LEN; length = getIndex(program, pc); pc += INDEX_LEN; result = flatNIMatcher(gData, offset, length, input, end); } break; case REOP_FLAT1i: { matchCh = (char) (program[pc++] & 0xFF); if (gData.cp != end) { char c = input.charAt(gData.cp); if (matchCh == c || upcase(matchCh) == upcase(c)) { result = true; gData.cp++; } } } break; case REOP_UCFLAT1: { matchCh = (char) getIndex(program, pc); pc += INDEX_LEN; if (gData.cp != end && input.charAt(gData.cp) == matchCh) { result = true; gData.cp++; } } break; case REOP_UCFLAT1i: { matchCh = (char) getIndex(program, pc); pc += INDEX_LEN; if (gData.cp != end) { char c = input.charAt(gData.cp); if (matchCh == c || upcase(matchCh) == upcase(c)) { result = true; gData.cp++; } } } break; case REOP_CLASS: case REOP_NCLASS: { index = getIndex(program, pc); pc += INDEX_LEN; if (gData.cp != end) { if (classMatcher( gData, gData.regexp.classList[index], input.charAt(gData.cp))) { gData.cp++; result = true; break; } } } break; default: throw Kit.codeBug(); } if (result) { if (!updatecp) gData.cp = startcp; return pc; } gData.cp = startcp; return -1; } private static boolean executeREBytecode( Context cx, REGlobalData gData, String input, int end) { int pc = 0; byte[] program = gData.regexp.program; int continuationOp = REOP_END; int continuationPc = 0; boolean result = false; int op = program[pc++]; /* * If the first node is a simple match, step the index into the string * until that match is made, or fail if it can't be found at all. */ if (gData.regexp.anchorCh < 0 && reopIsSimple(op)) { boolean anchor = false; while (gData.cp <= end) { int match = simpleMatch(gData, input, op, program, pc, end, true); if (match >= 0) { anchor = true; pc = match; /* accept skip to next opcode */ op = program[pc++]; break; } gData.skipped++; gData.cp++; } if (!anchor) return false; } for (; ; ) { ScriptRuntime.addInstructionCount(cx, 5); if (reopIsSimple(op)) { int match = simpleMatch(gData, input, op, program, pc, end, true); result = match >= 0; if (result) pc = match; /* accept skip to next opcode */ } else { switchStatement: switch (op) { case REOP_ALTPREREQ: case REOP_ALTPREREQi: case REOP_ALTPREREQ2: { char matchCh1 = (char) getIndex(program, pc); pc += INDEX_LEN; char matchCh2 = (char) getIndex(program, pc); pc += INDEX_LEN; if (gData.cp == end) { result = false; break; } char c = input.charAt(gData.cp); if (op == REOP_ALTPREREQ2) { if (c != matchCh1 && !classMatcher( gData, gData.regexp.classList[matchCh2], c)) { result = false; break; } } else { if (op == REOP_ALTPREREQi) c = upcase(c); if (c != matchCh1 && c != matchCh2) { result = false; break; } } } /* else false thru... */ // fall through case REOP_ALT: { int nextpc = pc + getOffset(program, pc); pc += INDEX_LEN; op = program[pc++]; int startcp = gData.cp; if (reopIsSimple(op)) { int match = simpleMatch(gData, input, op, program, pc, end, true); if (match < 0) { op = program[nextpc++]; pc = nextpc; continue; } result = true; pc = match; op = program[pc++]; } byte nextop = program[nextpc++]; pushBackTrackState( gData, nextop, nextpc, startcp, continuationOp, continuationPc); } continue; case REOP_JUMP: { int offset = getOffset(program, pc); pc += offset; op = program[pc++]; } continue; case REOP_LPAREN: { int parenIndex = getIndex(program, pc); pc += INDEX_LEN; gData.setParens(parenIndex, gData.cp, 0); op = program[pc++]; } continue; case REOP_RPAREN: { int parenIndex = getIndex(program, pc); pc += INDEX_LEN; int cap_index = gData.parensIndex(parenIndex); gData.setParens(parenIndex, cap_index, gData.cp - cap_index); op = program[pc++]; } continue; case REOP_ASSERT: { int nextpc = pc + getIndex(program, pc); /* start of term after ASSERT */ pc += INDEX_LEN; /* start of ASSERT child */ op = program[pc++]; if (reopIsSimple(op) && simpleMatch(gData, input, op, program, pc, end, false) < 0) { result = false; break; } pushProgState( gData, 0, 0, gData.cp, gData.backTrackStackTop, continuationOp, continuationPc); pushBackTrackState(gData, REOP_ASSERTTEST, nextpc); } continue; case REOP_ASSERT_NOT: { int nextpc = pc + getIndex(program, pc); /* start of term after ASSERT */ pc += INDEX_LEN; /* start of ASSERT child */ op = program[pc++]; if (reopIsSimple(op)) { int match = simpleMatch(gData, input, op, program, pc, end, false); if (match >= 0 && program[match] == REOP_ASSERTNOTTEST) { result = false; break; } } pushProgState( gData, 0, 0, gData.cp, gData.backTrackStackTop, continuationOp, continuationPc); pushBackTrackState(gData, REOP_ASSERTNOTTEST, nextpc); } continue; case REOP_ASSERTTEST: case REOP_ASSERTNOTTEST: { REProgState state = popProgState(gData); gData.cp = state.index; gData.backTrackStackTop = state.backTrack; continuationPc = state.continuationPc; continuationOp = state.continuationOp; if (op == REOP_ASSERTNOTTEST) { result = !result; } } break; case REOP_STAR: case REOP_PLUS: case REOP_OPT: case REOP_QUANT: case REOP_MINIMALSTAR: case REOP_MINIMALPLUS: case REOP_MINIMALOPT: case REOP_MINIMALQUANT: { int min, max; boolean greedy = false; switch (op) { case REOP_STAR: greedy = true; // fallthrough case REOP_MINIMALSTAR: min = 0; max = -1; break; case REOP_PLUS: greedy = true; // fallthrough case REOP_MINIMALPLUS: min = 1; max = -1; break; case REOP_OPT: greedy = true; // fallthrough case REOP_MINIMALOPT: min = 0; max = 1; break; case REOP_QUANT: greedy = true; // fallthrough case REOP_MINIMALQUANT: min = getOffset(program, pc); pc += INDEX_LEN; // See comments in emitREBytecode for " - 1" reason max = getOffset(program, pc) - 1; pc += INDEX_LEN; break; default: throw Kit.codeBug(); } pushProgState( gData, min, max, gData.cp, null, continuationOp, continuationPc); if (greedy) { pushBackTrackState(gData, REOP_REPEAT, pc); continuationOp = REOP_REPEAT; continuationPc = pc; /* Step over , & */ pc += 3 * INDEX_LEN; op = program[pc++]; } else { if (min != 0) { continuationOp = REOP_MINIMALREPEAT; continuationPc = pc; /* & */ pc += 3 * INDEX_LEN; op = program[pc++]; } else { pushBackTrackState(gData, REOP_MINIMALREPEAT, pc); popProgState(gData); pc += 2 * INDEX_LEN; // & pc = pc + getOffset(program, pc); op = program[pc++]; } } } continue; case REOP_ENDCHILD: /* marks the end of a quantifier child */ // If we have not gotten a result here, it is because of an // empty match. Do the same thing REOP_EMPTY would do. result = true; // Use the current continuation. pc = continuationPc; op = continuationOp; continue; case REOP_REPEAT: { int nextpc, nextop; do { REProgState state = popProgState(gData); if (!result) { // Failed, see if we have enough children. if (state.min == 0) result = true; continuationPc = state.continuationPc; continuationOp = state.continuationOp; pc += 2 * INDEX_LEN; /* & */ pc += getOffset(program, pc); break switchStatement; } if (state.min == 0 && (gData.cp == state.index || state.max == 0)) { // matched an empty string or an {0} quantifier, that'll get us // nowhere result = false; continuationPc = state.continuationPc; continuationOp = state.continuationOp; pc += 2 * INDEX_LEN; pc += getOffset(program, pc); break switchStatement; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; if (new_max == 0) { result = true; continuationPc = state.continuationPc; continuationOp = state.continuationOp; pc += 2 * INDEX_LEN; pc += getOffset(program, pc); break switchStatement; } nextpc = pc + 3 * INDEX_LEN; nextop = program[nextpc]; int startcp = gData.cp; if (reopIsSimple(nextop)) { nextpc++; int match = simpleMatch( gData, input, nextop, program, nextpc, end, true); if (match < 0) { result = (new_min == 0); continuationPc = state.continuationPc; continuationOp = state.continuationOp; pc += 2 * INDEX_LEN; /* & */ pc += getOffset(program, pc); break switchStatement; } result = true; nextpc = match; } continuationOp = REOP_REPEAT; continuationPc = pc; pushProgState( gData, new_min, new_max, startcp, null, state.continuationOp, state.continuationPc); if (new_min == 0) { pushBackTrackState( gData, REOP_REPEAT, pc, startcp, state.continuationOp, state.continuationPc); int parenCount = getIndex(program, pc); int parenIndex = getIndex(program, pc + INDEX_LEN); for (int k = 0; k < parenCount; k++) { gData.setParens(parenIndex + k, -1, 0); } } } while (program[nextpc] == REOP_ENDCHILD); pc = nextpc; op = program[pc++]; } continue; case REOP_MINIMALREPEAT: { REProgState state = popProgState(gData); if (!result) { // // Non-greedy failure - try to consume another child. // if (state.max == -1 || state.max > 0) { pushProgState( gData, state.min, state.max, gData.cp, null, state.continuationOp, state.continuationPc); continuationOp = REOP_MINIMALREPEAT; continuationPc = pc; int parenCount = getIndex(program, pc); pc += INDEX_LEN; int parenIndex = getIndex(program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.setParens(parenIndex + k, -1, 0); } op = program[pc++]; continue; } // Don't need to adjust pc since we're going to pop. continuationPc = state.continuationPc; continuationOp = state.continuationOp; break; } if (state.min == 0 && gData.cp == state.index) { // Matched an empty string, that'll get us nowhere. result = false; continuationPc = state.continuationPc; continuationOp = state.continuationOp; break; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; pushProgState( gData, new_min, new_max, gData.cp, null, state.continuationOp, state.continuationPc); if (new_min != 0) { continuationOp = REOP_MINIMALREPEAT; continuationPc = pc; int parenCount = getIndex(program, pc); pc += INDEX_LEN; int parenIndex = getIndex(program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.setParens(parenIndex + k, -1, 0); } op = program[pc++]; } else { continuationPc = state.continuationPc; continuationOp = state.continuationOp; pushBackTrackState(gData, REOP_MINIMALREPEAT, pc); popProgState(gData); pc += 2 * INDEX_LEN; pc = pc + getOffset(program, pc); op = program[pc++]; } continue; } case REOP_END: return true; default: throw Kit.codeBug("invalid bytecode"); } } /* * If the match failed and there's a backtrack option, take it. * Otherwise this is a complete and utter failure. */ if (!result) { REBackTrackData backTrackData = gData.backTrackStackTop; if (backTrackData != null) { gData.backTrackStackTop = backTrackData.previous; gData.parens = backTrackData.parens; gData.cp = backTrackData.cp; gData.stateStackTop = backTrackData.stateStackTop; continuationOp = backTrackData.continuationOp; continuationPc = backTrackData.continuationPc; pc = backTrackData.pc; op = backTrackData.op; continue; } return false; } op = program[pc++]; } } private static boolean matchRegExp( Context cx, REGlobalData gData, RECompiled re, String input, int start, int end, boolean multiline) { if (re.parenCount != 0) { gData.parens = new long[re.parenCount]; } else { gData.parens = null; } gData.backTrackStackTop = null; gData.stateStackTop = null; gData.multiline = multiline || (re.flags & JSREG_MULTILINE) != 0; gData.regexp = re; int anchorCh = gData.regexp.anchorCh; // // have to include the position beyond the last character // in order to detect end-of-input/line condition // for (int i = start; i <= end; ++i) { // // If the first node is a literal match, step the index into // the string until that match is made, or fail if it can't be // found at all. // if (anchorCh >= 0) { for (; ; ) { if (i == end) { return false; } char matchCh = input.charAt(i); if (matchCh == anchorCh || ((gData.regexp.flags & JSREG_FOLD) != 0 && upcase(matchCh) == upcase((char) anchorCh))) { break; } if ((gData.regexp.flags & JSREG_STICKY) != 0) { return false; } ++i; } } gData.cp = i; gData.skipped = i - start; for (int j = 0; j < re.parenCount; j++) { gData.parens[j] = -1L; } boolean result = executeREBytecode(cx, gData, input, end); gData.backTrackStackTop = null; gData.stateStackTop = null; if (result) { return true; } if (anchorCh == ANCHOR_BOL && !gData.multiline) { gData.skipped = end; return false; } if ((gData.regexp.flags & JSREG_STICKY) != 0) { return false; } i = start + gData.skipped; } return false; } /* * indexp is assumed to be an array of length 1 */ Object executeRegExp( Context cx, Scriptable scope, RegExpImpl res, String str, int[] indexp, int matchType) { REGlobalData gData = new REGlobalData(); int start = indexp[0]; int end = str.length(); if (start > end) start = end; // // Call the recursive matcher to do the real work. // boolean matches = matchRegExp(cx, gData, re, str, start, end, res.multiline); if (!matches) { if (matchType != PREFIX) return null; return Undefined.instance; } int index = gData.cp; int ep = indexp[0] = index; int matchlen = ep - (start + gData.skipped); index -= matchlen; Object result; Scriptable obj; if (matchType == TEST) { /* * Testing for a match and updating cx.regExpImpl: don't allocate * an array object, do return true. */ result = Boolean.TRUE; obj = null; } else { /* * The array returned on match has element 0 bound to the matched * string, elements 1 through re.parenCount bound to the paren * matches, an index property telling the length of the left context, * and an input property referring to the input string. */ result = cx.newArray(scope, 0); obj = (Scriptable) result; String matchstr = str.substring(index, index + matchlen); obj.put(0, obj, matchstr); } if (re.parenCount == 0) { res.parens = null; res.lastParen = new SubString(); } else { SubString parsub = null; int num; res.parens = new SubString[re.parenCount]; for (num = 0; num < re.parenCount; num++) { int cap_index = gData.parensIndex(num); if (cap_index != -1) { int cap_length = gData.parensLength(num); parsub = new SubString(str, cap_index, cap_length); res.parens[num] = parsub; if (matchType != TEST) obj.put(num + 1, obj, parsub.toString()); } else { if (matchType != TEST) obj.put(num + 1, obj, Undefined.instance); } } res.lastParen = parsub; } if (!(matchType == TEST)) { /* * Define the index and input properties last for better for/in loop * order (so they come after the elements). */ obj.put("index", obj, Integer.valueOf(start + gData.skipped)); obj.put("input", obj, str); } if (res.lastMatch == null) { res.lastMatch = new SubString(); res.leftContext = new SubString(); res.rightContext = new SubString(); } res.lastMatch.str = str; res.lastMatch.index = index; res.lastMatch.length = matchlen; res.leftContext.str = str; if (cx.getLanguageVersion() == Context.VERSION_1_2) { /* * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used * in scalar contexts, and unintentionally for the string.match "list" * psuedo-context. On "hi there bye", the following would result: * * Language while(/ /g){print("$`");} s/ /$`/g * perl4.036 "hi", "there" "hihitherehi therebye" * perl5 "hi", "hi there" "hihitherehi therebye" * js1.2 "hi", "there" "hihitheretherebye" * * Insofar as JS1.2 always defined $` as "left context from the last * match" for global regexps, it was more consistent than perl4. */ res.leftContext.index = start; res.leftContext.length = gData.skipped; } else { /* * For JS1.3 and ECMAv2, emulate Perl5 exactly: * * js1.3 "hi", "hi there" "hihitherehi therebye" */ res.leftContext.index = 0; res.leftContext.length = start + gData.skipped; } res.rightContext.str = str; res.rightContext.index = ep; res.rightContext.length = end - ep; return result; } int getFlags() { return re.flags; } private static void reportWarning(Context cx, String messageId, String arg) { if (cx.hasFeature(Context.FEATURE_STRICT_MODE)) { String msg = ScriptRuntime.getMessageById(messageId, arg); Context.reportWarning(msg); } } private static void reportError(String messageId, String arg) { String msg = ScriptRuntime.getMessageById(messageId, arg); throw ScriptRuntime.constructError("SyntaxError", msg); } private static final int Id_lastIndex = 1, Id_source = 2, Id_flags = 3, Id_global = 4, Id_ignoreCase = 5, Id_multiline = 6, Id_dotAll = 7, Id_sticky = 8, MAX_INSTANCE_ID = 8; @Override protected int getMaxInstanceId() { return MAX_INSTANCE_ID; } @Override protected int findInstanceIdInfo(String s) { int id; switch (s) { case "lastIndex": id = Id_lastIndex; break; case "source": id = Id_source; break; case "flags": id = Id_flags; break; case "global": id = Id_global; break; case "ignoreCase": id = Id_ignoreCase; break; case "multiline": id = Id_multiline; break; case "dotAll": id = Id_dotAll; break; case "sticky": id = Id_sticky; break; default: id = 0; break; } if (id == 0) return super.findInstanceIdInfo(s); int attr; switch (id) { case Id_lastIndex: attr = lastIndexAttr; break; case Id_source: case Id_flags: case Id_global: case Id_ignoreCase: case Id_multiline: case Id_dotAll: case Id_sticky: attr = PERMANENT | READONLY | DONTENUM; break; default: throw new IllegalStateException(); } return instanceIdInfo(attr, id); } @Override protected String getInstanceIdName(int id) { switch (id) { case Id_lastIndex: return "lastIndex"; case Id_source: return "source"; case Id_flags: return "flags"; case Id_global: return "global"; case Id_ignoreCase: return "ignoreCase"; case Id_multiline: return "multiline"; case Id_dotAll: return "dotAll"; case Id_sticky: return "sticky"; } return super.getInstanceIdName(id); } @Override protected Object getInstanceIdValue(int id) { switch (id) { case Id_lastIndex: return lastIndex; case Id_source: return new String(re.source); case Id_flags: { StringBuilder buf = new StringBuilder(); appendFlags(buf); return buf.toString(); } case Id_global: return ScriptRuntime.wrapBoolean((re.flags & JSREG_GLOB) != 0); case Id_ignoreCase: return ScriptRuntime.wrapBoolean((re.flags & JSREG_FOLD) != 0); case Id_multiline: return ScriptRuntime.wrapBoolean((re.flags & JSREG_MULTILINE) != 0); case Id_dotAll: return ScriptRuntime.wrapBoolean((re.flags & JSREG_DOTALL) != 0); case Id_sticky: return ScriptRuntime.wrapBoolean((re.flags & JSREG_STICKY) != 0); } return super.getInstanceIdValue(id); } private void setLastIndex(Object value) { if ((lastIndexAttr & READONLY) != 0) { throw ScriptRuntime.typeErrorById("msg.modify.readonly", "lastIndex"); } lastIndex = value; } @Override protected void setInstanceIdValue(int id, Object value) { switch (id) { case Id_lastIndex: setLastIndex(value); return; case Id_source: case Id_flags: case Id_global: case Id_ignoreCase: case Id_multiline: case Id_dotAll: case Id_sticky: return; } super.setInstanceIdValue(id, value); } @Override protected void setInstanceIdAttributes(int id, int attr) { switch (id) { case Id_lastIndex: lastIndexAttr = attr; return; } super.setInstanceIdAttributes(id, attr); } @Override protected void initPrototypeId(int id) { if (id == SymbolId_match) { initPrototypeMethod(REGEXP_TAG, id, SymbolKey.MATCH, "[Symbol.match]", 1); return; } if (id == SymbolId_search) { initPrototypeMethod(REGEXP_TAG, id, SymbolKey.SEARCH, "[Symbol.search]", 1); return; } String s; int arity; switch (id) { case Id_compile: arity = 2; s = "compile"; break; case Id_toString: arity = 0; s = "toString"; break; case Id_toSource: arity = 0; s = "toSource"; break; case Id_exec: arity = 1; s = "exec"; break; case Id_test: arity = 1; s = "test"; break; case Id_prefix: arity = 1; s = "prefix"; break; default: throw new IllegalArgumentException(String.valueOf(id)); } initPrototypeMethod(REGEXP_TAG, id, s, arity); } @Override public Object execIdCall( IdFunctionObject f, Context cx, Scriptable scope, Scriptable thisObj, Object[] args) { if (!f.hasTag(REGEXP_TAG)) { return super.execIdCall(f, cx, scope, thisObj, args); } int id = f.methodId(); switch (id) { case Id_compile: return realThis(thisObj, f).compile(cx, scope, args); case Id_toString: // thisObj != scope is a strange hack but i had no better idea for the moment if (thisObj != scope && thisObj instanceof NativeObject) { Object sourceObj = thisObj.get("source", thisObj); String source = sourceObj.equals(NOT_FOUND) ? "undefined" : escapeRegExp(sourceObj); Object flagsObj = thisObj.get("flags", thisObj); String flags = flagsObj.equals(NOT_FOUND) ? "undefined" : flagsObj.toString(); return "/" + source + "/" + flags; } return realThis(thisObj, f).toString(); case Id_toSource: return realThis(thisObj, f).toString(); case Id_exec: return realThis(thisObj, f).execSub(cx, scope, args, MATCH); case Id_test: { Object x = realThis(thisObj, f).execSub(cx, scope, args, TEST); return Boolean.TRUE.equals(x) ? Boolean.TRUE : Boolean.FALSE; } case Id_prefix: return realThis(thisObj, f).execSub(cx, scope, args, PREFIX); case SymbolId_match: return realThis(thisObj, f).execSub(cx, scope, args, MATCH); case SymbolId_search: Scriptable scriptable = (Scriptable) realThis(thisObj, f).execSub(cx, scope, args, MATCH); return scriptable == null ? -1 : scriptable.get("index", scriptable); } throw new IllegalArgumentException(String.valueOf(id)); } private static NativeRegExp realThis(Scriptable thisObj, IdFunctionObject f) { return ensureType(thisObj, NativeRegExp.class, f); } @Override protected int findPrototypeId(Symbol k) { if (SymbolKey.MATCH.equals(k)) { return SymbolId_match; } if (SymbolKey.SEARCH.equals(k)) { return SymbolId_search; } return 0; } @Override protected int findPrototypeId(String s) { int id; switch (s) { case "compile": id = Id_compile; break; case "toString": id = Id_toString; break; case "toSource": id = Id_toSource; break; case "exec": id = Id_exec; break; case "test": id = Id_test; break; case "prefix": id = Id_prefix; break; default: id = 0; break; } return id; } private static final int Id_compile = 1, Id_toString = 2, Id_toSource = 3, Id_exec = 4, Id_test = 5, Id_prefix = 6, SymbolId_match = 7, SymbolId_search = 8, MAX_PROTOTYPE_ID = SymbolId_search; private RECompiled re; Object lastIndex = ScriptRuntime.zeroObj; /* index after last match, for //g iterator */ private int lastIndexAttr = DONTENUM | PERMANENT; } // class NativeRegExp class RECompiled implements Serializable { private static final long serialVersionUID = -6144956577595844213L; final char[] source; /* locked source string, sans // */ int parenCount; /* number of parenthesized submatches */ int flags; /* flags */ byte[] program; /* regular expression bytecode */ int classCount; /* count [...] bitmaps */ RECharSet[] classList; /* list of [...] bitmaps */ int anchorCh = -1; /* if >= 0, then re starts with this literal char */ RECompiled(String str) { this.source = str.toCharArray(); } } class RENode { RENode(byte op) { this.op = op; } byte op; /* r.e. op bytecode */ RENode next; /* next in concatenation order */ RENode kid; /* first operand */ RENode kid2; /* second operand */ int parenIndex; /* or a parenthesis index */ /* or a range */ int min; int max; int parenCount; boolean greedy; /* or a character class */ int startIndex; int kidlen; /* length of string at kid, in chars */ int bmsize; /* bitmap size, based on max char code */ int index; /* index into class list */ boolean sense; /* or a literal sequence */ char chr; /* of one character */ int length; /* or many (via the index) */ int flatIndex; /* which is -1 if not sourced */ } class CompilerState { CompilerState(Context cx, char[] source, int length, int flags) { this.cx = cx; this.cpbegin = source; this.cp = 0; this.cpend = length; this.flags = flags; this.backReferenceLimit = Integer.MAX_VALUE; this.maxBackReference = 0; this.parenCount = 0; this.classCount = 0; this.progLength = 0; } Context cx; char[] cpbegin; int cpend; int cp; int flags; int backReferenceLimit; int maxBackReference; int parenCount; int parenNesting; int classCount; /* number of [] encountered */ int progLength; /* estimated bytecode length */ RENode result; } class REProgState { REProgState( REProgState previous, int min, int max, int index, REBackTrackData backTrack, int continuationOp, int continuationPc) { this.previous = previous; this.min = min; this.max = max; this.index = index; this.continuationOp = continuationOp; this.continuationPc = continuationPc; this.backTrack = backTrack; } final REProgState previous; // previous state in stack final int min; /* current quantifier min */ final int max; /* current quantifier max */ final int index; /* progress in text */ final int continuationOp; final int continuationPc; final REBackTrackData backTrack; // used by ASSERT_ to recover state } class REBackTrackData { REBackTrackData( REGlobalData gData, int op, int pc, int cp, int continuationOp, int continuationPc) { previous = gData.backTrackStackTop; this.op = op; this.pc = pc; this.cp = cp; this.continuationOp = continuationOp; this.continuationPc = continuationPc; parens = gData.parens; stateStackTop = gData.stateStackTop; } final REBackTrackData previous; final int op; /* operator */ final int pc; /* bytecode pointer */ final int cp; /* char buffer index */ final int continuationOp; /* continuation op */ final int continuationPc; /* continuation pc */ final long[] parens; /* parenthesis captures */ final REProgState stateStackTop; /* state of op that backtracked */ } class REGlobalData { boolean multiline; RECompiled regexp; /* the RE in execution */ int skipped; /* chars skipped anchoring this r.e. */ int cp; /* char buffer index */ long[] parens; /* parens captures */ REProgState stateStackTop; /* stack of state of current ancestors */ REBackTrackData backTrackStackTop; /* last matched-so-far position */ /** Get start of parenthesis capture contents, -1 for empty. */ int parensIndex(int i) { return (int) (parens[i]); } /** Get length of parenthesis capture contents. */ int parensLength(int i) { return (int) (parens[i] >>> 32); } void setParens(int i, int index, int length) { // clone parens array if it is shared with backtrack state if (backTrackStackTop != null && backTrackStackTop.parens == parens) { parens = parens.clone(); } parens[i] = (index & 0xffffffffL) | ((long) length << 32); } } /* * This struct holds a bitmap representation of a class from a regexp. * There's a list of these referenced by the classList field in the NativeRegExp * struct below. The initial state has startIndex set to the offset in the * original regexp source of the beginning of the class contents. The first * use of the class converts the source representation into a bitmap. * */ final class RECharSet implements Serializable { private static final long serialVersionUID = 7931787979395898394L; RECharSet(int length, int startIndex, int strlength, boolean sense) { this.length = length; this.startIndex = startIndex; this.strlength = strlength; this.sense = sense; } final int length; final int startIndex; final int strlength; final boolean sense; transient volatile boolean converted; transient volatile byte[] bits; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy