org.mozilla.javascript.regexp.RegExpImpl Maven / Gradle / Ivy
The newest version!
/* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
package org.mozilla.javascript.regexp;
import org.mozilla.javascript.*;
/**
*
*/
public class RegExpImpl implements RegExpProxy {
public boolean isRegExp(Scriptable obj) {
return obj instanceof NativeRegExp;
}
public Object compileRegExp(Context cx, String source, String flags)
{
return NativeRegExp.compileRE(cx, source, flags, false);
}
public Scriptable wrapRegExp(Context cx, Scriptable scope,
Object compiled)
{
return new NativeRegExp(scope, (RECompiled) compiled);
}
public Object action(Context cx, Scriptable scope,
Scriptable thisObj, Object[] args,
int actionType)
{
GlobData data = new GlobData();
data.mode = actionType;
data.str = ScriptRuntime.toString(thisObj);
switch (actionType) {
case RA_MATCH:
{
NativeRegExp re = createRegExp(cx, scope, args, 1, false);
Object rval = matchOrReplace(cx, scope, thisObj, args,
this, data, re);
return data.arrayobj == null ? rval : data.arrayobj;
}
case RA_SEARCH:
{
NativeRegExp re = createRegExp(cx, scope, args, 1, false);
return matchOrReplace(cx, scope, thisObj, args,
this, data, re);
}
case RA_REPLACE:
{
boolean useRE = (args.length > 0 && args[0] instanceof NativeRegExp)
|| args.length > 2;
NativeRegExp re = null;
String search = null;
if (useRE) {
re = createRegExp(cx, scope, args, 2, true);
} else {
Object arg0 = args.length < 1 ? Undefined.instance : args[0];
search = ScriptRuntime.toString(arg0);
}
Object arg1 = args.length < 2 ? Undefined.instance : args[1];
String repstr = null;
Function lambda = null;
if (arg1 instanceof Function) {
lambda = (Function) arg1;
} else {
repstr = ScriptRuntime.toString(arg1);
}
data.lambda = lambda;
data.repstr = repstr;
data.dollar = repstr == null ? -1 : repstr.indexOf('$');
data.charBuf = null;
data.leftIndex = 0;
Object val;
if (useRE) {
val = matchOrReplace(cx, scope, thisObj, args,
this, data, re);
} else {
String str = data.str;
int index = str.indexOf(search);
if (index >= 0) {
int slen = search.length();
this.lastParen = null;
this.leftContext = new SubString(str, 0, index);
this.lastMatch = new SubString(str, index, slen);
this.rightContext = new SubString(str, index + slen, str.length() - index - slen);
val = Boolean.TRUE;
} else {
val = Boolean.FALSE;
}
}
if (data.charBuf == null) {
if (data.global || val == null
|| !val.equals(Boolean.TRUE))
{
/* Didn't match even once. */
return data.str;
}
SubString lc = this.leftContext;
replace_glob(data, cx, scope, this, lc.index, lc.length);
}
SubString rc = this.rightContext;
data.charBuf.append(rc.str, rc.index, rc.index + rc.length);
return data.charBuf.toString();
}
default:
throw Kit.codeBug();
}
}
private static NativeRegExp createRegExp(Context cx, Scriptable scope,
Object[] args, int optarg,
boolean forceFlat)
{
NativeRegExp re;
Scriptable topScope = ScriptableObject.getTopLevelScope(scope);
if (args.length == 0 || args[0] == Undefined.instance) {
RECompiled compiled = NativeRegExp.compileRE(cx, "", "", false);
re = new NativeRegExp(topScope, compiled);
} else if (args[0] instanceof NativeRegExp) {
re = (NativeRegExp) args[0];
} else {
String src = ScriptRuntime.toString(args[0]);
String opt;
if (optarg < args.length) {
args[0] = src;
opt = ScriptRuntime.toString(args[optarg]);
} else {
opt = null;
}
RECompiled compiled = NativeRegExp.compileRE(cx, src, opt, forceFlat);
re = new NativeRegExp(topScope, compiled);
}
return re;
}
/**
* Analog of C match_or_replace.
*/
private static Object matchOrReplace(Context cx, Scriptable scope,
Scriptable thisObj, Object[] args,
RegExpImpl reImpl,
GlobData data, NativeRegExp re)
{
String str = data.str;
data.global = (re.getFlags() & NativeRegExp.JSREG_GLOB) != 0;
int[] indexp = { 0 };
Object result = null;
if (data.mode == RA_SEARCH) {
result = re.executeRegExp(cx, scope, reImpl,
str, indexp, NativeRegExp.TEST);
if (result != null && result.equals(Boolean.TRUE))
result = Integer.valueOf(reImpl.leftContext.length);
else
result = Integer.valueOf(-1);
} else if (data.global) {
re.lastIndex = 0d;
for (int count = 0; indexp[0] <= str.length(); count++) {
result = re.executeRegExp(cx, scope, reImpl,
str, indexp, NativeRegExp.TEST);
if (result == null || !result.equals(Boolean.TRUE))
break;
if (data.mode == RA_MATCH) {
match_glob(data, cx, scope, count, reImpl);
} else {
if (data.mode != RA_REPLACE) Kit.codeBug();
SubString lastMatch = reImpl.lastMatch;
int leftIndex = data.leftIndex;
int leftlen = lastMatch.index - leftIndex;
data.leftIndex = lastMatch.index + lastMatch.length;
replace_glob(data, cx, scope, reImpl, leftIndex, leftlen);
}
if (reImpl.lastMatch.length == 0) {
if (indexp[0] == str.length())
break;
indexp[0]++;
}
}
} else {
result = re.executeRegExp(cx, scope, reImpl, str, indexp,
((data.mode == RA_REPLACE)
? NativeRegExp.TEST
: NativeRegExp.MATCH));
}
return result;
}
public int find_split(Context cx, Scriptable scope, String target,
String separator, Scriptable reObj,
int[] ip, int[] matchlen,
boolean[] matched, String[][] parensp)
{
int i = ip[0];
int length = target.length();
int result;
int version = cx.getLanguageVersion();
NativeRegExp re = (NativeRegExp) reObj;
again:
while (true) { // imitating C label
/* JS1.2 deviated from Perl by never matching at end of string. */
int ipsave = ip[0]; // reuse ip to save object creation
ip[0] = i;
Object ret = re.executeRegExp(cx, scope, this, target, ip,
NativeRegExp.TEST);
if (ret != Boolean.TRUE) {
// Mismatch: ensure our caller advances i past end of string.
ip[0] = ipsave;
matchlen[0] = 1;
matched[0] = false;
return length;
}
i = ip[0];
ip[0] = ipsave;
matched[0] = true;
SubString sep = this.lastMatch;
matchlen[0] = sep.length;
if (matchlen[0] == 0) {
/*
* Empty string match: never split on an empty
* match at the start of a find_split cycle. Same
* rule as for an empty global match in
* match_or_replace.
*/
if (i == ip[0]) {
/*
* "Bump-along" to avoid sticking at an empty
* match, but don't bump past end of string --
* our caller must do that by adding
* sep->length to our return value.
*/
if (i == length) {
if (version == Context.VERSION_1_2) {
matchlen[0] = 1;
result = i;
}
else
result = -1;
break;
}
i++;
continue again; // imitating C goto
}
}
// PR_ASSERT((size_t)i >= sep->length);
result = i - matchlen[0];
break;
}
int size = (parens == null) ? 0 : parens.length;
parensp[0] = new String[size];
for (int num = 0; num < size; num++) {
SubString parsub = getParenSubString(num);
parensp[0][num] = parsub.toString();
}
return result;
}
/**
* Analog of REGEXP_PAREN_SUBSTRING in C jsregexp.h.
* Assumes zero-based; i.e., for $3, i==2
*/
SubString getParenSubString(int i)
{
if (parens != null && i < parens.length) {
SubString parsub = parens[i];
if (parsub != null) {
return parsub;
}
}
return SubString.emptySubString;
}
/*
* Analog of match_glob() in jsstr.c
*/
private static void match_glob(GlobData mdata, Context cx,
Scriptable scope, int count,
RegExpImpl reImpl)
{
if (mdata.arrayobj == null) {
mdata.arrayobj = cx.newArray(scope, 0);
}
SubString matchsub = reImpl.lastMatch;
String matchstr = matchsub.toString();
mdata.arrayobj.put(count, mdata.arrayobj, matchstr);
}
/*
* Analog of replace_glob() in jsstr.c
*/
private static void replace_glob(GlobData rdata, Context cx,
Scriptable scope, RegExpImpl reImpl,
int leftIndex, int leftlen)
{
int replen;
String lambdaStr;
if (rdata.lambda != null) {
// invoke lambda function with args lastMatch, $1, $2, ... $n,
// leftContext.length, whole string.
SubString[] parens = reImpl.parens;
int parenCount = (parens == null) ? 0 : parens.length;
Object[] args = new Object[parenCount + 3];
args[0] = reImpl.lastMatch.toString();
for (int i=0; i < parenCount; i++) {
SubString sub = parens[i];
if (sub != null) {
args[i+1] = sub.toString();
} else {
args[i+1] = Undefined.instance;
}
}
args[parenCount+1] = Integer.valueOf(reImpl.leftContext.length);
args[parenCount+2] = rdata.str;
// This is a hack to prevent expose of reImpl data to
// JS function which can run new regexps modifing
// regexp that are used later by the engine.
// TODO: redesign is necessary
if (reImpl != ScriptRuntime.getRegExpProxy(cx)) Kit.codeBug();
RegExpImpl re2 = new RegExpImpl();
re2.multiline = reImpl.multiline;
re2.input = reImpl.input;
ScriptRuntime.setRegExpProxy(cx, re2);
try {
Scriptable parent = ScriptableObject.getTopLevelScope(scope);
Object result = rdata.lambda.call(cx, parent, parent, args);
lambdaStr = ScriptRuntime.toString(result);
} finally {
ScriptRuntime.setRegExpProxy(cx, reImpl);
}
replen = lambdaStr.length();
} else {
lambdaStr = null;
replen = rdata.repstr.length();
if (rdata.dollar >= 0) {
int[] skip = new int[1];
int dp = rdata.dollar;
do {
SubString sub = interpretDollar(cx, reImpl, rdata.repstr,
dp, skip);
if (sub != null) {
replen += sub.length - skip[0];
dp += skip[0];
} else {
++dp;
}
dp = rdata.repstr.indexOf('$', dp);
} while (dp >= 0);
}
}
int growth = leftlen + replen + reImpl.rightContext.length;
StringBuilder charBuf = rdata.charBuf;
if (charBuf == null) {
charBuf = new StringBuilder(growth);
rdata.charBuf = charBuf;
} else {
charBuf.ensureCapacity(rdata.charBuf.length() + growth);
}
charBuf.append(reImpl.leftContext.str, leftIndex, leftIndex + leftlen);
if (rdata.lambda != null) {
charBuf.append(lambdaStr);
} else {
do_replace(rdata, cx, reImpl);
}
}
private static SubString interpretDollar(Context cx, RegExpImpl res,
String da, int dp, int[] skip)
{
char dc;
int num, tmp;
if (da.charAt(dp) != '$') Kit.codeBug();
/* Allow a real backslash (literal "\\") to escape "$1" etc. */
int version = cx.getLanguageVersion();
if (version != Context.VERSION_DEFAULT
&& version <= Context.VERSION_1_4)
{
if (dp > 0 && da.charAt(dp - 1) == '\\')
return null;
}
int daL = da.length();
if (dp + 1 >= daL)
return null;
/* Interpret all Perl match-induced dollar variables. */
dc = da.charAt(dp + 1);
if (NativeRegExp.isDigit(dc)) {
int cp;
if (version != Context.VERSION_DEFAULT
&& version <= Context.VERSION_1_4)
{
if (dc == '0')
return null;
/* Check for overflow to avoid gobbling arbitrary decimal digits. */
num = 0;
cp = dp;
while (++cp < daL && NativeRegExp.isDigit(dc = da.charAt(cp)))
{
tmp = 10 * num + (dc - '0');
if (tmp < num)
break;
num = tmp;
}
}
else { /* ECMA 3, 1-9 or 01-99 */
int parenCount = (res.parens == null) ? 0 : res.parens.length;
num = dc - '0';
if (num > parenCount)
return null;
cp = dp + 2;
if ((dp + 2) < daL) {
dc = da.charAt(dp + 2);
if (NativeRegExp.isDigit(dc)) {
tmp = 10 * num + (dc - '0');
if (tmp <= parenCount) {
cp++;
num = tmp;
}
}
}
if (num == 0) return null; /* $0 or $00 is not valid */
}
/* Adjust num from 1 $n-origin to 0 array-index-origin. */
num--;
skip[0] = cp - dp;
return res.getParenSubString(num);
}
skip[0] = 2;
switch (dc) {
case '$':
return new SubString("$");
case '&':
return res.lastMatch;
case '+':
return res.lastParen;
case '`':
if (version == Context.VERSION_1_2) {
/*
* JS1.2 imitated the Perl4 bug where left context at each step
* in an iterative use of a global regexp started from last match,
* not from the start of the target string. But Perl4 does start
* $` at the beginning of the target string when it is used in a
* substitution, so we emulate that special case here.
*/
res.leftContext.index = 0;
res.leftContext.length = res.lastMatch.index;
}
return res.leftContext;
case '\'':
return res.rightContext;
}
return null;
}
/**
* Analog of do_replace in jsstr.c
*/
private static void do_replace(GlobData rdata, Context cx,
RegExpImpl regExpImpl)
{
StringBuilder charBuf = rdata.charBuf;
int cp = 0;
String da = rdata.repstr;
int dp = rdata.dollar;
if (dp != -1) {
int[] skip = new int[1];
do {
int len = dp - cp;
charBuf.append(da.substring(cp, dp));
cp = dp;
SubString sub = interpretDollar(cx, regExpImpl, da,
dp, skip);
if (sub != null) {
len = sub.length;
if (len > 0) {
charBuf.append(sub.str, sub.index, sub.index + len);
}
cp += skip[0];
dp += skip[0];
} else {
++dp;
}
dp = da.indexOf('$', dp);
} while (dp >= 0);
}
int daL = da.length();
if (daL > cp) {
charBuf.append(da.substring(cp, daL));
}
}
/*
* See ECMA 15.5.4.8. Modified to match JS 1.2 - optionally takes
* a limit argument and accepts a regular expression as the split
* argument.
*/
public Object js_split(Context cx, Scriptable scope,
String target, Object[] args)
{
// create an empty Array to return;
Scriptable result = cx.newArray(scope, 0);
// Use the second argument as the split limit, if given.
boolean limited = (args.length > 1) && (args[1] != Undefined.instance);
long limit = 0; // Initialize to avoid warning.
if (limited) {
/* Clamp limit between 0 and 1 + string length. */
limit = ScriptRuntime.toUint32(args[1]);
if (limit > target.length())
limit = 1 + target.length();
}
// return an array consisting of the target if no separator given
if (args.length < 1 || args[0] == Undefined.instance) {
result.put(0, result, target);
return result;
}
String separator = null;
int[] matchlen = new int[1];
Scriptable re = null;
RegExpProxy reProxy = null;
if (args[0] instanceof Scriptable) {
reProxy = ScriptRuntime.getRegExpProxy(cx);
if (reProxy != null) {
Scriptable test = (Scriptable)args[0];
if (reProxy.isRegExp(test)) {
re = test;
}
}
}
if (re == null) {
separator = ScriptRuntime.toString(args[0]);
matchlen[0] = separator.length();
}
// split target with separator or re
int[] ip = { 0 };
int match;
int len = 0;
boolean[] matched = { false };
String[][] parens = { null };
int version = cx.getLanguageVersion();
while ((match = find_split(cx, scope, target, separator, version,
reProxy, re, ip, matchlen, matched, parens))
>= 0)
{
if ((limited && len >= limit) || (match > target.length()))
break;
String substr;
if (target.length() == 0)
substr = target;
else
substr = target.substring(ip[0], match);
result.put(len, result, substr);
len++;
/*
* Imitate perl's feature of including parenthesized substrings
* that matched part of the delimiter in the new array, after the
* split substring that was delimited.
*/
if (re != null && matched[0]) {
int size = parens[0].length;
for (int num = 0; num < size; num++) {
if (limited && len >= limit)
break;
result.put(len, result, parens[0][num]);
len++;
}
matched[0] = false;
}
ip[0] = match + matchlen[0];
if (version < Context.VERSION_1_3
&& version != Context.VERSION_DEFAULT)
{
/*
* Deviate from ECMA to imitate Perl, which omits a final
* split unless a limit argument is given and big enough.
*/
if (!limited && ip[0] == target.length())
break;
}
}
return result;
}
/*
* Used by js_split to find the next split point in target,
* starting at offset ip and looking either for the given
* separator substring, or for the next re match. ip and
* matchlen must be reference variables (assumed to be arrays of
* length 1) so they can be updated in the leading whitespace or
* re case.
*
* Return -1 on end of string, >= 0 for a valid index of the next
* separator occurrence if found, or the string length if no
* separator is found.
*/
private static int find_split(Context cx, Scriptable scope, String target,
String separator, int version,
RegExpProxy reProxy, Scriptable re,
int[] ip, int[] matchlen, boolean[] matched,
String[][] parensp)
{
int i = ip[0];
int length = target.length();
/*
* Perl4 special case for str.split(' '), only if the user has selected
* JavaScript1.2 explicitly. Split on whitespace, and skip leading w/s.
* Strange but true, apparently modeled after awk.
*/
if (version == Context.VERSION_1_2 &&
re == null && separator.length() == 1 && separator.charAt(0) == ' ')
{
/* Skip leading whitespace if at front of str. */
if (i == 0) {
while (i < length && Character.isWhitespace(target.charAt(i)))
i++;
ip[0] = i;
}
/* Don't delimit whitespace at end of string. */
if (i == length)
return -1;
/* Skip over the non-whitespace chars. */
while (i < length
&& !Character.isWhitespace(target.charAt(i)))
i++;
/* Now skip the next run of whitespace. */
int j = i;
while (j < length && Character.isWhitespace(target.charAt(j)))
j++;
/* Update matchlen to count delimiter chars. */
matchlen[0] = j - i;
return i;
}
/*
* Stop if past end of string. If at end of string, we will
* return target length, so that
*
* "ab,".split(',') => new Array("ab", "")
*
* and the resulting array converts back to the string "ab,"
* for symmetry. NB: This differs from perl, which drops the
* trailing empty substring if the LIMIT argument is omitted.
*/
if (i > length)
return -1;
/*
* Match a regular expression against the separator at or
* above index i. Return -1 at end of string instead of
* trying for a match, so we don't get stuck in a loop.
*/
if (re != null) {
return reProxy.find_split(cx, scope, target, separator, re,
ip, matchlen, matched, parensp);
}
/*
* Deviate from ECMA by never splitting an empty string by any separator
* string into a non-empty array (an array of length 1 that contains the
* empty string).
*/
if (version != Context.VERSION_DEFAULT && version < Context.VERSION_1_3
&& length == 0)
return -1;
/*
* Special case: if sep is the empty string, split str into
* one character substrings. Let our caller worry about
* whether to split once at end of string into an empty
* substring.
*
* For 1.2 compatibility, at the end of the string, we return the length as
* the result, and set the separator length to 1 -- this allows the caller
* to include an additional null string at the end of the substring list.
*/
if (separator.length() == 0) {
if (version == Context.VERSION_1_2) {
if (i == length) {
matchlen[0] = 1;
return i;
}
return i + 1;
}
return (i == length) ? -1 : i + 1;
}
/* Punt to j.l.s.indexOf; return target length if separator is
* not found.
*/
if (ip[0] >= length)
return length;
i = target.indexOf(separator, ip[0]);
return (i != -1) ? i : length;
}
protected String input; /* input string to match (perl $_, GC root) */
protected boolean multiline; /* whether input contains newlines (perl $*) */
protected SubString[] parens; /* Vector of SubString; last set of parens
matched (perl $1, $2) */
protected SubString lastMatch; /* last string matched (perl $&) */
protected SubString lastParen; /* last paren matched (perl $+) */
protected SubString leftContext; /* input to left of last match (perl $`) */
protected SubString rightContext; /* input to right of last match (perl $') */
}
final class GlobData
{
int mode; /* input: return index, match object, or void */
boolean global; /* output: whether regexp was global */
String str; /* output: 'this' parameter object as string */
// match-specific data
Scriptable arrayobj;
// replace-specific data
Function lambda; /* replacement function object or null */
String repstr; /* replacement string */
int dollar = -1; /* -1 or index of first $ in repstr */
StringBuilder charBuf; /* result characters, null initially */
int leftIndex; /* leftContext index, always 0 for JS1.2 */
}