com.google.re2j.Parser Maven / Gradle / Ivy
The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/parse.go
// TODO(adonovan):
// - Eliminate allocations (new int[], new Regexp[], new ArrayList) by
// recycling old arrays on a freelist.
package com.google.re2j;
import java.util.ArrayList;
/**
* A parser of regular expression patterns.
*
* The only public entry point is {@link #parse(String pattern, int flags)}.
*/
class Parser {
// Unexpected error
private static final String ERR_INTERNAL_ERROR =
"regexp/syntax: internal error";
// Parse errors
private static final String ERR_INVALID_CHAR_CLASS =
"invalid character class";
private static final String ERR_INVALID_CHAR_RANGE =
"invalid character class range";
private static final String ERR_INVALID_ESCAPE =
"invalid escape sequence";
private static final String ERR_INVALID_NAMED_CAPTURE =
"invalid named capture";
private static final String ERR_INVALID_PERL_OP =
"invalid or unsupported Perl syntax";
private static final String ERR_INVALID_REPEAT_OP =
"invalid nested repetition operator";
private static final String ERR_INVALID_REPEAT_SIZE =
"invalid repeat count";
private static final String ERR_MISSING_BRACKET =
"missing closing ]";
private static final String ERR_MISSING_PAREN =
"missing closing )";
private static final String ERR_MISSING_REPEAT_ARGUMENT =
"missing argument to repetition operator";
private static final String ERR_TRAILING_BACKSLASH =
"trailing backslash at end of expression";
// Hack to expose ArrayList.removeRange().
private static class Stack extends ArrayList {
@Override
public void removeRange(int fromIndex, int toIndex) {
super.removeRange(fromIndex, toIndex);
}
}
private final String wholeRegexp;
// Flags control the behavior of the parser and record information about
// regexp context.
private int flags; // parse mode flags
// Stack of parsed expressions.
private final Stack stack = new Stack();
private Regexp free;
private int numCap = 0; // number of capturing groups seen
Parser(String wholeRegexp, int flags) {
this.wholeRegexp = wholeRegexp;
this.flags = flags;
}
// Allocate a Regexp, from the free list if possible.
private Regexp newRegexp(Regexp.Op op) {
Regexp re = free;
if (re != null && re.subs != null && re.subs.length > 0) {
free = re.subs[0];
re.reinit();
re.op = op;
} else {
re = new Regexp(op);
}
return re;
}
private void reuse(Regexp re) {
if (re.subs != null && re.subs.length > 0) {
re.subs[0] = free;
}
free = re;
}
// Parse stack manipulation.
private Regexp pop() {
return stack.remove(stack.size() - 1);
}
private Regexp[] popToPseudo() {
int n = stack.size(), i = n;
while (i > 0 && !stack.get(i - 1).op.isPseudo()) {
i--;
}
Regexp[] r = stack.subList(i, n).toArray(new Regexp[n - i]);
stack.removeRange(i, n);
return r;
}
// push pushes the regexp re onto the parse stack and returns the regexp.
// Returns null for a CHAR_CLASS that can be merged with the top-of-stack.
private Regexp push(Regexp re) {
if (re.op == Regexp.Op.CHAR_CLASS &&
re.runes.length == 2 &&
re.runes[0] == re.runes[1]) {
// Collapse range [x-x] -> single rune x.
if (maybeConcat(re.runes[0], flags & ~RE2.FOLD_CASE)) {
return null;
}
re.op = Regexp.Op.LITERAL;
re.runes = new int[] { re.runes[0] };
re.flags = flags & ~RE2.FOLD_CASE;
} else if ((re.op == Regexp.Op.CHAR_CLASS &&
re.runes.length == 4 &&
re.runes[0] == re.runes[1] &&
re.runes[2] == re.runes[3] &&
Unicode.simpleFold(re.runes[0]) == re.runes[2] &&
Unicode.simpleFold(re.runes[2]) == re.runes[0]) ||
(re.op == Regexp.Op.CHAR_CLASS &&
re.runes.length == 2 &&
re.runes[0] + 1 == re.runes[1] &&
Unicode.simpleFold(re.runes[0]) == re.runes[1] &&
Unicode.simpleFold(re.runes[1]) == re.runes[0])) {
// Case-insensitive rune like [Aa] or [Δδ].
if (maybeConcat(re.runes[0], flags | RE2.FOLD_CASE)) {
return null;
}
// Rewrite as (case-insensitive) literal.
re.op = Regexp.Op.LITERAL;
re.runes = new int[] { re.runes[0] };
re.flags = flags | RE2.FOLD_CASE;
} else {
// Incremental concatenation.
maybeConcat(-1, 0);
}
stack.add(re);
return re;
}
// maybeConcat implements incremental concatenation
// of literal runes into string nodes. The parser calls this
// before each push, so only the top fragment of the stack
// might need processing. Since this is called before a push,
// the topmost literal is no longer subject to operators like *
// (Otherwise ab* would turn into (ab)*.)
// If (r >= 0 and there's a node left over, maybeConcat uses it
// to push r with the given flags.
// maybeConcat reports whether r was pushed.
private boolean maybeConcat(int r, int flags) {
int n = stack.size();
if (n < 2) {
return false;
}
Regexp re1 = stack.get(n - 1);
Regexp re2 = stack.get(n - 2);
if (re1.op != Regexp.Op.LITERAL ||
re2.op != Regexp.Op.LITERAL ||
(re1.flags & RE2.FOLD_CASE) != (re2.flags & RE2.FOLD_CASE)) {
return false;
}
// Push re1 into re2.
re2.runes = concatRunes(re2.runes, re1.runes);
// Reuse re1 if possible.
if (r >= 0) {
re1.runes = new int[] { r };
re1.flags = flags;
return true;
}
pop();
reuse(re1);
return false; // did not push r
}
// newLiteral returns a new LITERAL Regexp with the given flags
private Regexp newLiteral(int r, int flags) {
Regexp re = newRegexp(Regexp.Op.LITERAL);
re.flags = flags;
if ((flags & RE2.FOLD_CASE) != 0) {
r = minFoldRune(r);
}
re.runes = new int[] { r };
return re;
}
// minFoldRune returns the minimum rune fold-equivalent to r.
private static int minFoldRune(int r) {
if (r < Unicode.MIN_FOLD || r > Unicode.MAX_FOLD) {
return r;
}
int min = r;
int r0 = r;
for (r = Unicode.simpleFold(r); r != r0; r = Unicode.simpleFold(r)) {
if (min > r) {
min = r;
}
}
return min;
}
// literal pushes a literal regexp for the rune r on the stack
// and returns that regexp.
private void literal(int r) {
push(newLiteral(r, flags));
}
// op pushes a regexp with the given op onto the stack
// and returns that regexp.
private Regexp op(Regexp.Op op) {
Regexp re = newRegexp(op);
re.flags = flags;
return push(re);
}
// repeat replaces the top stack element with itself repeated according to
// op, min, max. beforePos is the start position of the repetition operator.
// Pre: t is positioned after the initial repetition operator.
// Post: t advances past an optional perl-mode '?', or stays put.
// Or, it fails with PatternSyntaxException.
private void repeat(Regexp.Op op, int min, int max, int beforePos,
StringIterator t, int lastRepeatPos)
throws PatternSyntaxException {
int flags = this.flags;
if ((flags & RE2.PERL_X) != 0) {
if (t.more() && t.lookingAt('?')) {
t.skip(1); // '?'
flags ^= RE2.NON_GREEDY;
}
if (lastRepeatPos != -1) {
// In Perl it is not allowed to stack repetition operators:
// a** is a syntax error, not a doubled star, and a++ means
// something else entirely, which we don't support!
throw new PatternSyntaxException(
ERR_INVALID_REPEAT_OP, t.from(lastRepeatPos));
}
}
int n = stack.size();
if (n == 0) {
throw new PatternSyntaxException(
ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
}
Regexp sub = stack.get(n - 1);
if (sub.op.isPseudo()) {
throw new PatternSyntaxException(
ERR_MISSING_REPEAT_ARGUMENT, t.from(beforePos));
}
Regexp re = newRegexp(op);
re.min = min;
re.max = max;
re.flags = flags;
re.subs = new Regexp[] { sub };
stack.set(n - 1, re);
}
// concat replaces the top of the stack (above the topmost '|' or '(') with
// its concatenation.
private Regexp concat() {
maybeConcat(-1, 0);
// Scan down to find pseudo-operator | or (.
Regexp[] subs = popToPseudo();
// Empty concatenation is special case.
if (subs.length == 0) {
return push(newRegexp(Regexp.Op.EMPTY_MATCH));
}
return push(collapse(subs, Regexp.Op.CONCAT));
}
// alternate replaces the top of the stack (above the topmost '(') with its
// alternation.
private Regexp alternate() {
// Scan down to find pseudo-operator (.
// There are no | above (.
Regexp[] subs = popToPseudo();
// Make sure top class is clean.
// All the others already are (see swapVerticalBar).
if (subs.length > 0) {
cleanAlt(subs[subs.length - 1]);
}
// Empty alternate is special case
// (shouldn't happen but easy to handle).
if (subs.length == 0) {
return push(newRegexp(Regexp.Op.NO_MATCH));
}
return push(collapse(subs, Regexp.Op.ALTERNATE));
}
// cleanAlt cleans re for eventual inclusion in an alternation.
private void cleanAlt(Regexp re) {
switch (re.op) {
case CHAR_CLASS:
re.runes = new CharClass(re.runes).cleanClass().toArray();
if (re.runes.length == 2 &&
re.runes[0] == 0 &&
re.runes[1] == Unicode.MAX_RUNE) {
re.runes = null;
re.op = Regexp.Op.ANY_CHAR;
return;
}
if (re.runes.length == 4 &&
re.runes[0] == 0 &&
re.runes[1] == '\n' - 1 &&
re.runes[2] == '\n' + 1 &&
re.runes[3] == Unicode.MAX_RUNE) {
re.runes = null;
re.op = Regexp.Op.ANY_CHAR_NOT_NL;
return;
}
break;
}
}
// collapse returns the result of applying op to subs[start:end].
// If (sub contains op nodes, they all get hoisted up
// so that there is never a concat of a concat or an
// alternate of an alternate.
private Regexp collapse(Regexp[] subs, Regexp.Op op) {
if (subs.length == 1) {
return subs[0];
}
// Concatenate subs iff op is same.
// Compute length in first pass.
int len = 0;
for (Regexp sub : subs) {
len += (sub.op == op) ? sub.subs.length : 1;
}
Regexp[] newsubs = new Regexp[len];
int i = 0;
for (Regexp sub : subs) {
if (sub.op == op) {
System.arraycopy(sub.subs, 0, newsubs, i, sub.subs.length);
i += sub.subs.length;
reuse(sub);
} else {
newsubs[i++] = sub;
}
}
Regexp re = newRegexp(op);
re.subs = newsubs;
if (op == Regexp.Op.ALTERNATE) {
re.subs = factor(re.subs, re.flags);
if (re.subs.length == 1) {
Regexp old = re;
re = re.subs[0];
reuse(old);
}
}
return re;
}
// factor factors common prefixes from the alternation list sub. It
// returns a replacement list that reuses the same storage and frees
// (passes to p.reuse) any removed *Regexps.
//
// For example,
// ABC|ABD|AEF|BCX|BCY
// simplifies by literal prefix extraction to
// A(B(C|D)|EF)|BC(X|Y)
// which simplifies by character class introduction to
// A(B[CD]|EF)|BC[XY]
//
private Regexp[] factor(Regexp[] array, int flags) {
if (array.length < 2) {
return array;
}
// The following code is subtle, because it's a literal Java
// translation of code that makes clever use of Go "slices".
// A slice is a triple (array, offset, length), and the Go
// implementation uses two slices, |sub| and |out| backed by the
// same array. In Java, we have to be explicit about all of these
// variables, so:
//
// Go Java
// sub (array, s, lensub)
// out (array, 0, lenout) // (always a prefix of |array|)
//
// In the comments we'll use the logical notation of go slices, e.g. sub[i]
// even though the Java code will read array[s + i].
int s = 0; // offset of first |sub| within array.
int lensub = array.length; // = len(sub)
int lenout = 0; // = len(out)
// Round 1: Factor out common literal prefixes.
// Note: (str, strlen) and (istr, istrlen) are like Go slices
// onto a prefix of some Regexp's runes array (hence offset=0).
int[] str = null;
int strlen = 0;
int strflags = 0;
int start = 0;
for (int i = 0; i <= lensub; i++) {
// Invariant: the Regexps that were in sub[0:start] have been
// used or marked for reuse, and the slice space has been reused
// for out (len <= start).
//
// Invariant: sub[start:i] consists of regexps that all begin
// with str as modified by strflags.
int[] istr = null;
int istrlen = 0;
int iflags = 0;
if (i < lensub) {
// NB, we inlined Go's leadingString() since Java has no pair return.
Regexp re = array[s + i];
if (re.op == Regexp.Op.CONCAT && re.subs.length > 0) {
re = re.subs[0];
}
if (re.op == Regexp.Op.LITERAL) {
istr = re.runes;
istrlen = re.runes.length;
iflags = re.flags & RE2.FOLD_CASE;
}
// istr is the leading literal string that re begins with.
// The string refers to storage in re or its children.
if (iflags == strflags) {
int same = 0;
while (same < strlen &&
same < istrlen &&
str[same] == istr[same]) {
same++;
}
if (same > 0) {
// Matches at least one rune in current range.
// Keep going around.
strlen = same;
continue;
}
}
}
// Found end of a run with common leading literal string:
// sub[start:i] all begin with str[0:strlen], but sub[i]
// does not even begin with str[0].
//
// Factor out common string and append factored expression to out.
if (i == start) {
// Nothing to do - run of length 0.
} else if (i == start + 1) {
// Just one: don't bother factoring.
array[lenout++] = array[s + start];
} else {
// Construct factored form: prefix(suffix1|suffix2|...)
Regexp prefix = newRegexp(Regexp.Op.LITERAL);
prefix.flags = strflags;
prefix.runes = Utils.subarray(str, 0, strlen);
for (int j = start; j < i; j++) {
array[s + j] = removeLeadingString(array[s + j], strlen);
}
// Recurse.
Regexp suffix =
collapse(subarray(array, s + start, s + i), Regexp.Op.ALTERNATE);
Regexp re = newRegexp(Regexp.Op.CONCAT);
re.subs = new Regexp[] { prefix, suffix };
array[lenout++] = re;
}
// Prepare for next iteration.
start = i;
str = istr;
strlen = istrlen;
strflags = iflags;
}
// In Go: sub = out
lensub = lenout;
s = 0;
// Round 2: Factor out common complex prefixes,
// just the first piece of each concatenation,
// whatever it is. This is good enough a lot of the time.
start = 0;
lenout = 0;
Regexp first = null;
for (int i = 0; i <= lensub; i++) {
// Invariant: the Regexps that were in sub[0:start] have been
// used or marked for reuse, and the slice space has been reused
// for out (lenout <= start).
//
// Invariant: sub[start:i] consists of regexps that all begin with
// ifirst.
Regexp ifirst = null;
if (i < lensub) {
ifirst = leadingRegexp(array[s + i]);
if (first != null && first.equals(ifirst)) {
continue;
}
}
// Found end of a run with common leading regexp:
// sub[start:i] all begin with first but sub[i] does not.
//
// Factor out common regexp and append factored expression to out.
if (i == start) {
// Nothing to do - run of length 0.
} else if (i == start + 1) {
// Just one: don't bother factoring.
array[lenout++] = array[s + start];
} else {
// Construct factored form: prefix(suffix1|suffix2|...)
Regexp prefix = first;
for (int j = start; j < i; j++) {
boolean reuse = j != start; // prefix came from sub[start]
array[s + j] = removeLeadingRegexp(array[s + j], reuse);
}
// recurse
Regexp suffix =
collapse(subarray(array, s + start, s + i), Regexp.Op.ALTERNATE);
Regexp re = newRegexp(Regexp.Op.CONCAT);
re.subs = new Regexp[] { prefix, suffix };
array[lenout++] = re;
}
// Prepare for next iteration.
start = i;
first = ifirst;
}
// In Go: sub = out
lensub = lenout;
s = 0;
// Round 3: Collapse runs of single literals into character classes.
start = 0;
lenout = 0;
for (int i = 0; i <= lensub; i++) {
// Invariant: the Regexps that were in sub[0:start] have been
// used or marked for reuse, and the slice space has been reused
// for out (lenout <= start).
//
// Invariant: sub[start:i] consists of regexps that are either
// literal runes or character classes.
if (i < lensub && isCharClass(array[s + i])) {
continue;
}
// sub[i] is not a char or char class;
// emit char class for sub[start:i]...
if (i == start) {
// Nothing to do - run of length 0.
} else if (i == start + 1) {
array[lenout++] = array[s + start];
} else {
// Make new char class.
// Start with most complex regexp in sub[start].
int max = start;
for (int j = start + 1; j < i; j++) {
Regexp subMax = array[s + max], subJ = array[s + j];
if (subMax.op.ordinal() < subJ.op.ordinal() ||
subMax.op == subJ.op && subMax.runes.length < subJ.runes.length) {
max = j;
}
}
// swap sub[start], sub[max].
Regexp tmp = array[s + start];
array[s + start] = array[s + max];
array[s + max] = tmp;
for (int j = start + 1; j < i; j++) {
mergeCharClass(array[s + start], array[s + j]);
reuse(array[s + j]);
}
cleanAlt(array[s + start]);
array[lenout++] = array[s + start];
}
// ... and then emit sub[i].
if (i < lensub) {
array[lenout++] = array[s + i];
}
start = i + 1;
}
// In Go: sub = out
lensub = lenout;
s = 0;
// Round 4: Collapse runs of empty matches into a single empty match.
start = 0;
lenout = 0;
for (int i = 0; i < lensub; ++i) {
if (i + 1 < lensub &&
array[s + i].op == Regexp.Op.EMPTY_MATCH &&
array[s + i + 1].op == Regexp.Op.EMPTY_MATCH) {
continue;
}
array[lenout++] = array[s + i];
}
// In Go: sub = out
lensub = lenout;
s = 0;
return subarray(array, s, lensub);
}
// removeLeadingString removes the first n leading runes
// from the beginning of re. It returns the replacement for re.
private Regexp removeLeadingString(Regexp re, int n) {
if (re.op == Regexp.Op.CONCAT && re.subs.length > 0) {
// Removing a leading string in a concatenation
// might simplify the concatenation.
Regexp sub = removeLeadingString(re.subs[0], n);
re.subs[0] = sub;
if (sub.op == Regexp.Op.EMPTY_MATCH) {
reuse(sub);
switch (re.subs.length) {
case 0:
case 1:
// Impossible but handle.
re.op = Regexp.Op.EMPTY_MATCH;
re.subs = null;
break;
case 2: {
Regexp old = re;
re = re.subs[1];
reuse(old);
break;
}
default:
re.subs = subarray(re.subs, 1, re.subs.length);
break;
}
}
return re;
}
if (re.op == Regexp.Op.LITERAL) {
re.runes = Utils.subarray(re.runes, n, re.runes.length);
if (re.runes.length == 0) {
re.op = Regexp.Op.EMPTY_MATCH;
}
}
return re;
}
// leadingRegexp returns the leading regexp that re begins with.
// The regexp refers to storage in re or its children.
private static Regexp leadingRegexp(Regexp re) {
if (re.op == Regexp.Op.EMPTY_MATCH) {
return null;
}
if (re.op == Regexp.Op.CONCAT && re.subs.length > 0) {
Regexp sub = re.subs[0];
if (sub.op == Regexp.Op.EMPTY_MATCH) {
return null;
}
return sub;
}
return re;
}
// removeLeadingRegexp removes the leading regexp in re.
// It returns the replacement for re.
// If reuse is true, it passes the removed regexp (if no longer needed) to
// reuse.
private Regexp removeLeadingRegexp(Regexp re, boolean reuse) {
if (re.op == Regexp.Op.CONCAT && re.subs.length > 0) {
if (reuse) {
reuse(re.subs[0]);
}
re.subs = subarray(re.subs, 1, re.subs.length);
switch (re.subs.length) {
case 0:
re.op = Regexp.Op.EMPTY_MATCH;
re.subs = Regexp.EMPTY_SUBS;
break;
case 1:
Regexp old = re;
re = re.subs[0];
reuse(old);
break;
}
return re;
}
if (reuse) {
reuse(re);
}
return newRegexp(Regexp.Op.EMPTY_MATCH);
}
private static Regexp literalRegexp(String s, int flags) {
Regexp re = new Regexp(Regexp.Op.LITERAL);
re.flags = flags;
re.runes = Utils.stringToRunes(s);
return re;
}
// Parsing.
// StringIterator: a stream of runes with an opaque cursor, permitting
// rewinding. The units of the cursor are not specified beyond the
// fact that ASCII characters are single width. (Cursor positions
// could be UTF-8 byte indices, UTF-16 code indices or rune indices.)
//
// In particular, be careful with:
// - skip(int): only use this to advance over ASCII characters
// since these always have a width of 1.
// - skip(String): only use this to advance over strings which are
// known to be at the current position, e.g. due to prior call to
// lookingAt().
// Only use pop() to advance over possibly non-ASCII runes.
private static class StringIterator {
private final String str; // a stream of UTF-16 codes
private int pos = 0; // current position in UTF-16 string
StringIterator(String str) { this.str = str; }
// Returns the cursor position. Do not interpret the result!
int pos() { return pos; }
// Resets the cursor position to a previous value returned by pos().
void rewindTo(int pos) {
this.pos = pos;
}
// Returns true unless the stream is exhausted.
boolean more() {
return pos < str.length();
}
// Returns the rune at the cursor position.
// Precondition: |more()|.
int peek() {
return str.codePointAt(pos);
}
// Advances the cursor by |n| positions, which must be ASCII runes.
//
// (In practise, this is only ever used to skip over regexp
// metacharacters that are ASCII, so there is no numeric difference
// between indices into UTF-8 bytes, UTF-16 codes and runes.)
void skip(int n) {
pos += n;
}
// Advances the cursor by the number of cursor positions in |s|.
void skipString(String s) {
pos += s.length();
}
// Returns the rune at the cursor position, and advances the cursor
// past it. Precondition: |more()|.
int pop() {
int r = str.codePointAt(pos);
pos += Character.charCount(r);
return r;
}
// Equivalent to both peek() == c but more efficient because we
// don't support surrogates. Precondition: |more()|.
boolean lookingAt(char c) {
return str.charAt(pos) == c;
}
// Equivalent to rest().startsWith(s).
boolean lookingAt(String s) {
return rest().startsWith(s);
}
// Returns the rest of the pattern as a Java UTF-16 string.
String rest() {
return str.substring(pos);
}
// Returns the substring from |beforePos| to the current position.
// |beforePos| must have been previously returned by |pos()|.
String from(int beforePos) {
return str.substring(beforePos, pos);
}
@Override public String toString() {
return rest();
}
}
/**
* Parse regular expression pattern {@var pattern} with mode flags
* {@var flags}.
*/
static Regexp parse(String pattern, int flags)
throws PatternSyntaxException {
return new Parser(pattern, flags).parseInternal();
}
private Regexp parseInternal() throws PatternSyntaxException {
if ((flags & RE2.LITERAL) != 0) {
// Trivial parser for literal string.
return literalRegexp(wholeRegexp, flags);
}
// Otherwise, must do real work.
int lastRepeatPos = -1, min = -1, max = -1;
StringIterator t = new StringIterator(wholeRegexp);
while (t.more()) {
int repeatPos = -1;
bigswitch:
switch (t.peek()) {
default:
literal(t.pop());
break;
case '(':
if ((flags & RE2.PERL_X) != 0 && t.lookingAt("(?")) {
// Flag changes and non-capturing groups.
parsePerlFlags(t);
break;
}
op(Regexp.Op.LEFT_PAREN).cap = ++numCap;
t.skip(1); // '('
break;
case '|':
parseVerticalBar();
t.skip(1); // '|'
break;
case ')':
parseRightParen();
t.skip(1); // ')'
break;
case '^':
if ((flags & RE2.ONE_LINE) != 0) {
op(Regexp.Op.BEGIN_TEXT);
} else {
op(Regexp.Op.BEGIN_LINE);
}
t.skip(1); // '^'
break;
case '$':
if ((flags & RE2.ONE_LINE) != 0) {
op(Regexp.Op.END_TEXT).flags |= RE2.WAS_DOLLAR;
} else {
op(Regexp.Op.END_LINE);
}
t.skip(1); // '$'
break;
case '.':
if ((flags & RE2.DOT_NL) != 0) {
op(Regexp.Op.ANY_CHAR);
} else {
op(Regexp.Op.ANY_CHAR_NOT_NL);
}
t.skip(1); // '.'
break;
case '[':
parseClass(t);
break;
case '*':
case '+':
case '?': {
repeatPos = t.pos();
Regexp.Op op = null;
switch (t.pop()) {
case '*': op = Regexp.Op.STAR; break;
case '+': op = Regexp.Op.PLUS; break;
case '?': op = Regexp.Op.QUEST; break;
}
repeat(op, min, max, repeatPos, t, lastRepeatPos);
// (min and max are now dead.)
break;
}
case '{': {
repeatPos = t.pos();
int minMax = parseRepeat(t);
if (minMax < 0) {
// If the repeat cannot be parsed, { is a literal.
t.rewindTo(repeatPos);
literal(t.pop()); // '{'
break;
}
min = minMax >> 16;
max = (short) (minMax & 0xffff); // sign extend
repeat(Regexp.Op.REPEAT, min, max, repeatPos, t, lastRepeatPos);
break;
}
case '\\': {
int savedPos = t.pos();
t.skip(1); // '\\'
if ((flags & RE2.PERL_X) != 0 && t.more()) {
int c = t.pop();
switch (c) {
case 'A':
op(Regexp.Op.BEGIN_TEXT);
break bigswitch;
case 'b':
op(Regexp.Op.WORD_BOUNDARY);
break bigswitch;
case 'B':
op(Regexp.Op.NO_WORD_BOUNDARY);
break bigswitch;
case 'C':
// any byte; not supported
throw new PatternSyntaxException(ERR_INVALID_ESCAPE, "\\C");
case 'Q': {
// \Q ... \E: the ... is always literals
String lit = t.rest();
int i = lit.indexOf("\\E");
if (i >= 0) {
lit = lit.substring(0, i);
}
t.skipString(lit);
t.skipString("\\E");
push(literalRegexp(lit, flags));
break bigswitch;
}
case 'z':
op(Regexp.Op.END_TEXT);
break bigswitch;
default:
t.rewindTo(savedPos);
break;
}
}
Regexp re = newRegexp(Regexp.Op.CHAR_CLASS);
re.flags = flags;
// Look for Unicode character group like \p{Han}
if (t.lookingAt("\\p") || t.lookingAt("\\P")) {
CharClass cc = new CharClass();
if (parseUnicodeClass(t, cc)) {
re.runes = cc.toArray();
push(re);
break bigswitch;
}
}
// Perl character class escape.
CharClass cc = new CharClass();
if (parsePerlClassEscape(t, cc)) {
re.runes = cc.toArray();
push(re);
break bigswitch;
}
t.rewindTo(savedPos);
reuse(re);
// Ordinary single-character escape.
literal(parseEscape(t));
break;
}
}
lastRepeatPos = repeatPos;
}
concat();
if (swapVerticalBar()) {
pop(); // pop vertical bar
}
alternate();
int n = stack.size();
if (n != 1) {
throw new PatternSyntaxException(ERR_MISSING_PAREN, wholeRegexp);
}
return stack.get(0);
}
// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
// If |t| is not of that form, it returns -1.
// If |t| has the right form but the values are negative or too big,
// it returns -2.
// On success, returns a nonnegative number encoding min/max in the
// high/low signed halfwords of the result. (Note: min >= 0; max may
// be -1.)
//
// On success, advances |t| beyond the repeat; otherwise |t.pos()| is
// undefined.
private static int parseRepeat(StringIterator t)
throws PatternSyntaxException {
int start = t.pos();
if (!t.more() || !t.lookingAt('{')) {
return -1;
}
t.skip(1); // '{'
int min = parseInt(t); // (can be -2)
if (min == -1) {
return -1;
}
if (!t.more()) {
return -1;
}
int max;
if (!t.lookingAt(',')) {
max = min;
} else {
t.skip(1); // ','
if (!t.more()) {
return -1;
}
if (t.lookingAt('}')) {
max = -1;
} else if ((max = parseInt(t)) == -1) { // (can be -2)
return -1;
}
}
if (!t.more() || !t.lookingAt('}')) {
return -1;
}
t.skip(1); // '}'
if (min < 0 || min > 1000 ||
max == -2 || max > 1000 || max >= 0 && min > max) {
// Numbers were negative or too big, or max is present and min > max.
throw new PatternSyntaxException(ERR_INVALID_REPEAT_SIZE, t.from(start));
}
return (min << 16) | (max & 0xffff); // success
}
// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
// like (?i) or (?: or (?i:.
// Pre: t at "(?". Post: t after ")".
// Sets numCap.
private void parsePerlFlags(StringIterator t) throws PatternSyntaxException {
int startPos = t.pos();
// Check for named captures, first introduced in Python's regexp library.
// As usual, there are three slightly different syntaxes:
//
// (?Pexpr) the original, introduced by Python
// (?expr) the .NET alteration, adopted by Perl 5.10
// (?'name'expr) another .NET alteration, adopted by Perl 5.10
//
// Perl 5.10 gave in and implemented the Python version too,
// but they claim that the last two are the preferred forms.
// PCRE and languages based on it (specifically, PHP and Ruby)
// support all three as well. EcmaScript 4 uses only the Python form.
//
// In both the open source world (via Code Search) and the
// Google source tree, (?Pexpr) is the dominant form.
// Java Pattern uses (?expr), so we implement both.
String s = t.rest();
if (s.startsWith("(?P<") || s.startsWith("(?<")) {
// Pull out name.
int startLength = 4;
if(s.startsWith("(?<")) {
startLength = 3;
}
int end = s.indexOf('>');
if (end < 0) {
throw new PatternSyntaxException(ERR_INVALID_NAMED_CAPTURE, s);
}
String name = s.substring(startLength, end); // "name"
t.skipString(name);
t.skip(startLength + 1); // "(?P<>" or "(?<>"
if (!isValidCaptureName(name)) {
throw new PatternSyntaxException(
ERR_INVALID_NAMED_CAPTURE, s.substring(0, end)); // "(?P"
}
// Like ordinary capture, but named.
Regexp re = op(Regexp.Op.LEFT_PAREN);
re.cap = ++numCap;
re.name = name;
return;
}
// Non-capturing group. Might also twiddle Perl flags.
t.skip(2); // "(?"
int flags = this.flags;
int sign = +1;
boolean sawFlag = false;
loop:
while (t.more()) {
int c = t.pop();
switch (c) {
default:
break loop;
// Flags.
case 'i':
flags |= RE2.FOLD_CASE;
sawFlag = true;
break;
case 'm':
flags &= ~RE2.ONE_LINE;
sawFlag = true;
break;
case 's':
flags |= RE2.DOT_NL;
sawFlag = true;
break;
case 'U':
flags |= RE2.NON_GREEDY;
sawFlag = true;
break;
// Switch to negation.
case '-':
if (sign < 0) {
break loop;
}
sign = -1;
// Invert flags so that | above turn into &~ and vice versa.
// We'll invert flags again before using it below.
flags = ~flags;
sawFlag = false;
break;
// End of flags, starting group or not.
case ':':
case ')':
if (sign < 0) {
if (!sawFlag) {
break loop;
}
flags = ~flags;
}
if (c == ':') {
// Open new group
op(Regexp.Op.LEFT_PAREN);
}
this.flags = flags;
return;
}
}
throw new PatternSyntaxException(ERR_INVALID_PERL_OP, t.from(startPos));
}
// isValidCaptureName reports whether name
// is a valid capture name: [A-Za-z0-9_]+.
// PCRE limits names to 32 bytes.
// Python rejects names starting with digits.
// We don't enforce either of those.
private static boolean isValidCaptureName(String name) {
if (name.isEmpty()) {
return false;
}
for (int i = 0; i < name.length(); ++i) {
char c = name.charAt(i);
if (c != '_' && !Utils.isalnum(c)) {
return false;
}
}
return true;
}
// parseInt parses a nonnegative decimal integer.
// -1 => bad format. -2 => format ok, but integer overflow.
private static int parseInt(StringIterator t) {
int start = t.pos();
int c;
while (t.more() && (c = t.peek()) >= '0' && c <= '9') {
t.skip(1); // digit
}
String n = t.from(start);
if (n.isEmpty() ||
n.length() > 1 && n.charAt(0) == '0') { // disallow leading zeros
return -1; // bad format
}
if (n.length() > 8) {
return -2; // overflow
}
return Integer.valueOf(n, 10); // can't fail
}
// can this be represented as a character class?
// single-rune literal string, char class, ., and .|\n.
private static boolean isCharClass(Regexp re) {
return (re.op == Regexp.Op.LITERAL && re.runes.length == 1 ||
re.op == Regexp.Op.CHAR_CLASS ||
re.op == Regexp.Op.ANY_CHAR_NOT_NL ||
re.op == Regexp.Op.ANY_CHAR);
}
// does re match r?
private static boolean matchRune(Regexp re, int r) {
switch (re.op) {
case LITERAL:
return re.runes.length == 1 && re.runes[0] == r;
case CHAR_CLASS:
for (int i = 0; i < re.runes.length; i += 2) {
if (re.runes[i] <= r && r <= re.runes[i + 1]) {
return true;
}
}
return false;
case ANY_CHAR_NOT_NL:
return r != '\n';
case ANY_CHAR:
return true;
}
return false;
}
// parseVerticalBar handles a | in the input.
private void parseVerticalBar() {
concat();
// The concatenation we just parsed is on top of the stack.
// If it sits above an opVerticalBar, swap it below
// (things below an opVerticalBar become an alternation).
// Otherwise, push a new vertical bar.
if (!swapVerticalBar()) {
op(Regexp.Op.VERTICAL_BAR);
}
}
// mergeCharClass makes dst = dst|src.
// The caller must ensure that dst.Op >= src.Op,
// to reduce the amount of copying.
private static void mergeCharClass(Regexp dst, Regexp src) {
switch (dst.op) {
case ANY_CHAR:
// src doesn't add anything.
break;
case ANY_CHAR_NOT_NL:
// src might add \n
if (matchRune(src, '\n')) {
dst.op = Regexp.Op.ANY_CHAR;
}
break;
case CHAR_CLASS:
// src is simpler, so either literal or char class
if (src.op == Regexp.Op.LITERAL) {
dst.runes = new CharClass(dst.runes).
appendLiteral(src.runes[0], src.flags).
toArray();
} else {
dst.runes = new CharClass(dst.runes).appendClass(src.runes).toArray();
}
break;
case LITERAL:
// both literal
if (src.runes[0] == dst.runes[0] && src.flags == dst.flags) {
break;
}
dst.op = Regexp.Op.CHAR_CLASS;
dst.runes = new CharClass().
appendLiteral(dst.runes[0], dst.flags).
appendLiteral(src.runes[0], src.flags).
toArray();
break;
}
}
// If the top of the stack is an element followed by an opVerticalBar
// swapVerticalBar swaps the two and returns true.
// Otherwise it returns false.
private boolean swapVerticalBar() {
// If above and below vertical bar are literal or char class,
// can merge into a single char class.
int n = stack.size();
if (n >= 3 &&
stack.get(n - 2).op == Regexp.Op.VERTICAL_BAR &&
isCharClass(stack.get(n - 1)) &&
isCharClass(stack.get(n - 3))) {
Regexp re1 = stack.get(n - 1);
Regexp re3 = stack.get(n - 3);
// Make re3 the more complex of the two.
if (re1.op.ordinal() > re3.op.ordinal()) {
Regexp tmp = re3;
re3 = re1;
re1 = tmp;
stack.set(n - 3, re3);
}
mergeCharClass(re3, re1);
reuse(re1);
pop();
return true;
}
if (n >= 2) {
Regexp re1 = stack.get(n - 1);
Regexp re2 = stack.get(n - 2);
if (re2.op == Regexp.Op.VERTICAL_BAR) {
if (n >= 3) {
// Now out of reach.
// Clean opportunistically.
cleanAlt(stack.get(n - 3));
}
stack.set(n - 2, re1);
stack.set(n - 1, re2);
return true;
}
}
return false;
}
// parseRightParen handles a ')' in the input.
private void parseRightParen() throws PatternSyntaxException {
concat();
if (swapVerticalBar()) {
pop(); // pop vertical bar
}
alternate();
int n = stack.size();
if (n < 2) {
throw new PatternSyntaxException(ERR_INTERNAL_ERROR, "stack underflow");
}
Regexp re1 = pop();
Regexp re2 = pop();
if (re2.op != Regexp.Op.LEFT_PAREN) {
throw new PatternSyntaxException(ERR_MISSING_PAREN, wholeRegexp);
}
// Restore flags at time of paren.
this.flags = re2.flags;
if (re2.cap == 0) {
// Just for grouping.
push(re1);
} else {
re2.op = Regexp.Op.CAPTURE;
re2.subs = new Regexp[] { re1 };
push(re2);
}
}
// parseEscape parses an escape sequence at the beginning of s
// and returns the rune.
// Pre: t at '\\'. Post: after escape.
@SuppressWarnings("fallthrough") // disables *all* fallthru checking. Lame.
private static int parseEscape(StringIterator t)
throws PatternSyntaxException {
int startPos = t.pos();
t.skip(1); // '\\'
if (!t.more()) {
throw new PatternSyntaxException(ERR_TRAILING_BACKSLASH);
}
int c = t.pop();
bigswitch:
switch (c) {
default:
if (!Utils.isalnum(c)) {
// Escaped non-word characters are always themselves.
// PCRE is not quite so rigorous: it accepts things like
// \q, but we don't. We once rejected \_, but too many
// programs and people insist on using it, so allow \_.
return c;
}
break;
// Octal escapes.
case '1': case '2': case '3': case '4': case '5': case '6': case '7':
// Single non-zero digit is a backreference; not supported
if (!t.more() || t.peek() < '0' || t.peek() > '7') {
break;
}
/* fallthrough */
case '0':
// Consume up to three octal digits; already have one.
int r = c - '0';
for (int i = 1; i < 3; i++) {
if (!t.more() || t.peek() < '0' || t.peek() > '7') {
break;
}
r = r * 8 + t.peek() - '0';
t.skip(1); // digit
}
return r;
// Hexadecimal escapes.
case 'x':
if (!t.more()) {
break;
}
c = t.pop();
if (c == '{') {
// Any number of digits in braces.
// Perl accepts any text at all; it ignores all text
// after the first non-hex digit. We require only hex digits,
// and at least one.
int nhex = 0;
r = 0;
for (;;) {
if (!t.more()) {
break bigswitch;
}
c = t.pop();
if (c == '}') {
break;
}
int v = Utils.unhex(c);
if (v < 0) {
break bigswitch;
}
r = r * 16 + v;
if (r > Unicode.MAX_RUNE) {
break bigswitch;
}
nhex++;
}
if (nhex == 0) {
break bigswitch;
}
return r;
}
// Easy case: two hex digits.
int x = Utils.unhex(c);
c = t.pop();
int y = Utils.unhex(c);
if (x < 0 || y < 0) {
break;
}
return x * 16 + y;
// C escapes. There is no case 'b', to avoid misparsing
// the Perl word-boundary \b as the C backspace \b
// when in POSIX mode. In Perl, /\b/ means word-boundary
// but /[\b]/ means backspace. We don't support that.
// If you want a backspace, embed a literal backspace
// character or use \x08.
case 'a':
return 7; // No \a in Java
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'v':
return 11; // No \v in Java
}
throw new PatternSyntaxException(ERR_INVALID_ESCAPE, t.from(startPos));
}
// parseClassChar parses a character class character and returns it.
// wholeClassPos is the position of the start of the entire class "[...".
// Pre: t at class char; Post: t after it.
private static int parseClassChar(StringIterator t, int wholeClassPos)
throws PatternSyntaxException {
if (!t.more()) {
throw new PatternSyntaxException(
ERR_MISSING_BRACKET, t.from(wholeClassPos));
}
// Allow regular escape sequences even though
// many need not be escaped in this context.
if (t.lookingAt('\\')) {
return parseEscape(t);
}
return t.pop();
}
// parsePerlClassEscape parses a leading Perl character class escape like \d
// from the beginning of |t|. If one is present, it appends the characters
// to cc and returns true. The iterator is advanced past the escape
// on success, undefined on failure, in which case false is returned.
private boolean parsePerlClassEscape(StringIterator t, CharClass cc) {
int beforePos = t.pos();
if ((flags & RE2.PERL_X) == 0 ||
!t.more() || t.pop() != '\\' || // consume '\\'
!t.more()) {
return false;
}
t.pop(); // e.g. advance past 'd' in "\\d"
CharGroup g = CharGroup.PERL_GROUPS.get(t.from(beforePos));
if (g == null) {
return false;
}
cc.appendGroup(g, (flags & RE2.FOLD_CASE) != 0);
return true;
}
// parseNamedClass parses a leading POSIX named character class like
// [:alnum:] from the beginning of t. If one is present, it appends the
// characters to cc, advances the iterator, and returns true.
// Pre: t at "[:". Post: t after ":]".
// On failure (no class of than name), throws PatternSyntaxException.
// On misparse, returns false; t.pos() is undefined.
private boolean parseNamedClass(StringIterator t, CharClass cc)
throws PatternSyntaxException {
// (Go precondition check deleted.)
String cls = t.rest();
int i = cls.indexOf(":]");
if (i < 0) {
return false;
}
String name = cls.substring(0, i + 2); // "[:alnum:]"
t.skipString(name);
CharGroup g = CharGroup.POSIX_GROUPS.get(name);
if (g.sign == 0) {
throw new PatternSyntaxException(ERR_INVALID_CHAR_RANGE, name);
}
cc.appendGroup(g, (flags & RE2.FOLD_CASE) != 0);
return true;
}
// RangeTables are represented as int[][], a list of triples (start, end,
// stride).
private static final int[][] ANY_TABLE = {
{0, Unicode.MAX_RUNE, 1},
};
// unicodeTable() returns the Unicode RangeTable identified by name
// and the table of additional fold-equivalent code points.
// Returns null if |name| does not identify a Unicode character range.
private static Pair unicodeTable(String name) {
// Special case: "Any" means any.
if (name.equals("Any")) {
return Pair.of(ANY_TABLE, ANY_TABLE);
}
int[][] table = UnicodeTables.CATEGORIES.get(name);
if (table != null) {
return Pair.of(table, UnicodeTables.FOLD_CATEGORIES.get(name));
}
table = UnicodeTables.SCRIPTS.get(name);
if (table != null) {
return Pair.of(table, UnicodeTables.FOLD_SCRIPT.get(name));
}
return null;
}
// parseUnicodeClass() parses a leading Unicode character class like \p{Han}
// from the beginning of t. If one is present, it appends the characters to
// to |cc|, advances |t| and returns true.
//
// Returns false if such a pattern is not present or UNICODE_GROUPS
// flag is not enabled; |t.pos()| is not advanced in this case.
// Indicates error by throwing PatternSyntaxException.
private boolean parseUnicodeClass(StringIterator t, CharClass cc)
throws PatternSyntaxException {
int startPos = t.pos();
if ((flags & RE2.UNICODE_GROUPS) == 0 ||
!t.lookingAt("\\p") && !t.lookingAt("\\P")) {
return false;
}
t.skip(1); // '\\'
// Committed to parse or throw exception.
int sign = +1;
int c = t.pop(); // 'p' or 'P'
if (c == 'P') {
sign = -1;
}
c = t.pop();
String name;
if (c != '{') {
// Single-letter name.
name = Utils.runeToString(c);
} else {
// Name is in braces.
String rest = t.rest();
int end = rest.indexOf('}');
if (end < 0) {
t.rewindTo(startPos);
throw new PatternSyntaxException(ERR_INVALID_CHAR_RANGE, t.rest());
}
name = rest.substring(0, end); // e.g. "Han"
t.skipString(name);
t.skip(1); // '}'
// Don't use skip(end) because it assumes UTF-16 coding, and
// StringIterator doesn't guarantee that.
}
// Group can have leading negation too.
// \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
if (!name.isEmpty() && name.charAt(0) == '^') {
sign = -sign;
name = name.substring(1);
}
Pair pair = unicodeTable(name);
if (pair == null) {
throw new PatternSyntaxException(
ERR_INVALID_CHAR_RANGE, t.from(startPos));
}
int[][] tab = pair.first;
int[][] fold = pair.second; // fold-equivalent table
// Variation of CharClass.appendGroup() for tables.
if ((flags & RE2.FOLD_CASE) == 0 || fold == null) {
cc.appendTableWithSign(tab, sign);
} else {
// Merge and clean tab and fold in a temporary buffer.
// This is necessary for the negative case and just tidy
// for the positive case.
int[] tmp = new CharClass().
appendTable(tab).
appendTable(fold).
cleanClass().
toArray();
cc.appendClassWithSign(tmp, sign);
}
return true;
}
// parseClass parses a character class and pushes it onto the parse stack.
//
// NOTES:
// Pre: at '['; Post: after ']'.
// Mutates stack. Advances iterator. May throw.
private void parseClass(StringIterator t) throws PatternSyntaxException {
int startPos = t.pos();
t.skip(1); // '['
Regexp re = newRegexp(Regexp.Op.CHAR_CLASS);
re.flags = flags;
CharClass cc = new CharClass();
int sign = +1;
if (t.more() && t.lookingAt('^')) {
sign = -1;
t.skip(1); // '^'
// If character class does not match \n, add it here,
// so that negation later will do the right thing.
if ((flags & RE2.CLASS_NL) == 0) {
cc.appendRange('\n', '\n');
}
}
boolean first = true; // ']' and '-' are okay as first char in class
while (!t.more() || t.peek() != ']' || first) {
// POSIX: - is only okay unescaped as first or last in class.
// Perl: - is okay anywhere.
if (t.more() && t.lookingAt('-') &&
(flags & RE2.PERL_X) == 0 &&
!first) {
String s = t.rest();
if (s.equals("-") || !s.startsWith("-]")) {
t.rewindTo(startPos);
throw new PatternSyntaxException(ERR_INVALID_CHAR_RANGE, t.rest());
}
}
first = false;
int beforePos = t.pos();
// Look for POSIX [:alnum:] etc.
if (t.lookingAt("[:")) {
if (parseNamedClass(t, cc)) {
continue;
}
t.rewindTo(beforePos);
}
// Look for Unicode character group like \p{Han}.
if (parseUnicodeClass(t, cc)) {
continue;
}
// Look for Perl character class symbols (extension).
if (parsePerlClassEscape(t, cc)) {
continue;
}
t.rewindTo(beforePos);
// Single character or simple range.
int lo = parseClassChar(t, startPos);
int hi = lo;
if (t.more() && t.lookingAt('-')) {
t.skip(1); // '-'
if (t.more() && t.lookingAt(']')) {
// [a-] means (a|-) so check for final ].
t.skip(-1);
} else {
hi = parseClassChar(t, startPos);
if (hi < lo) {
throw new PatternSyntaxException(
ERR_INVALID_CHAR_RANGE, t.from(beforePos));
}
}
}
if ((flags & RE2.FOLD_CASE) == 0) {
cc.appendRange(lo, hi);
} else {
cc.appendFoldedRange(lo, hi);
}
}
t.skip(1); // ']'
cc.cleanClass();
if (sign < 0) {
cc.negateClass();
}
re.runes = cc.toArray();
push(re);
}
//// Utilities
// Returns a new copy of the specified subarray.
static Regexp[] subarray(Regexp[] array, int start, int end) {
Regexp[] r = new Regexp[end - start];
for (int i = start; i < end; ++i) {
r[i - start] = array[i];
}
return r;
}
private static class Pair {
final F first;
final S second;
Pair(F first, S second) {
this.first = first;
this.second = second;
}
static Pair of(F first, S second) {
return new Pair(first, second);
}
}
private static int[] concatRunes(int[] x, int[] y) {
int[] z = new int[x.length + y.length];
System.arraycopy(x, 0, z, 0, x.length);
System.arraycopy(y, 0, z, x.length, y.length);
return z;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy