com.basistech.tclre.Lex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tcl-regex Show documentation
Show all versions of tcl-regex Show documentation
Java port of the regex engine from Tcl
The newest version!
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.tclre;
/**
* from regc_lex.c
* Note that this continues the pattern of all the state living
* in the 'Compiler' object.
*/
class Lex {
/* lexical contexts */
private static final int L_ERE = 1; /* mainline ERE/ARE */
private static final int L_BRE = 2; /* mainline BRE */
private static final int L_Q = 3; /* Flags.REG_QUOTE */
private static final int L_EBND = 4; /* ERE/ARE bound */
private static final int L_BBND = 5; /* BRE bound */
private static final int L_BRACK = 6; /* brackets */
private static final int L_CEL = 7; /* collating element */
private static final int L_ECL = 8; /* equivalence class */
private static final int L_CCL = 9; /* character class */
/*
* string constants to interpolate as expansions of things like \d
*/
//CHECKSTYLE:OFF
private static final char backd[] = { /* \d */
'[', '[', ':',
'd', 'i', 'g', 'i', 't',
':', ']', ']'
};
private static final char backD[] = { /* \D */
'[', '^', '[', ':',
'd', 'i', 'g', 'i', 't',
':', ']', ']'
};
private static final char brbackd[] = { /* \d within brackets */
'[', ':',
'd', 'i', 'g', 'i', 't',
':', ']'
};
private static final char backs[] = { /* \s */
'[', '[', ':',
's', 'p', 'a', 'c', 'e',
':', ']', ']'
};
private static final char backS[] = { /* \S */
'[', '^', '[', ':',
's', 'p', 'a', 'c', 'e',
':', ']', ']'
};
private static final char brbacks[] = { /* \s within brackets */
'[', ':',
's', 'p', 'a', 'c', 'e',
':', ']'
};
private static final char backw[] = { /* \w */
'[', '[', ':',
'a', 'l', 'n', 'u', 'm',
':', ']', '_', ']'
};
private static final char backW[] = { /* \W */
'[', '^', '[', ':',
'a', 'l', 'n', 'u', 'm',
':', ']', '_', ']'
};
private static final char brbackw[] = { /* \w within brackets */
'[', ':',
'a', 'l', 'n', 'u', 'm',
':', ']', '_'
};
//CHECKSTYLE:ON
private Compiler v;
Lex(Compiler v) {
this.v = v;
}
boolean see(int t) {
return v.nexttype == t;
}
private char charAt(int index) {
return v.pattern[index];
}
private char charAtNow() {
return charAt(v.now);
}
private char charAtNowAdvance() {
return charAt(v.now++);
}
private char charAtNowPlus(int offset) {
return charAt(v.now + offset);
}
/* scanning macros (know about v) */
boolean ateos() {
return v.now >= v.stop;
}
boolean have(int n) {
return v.stop - v.now >= n;
}
boolean next1(char c) {
return !ateos() && charAtNow() == c;
}
boolean next2(char a, char b) {
return have(2) && charAtNow() == a && charAtNowPlus(1) == b;
}
boolean next3(char a, char b, char c) {
return have(3)
&& charAtNow() == a
&& charAtNowPlus(1) == b
&& charAtNowPlus(2) == c;
}
void set(int c) {
v.nexttype = c;
}
void set(int c, int n) {
v.nexttype = c;
v.nextvalue = n;
}
boolean ret(int c) {
set(c);
return true;
}
boolean retv(int c, int n) {
set(c, n);
return true;
}
boolean lasttype(int t) {
return v.lasttype == t;
}
void intocon(int c) {
v.lexcon = c;
}
boolean incon(int c) {
return v.lexcon == c;
}
/**
* lexstart - set up lexical stuff, scan leading options
*/
void lexstart() throws RegexException {
prefixes(); /* may turn on new type bits etc. */
if (0 != (v.cflags & Flags.REG_QUOTE)) {
assert 0 == (v.cflags & (Flags.REG_ADVANCED | Flags.REG_EXPANDED | Flags.REG_NEWLINE));
intocon(L_Q);
} else if (0 != (v.cflags & Flags.REG_EXTENDED)) {
assert 0 == (v.cflags & Flags.REG_QUOTE);
intocon(L_ERE);
} else {
assert 0 == (v.cflags & (Flags.REG_QUOTE | Flags.REG_ADVF));
intocon(L_BRE);
}
v.nexttype = Compiler.EMPTY; /* remember we were at the start */
next(); /* set up the first token */
}
boolean iscalpha(char c) {
return c < 0x80 && Character.isLetter(c);
}
boolean iscdigit(char x) {
return x < 0x80 && Character.isDigit(x);
}
boolean iscalnum(char x) {
return x < 0x80 && Character.isLetterOrDigit(x);
}
boolean iscspace(char x) {
return x < 0x80 && Character.isSpaceChar(x);
}
/**
* prefixes - implement various special prefixes
*/
void prefixes() throws RegexException {
/* literal string doesn't get any of this stuff */
if (0 != (v.cflags & Flags.REG_QUOTE)) {
return;
}
/* initial "***" gets special things */
if (have(4) && next3('*', '*', '*')) {
switch (charAtNowPlus(3)) {
case '?': /* "***?" error, msg shows version */
throw new RegexException("REG_BADPAT");
case '=': /* "***=" shifts to literal string */
v.note(Flags.REG_UNONPOSIX);
v.cflags |= Flags.REG_QUOTE;
v.cflags &= ~(Flags.REG_ADVANCED | Flags.REG_EXPANDED | Flags.REG_NEWLINE);
v.now += 4;
return; /* and there can be no more prefixes */
case ':': /* "***:" shifts to AREs */
v.note(Flags.REG_UNONPOSIX);
v.cflags |= Flags.REG_ADVANCED;
v.now += 4;
break;
default: /* otherwise *** is just an error */
throw new RegexException("REG_BADRPT");
}
}
/* BREs and EREs don't get embedded options */
if ((v.cflags & Flags.REG_ADVANCED) != Flags.REG_ADVANCED) {
return;
}
/* embedded options (AREs only) */
if (have(3) && next2('(', '?') && iscalpha(charAtNowPlus(2))) {
v.note(Flags.REG_UNONPOSIX);
v.now += 2;
for (; !ateos() && iscalpha(charAtNow()); v.now++) {
switch (charAtNow()) {
case 'b': /* BREs (but why???) */
v.cflags &= ~(Flags.REG_ADVANCED | Flags.REG_QUOTE);
break;
case 'c': /* case sensitive */
v.cflags &= ~Flags.REG_ICASE;
break;
case 'e': /* plain EREs */
v.cflags |= Flags.REG_EXTENDED;
v.cflags &= ~(Flags.REG_ADVF | Flags.REG_QUOTE);
break;
case 'i': /* case insensitive */
v.cflags |= Flags.REG_ICASE;
break;
case 'm': /* Perloid synonym for n */
case 'n': /* \n affects ^ $ . [^ */
v.cflags |= Flags.REG_NEWLINE;
break;
case 'p': /* ~Perl, \n affects . [^ */
v.cflags |= Flags.REG_NLSTOP;
v.cflags &= ~Flags.REG_NLANCH;
break;
case 'q': /* literal string */
v.cflags |= Flags.REG_QUOTE;
v.cflags &= ~Flags.REG_ADVANCED;
break;
case 's': /* single line, \n ordinary */
v.cflags &= ~Flags.REG_NEWLINE;
break;
case 't': /* tight syntax */
v.cflags &= ~Flags.REG_EXPANDED;
break;
case 'w': /* weird, \n affects ^ $ only */
v.cflags &= ~Flags.REG_NLSTOP;
v.cflags |= Flags.REG_NLANCH;
break;
case 'x': /* expanded syntax */
v.cflags |= Flags.REG_EXPANDED;
break;
default:
throw new RegexException("REG_BADOPT");
}
}
if (!next1(')')) {
throw new RegexException("REG_BADOPT");
}
v.now++;
if (0 != (v.cflags & Flags.REG_QUOTE)) {
v.cflags &= ~(Flags.REG_EXPANDED | Flags.REG_NEWLINE);
}
}
}
/**
* lexnest - "call a subroutine", interpolating string at the lexical level
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
*/
void lexnest(char[] interpolated) {
assert v.savepattern == null; /* only one level of nesting */
v.savepattern = v.pattern;
v.savenow = v.now;
v.savestop = v.stop;
v.savenow = v.now;
v.pattern = interpolated;
v.now = 0;
v.stop = v.pattern.length;
}
/**
* lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
*/
void lexword() {
lexnest(backw);
}
int digitval(char c) {
return c - '0';
}
void note(long n) {
v.note(n);
}
//CHECKSTYLE:OFF
/**
* next - get next token
*/
boolean next() throws RegexException {
char c;
/* remember flavor of last token */
v.lasttype = v.nexttype;
/* REG_BOSONLY */
if (v.nexttype == Compiler.EMPTY && (0 != (v.cflags & Flags.REG_BOSONLY))) {
/* at start of a REG_BOSONLY RE */
return retv(Compiler.SBEGIN, (char)0); /* same as \A */
}
/* if we're nested and we've hit end, return to outer level */
if (v.savepattern != null && ateos()) {
v.now = v.savenow;
v.stop = v.savestop;
v.savenow = -1;
v.savestop = -1;
v.pattern = v.savepattern;
v.savepattern = null; // mark that it's not saved.
}
/* skip white space etc. if appropriate (not in literal or []) */
if (0 != (v.cflags & Flags.REG_EXPANDED)) {
switch (v.lexcon) {
case L_ERE:
case L_BRE:
case L_EBND:
case L_BBND:
skip();
break;
}
}
/* handle EOS, depending on context */
if (ateos()) {
switch (v.lexcon) {
case L_ERE:
case L_BRE:
case L_Q:
return ret(Compiler.EOS);
case L_EBND:
case L_BBND:
throw new RegexException("Unbalanced braces.");
case L_BRACK:
case L_CEL:
case L_ECL:
case L_CCL:
throw new RegexException("Unbalanced brackets.");
}
assert false;
}
/* okay, time to actually get a character */
c = charAtNowAdvance();
/* deal with the easy contexts, punt EREs to code below */
switch (v.lexcon) {
case L_BRE: /* punt BREs to separate function */
return brenext(c);
case L_ERE: /* see below */
break;
case L_Q: /* literal strings are easy */
return retv(Compiler.PLAIN, c);
case L_BBND: /* bounds are fairly simple */
case L_EBND:
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return retv(Compiler.DIGIT, digitval(c));
case ',':
return ret(',');
case '}': /* ERE bound ends with } */
if (incon(L_EBND)) {
intocon(L_ERE);
if (0 != (v.cflags & Flags.REG_ADVF) && next1('?')) {
v.now++;
note(Flags.REG_UNONPOSIX);
return retv('}', 0);
}
return retv('}', 1);
} else {
throw new RegexException("Errors.REG_BADBR");
}
case '\\': /* BRE bound ends with \} */
if (incon(L_BBND) && next1('}')) {
v.now++;
intocon(L_BRE);
return ret('}');
} else {
throw new RegexException("Errors.REG_BADBR");
}
default:
throw new RegexException("Errors.REG_BADBR");
}
case L_BRACK: /* brackets are not too hard */
switch (c) {
case ']':
if (lasttype('[')) {
return retv(Compiler.PLAIN, c);
} else {
intocon(0 != (v.cflags & Flags.REG_EXTENDED) ? L_ERE : L_BRE);
return ret(']');
}
case '\\':
note(Flags.REG_UBBS);
if (0 == (v.cflags & Flags.REG_ADVF)) {
return retv(Compiler.PLAIN, c);
}
note(Flags.REG_UNONPOSIX);
if (ateos()) {
throw new RegexException("REG_EESCAPE");
}
lexescape();
switch (v.nexttype) { /* not all escapes okay here */
case Compiler.PLAIN:
return true;
case Compiler.CCLASS:
switch (v.nextvalue) {
case 'd':
lexnest(brbackd);
break;
case 's':
lexnest(brbacks);
break;
case 'w':
lexnest(brbackw);
break;
default:
throw new RegexException("Errors.REG_EESCAPE");
}
/* lexnest done, back up and try again */
v.nexttype = v.lasttype;
return next();
}
/* not one of the acceptable escapes */
throw new RegexException("Errors.REG_EESCAPE");
case '-':
if (lasttype('[') || next1(']')) {
return retv(Compiler.PLAIN, c);
} else {
return retv(Compiler.RANGE, c);
}
case '[':
if (ateos()) {
throw new RegexException("Errors.REG_EBRACK");
}
switch (charAtNowAdvance()) {
case '.':
intocon(L_CEL);
/* might or might not be locale-specific */
return ret(Compiler.COLLEL);
case '=':
intocon(L_ECL);
note(Flags.REG_ULOCALE);
return ret(Compiler.ECLASS);
case ':':
intocon(L_CCL);
note(Flags.REG_ULOCALE);
return ret(Compiler.CCLASS);
default: /* oops */
v.now--;
return retv(Compiler.PLAIN, c);
}
default:
return retv(Compiler.PLAIN, c);
}
case L_CEL: /* collating elements are easy */
if (c == '.' && next1(']')) {
v.now++;
intocon(L_BRACK);
return retv(Compiler.END, '.');
} else {
return retv(Compiler.PLAIN, c);
}
case L_ECL: /* ditto equivalence classes */
if (c == '=' && next1(']')) {
v.now++;
intocon(L_BRACK);
return retv(Compiler.END, '=');
} else {
return retv(Compiler.PLAIN, c);
}
case L_CCL: /* ditto character classes */
if (c == ':' && next1(']')) {
v.now++;
intocon(L_BRACK);
return retv(Compiler.END, ':');
} else {
return retv(Compiler.PLAIN, c);
}
default:
assert false;
break;
}
/* that got rid of everything except EREs and AREs */
assert incon(L_ERE);
/* deal with EREs and AREs, except for backslashes */
switch (c) {
case '|':
return ret('|');
case '*':
if (0 != (v.cflags & Flags.REG_ADVF) && next1('?')) {
v.now++;
note(Flags.REG_UNONPOSIX);
return retv('*', 0);
}
return retv('*', 1);
case '+':
if (0 != (v.cflags & Flags.REG_ADVF) && next1('?')) {
v.now++;
note(Flags.REG_UNONPOSIX);
return retv('+', 0);
}
return retv('+', 1);
case '?':
if (0 != (v.cflags & Flags.REG_ADVF) && next1('?')) {
v.now++;
note(Flags.REG_UNONPOSIX);
return retv('?', 0);
}
return retv('?', 1);
case '{': /* bounds start or plain character */
if (0 != (v.cflags & Flags.REG_EXPANDED)) {
skip();
}
if (ateos() || !iscdigit(charAtNow())) {
note(Flags.REG_UBRACES);
note(Flags.REG_UUNSPEC);
return retv(Compiler.PLAIN, c);
} else {
note(Flags.REG_UBOUNDS);
intocon(L_EBND);
return ret('{');
}
case '(': /* parenthesis, or advanced extension */
if (0 != (v.cflags & Flags.REG_ADVF) && next1('?')) {
note(Flags.REG_UNONPOSIX);
v.now++;
char flagChar = charAtNowAdvance();
switch (flagChar) {
case ':': /* non-capturing paren */
return retv('(', 0);
case '#': /* comment */
while (!ateos() && charAtNow() != ')') {
v.now++;
}
if (!ateos()) {
v.now++;
}
assert v.nexttype == v.lasttype;
return next();
case '=': /* positive lookahead */
note(Flags.REG_ULOOKAHEAD);
return retv(Compiler.LACON, 1);
case '!': /* negative lookahead */
note(Flags.REG_ULOOKAHEAD);
return retv(Compiler.LACON, 0);
default:
throw new RegexException(String.format("Invalid flag after '(?': %c", flagChar));
}
}
if (0 != (v.cflags & Flags.REG_NOSUB) || 0 != (v.cflags & Flags.REG_NOCAPT)) {
return retv('(', 0); /* all parens non-capturing */
} else {
return retv('(', 1);
}
case ')':
if (lasttype('(')) {
note(Flags.REG_UUNSPEC);
}
return retv(')', c);
case '[': /* easy except for [[:<:]] and [[:>:]] */
if (have(6) && charAtNow() == '['
&& charAtNowPlus(1) == ':'
&& (charAtNowPlus(2) == '<' || charAtNowPlus(2) == '>')
&& charAtNowPlus(3) == ':'
&& charAtNowPlus(4) == ']'
&& charAtNowPlus(5) == ']') {
c = charAtNowPlus(2);
v.now += 6;
note(Flags.REG_UNONPOSIX);
return ret((c == '<') ? '<' : '>');
}
intocon(L_BRACK);
if (next1('^')) {
v.now++;
return retv('[', 0);
}
return retv('[', 1);
case '.':
return ret('.');
case '^':
return ret('^');
case '$':
return ret('$');
case '\\': /* mostly punt backslashes to code below */
if (ateos()) {
throw new RegexException("REG_EESCAPE");
}
break;
default: /* ordinary character */
return retv(Compiler.PLAIN, c);
}
/* ERE/ARE backslash handling; backslash already eaten */
assert !ateos();
if (0 == (v.cflags & Flags.REG_ADVF)) { /* only AREs have non-trivial escapes */
if (iscalnum(charAtNow())) {
note(Flags.REG_UBSALNUM);
note(Flags.REG_UUNSPEC);
}
return retv(Compiler.PLAIN, charAtNowAdvance());
}
lexescape();
if (v.nexttype == Compiler.CCLASS) { /* fudge at lexical level */
switch (v.nextvalue) {
case 'd':
lexnest(backd);
break;
case 'D':
lexnest(backD);
break;
case 's':
lexnest(backs);
break;
case 'S':
lexnest(backS);
break;
case 'w':
lexnest(backw);
break;
case 'W':
lexnest(backW);
break;
default:
throw new RuntimeException("Invalid escape " + Character.toString((char)v.nextvalue));
}
/* lexnest done, back up and try again */
v.nexttype = v.lasttype;
return next();
}
/* otherwise, lexescape has already done the work */
return true;
}
//CHECKSTYLE:ON
/**
* brenext - get next BRE token
* This is much like EREs except for all the stupid backslashes and the
*/
boolean brenext(char pc) throws RegexException {
char c = pc;
switch (c) {
case '*':
if (lasttype(Compiler.EMPTY) || lasttype('(') || lasttype('^')) {
return retv(Compiler.PLAIN, c);
}
return ret('*');
case '[':
//CHECKSTYLE:OFF
if (have(6) && charAtNow() == '['
&& charAtNowPlus(1) == ':'
&& (charAtNowPlus(2) == '<' || charAtNowPlus(2) == '>')
&& charAtNowPlus(3) == ':'
&& charAtNowPlus(4) == ']'
&& charAtNowPlus(5) == ']') {
c = charAtNowPlus(2);
v.now += 6;
note(Flags.REG_UNONPOSIX);
return ret((c == '<') ? '<' : '>');
//CHECKSTYLE:ON
}
intocon(L_BRACK);
if (next1('^')) {
v.now++;
return retv('[', 0);
}
return retv('[', 1);
case '.':
return ret('.');
case '^':
if (lasttype(Compiler.EMPTY)) {
return ret('^');
}
if (lasttype('(')) {
note(Flags.REG_UUNSPEC);
return ret('^');
}
return retv(Compiler.PLAIN, c);
case '$':
if (0 != (v.cflags & Flags.REG_EXPANDED)) {
skip();
}
if (ateos()) {
return ret('$');
}
if (next2('\\', ')')) {
note(Flags.REG_UUNSPEC);
return ret('$');
}
return retv(Compiler.PLAIN, c);
case '\\':
break; /* see below */
default:
return retv(Compiler.PLAIN, c);
}
assert c == '\\';
if (ateos()) {
throw new RegexException("REG_EESCAPE");
}
c = charAtNowAdvance();
switch (c) {
case '{':
intocon(L_BBND);
note(Flags.REG_UBOUNDS);
return ret('{');
case '(':
return retv('(', 1);
case ')':
return retv(')', c);
case '<':
note(Flags.REG_UNONPOSIX);
return ret('<');
case '>':
note(Flags.REG_UNONPOSIX);
return ret('>');
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
note(Flags.REG_UBACKREF);
return retv(Compiler.BACKREF, digitval(c));
default:
if (iscalnum(c)) {
note(Flags.REG_UBSALNUM);
note(Flags.REG_UUNSPEC);
}
return retv(Compiler.PLAIN, c);
}
}
void skip() {
int start = v.now;
assert 0 != (v.cflags & Flags.REG_EXPANDED);
for (;;) {
while (!ateos() && iscspace(charAtNow())) {
v.now++;
}
if (ateos() || charAtNow() != '#') {
break; /* NOTE BREAK OUT */
}
assert next1('#');
while (!ateos() && charAtNow() != '\n') {
v.now++;
}
/* leave the newline to be picked up by the iscspace loop */
}
if (v.now != start) {
note(Flags.REG_UNONPOSIX);
}
}
/**
* lexescape - parse an ARE backslash escape (backslash already eaten)
* Note slightly nonstandard use of the CCLASS type code.
*/
//CHECKSTYLE:OFF
boolean lexescape() throws RegexException {
int c;
int save;
assert 0 != (v.cflags & Flags.REG_ADVF);
assert !ateos();
c = charAtNowAdvance();
if (!iscalnum((char)c)) {
return retv(Compiler.PLAIN, c);
}
note(Flags.REG_UNONPOSIX);
switch (c) {
case 'a':
return retv(Compiler.PLAIN, '\007');
case 'A':
return retv(Compiler.SBEGIN, 0);
case 'b':
return retv(Compiler.PLAIN, '\b');
case 'B':
return retv(Compiler.PLAIN, '\\');
case 'c':
if (ateos()) {
throw new RegexException("Incomplete \\c escape.");
}
return retv(Compiler.PLAIN, (char)(charAtNowAdvance() & 037));
case 'd':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 'd');
case 'D':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 'D');
case 'e':
return retv(Compiler.PLAIN, '\033');
case 'f':
return retv(Compiler.PLAIN, '\f');
case 'm':
return ret('<');
case 'M':
return ret('>');
case 'n':
return retv(Compiler.PLAIN, '\n');
case 'r':
return retv(Compiler.PLAIN, '\r');
case 's':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 's');
case 'S':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 'S');
case 't':
return retv(Compiler.PLAIN, '\t');
case 'u':
c = lexdigits(16, 4, 4);
return retv(Compiler.PLAIN, c);
case 'U':
c = lexdigits(16, 8, 8);
// This escape is for UTF-32 characters. There are, ahem, certain requirements.
if (c > Character.MAX_CODE_POINT) {
throw new RegexException("Invalid UTF-32 escape.");
}
return retv(Compiler.PLAIN, c);
case 'v':
return retv(Compiler.PLAIN, '\u000b');
case 'w':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 'w');
case 'W':
note(Flags.REG_ULOCALE);
return retv(Compiler.CCLASS, 'W');
case 'x':
c = lexdigits(16, 1, 255); /* REs >255 long outside spec */
return retv(Compiler.PLAIN, c);
case 'y':
note(Flags.REG_ULOCALE);
return retv(Compiler.WBDRY, 0);
case 'Y':
note(Flags.REG_ULOCALE);
return retv(Compiler.NWBDRY, 0);
case 'Z':
return retv(Compiler.SEND, 0);
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
save = v.now;
v.now--; /* put first digit back */
c = lexdigits(10, 1, 255); /* REs >255 long outside spec */
/* ugly heuristic (first test is "exactly 1 digit?") */
if (v.now - save == 0 || c <= v.getSubs().size()) {
note(Flags.REG_UBACKREF);
return retv(Compiler.BACKREF, (char)c);
}
/* oops, doesn't look like it's a backref after all... */
v.now = save;
/* and fall through into octal number */
case '0':
v.now--; /* put first digit back */
c = lexdigits(8, 1, 3);
return retv(Compiler.PLAIN, c);
default:
throw new RegexException("Invalid escape"); // unknown escape.
}
}
//CHECKSTYLE:ON
/*
- lexdigits - slurp up digits and return codepoint value
*/
/* chr value; errors signalled via ERR */
private int lexdigits(int base, int minlen, int maxlen) throws RegexException {
int n; /* unsigned to avoid overflow misbehavior */
int len;
int c;
int d;
final char ub = (char)base;
n = 0;
for (len = 0; len < maxlen && !ateos(); len++) {
c = charAtNowAdvance();
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
d = digitval((char)c);
break;
case 'a':
case 'A':
d = 10;
break;
case 'b':
case 'B':
d = 11;
break;
case 'c':
case 'C':
d = 12;
break;
case 'd':
case 'D':
d = 13;
break;
case 'e':
case 'E':
d = 14;
break;
case 'f':
case 'F':
d = 15;
break;
default:
v.now--; /* oops, not a digit at all */
d = -1;
break;
}
if (d >= base) { /* not a plausible digit */
v.now--;
d = -1;
}
if (d < 0) {
break; /* NOTE BREAK OUT */
}
n = n * ub + d;
}
if (len < minlen) {
throw new RegexException("Not enough digits.");
}
return n;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy