com.basistech.tclre.Compiler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tcl-regex Show documentation
Show all versions of tcl-regex Show documentation
Java port of the regex engine from Tcl
The newest version!
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.tclre;
import java.io.Serializable;
import java.util.EnumSet;
import java.util.List;
import com.google.common.collect.Lists;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* from regcomp.c
*/
//CHECKSTYLE:OFF
class Compiler {
//CHECKSTYLE:ON
/* token type codes, some also used as NFA arc types */
static final int EMPTY = 'n'; /* no token present */
static final int EOS = 'e'; /* end of string */
static final int PLAIN = 'p'; /* ordinary character */
static final int DIGIT = 'd'; /* digit (in bound) */
static final int BACKREF = 'b'; /* back reference */
static final int COLLEL = 'I'; /* start of [. */
static final int ECLASS = 'E'; /* start of [= */
static final int CCLASS = 'C'; /* start of [: */
static final int END = 'X'; /* end of [. [= [: */
static final int RANGE = 'R'; /* - within [] which might be range delim. */
static final int LACON = 'L'; /* lookahead constraint subRE */
static final int AHEAD = 'a'; /* color-lookahead arc */
static final int BEHIND = 'r'; /* color-lookbehind arc */
static final int WBDRY = 'w'; /* word boundary constraint */
static final int NWBDRY = 'W'; /* non-word-boundary constraint */
static final int SBEGIN = 'A'; /* beginning of string (even if not BOL) */
static final int SEND = 'Z'; /* end of string (even if not EOL) */
static final int PREFER = 'P'; /* length preference */
static final int DUPMAX = 255;
static final int INFINITY = 256;
static final int SOME = 2;
static final int INF = 3;
private static final Logger LOG = LoggerFactory.getLogger(Compiler.class);
private static final boolean IS_DEBUG = System.getProperty("tclre.debug") != null;
int now; /* scan pointer into string */
int stop; /* end of string */
char[] savepattern;
int savenow = -1; /* saved now and stop for "subroutine call" */
int savestop = -1;
int cflags; /* copy of compile flags */
int lasttype; /* type of previous token */
int nexttype; /* type of next token */
int nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see lex.c) */
char[] pattern;
private ColorMap cm; /* character color map */
private short nlcolor; /* color of newline */
private State wordchrs; /* state in nfa holding word-char outarcs */
private List lacons; /* lookahead-constraint vector */
private Lex lex;
// this bit of cleanup to allow some mocking in testing the color map.
private Nfa nfa; /* the NFA */
private long info;
private final EnumSet originalFlags;
private final List subs; /* subRE pointer vector */
/**
* Constructor does minimal setup; construct, then call compile().
* The entire effect is a side-effect on 're'.
*/
private Compiler(String pattern, EnumSet flags) {
if (flags.contains(PatternFlags.QUOTE)
&& (flags.contains(PatternFlags.ADVANCED)
|| flags.contains(PatternFlags.EXPANDED)
|| flags.contains(PatternFlags.NLANCH)
|| flags.contains(PatternFlags.NLSTOP))) {
throw new IllegalArgumentException("Invalid flag combination");
}
this.pattern = pattern.toCharArray();
this.originalFlags = flags;
// Map from EnumSet, which is how we want users to see this some time, to bitflags.
// At some point we might push the enum sets all the way down.
for (PatternFlags f : flags) {
switch (f) {
case BASIC:
this.cflags |= Flags.REG_BASIC;
break;
case EXTENDED:
this.cflags |= Flags.REG_EXPANDED;
break;
case ADVANCED:
this.cflags |= Flags.REG_ADVF;
this.cflags |= Flags.REG_EXTENDED;
break;
case QUOTE:
this.cflags |= Flags.REG_QUOTE;
break;
case ICASE:
this.cflags |= Flags.REG_ICASE;
break;
case NOSUB:
this.cflags |= Flags.REG_NOSUB;
break;
case EXPANDED:
this.cflags |= Flags.REG_EXPANDED;
break;
case NLSTOP:
this.cflags |= Flags.REG_NLSTOP;
break;
case NLANCH:
this.cflags |= Flags.REG_NLANCH;
break;
default:
throw new RuntimeException("Can't handle " + f);
}
}
subs = Lists.newArrayListWithCapacity(10);
lacons = Lists.newArrayList();
// the lexer is 'over there' but shared state here, for now at least.
lex = new Lex(this);
}
/**
* The official API into this class.
* @param pattern the pattern
* @param flags the flags
* @return the regexp
* @throws RegexException
*/
static RePattern compile(String pattern, EnumSet flags) throws RegexException {
Compiler that = new Compiler(pattern, flags);
return that.compile();
}
private RePattern compile() throws RegexException {
stop = pattern.length;
nlcolor = Constants.COLORLESS;
info = 0;
cm = new ColorMap(this);
nfa = new Nfa(cm);
// No MCESS support, so no initialization of it.
/* Parsing */
lex.lexstart();
if (0 != (cflags & Flags.REG_NLSTOP) || 0 != (cflags & Flags.REG_NLANCH)) {
/* assign newline a unique color */
nlcolor = cm.subcolor(newline());
cm.okcolors(nfa);
}
Subre tree = parse(EOS, PLAIN, nfa.init, nfa.finalState);
assert see(EOS); /* even if error; ISERR() => see(EOS) */
assert tree != null;
/* finish setup of nfa and its subre tree */
nfa.specialcolors();
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug("========= RAW ==========");
nfa.dumpnfa();
LOG.debug(tree.dumpst(true));
}
optst(tree);
int ntree = numst(tree, 1);
markst(tree);
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug("========= TREE FIXED ==========");
LOG.debug(tree.dumpst(true));
}
/* build compacted NFAs for tree and lacons */
info |= nfatree(tree);
// lacons start at 1.
for (int i = 1; i < lacons.size(); i++) {
LOG.debug(String.format("========= LA%d ==========", i));
nfanode(lacons.get(i));
}
if (0 != (tree.flags & Subre.SHORTER)) {
note(Flags.REG_USHORTEST);
}
/* build compacted NFAs for tree, lacons, fast search */
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug("========= SEARCH ==========");
}
/* can sacrifice main NFA now, so use it as work area */
nfa.optimize();
makesearch(nfa);
Cnfa search = nfa.compact();
/* looks okay, package it up */
int nsub = subs.size();
SubstringComparator compare;
if (0 != (cflags & Flags.REG_ICASE)) {
compare = new Comparer(true);
} else {
compare = new Comparer(false);
}
Guts guts = new Guts(cflags, info, nsub, new RuntimeSubexpression(tree),
search, ntree, cm, compare, lacons);
return new HsrePattern(new String(pattern, 0, pattern.length), originalFlags, info, nsub, guts);
}
private static int pair(int a, int b) {
return a * 4 + b;
}
private static int reduce(int x) {
if (x == INFINITY) {
return INF;
} else if (x > 1) {
return SOME;
} else {
return x;
}
}
private char newline() {
return '\n';
}
boolean see(int t) {
return nexttype == t;
}
List getSubs() {
return subs;
}
private static class Comparer implements SubstringComparator, Serializable {
static final long serialVersionUID = 1L;
private final boolean caseInsensitive;
Comparer(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
/*TODO: this isn't right for surrogate pairs, and it's pretty heavy for case-insensitive comparison.
*/
@Override
public int compare(CharSequence data, int start1, int start2, int length) {
for (int x = 0; x < length; x++) {
final int c1 = data.charAt(start1 + x);
final int c2 = data.charAt(start2 + x);
int thisCompare;
if (caseInsensitive) {
thisCompare = Normalizer.compare(c1, c2, Normalizer.COMPARE_IGNORE_CASE);
} else {
thisCompare = c1 - c2;
}
if (thisCompare != 0) {
return thisCompare;
}
}
return 0;
}
}
/**
* makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
* NFA must have been optimize()d already.
*/
private void makesearch(Nfa nfa) {
Arc a;
Arc b;
State pre = nfa.pre;
State s;
State s2;
State slist;
/* no loops are needed if it's anchored */
for (a = pre.outs; a != null; a = a.outchain) {
assert a.type == PLAIN;
if (a.co != nfa.bos[0] && a.co != nfa.bos[1]) {
break;
}
}
if (a != null) {
/* add implicit .* in front */
cm.rainbow(nfa, PLAIN, Constants.COLORLESS, pre, pre);
/* and ^* and \A* too -- not always necessary, but harmless */
nfa.newarc(PLAIN, nfa.bos[0], pre, pre);
nfa.newarc(PLAIN, nfa.bos[1], pre, pre);
}
/*
* Now here's the subtle part. Because many REs have no lookback
* constraints, often knowing when you were in the pre state tells
* you little; it's the next state(s) that are informative. But
* some of them may have other inarcs, i.e. it may be possible to
* make actual progress and then return to one of them. We must
* de-optimize such cases, splitting each such state into progress
* and no-progress states.
*/
/* first, make a list of the states */
slist = null;
for (a = pre.outs; a != null; a = a.outchain) {
s = a.to;
for (b = s.ins; b != null; b = b.inchain) {
if (b.from != pre) {
break;
}
}
if (b != null) { /* must be split */
if (s.tmp == null) { /* if not already in the list */
/* (fixes bugs 505048, 230589, */
/* 840258, 504785) */
s.tmp = slist;
slist = s;
}
}
}
/* do the splits */
for (s = slist; s != null; s = s2) {
s2 = nfa.newstate();
copyouts(nfa, s, s2);
for (a = s.ins; a != null; a = b) {
b = a.inchain;
if (a.from != pre) {
cparc(nfa, a, a.from, s2);
nfa.freearc(a);
}
}
s2 = s.tmp;
s.tmp = null; /* clean up while we're at it */
}
}
/**
* - cparc - allocate a new arc within an NFA, copying details from old one
* ^ static VOID cparc(struct nfa *, struct arc *, struct state *,
* ^ struct state *);
*/
private void cparc(Nfa nfa, Arc oa, State from, State to) {
nfa.newarc(oa.type, oa.co, from, to);
}
/**
* - moveins - move all in arcs of a state to another state
* You might think this could be done better by just updating the
* existing arcs, and you would be right if it weren't for the desire
* for duplicate suppression, which makes it easier to just make new
* ones to exploit the suppression built into newarc.
*/
private void moveins(Nfa nfa, State old, State newState) {
Arc a;
assert old != newState;
while ((a = old.ins) != null) {
cparc(nfa, a, a.from, newState);
nfa.freearc(a);
}
assert old.nins == 0;
assert old.ins == null;
}
/**
* copyouts - copy all out arcs of a state to another state
*/
private void copyouts(Nfa nfa, State old, State newState) {
Arc a;
assert old != newState;
for (a = old.outs; a != null; a = a.outchain) {
cparc(nfa, a, newState, a.to);
}
}
/**
* cloneouts - copy out arcs of a state to another state pair, modifying type
*/
private void cloneouts(Nfa nfa, State old, State from, State to, int type) {
Arc a;
assert old != from;
for (a = old.outs; a != null; a = a.outchain) {
nfa.newarc(type, a.co, from, to);
}
}
/**
* optst - optimize a subRE subtree
*/
private void optst(Subre t) {
if (t == null) {
return;
}
/* recurse through children */
if (t.left != null) {
optst(t.left);
}
if (t.right != null) {
optst(t.right);
}
}
/**
* numst - number tree nodes (assigning retry indexes)
*
* @return next number
*/
private int numst(Subre t, int start) {
int i;
assert t != null;
i = start;
t.retry = (short)i++;
if (t.left != null) {
i = numst(t.left, i);
}
if (t.right != null) {
i = numst(t.right, i);
}
return i;
}
/**
* markst - mark tree nodes as INUSE
*/
private void markst(Subre t) {
assert t != null;
t.flags |= Subre.INUSE;
if (t.left != null) {
markst(t.left);
}
if (t.right != null) {
markst(t.right);
}
}
/**
* nfatree - turn a subRE subtree into a tree of compacted NFAs
*/
/* optimize results from top node */
private long nfatree(Subre t) throws RegexException {
assert t != null && t.begin != null;
if (t.left != null) {
nfatree(t.left);
}
if (t.right != null) {
nfatree(t.right);
}
return nfanode(t);
}
/**
* nfanode - do one NFA for nfatree
*
* @return results of {@link Nfa#optimize()}
*/
private long nfanode(Subre t) throws RegexException {
long ret;
assert t.begin != null;
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug(String.format("========= TREE NODE %s ==========", t.shortId()));
}
Nfa newNfa = new Nfa(nfa);
newNfa.dupnfa(t.begin, t.end, newNfa.init, newNfa.finalState);
newNfa.specialcolors();
ret = newNfa.optimize();
t.cnfa = newNfa.compact();
// freenfa ... depend on our friend the GC.
return ret;
}
private int lmix(int f) {
return f << 2; /* LONGER -> MIXED */
}
private int smix(int f) {
return f << 1; /* SHORTER -> MIXED */
}
int up(int f) {
return (f & ~Subre.LOCAL) | (lmix(f) & smix(f) & Subre.MIXED);
}
private boolean eat(char t) throws RegexException {
return see(t) && lex.next();
}
private boolean messy(int f) {
return 0 != (f & (Subre.MIXED | Subre.CAP | Subre.BACKR));
}
/**
* parse - parse an RE
* This is actually just the top level, which parses a bunch of branches
* tied together with '|'. They appear in the tree as the left children
* of a chain of '|' subres.
*/
private Subre parse(int stopper, int type, State initState, State finalState) throws RegexException {
State left; /* scaffolding for branch */
State right;
Subre branches; /* top level */
Subre branch; /* current branch */
Subre t; /* temporary */
int firstbranch; /* is this the first branch? */
assert stopper == ')' || stopper == EOS;
branches = new Subre('|', Subre.LONGER, initState, finalState);
branch = branches;
firstbranch = 1;
do { /* a branch */
if (0 == firstbranch) {
/* need a place to hang it */
branch.right = new Subre('|', Subre.LONGER, initState, finalState);
branch = branch.right;
}
firstbranch = 0;
left = nfa.newstate();
right = nfa.newstate();
nfa.emptyarc(initState, left);
nfa.emptyarc(right, finalState);
branch.left = parsebranch(stopper, type, left, right, false);
branch.flags |= up(branch.flags | branch.left.flags);
if ((branch.flags & ~branches.flags) != 0) /* new flags */ {
for (t = branches; t != branch; t = t.right) {
t.flags |= branch.flags;
}
}
} while (eat('|'));
assert see(stopper) || see(EOS);
if (!see(stopper)) {
assert stopper == ')' && see(EOS);
//ERR(REG_EPAREN);
throw new RegexException("Unbalanced parentheses.");
}
/* optimize out simple cases */
if (branch == branches) { /* only one branch */
assert branch.right == null;
t = branch.left;
branch.left = null;
branches = t;
} else if (!messy(branches.flags)) { /* no interesting innards */
branches.left = null;
branches.right = null;
branches.op = '=';
}
return branches;
}
/**
* parsebranch - parse one branch of an RE
* This mostly manages concatenation, working closely with parseqatom().
* Concatenated things are bundled up as much as possible, with separate
* ',' nodes introduced only when necessary due to substructure.
*/
private Subre parsebranch(int stopper, int type, State left, State right, boolean partial) throws RegexException {
State lp; /* left end of current construct */
boolean seencontent = false; /* is there anything in this branch yet? */
Subre t;
lp = left;
t = new Subre('=', 0, left, right); /* op '=' is tentative */
while (!see('|') && !see(stopper) && !see(EOS)) {
if (seencontent) { /* implicit concat operator */
lp = nfa.newstate();
moveins(nfa, right, lp);
}
seencontent = true;
/* NB, recursion in parseqatom() may swallow rest of branch */
parseqatom(stopper, type, lp, right, t);
}
if (!seencontent) { /* empty branch */
if (!partial) {
note(Flags.REG_UUNSPEC);
}
nfa.emptyarc(left, right);
}
return t;
}
//CHECKSTYLE:OFF
private void parseqatom(int stopper, int type, State lp, State rp, Subre top) throws RegexException {
State s; /* temporaries for new states */
State s2;
int m;
int n;
Subre atom; /* atom's subtree */
Subre t;
boolean cap; /* capturing parens? */
int pos; /* positive lookahead? */
int subno; /* capturing-parens or backref number */
int atomtype;
int qprefer; /* quantifier short/long preference */
int f;
AtomSetter atomp;
/* initial bookkeeping */
atom = null;
assert lp.nouts == 0; /* must string new code */
assert rp.nins == 0; /* between lp and rp */
subno = 0; /* just to shut lint up */
/* an atom or constraint... */
atomtype = nexttype;
switch (atomtype) {
/* first, constraints, which end by returning */
case '^':
nfa.newarc('^', (short)1, lp, rp);
if (0 != (cflags & Flags.REG_NLANCH)) {
nfa.newarc(BEHIND, nlcolor, lp, rp);
}
lex.next();
return;
case '$':
nfa.newarc('$', (short)1, lp, rp);
if (0 != (cflags & Flags.REG_NLANCH)) {
nfa.newarc(AHEAD, nlcolor, lp, rp);
}
lex.next();
return;
case SBEGIN:
nfa.newarc('^', (short)1, lp, rp); /* BOL */
nfa.newarc('^', (short)0, lp, rp); /* or BOS */
lex.next();
return;
case SEND:
nfa.newarc('$', (short)1, lp, rp); /* EOL */
nfa.newarc('$', (short)0, lp, rp); /* or EOS */
lex.next();
return;
case '<':
wordchrs(); /* does next() */
s = nfa.newstate();
nonword(BEHIND, lp, s);
word(AHEAD, s, rp);
return;
case '>':
wordchrs(); /* does next() */
s = nfa.newstate();
word(BEHIND, lp, s);
nonword(AHEAD, s, rp);
return;
case WBDRY:
wordchrs(); /* does next() */
s = nfa.newstate();
nonword(BEHIND, lp, s);
word(AHEAD, s, rp);
s = nfa.newstate();
word(BEHIND, lp, s);
nonword(AHEAD, s, rp);
return;
case NWBDRY:
wordchrs(); /* does next() */
s = nfa.newstate();
word(BEHIND, lp, s);
word(AHEAD, s, rp);
s = nfa.newstate();
nonword(BEHIND, lp, s);
nonword(AHEAD, s, rp);
return;
case LACON: /* lookahead constraint */
pos = nextvalue;
lex.next();
s = nfa.newstate();
s2 = nfa.newstate();
parse(')', LACON, s, s2); // parse for side-effect.
assert see(')');
lex.next();
n = newlacon(s, s2, pos);
nfa.newarc(LACON, (short)n, lp, rp);
return;
/* then errors, to get them out of the way */
case '*':
case '+':
case '?':
case '{':
throw new RegexException("Pattern syntax error (*+?} misplaced).");
/* then plain characters, and minor variants on that theme */
case ')': /* unbalanced paren */
if ((cflags & Flags.REG_ADVANCED) != Flags.REG_EXTENDED) {
throw new RegexException("Unbalanced parenthesis.");
}
/* legal in EREs due to specification botch */
note(Flags.REG_UPBOTCH);
/* fallthrough into case PLAIN */
case PLAIN:
// look out for surrogates as ordinary chars.
if (nextvalue < Character.MAX_VALUE && Character.isHighSurrogate((char)nextvalue)) {
char high = (char)nextvalue;
lex.next();
char low = (char)nextvalue;
int codepoint = Character.toCodePoint(high, low);
onechr(codepoint, lp, rp);
} else {
onechr(nextvalue, lp, rp);
}
cm.okcolors(nfa);
lex.next();
break;
case '[':
if (nextvalue == 1) {
bracket(lp, rp);
} else {
cbracket(lp, rp);
}
assert see(']');
lex.next();
break;
case '.':
cm.rainbow(nfa, PLAIN,
(0 != (cflags & Flags.REG_NLSTOP)) ? nlcolor : Constants.COLORLESS,
lp, rp);
lex.next();
break;
/* and finally the ugly stuff */
case '(': /* value flags as capturing or non */
if (type == LACON) {
cap = false;
} else {
cap = nextvalue != 0;
}
if (cap) {
subno = subs.size() + 1; // first subno is 1.
/*
* This recurses via a call to parse just below.
* So, the size() just above has to reflect this new sub,
* even though we won't create the object until a little further
* down.
*/
subs.add(null);
} else {
atomtype = PLAIN; /* something that's not '(' */
}
lex.next();
/* need new endpoints because tree will contain pointers */
s = nfa.newstate();
s2 = nfa.newstate();
nfa.emptyarc(lp, s);
nfa.emptyarc(s2, rp);
atom = parse(')', PLAIN, s, s2);
assert see(')');
lex.next();
if (cap) {
// we can't assert anything about the size of 'subs', recursion may have added to it.
// but we can check that nothing has used our slot.
assert subs.get(subno - 1) == null;
subs.set(subno - 1, atom);
t = new Subre('(', atom.flags | Subre.CAP, lp, rp);
t.subno = subno;
t.left = atom;
atom = t;
}
/* postpone everything else pending possible {0} */
break;
case BACKREF: /* the Feature From The Black Lagoon */
if (type == LACON) {
throw new RegexException("REG_ESUBREG");
}
if (nextvalue > subs.size()) {
throw new RegexException(String.format("Backreference to %d out of range of defined subexpressions (%d)", nextvalue, subs
.size()));
}
if (subs.get(nextvalue - 1) == null) { // \1 is first backref, living in slot 0.
throw new RegexException(String.format("Backreference to %d refers to non-capturing group.", nextvalue));
}
assert nextvalue > 0;
atom = new Subre('b', Subre.BACKR, lp, rp);
subno = nextvalue ;
atom.subno = subno;
nfa.emptyarc(lp, rp); /* temporarily, so there's something */
lex.next();
break;
default:
throw new RuntimeException("Impossible type in lex");
}
/* ...and an atom may be followed by a quantifier */
switch (nexttype) {
case '*':
m = 0;
n = INFINITY;
qprefer = (nextvalue != 0) ? Subre.LONGER : Subre.SHORTER;
lex.next();
break;
case '+':
m = 1;
n = INFINITY;
qprefer = (nextvalue != 0) ? Subre.LONGER : Subre.SHORTER;
lex.next();
break;
case '?':
m = 0;
n = 1;
qprefer = (nextvalue != 0) ? Subre.LONGER : Subre.SHORTER;
lex.next();
break;
case '{':
lex.next();
m = scannum();
if (eat(',')) {
if (see(DIGIT)) {
n = scannum();
} else {
n = INFINITY;
}
if (m > n) {
throw new RegexException("First quantity is larger than second quantity in {m,n} quantifier.");
}
/* {m,n} exercises preference, even if it's {m,m} */
qprefer = (nextvalue != 0) ? Subre.LONGER : Subre.SHORTER;
} else {
n = m;
/* {m} passes operand's preference through */
qprefer = 0;
}
if (!see('}')) { /* catches errors too */
throw new RegexException("Invalid syntax for {m,n} quantifier.");
}
lex.next();
break;
default: /* no quantifier */
m = 1;
n = 1;
qprefer = 0;
break;
}
/* annoying special case: {0} or {0,0} cancels everything */
if (m == 0 && n == 0) {
if (atomtype == '(') {
assert subno == subs.size() - 1;
subs.remove(subs.size() - 1);
}
delsub(nfa, lp, rp);
nfa.emptyarc(lp, rp);
return;
}
/* if not a messy case, avoid hard part */
assert !messy(top.flags);
f = top.flags | qprefer | ((atom != null) ? atom.flags : 0);
if (atomtype != '(' && atomtype != BACKREF && !messy(up(f))) {
if (!(m == 1 && n == 1)) {
repeat(lp, rp, m, n);
}
top.flags = f;
return;
}
/*
* hard part: something messy
* That is, capturing parens, back reference, short/long clash, or
* an atom with substructure containing one of those.
*/
/* now we'll need a subre for the contents even if they're boring */
if (atom == null) {
atom = new Subre('=', 0, lp, rp);
}
/*
* prepare a general-purpose state skeleton
*
* --. [s] ---prefix--. [begin] ---atom--. [end] ----rest--. [rp]
* / /
* [lp] ---. [s2] ----bypass---------------------
*
* where bypass is an empty, and prefix is some repetitions of atom
*/
s = nfa.newstate(); /* first, new endpoints for the atom */
s2 = nfa.newstate();
nfa.moveouts(lp, s);
nfa.moveins(rp, s2);
atom.begin = s;
atom.end = s2;
s = nfa.newstate(); /* and spots for prefix and bypass */
s2 = nfa.newstate();
nfa.emptyarc(lp, s);
nfa.emptyarc(lp, s2);
/* break remaining subRE into x{...} and what follows */
t = new Subre('.', Subre.combine(qprefer, atom.flags), lp, rp);
t.left = atom;
final Subre target = t;
atomp = new AtomSetter() {
@Override
public void set(Subre s) {
target.left = s;
}
};
/* here we should recurse... but we must postpone that to the end */
/* split top into prefix and remaining */
assert top.op == '=' && top.left == null && top.right == null;
top.left = new Subre('=', top.flags, top.begin, lp);
top.op = '.';
top.right = t;
/* if it's a backref, now is the time to replicate the subNFA */
if (atomtype == BACKREF) {
assert atom.begin.nouts == 1; /* just the EMPTY */
delsub(nfa, atom.begin, atom.end);
assert subs.get(subno - 1) != null;
/* and here's why the recursion got postponed: it must */
/* wait until the skeleton is filled in, because it may */
/* hit a backref that wants to copy the filled-in skeleton */
nfa.dupnfa(subs.get(subno - 1).begin, subs.get(subno - 1).end,
atom.begin, atom.end);
}
/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
if (m == 0) {
nfa.emptyarc(s2, atom.end); /* the bypass */
assert Subre.pref(qprefer) != 0;
f = Subre.combine(qprefer, atom.flags);
t = new Subre('|', f, lp, atom.end);
t.left = atom;
t.right = new Subre('|', Subre.pref(f), s2, atom.end);
t.right.left = new Subre('=', 0, s2, atom.end);
atomp.set(t);
final Subre target2 = t;
atomp = new AtomSetter() {
@Override
public void set(Subre s) {
target2.left = s;
}
};
m = 1;
}
/* deal with the rest of the quantifier */
if (atomtype == BACKREF) {
/* special case: backrefs have internal quantifiers */
nfa.emptyarc(s, atom.begin); /* empty prefix */
/* just stuff everything into atom */
repeat(atom.begin, atom.end, m, n);
atom.min = (short)m;
atom.max = (short)n;
atom.flags |= Subre.combine(qprefer, atom.flags);
} else if (m == 1 && n == 1) {
/* no/vacuous quantifier: done */
nfa.emptyarc(s, atom.begin); /* empty prefix */
} else {
/* turn x{m,n} into x{m-1,n-1}x, with capturing */
/* parens in only second x */
nfa.dupnfa(atom.begin, atom.end, s, atom.begin);
assert m >= 1 && m != INFINITY && n >= 1;
repeat(s, atom.begin, m - 1, (n == INFINITY) ? n : n - 1);
f = Subre.combine(qprefer, atom.flags);
t = new Subre('.', f, s, atom.end); /* prefix and atom */
t.left = new Subre('=', Subre.pref(f), s, atom.begin);
t.right = atom;
atomp.set(t);
}
/* and finally, look after that postponed recursion */
t = top.right;
if (!(see('|') || see(stopper) || see(EOS))) {
t.right = parsebranch(stopper, type, atom.end, rp, true);
} else {
nfa.emptyarc(atom.end, rp);
t.right = new Subre('=', 0, atom.end, rp);
}
assert see('|') || see(stopper) || see(EOS);
t.flags |= Subre.combine(t.flags, t.right.flags);
top.flags |= Subre.combine(top.flags, t.flags);
}
//CHECKSTYLE:ON
private void delsub(Nfa nfa, State lp, State rp) {
rp.tmp = rp;
deltraverse(nfa, lp, lp);
assert lp.nouts == 0 && rp.nins == 0; /* did the job */
assert lp.no != State.FREESTATE && rp.no != State.FREESTATE; /* no more */
lp.tmp = null;
rp.tmp = null;
}
/**
* deltraverse - the recursive heart of delsub
* This routine's basic job is to destroy all out-arcs of the state.
*/
private void deltraverse(Nfa nfa, State leftend, State s) {
Arc a;
State to;
if (s.nouts == 0) {
return; /* nothing to do */
}
if (s.tmp != null) {
return; /* already in progress */
}
s.tmp = s; /* mark as in progress */
while ((a = s.outs) != null) {
to = a.to;
deltraverse(nfa, leftend, to);
assert to.nouts == 0 || to.tmp != null;
nfa.freearc(a);
if (to.nins == 0 && to.tmp == null) {
assert to.nouts == 0;
nfa.freestate(to);
}
}
assert s.no != State.FREESTATE; /* we're still here */
assert s == leftend || s.nins != 0; /* and still reachable */
assert s.nouts == 0; /* but have no outarcs */
s.tmp = null; /* we're done here */
}
/**
* nonword - generate arcs for non-word-character ahead or behind
*/
private void nonword(int dir, State lp, State rp) {
int anchor = (dir == AHEAD) ? '$' : '^';
assert dir == AHEAD || dir == BEHIND;
nfa.newarc(anchor, (short)1, lp, rp);
nfa.newarc(anchor, (short)0, lp, rp);
cm.colorcomplement(nfa, dir, wordchrs, lp, rp);
/* (no need for special attention to \n) */
}
/**
* word - generate arcs for word character ahead or behind
*/
private void word(int dir, State lp, State rp) {
assert dir == AHEAD || dir == BEHIND;
cloneouts(nfa, wordchrs, lp, rp, dir);
/* (no need for special attention to \n) */
}
/**
* scannum - scan a number
*
* @return value <= DUPMAX
*/
private int scannum() throws RegexException {
int n = 0;
while (see(DIGIT) && n < DUPMAX) {
n = n * 10 + nextvalue;
lex.next();
}
if (see(DIGIT) || n > DUPMAX) {
throw new RegexException("Unvalid reference number.");
}
return n;
}
/**
* repeat - replicate subNFA for quantifiers
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same
* left and right end states, however!) This used to be important for the
* subRE tree, although the important bits are now handled by the in-line
* code in parse(), and when this is called, it doesn't matter any more.
*/
private void repeat(State lp, State rp, int m, int n) throws RegexException {
final int rm = reduce(m);
final int rn = reduce(n);
State s;
State s2;
switch (pair(rm, rn)) {
// pair(0, 0)
case 0: /* empty string */
// never get here; other code optimizes this out.
delsub(nfa, lp, rp);
nfa.emptyarc(lp, rp);
break;
//case PAIR(0, 1): /* do as x| */
case 1:
nfa.emptyarc(lp, rp);
break;
//case PAIR(0, SOME): /* do as x{1,n}| */
case SOME:
repeat(lp, rp, 1, n);
nfa.emptyarc(lp, rp);
break;
//case PAIR(0, INF): /* loop x around */
case INF:
s = nfa.newstate();
nfa.moveouts(lp, s);
nfa.moveins(rp, s);
nfa.emptyarc(lp, s);
nfa.emptyarc(s, rp);
break;
//case PAIR(1, 1): /* no action required */
case 4 * 1 + 1:
break;
//case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
case 4 * 1 + SOME:
s = nfa.newstate();
nfa.moveouts(lp, s);
nfa.dupnfa(s, rp, lp, s);
repeat(lp, s, 1, n - 1);
nfa.emptyarc(lp, s);
break;
//case PAIR(1, INF): /* add loopback arc */
case 4 * 1 + INF:
s = nfa.newstate();
s2 = nfa.newstate();
nfa.moveouts(lp, s);
nfa.moveins(rp, s2);
nfa.emptyarc(lp, s);
nfa.emptyarc(s2, rp);
nfa.emptyarc(s2, s);
break;
//case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
case 4 * SOME + SOME:
s = nfa.newstate();
nfa.moveouts(lp, s);
nfa.dupnfa(s, rp, lp, s);
repeat(lp, s, m - 1, n - 1);
break;
//case PAIR(SOME, INF): /* do as x{m-1,}x */
case 4 * SOME + INF:
s = nfa.newstate();
nfa.moveouts(lp, s);
nfa.dupnfa(s, rp, lp, s);
repeat(lp, s, m - 1, n);
break;
default:
throw new RuntimeException("Impossible quantification");
}
}
/**
* wordchrs - set up word-chr list for word-boundary stuff, if needed
* The list is kept as a bunch of arcs between two dummy states; it's
* disposed of by the unreachable-states sweep in NFA optimization.
* Does NEXT(). Must not be called from any unusual lexical context.
* This should be reconciled with the \w etc. handling in lex.c, and
* should be cleaned up to reduce dependencies on input scanning.
*/
private void wordchrs() throws RegexException {
State left;
State right;
if (wordchrs != null) {
lex.next(); /* for consistency */
return;
}
left = nfa.newstate();
right = nfa.newstate();
/* fine point: implemented with [::], and lexer will set REG_ULOCALE */
lex.lexword();
lex.next();
assert savepattern != null && see('[');
bracket(left, right);
assert savepattern != null && see(']');
lex.next();
wordchrs = left;
}
/**
* bracket - handle non-complemented bracket expression
* Also called from cbracket for complemented bracket expressions.
*/
private void bracket(State lp, State rp) throws RegexException {
assert see('[');
lex.next();
while (!see(']') && !see(EOS)) {
brackpart(lp, rp);
}
assert see(']');
cm.okcolors(nfa);
}
//CHECKSTYLE:OFF
/**
* brackpart - handle one item (or range) within a bracket expression
*/
private void brackpart(State lp, State rp) throws RegexException {
UnicodeSet set;
/*
* OK, well; if the user uses \U the item that comes next can be a full codepoint.
* If the user does not, it might be part of a surrogate.
*/
int c;
// start and end chars of a range
int startc;
int endc = 0;
int ele;
/* parse something, get rid of special cases, take shortcuts */
switch (nexttype) {
case RANGE: /* a-b-c or other botch */
throw new RegexException("Invalid syntax in range expression.");
case PLAIN:
c = nextvalue;
lex.next();
if (c <= Character.MAX_VALUE && Character.isHighSurrogate((char)c)) {
// some idiot could write a high surrogate and then immediate \\Uxxxxx, woof.
char low = (char)nextvalue;
lex.next();
startc = Character.toCodePoint((char)c, low);
} else {
startc = c;
}
/* shortcut for ordinary char (not range, not MCCE leader) */
if (!see(RANGE)) {
onechr(startc, lp, rp);
return;
}
break;
// COLLEL and ECLASS are of dubious utility and don't try to get surrogates right.
case COLLEL:
String charName = scanplain();
if (charName.length() == 0) {
throw new RegexException("Missing character name for collation.");
}
ele = Locale.element(charName);
if (ele == -1) {
throw new RegexException("Invalid character name " + charName);
} else {
startc = (char)ele;
}
break;
case ECLASS:
charName = scanplain();
if (charName.length() == 0) {
throw new RegexException("Unterminated or invalid equivalence class.");
}
ele = Locale.element(charName);
if (ele == -1) {
throw new RegexException("Invalid character name " + charName);
} else {
startc = (char)ele;
}
set = Locale.eclass((char)startc, 0 != (cflags & Flags.REG_ICASE));
dovec(set, lp, rp);
return;
case CCLASS:
String className = scanplain();
if (className.length() == 0) {
throw new RegexException("Missing class name for char class.");
}
set = Locale.cclass(className, 0 != (cflags & Flags.REG_ICASE));
dovec(set, lp, rp);
return;
default:
throw new RegexException("Impossible lexical state.");
}
if (see(RANGE)) {
lex.next();
switch (nexttype) {
case PLAIN:
case RANGE:
c = nextvalue;
lex.next();
if (c <= Character.MAX_VALUE && Character.isHighSurrogate((char)c)) {
char low = (char)nextvalue;
lex.next();
endc = Character.toCodePoint((char)c, low);
} else {
endc = c;
}
break;
case COLLEL:
String charName = scanplain();
if (charName.length() == 0) {
throw new RegexException("Missing character name in collation.");
}
// look up named character.
ele = Locale.element(charName);
if (ele == -1) {
throw new RegexException("Invalid character name " + charName);
}
break;
default:
throw new RegexException("Invalid syntax in range.");
}
} else {
endc = startc;
}
set = new UnicodeSet(startc, endc);
if (0 != (cflags & Flags.REG_ICASE)) {
set.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
}
dovec(set, lp, rp);
}
//CHECKSTYLE:ON
/**
* scanplain - scan PLAIN contents of [. etc.
* Certain bits of trickery in lex.c know that this code does not try
* to look past the final bracket of the [. etc.
*
* @return the string. It can't return a pos; a nested pattern is popped by
* the last lex.next() in here and so the offsets don't work.
*/
private String scanplain() throws RegexException {
int startp = now;
int endp;
assert see(COLLEL) || see(ECLASS) || see(CCLASS);
lex.next();
endp = now;
while (see(PLAIN)) {
endp = now;
lex.next();
}
String ret = new String(pattern, startp, endp - startp);
assert see(END);
lex.next();
return ret;
}
/**
* cbracket - handle complemented bracket expression
* We do it by calling bracket() with dummy endpoints, and then complementing
* the result. The alternative would be to invoke rainbow(), and then delete
* arcs as the b.e. is seen... but that gets messy.
*/
private void cbracket(State lp, State rp) throws RegexException {
State left = nfa.newstate();
State right = nfa.newstate();
bracket(left, right);
if (0 != (cflags & Flags.REG_NLSTOP)) {
nfa.newarc(PLAIN, nlcolor, left, right);
}
assert lp.nouts == 0; /* all outarcs will be ours */
/* easy part of complementing */
cm.colorcomplement(nfa, PLAIN, left, lp, rp);
// No MCCE in Java.
nfa.dropstate(left);
assert right.nins == 0;
nfa.freestate(right);
}
/**
* newlacon - allocate a lookahead-constraint subRE
*
* @return lacon number
*/
private int newlacon(State begin, State end, int pos) {
if (lacons.size() == 0) {
// skip 0
lacons.add(null);
}
Subre sub = new Subre((char)0, 0, begin, end);
sub.subno = pos;
lacons.add(sub);
return lacons.size() - 1; // it's the index into the array, -1.
}
/**
* onechr - fill in arcs for a plain character, and possible case complements
* This is mostly a shortcut for efficient handling of the common case.
*/
private void onechr(int c, State lp, State rp) throws RegexException {
if (0 == (cflags & Flags.REG_ICASE)) {
nfa.newarc(PLAIN, cm.subcolor(c), lp, rp);
return;
}
/* rats, need general case anyway... */
dovec(Locale.allcases(c), lp, rp);
}
/**
* dovec - fill in arcs for each element of a cvec
* all kinds of MCCE complexity removed.
*/
private void dovec(UnicodeSet set, State lp, State rp) throws RegexException {
int rangeCount = set.getRangeCount();
for (int rx = 0; rx < rangeCount; rx++) {
int rangeStart = set.getRangeStart(rx);
int rangeEnd = set.getRangeEnd(rx);
/*
* Note: ICU operates in UTF-32 here, and the ColorMap is happy to play along.
*/
if (LOG.isDebugEnabled() && IS_DEBUG) {
LOG.debug(String.format("%s %d %4x %4x", set, rx, rangeStart, rangeEnd));
}
//TODO: this arc is probably redundant.
if (rangeStart == rangeEnd) {
nfa.newarc(PLAIN, cm.subcolor(rangeStart), lp, rp);
}
cm.subrange(rangeStart, rangeEnd, lp, rp);
}
}
void note(long b) {
info |= b;
}
interface AtomSetter {
void set(Subre s);
}
Nfa getNfa() {
return nfa;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy