com.google.re2j.Compiler Maven / Gradle / Ivy
/*
* Copyright (c) 2020 The Go Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
*/
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/compile.go
package com.google.re2j;
/**
* Compiler from {@code Regexp} (RE2 abstract syntax) to {@code RE2} (compiled regular expression).
*
* The only entry point is {@link #compileRegexp}.
*/
class Compiler {
/**
* A fragment of a compiled regular expression program.
*
* @see http://swtch.com/~rsc/regexp/regexp1.html
*/
private static class Frag {
final int i; // an instruction address (pc).
int out; // a patch list; see explanation in Prog.java
Frag() {
this(0, 0);
}
Frag(int i) {
this(i, 0);
}
Frag(int i, int out) {
this.i = i;
this.out = out;
}
}
private final Prog prog = new Prog(); // Program being built
private Compiler() {
newInst(Inst.FAIL); // always the first instruction
}
static Prog compileRegexp(Regexp re) {
Compiler c = new Compiler();
Frag f = c.compile(re);
c.prog.patch(f.out, c.newInst(Inst.MATCH).i);
c.prog.start = f.i;
return c.prog;
}
private Frag newInst(int op) {
// TODO(rsc): impose length limit.
prog.addInst(op);
return new Frag(prog.numInst() - 1);
}
// Returns a no-op fragment. Sometimes unavoidable.
private Frag nop() {
Frag f = newInst(Inst.NOP);
f.out = f.i << 1;
return f;
}
private Frag fail() {
return new Frag();
}
// Given fragment a, returns (a) capturing as \n.
// Given a fragment a, returns a fragment with capturing parens around a.
private Frag cap(int arg) {
Frag f = newInst(Inst.CAPTURE);
f.out = f.i << 1;
prog.getInst(f.i).arg = arg;
if (prog.numCap < arg + 1) {
prog.numCap = arg + 1;
}
return f;
}
// Given fragments a and b, returns ab; a|b
private Frag cat(Frag f1, Frag f2) {
// concat of failure is failure
if (f1.i == 0 || f2.i == 0) {
return fail();
}
// TODO(rsc): elide nop
prog.patch(f1.out, f2.i);
return new Frag(f1.i, f2.out);
}
// Given fragments for a and b, returns fragment for a|b.
private Frag alt(Frag f1, Frag f2) {
// alt of failure is other
if (f1.i == 0) {
return f2;
}
if (f2.i == 0) {
return f1;
}
Frag f = newInst(Inst.ALT);
Inst i = prog.getInst(f.i);
i.out = f1.i;
i.arg = f2.i;
f.out = prog.append(f1.out, f2.out);
return f;
}
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
private Frag quest(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
i.arg = f1.i;
f.out = f.i << 1;
} else {
i.out = f1.i;
f.out = f.i << 1 | 1;
}
f.out = prog.append(f.out, f1.out);
return f;
}
// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
private Frag star(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
i.arg = f1.i;
f.out = f.i << 1;
} else {
i.out = f1.i;
f.out = f.i << 1 | 1;
}
prog.patch(f1.out, f.i);
return f;
}
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
private Frag plus(Frag f1, boolean nongreedy) {
return new Frag(f1.i, star(f1, nongreedy).out);
}
// op is a bitmask of EMPTY_* flags.
private Frag empty(int op) {
Frag f = newInst(Inst.EMPTY_WIDTH);
prog.getInst(f.i).arg = op;
f.out = f.i << 1;
return f;
}
private Frag rune(int rune, int flags) {
return rune(new int[] {rune}, flags);
}
// flags : parser flags
private Frag rune(int[] runes, int flags) {
Frag f = newInst(Inst.RUNE);
Inst i = prog.getInst(f.i);
i.runes = runes;
flags &= RE2.FOLD_CASE; // only relevant flag is FoldCase
if (runes.length != 1 || Unicode.simpleFold(runes[0]) == runes[0]) {
flags &= ~RE2.FOLD_CASE; // and sometimes not even that
}
i.arg = flags;
f.out = f.i << 1;
// Special cases for exec machine.
if (((flags & RE2.FOLD_CASE) == 0 && runes.length == 1)
|| (runes.length == 2 && runes[0] == runes[1])) {
i.op = Inst.RUNE1;
} else if (runes.length == 2 && runes[0] == 0 && runes[1] == Unicode.MAX_RUNE) {
i.op = Inst.RUNE_ANY;
} else if (runes.length == 4
&& runes[0] == 0
&& runes[1] == '\n' - 1
&& runes[2] == '\n' + 1
&& runes[3] == Unicode.MAX_RUNE) {
i.op = Inst.RUNE_ANY_NOT_NL;
}
return f;
}
private static final int[] ANY_RUNE_NOT_NL = {0, '\n' - 1, '\n' + 1, Unicode.MAX_RUNE};
private static final int[] ANY_RUNE = {0, Unicode.MAX_RUNE};
private Frag compile(Regexp re) {
switch (re.op) {
case NO_MATCH:
return fail();
case EMPTY_MATCH:
return nop();
case LITERAL:
if (re.runes.length == 0) {
return nop();
} else {
Frag f = null;
for (int r : re.runes) {
Frag f1 = rune(r, re.flags);
f = (f == null) ? f1 : cat(f, f1);
}
return f;
}
case CHAR_CLASS:
return rune(re.runes, re.flags);
case ANY_CHAR_NOT_NL:
return rune(ANY_RUNE_NOT_NL, 0);
case ANY_CHAR:
return rune(ANY_RUNE, 0);
case BEGIN_LINE:
return empty(Utils.EMPTY_BEGIN_LINE);
case END_LINE:
return empty(Utils.EMPTY_END_LINE);
case BEGIN_TEXT:
return empty(Utils.EMPTY_BEGIN_TEXT);
case END_TEXT:
return empty(Utils.EMPTY_END_TEXT);
case WORD_BOUNDARY:
return empty(Utils.EMPTY_WORD_BOUNDARY);
case NO_WORD_BOUNDARY:
return empty(Utils.EMPTY_NO_WORD_BOUNDARY);
case CAPTURE:
{
Frag bra = cap(re.cap << 1), sub = compile(re.subs[0]), ket = cap(re.cap << 1 | 1);
return cat(cat(bra, sub), ket);
}
case STAR:
return star(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case PLUS:
return plus(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case QUEST:
return quest(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case CONCAT:
if (re.subs.length == 0) {
return nop();
} else {
Frag f = null;
for (Regexp sub : re.subs) {
Frag f1 = compile(sub);
f = (f == null) ? f1 : cat(f, f1);
}
return f;
}
case ALTERNATE:
{
if (re.subs.length == 0) {
return nop();
} else {
Frag f = null;
for (Regexp sub : re.subs) {
Frag f1 = compile(sub);
f = (f == null) ? f1 : alt(f, f1);
}
return f;
}
}
default:
throw new IllegalStateException("regexp: unhandled case in compile");
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy