com.google.re2j.Compiler Maven / Gradle / Ivy
The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/compile.go
package com.google.re2j;
import java.util.LinkedList;
import java.util.List;
import static com.google.re2j.Inst.Op.BYTE;
import static com.google.re2j.Inst.Op.BYTE1;
import static com.google.re2j.RE2.FOLD_CASE;
import static com.google.re2j.Unicode.RUNE_SELF;
import static com.google.re2j.Unicode.UTF_MAX;
import static com.google.re2j.Unicode.codePointToUtf8;
import static com.google.re2j.Unicode.maxRune;
import static com.google.re2j.Unicode.simpleFold;
/**
* Compiler from {@code Regexp} (RE2 abstract syntax) to {@code RE2} (compiled regular expression).
*
* The only entry point is {@link #compileRegexp}.
*/
class Compiler {
/**
* A fragment of a compiled regular expression program.
*
* @see http://swtch.com/~rsc/regexp/regexp1.html
*/
private static class Frag {
final int i; // an instruction address (pc).
int out; // a patch list; see explanation in Prog.java
Frag() {
this(0, 0);
}
Frag(int i) {
this(i, 0);
}
Frag(int i, int out) {
this.i = i;
this.out = out;
}
}
private final Prog prog = new Prog(); // Program being built
private boolean reversed;
private Compiler() {
newInst(Inst.Op.FAIL); // always the first instruction
}
static Prog compileRegexp(Regexp re, boolean reversed) {
Compiler c = new Compiler();
return c.compileToProgram(re, reversed);
}
private Frag newInst(Inst.Op op) {
// TODO(rsc): impose length limit.
prog.addInst(op);
return new Frag(prog.numInst() - 1);
}
// Returns a no-op fragment. Sometimes unavoidable.
private Frag nop() {
Frag f = newInst(Inst.Op.NOP);
f.out = f.i << 1;
return f;
}
private Frag match() {
return newInst(Inst.Op.MATCH);
}
private Frag fail() {
return new Frag();
}
// Given fragment a, returns (a) capturing as \n.
// Given a fragment a, returns a fragment with capturing parens around a.
private Frag cap(int arg) {
Frag f = newInst(Inst.Op.CAPTURE);
f.out = f.i << 1;
prog.getInst(f.i).arg = arg;
if (prog.numCap < arg + 1) {
prog.numCap = arg + 1;
}
return f;
}
// Given fragments a and b, returns ab; a|b
private Frag cat(Frag f1, Frag f2) {
// concat of failure is failure
if (f1.i == 0 || f2.i == 0) {
return fail();
}
if (reversed) {
prog.patch(f2.out, f1.i);
return new Frag(f2.i, f1.out);
} else {
prog.patch(f1.out, f2.i);
return new Frag(f1.i, f2.out);
}
}
// Given fragments for a and b, returns fragment for a|b.
private Frag alt(Frag f1, Frag f2) {
// alt of failure is other
if (f1.i == 0) {
return f2;
}
if (f2.i == 0) {
return f1;
}
Frag f = newInst(Inst.Op.ALT);
Inst i = prog.getInst(f.i);
i.out = f1.i;
i.arg = f2.i;
f.out = prog.append(f1.out, f2.out);
return f;
}
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
private Frag quest(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.Op.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
i.arg = f1.i;
f.out = f.i << 1;
} else {
i.out = f1.i;
f.out = f.i << 1 | 1;
}
f.out = prog.append(f.out, f1.out);
return f;
}
// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
private Frag star(Frag f1, boolean nongreedy) {
Frag f = newInst(Inst.Op.ALT);
Inst i = prog.getInst(f.i);
if (nongreedy) {
i.arg = f1.i;
f.out = f.i << 1;
} else {
i.out = f1.i;
f.out = f.i<<1 | 1;
}
prog.patch(f1.out, f.i);
return f;
}
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
private Frag plus(Frag f1, boolean nongreedy) {
return new Frag(f1.i, star(f1, nongreedy).out);
}
// op is a bitmask of EMPTY_* flags.
private Frag empty(int op) {
Frag f = newInst(Inst.Op.EMPTY_WIDTH);
prog.getInst(f.i).arg = op;
f.out = f.i << 1;
return f;
}
private Frag rune(int rune, int flags) {
if ((flags & FOLD_CASE) == 0) {
return rune(new int[]{rune}, flags);
} else {
return alt(rune(new int[]{rune}, flags), rune(new int[]{simpleFold(rune)}, flags));
}
}
// flags : parser flags
private Frag rune(int[] runes, int flags) {
if (runes.length == 1) {
return rune(runes[0], runes[0], flags);
} else if (runes.length > 1) {
List expandedRunes = new LinkedList<>();
for (int i = 0; i < runes.length; i += 2) {
expandRuneRange(runes[i], runes[i + 1], expandedRunes);
}
Frag alt = rune(expandedRunes.get(0), expandedRunes.get(1), flags);
for (int i = 2; i < expandedRunes.size(); i += 2) {
alt = alt(alt, rune(expandedRunes.get(i), expandedRunes.get(i + 1), flags));
}
return alt;
} else {
return fail();
}
}
private void expandRuneRange(int lo, int hi, List runes) {
// Split range into same-length sized ranges.
for (int i = 1; i < UTF_MAX; i++) {
int max = maxRune(i);
if (lo <= max && max < hi) {
expandRuneRange(lo, max, runes);
expandRuneRange(max + 1, hi, runes);
return;
}
}
// ASCII range is always a special case.
if (hi < RUNE_SELF) {
runes.add(lo);
runes.add(hi);
return;
}
// Split range into sections that agree on leading bytes.
for (int i = 1; i < UTF_MAX; i++) {
int m = (1 << (6 * i)) - 1; // last i bytes of a UTF-8 sequence
if ((lo & ~m) != (hi & ~m)) {
if ((lo & m) != 0) {
expandRuneRange(lo, lo | m, runes);
expandRuneRange((lo | m) + 1, hi, runes);
return;
}
if ((hi & m) != m) {
expandRuneRange(lo, (hi & ~m) - 1, runes);
expandRuneRange(hi & ~m, hi, runes);
return;
}
}
}
runes.add(lo);
runes.add(hi);
}
private Frag rune(int lo, int hi, int flags) {
return bytes(codePointToUtf8(lo), codePointToUtf8(hi), flags);
}
private Frag bytes(byte[] lo, byte[] hi, int flags) {
Frag prefix = byteRange(new byte[]{lo[0], hi[0]}, flags);
for (int i = 1; i < lo.length; ++i) {
prefix = cat(prefix, byteRange(new byte[]{lo[i], hi[i]}, flags));
}
return prefix;
}
private Frag byteRange(byte[] byteRanges, int flags) {
Frag f = newInst(BYTE);
Inst i = prog.getInst(f.i);
i.byteRanges = byteRanges;
i.arg = flags;
f.out = f.i << 1;
// Special cases for exec machine.
if (byteRanges.length == 1 || (byteRanges.length == 2 && byteRanges[0] == byteRanges[1])) {
i.op = BYTE1;
i.byteRanges = new byte[]{byteRanges[0]};
}
return f;
}
private static final int[] ANY_RUNE_NOT_NL = {
0, '\n' - 1, '\n' + 1, Unicode.MAX_RUNE
};
private static final int[] ANY_RUNE = { 0, Unicode.MAX_RUNE };
private static final byte[] ANY_BYTE = { 0, (byte) 0xff };
private Prog compileToProgram(Regexp re, boolean reversed) {
this.reversed = reversed;
Frag f = compile(re);
this.reversed = false;
// Finish by putting Match node at end, and record start.
// Turn off c.reversed (if it is set) to force the remaining concatenations to behave normally.
Frag all = cat(f, match());
prog.start = all.i;
// Also create unanchored version, which starts with a .*? loop.
if ((prog.startCond() & Utils.EMPTY_BEGIN_TEXT) == 0) {
Frag unanchored = cat(star(byteRange(ANY_BYTE, 0), true), all);
prog.startUnanchored = unanchored.i;
} else {
prog.startUnanchored = prog.start;
}
return prog;
}
private Frag compile(Regexp re) {
switch (re.op) {
case NO_MATCH:
return fail();
case EMPTY_MATCH:
return nop();
case LITERAL:
if (re.runes.length == 0) {
return nop();
} else {
Frag f = null;
for (int r : re.runes) {
Frag f1 = rune(r, re.flags);
f = (f == null) ? f1 : cat(f, f1);
}
return f;
}
case CHAR_CLASS:
return rune(re.runes, re.flags);
case ANY_CHAR_NOT_NL:
return rune(ANY_RUNE_NOT_NL, 0);
case ANY_CHAR:
return rune(ANY_RUNE, 0);
case BEGIN_LINE:
return empty(reversed ? Utils.EMPTY_END_LINE : Utils.EMPTY_BEGIN_LINE);
case END_LINE:
return empty(reversed ? Utils.EMPTY_BEGIN_LINE : Utils.EMPTY_END_LINE);
case BEGIN_TEXT:
return empty(reversed ? Utils.EMPTY_END_TEXT : Utils.EMPTY_BEGIN_TEXT);
case END_TEXT:
return empty(reversed ? Utils.EMPTY_BEGIN_TEXT : Utils.EMPTY_END_TEXT);
case WORD_BOUNDARY:
return empty(Utils.EMPTY_WORD_BOUNDARY);
case NO_WORD_BOUNDARY:
return empty(Utils.EMPTY_NO_WORD_BOUNDARY);
case CAPTURE: {
Frag bra = cap(re.cap << 1),
sub = compile(re.subs[0]),
ket = cap(re.cap << 1 | 1);
return cat(cat(bra, sub), ket);
}
case STAR:
return star(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case PLUS:
return plus(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case QUEST:
return quest(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0);
case CONCAT:
if (re.subs.length == 0) {
return nop();
} else {
Frag f = null;
for (Regexp sub : re.subs) {
Frag f1 = compile(sub);
f = (f == null) ? f1 : cat(f, f1);
}
return f;
}
case ALTERNATE: {
if (re.subs.length == 0) {
return nop();
} else {
Frag f = null;
for (Regexp sub : re.subs) {
Frag f1 = compile(sub);
f = (f == null) ? f1 : alt(f, f1);
}
return f;
}
}
default:
throw new IllegalStateException("regexp: unhandled case in compile");
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy