com.google.re2j.Regexp Maven / Gradle / Ivy
The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/regexp.go
package com.google.re2j;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* Regular expression abstract syntax tree.
* Produced by parser, used by compiler.
* NB, this corresponds to {@code syntax.regexp} in the Go implementation;
* Go's {@code regexp} is called {@code RE2} in Java.
*/
class Regexp {
enum Op {
NO_MATCH, // Matches no strings.
EMPTY_MATCH, // Matches empty string.
LITERAL, // Matches runes[] sequence
CHAR_CLASS, // Matches Runes interpreted as range pair list
ANY_CHAR_NOT_NL, // Matches any character except '\n'
ANY_CHAR, // Matches any character
BEGIN_LINE, // Matches empty string at end of line
END_LINE, // Matches empty string at end of line
BEGIN_TEXT, // Matches empty string at beginning of text
END_TEXT, // Matches empty string at end of text
WORD_BOUNDARY, // Matches word boundary `\b`
NO_WORD_BOUNDARY, // Matches word non-boundary `\B`
CAPTURE, // Capturing subexpr with index cap, optional name name
STAR, // Matches subs[0] zero or more times.
PLUS, // Matches subs[0] one or more times.
QUEST, // Matches subs[0] zero or one times.
REPEAT, // Matches subs[0] [min, max] times; max=-1 => no limit.
CONCAT, // Matches concatenation of subs[]
ALTERNATE, // Matches union of subs[]
// Pseudo ops, used internally by Parser for parsing stack:
LEFT_PAREN,
VERTICAL_BAR;
boolean isPseudo() {
return ordinal() >= LEFT_PAREN.ordinal();
}
}
static final Regexp[] EMPTY_SUBS = {};
Op op; // operator
int flags; // bitmap of parse flags
Regexp[] subs; // subexpressions, if any. Never null.
// subs[0] is used as the freelist.
int[] runes; // matched runes, for LITERAL, CHAR_CLASS
int min, max; // min, max for REPEAT
int cap; // capturing index, for CAPTURE
String name; // capturing name, for CAPTURE
// Do update copy ctor when adding new fields!
Regexp(Op op) {
this.op = op;
}
// Shallow copy constructor.
Regexp(Regexp that) {
this.op = that.op;
this.flags = that.flags;
this.subs = that.subs;
this.runes = that.runes;
this.min = that.min;
this.max = that.max;
this.cap = that.cap;
this.name = that.name;
}
void reinit() {
this.flags = 0;
subs = EMPTY_SUBS;
runes = null;
cap = min = max = 0;
name = null;
}
@Override public String toString() {
StringBuilder out = new StringBuilder();
appendTo(out);
return out.toString();
}
private static void quoteIfHyphen(StringBuilder out, int rune) {
if (rune == '-') {
out.append('\\');
}
}
// appendTo() appends the Perl syntax for |this| regular expression to |out|.
private void appendTo(StringBuilder out) {
switch (op) {
case NO_MATCH:
out.append("[^\\x00-\\x{10FFFF}]");
break;
case EMPTY_MATCH:
out.append("(?:)");
break;
case STAR:
case PLUS:
case QUEST:
case REPEAT: {
Regexp sub = subs[0];
if (sub.op.ordinal() > Op.CAPTURE.ordinal() ||
sub.op == Op.LITERAL && sub.runes.length > 1) {
out.append("(?:");
sub.appendTo(out);
out.append(')');
} else {
sub.appendTo(out);
}
switch (op) {
case STAR:
out.append('*');
break;
case PLUS:
out.append('+');
break;
case QUEST:
out.append('?');
break;
case REPEAT:
out.append('{').append(min);
if (min != max) {
out.append(',');
if (max >= 0) {
out.append(max);
}
}
out.append('}');
break;
}
if ((flags & RE2.NON_GREEDY) != 0) {
out.append('?');
}
break;
}
case CONCAT:
for (Regexp sub : subs) {
if (sub.op == Op.ALTERNATE) {
out.append("(?:");
sub.appendTo(out);
out.append(')');
} else {
sub.appendTo(out);
}
}
break;
case ALTERNATE: {
String sep = "";
for (Regexp sub : subs) {
out.append(sep);
sep = "|";
sub.appendTo(out);
}
break;
}
case LITERAL:
if ((flags & RE2.FOLD_CASE) != 0) {
out.append("(?i:");
}
for (int rune : runes) {
Utils.escapeRune(out, rune);
}
if ((flags & RE2.FOLD_CASE) != 0) {
out.append(')');
}
break;
case ANY_CHAR_NOT_NL:
out.append("(?-s:.)");
break;
case ANY_CHAR:
out.append("(?s:.)");
break;
case CAPTURE:
if (name == null || name.isEmpty()) {
out.append('(');
} else {
out.append("(?P<");
out.append(name);
out.append(">");
}
if (subs[0].op != Op.EMPTY_MATCH) {
subs[0].appendTo(out);
}
out.append(')');
break;
case BEGIN_TEXT:
out.append("\\A");
break;
case END_TEXT:
if ((flags & RE2.WAS_DOLLAR) != 0) {
out.append("(?-m:$)");
} else {
out.append("\\z");
}
break;
case BEGIN_LINE:
out.append('^');
break;
case END_LINE:
out.append('$');
break;
case WORD_BOUNDARY:
out.append("\\b");
break;
case NO_WORD_BOUNDARY:
out.append("\\B");
break;
case CHAR_CLASS:
if (runes.length % 2 != 0) {
out.append("[invalid char class]");
break;
}
out.append('[');
if (runes.length == 0) {
out.append("^\\x00-\\x{10FFFF}");
} else if (runes[0] == 0 &&
runes[runes.length - 1] == Unicode.MAX_RUNE) {
// Contains 0 and MAX_RUNE. Probably a negated class.
// Print the gaps.
out.append('^');
for (int i = 1; i < runes.length - 1; i += 2) {
int lo = runes[i] + 1;
int hi = runes[i+1] - 1;
quoteIfHyphen(out, lo);
Utils.escapeRune(out, lo);
if (lo != hi) {
out.append('-');
quoteIfHyphen(out, hi);
Utils.escapeRune(out, hi);
}
}
} else {
for (int i = 0; i < runes.length; i += 2) {
int lo = runes[i];
int hi = runes[i + 1];
quoteIfHyphen(out, lo);
Utils.escapeRune(out, lo);
if (lo != hi) {
out.append('-');
quoteIfHyphen(out, hi);
Utils.escapeRune(out, hi);
}
}
}
out.append(']');
break;
default: // incl. pseudos
out.append(op);
break;
}
}
// maxCap() walks the regexp to find the maximum capture index.
int maxCap() {
int m = 0;
if (op == Op.CAPTURE) {
m = cap;
}
if (subs != null) {
for (Regexp sub : subs) {
int n = sub.maxCap();
if (m < n) {
m = n;
}
}
}
return m;
}
// namedGroupIndexes() walks the regexp to build a map of indexes for named groups.
Map namedGroupIndexes() {
Map indexes = new HashMap<>();
putNamedGroupIndexes(indexes);
return indexes;
}
private void putNamedGroupIndexes(Map indexes) {
if (op == Op.CAPTURE && name != null) {
indexes.put(name, cap);
}
if (subs != null) {
for (Regexp sub: subs) {
sub.putNamedGroupIndexes(indexes);
}
}
}
// equals() returns true if this and that have identical structure.
@Override public boolean equals(Object that) {
if (!(that instanceof Regexp)) {
return false;
}
Regexp x = this;
Regexp y = (Regexp) that;
if (x.op != y.op) {
return false;
}
switch (x.op) {
case END_TEXT:
// The parse flags remember whether this is \z or \Z.
if ((x.flags & RE2.WAS_DOLLAR) != (y.flags & RE2.WAS_DOLLAR)) {
return false;
}
break;
case LITERAL:
case CHAR_CLASS:
if (!Arrays.equals(x.runes, y.runes)) {
return false;
}
break;
case ALTERNATE:
case CONCAT:
if (x.subs.length != y.subs.length) {
return false;
}
for (int i = 0; i < x.subs.length; ++i) {
if (!x.subs[i].equals(y.subs[i])) {
return false;
}
}
break;
case STAR:
case PLUS:
case QUEST:
if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
!x.subs[0].equals(y.subs[0])) {
return false;
}
break;
case REPEAT:
if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
x.min != y.min || x.max != y.max || !x.subs[0].equals(y.subs[0])) {
return false;
}
break;
case CAPTURE:
if (x.cap != y.cap || x.name != y.name ||
!x.subs[0].equals(y.subs[0])) {
return false;
}
break;
}
return true;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy