com.google.javascript.jscomp.regex.RegExpTree Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of closure-compiler-unshaded Show documentation
Show all versions of closure-compiler-unshaded Show documentation
Closure Compiler is a JavaScript optimizing compiler. It parses your
JavaScript, analyzes it, removes dead code and rewrites and minimizes
what's left. It also checks syntax, variable references, and types, and
warns about common JavaScript pitfalls. It is used in many of Google's
JavaScript apps, including Gmail, Google Web Search, Google Maps, and
Google Docs.
/*
* Copyright 2011 The Closure Compiler Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.javascript.jscomp.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Math.min;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import org.jspecify.nullness.Nullable;
/**
* An AST for JavaScript regular expressions.
*/
public abstract class RegExpTree {
/**
* Returns a simpler regular expression that is semantically the same assuming
* the given flags.
* @param flags Regular expression flags, e.g. {@code "igm"}.
*/
public abstract RegExpTree simplify(String flags);
/**
* True if the presence or absence of an {@code "i"} flag would change the
* meaning of this regular expression.
*/
public abstract boolean isCaseSensitive();
/**
* True if the regular expression contains an anchor : {@code ^} or {@code $}.
*/
public abstract boolean containsAnchor();
/**
* True if the regular expression contains capturing groups.
*/
public final boolean hasCapturingGroup() {
return numCapturingGroups() != 0;
}
/**
* The number of capturing groups.
*/
public abstract int numCapturingGroups();
/**
* The children of this node.
*/
public abstract List extends RegExpTree> children();
/**
* Appends this regular expression source to the given buffer.
*/
protected abstract void appendSourceCode(StringBuilder sb);
protected abstract void appendDebugInfo(StringBuilder sb);
@Override
public final String toString() {
StringBuilder sb = new StringBuilder();
sb.append('/');
appendSourceCode(sb);
// Don't emit a regular expression that looks like a line comment start.
if (sb.length() == 1) {
sb.append("(?:)");
}
sb.append('/');
return sb.toString();
}
@Override
public abstract boolean equals(Object o);
@Override
public abstract int hashCode();
private enum ParentheticalType {
CAPTURING,
NONCAPTURING,
POSITIVE_LOOKAHEAD,
NEGATIVE_LOOKAHEAD,
POSITIVE_LOOKBEHIND,
NEGATIVE_LOOKBEHIND,
NAMED_GROUPS,
}
/**
* Parses a regular expression to an AST.
*
* @param pattern The {@code foo} From {@code /foo/i}.
* @param flags The {@code i} From {@code /foo/i}.
*/
public static RegExpTree parseRegExp(
final String pattern, final String flags) {
/** A recursive descent parser that closes over pattern and flags above. */
class Parser {
/** The number of characters in pattern consumed. */
int pos;
/** The number of capturing groups seen so far. */
int numCapturingGroups;
/** The names of capturing groups in the regex expression */
Set capturingGroupNames = new HashSet<>();
/** The length of pattern. */
final int limit = pattern.length();
/** Boolean indicating whether we should look for named capture group backreferences */
boolean lookForNamedCaptureBackreferences;
RegExpTree parseTopLevel() {
// First assume there are no named capture backreferences,
// because the spec says they should only be recognized
// if the pattern contains at least one named capture group.
this.pos = 0;
this.numCapturingGroups = 0;
this.lookForNamedCaptureBackreferences = false;
RegExpTree out = parse();
// If there were named capture groups, we must parse the pattern string again
// so we can check for any backreferences to them.
if (!capturingGroupNames.isEmpty()) {
this.pos = 0;
this.numCapturingGroups = 0;
this.lookForNamedCaptureBackreferences = true;
out = parse();
}
if (pos < limit) { // Unmatched closed paren maybe.
throw new IllegalArgumentException(pattern.substring(pos));
}
return out;
}
RegExpTree parse() {
// Collects ["foo", "bar", "baz"] for /foo|bar|baz/.
ImmutableList.Builder alternatives = null;
// The last item parsed within an alternation.
RegExpTree preceder = null;
topLoop:
while (pos < limit) {
char ch = pattern.charAt(pos);
RegExpTree atom;
switch (ch) {
case '[':
atom = parseCharset();
break;
case '(':
atom = parseParenthetical();
break;
case ')':
break topLoop;
case '\\':
atom = parseEscape();
break;
case '^':
case '$':
atom = new Anchor(ch);
++pos;
break;
case '.':
// We represent . as a character set to make it easy to simplify
// things like /.|[\r\n]/.
atom = DOT_CHARSET;
++pos;
break;
case '|':
// An alternative may be empty as in /foo||bar/.
// The '|' is consumed below.
atom = Empty.INSTANCE;
break;
default:
// Find a run of concatenated characters to avoid building a
// tree node per literal character.
int start = pos;
int end = pos + 1;
charsLoop:
while (end < limit) {
switch (pattern.charAt(end)) {
case '[':
case '(':
case ')':
case '\\':
case '^':
case '$':
case '|':
case '.':
case '*':
case '+':
case '?':
case '{':
break charsLoop;
default:
// Repetition binds more tightly than concatenation.
// Only consume up to "foo" in /foob*/ so that the suffix
// operator parser below has the right precedence.
if (end + 1 >= limit
|| !isRepetitionStart(pattern.charAt(end + 1))) {
++end;
} else {
break charsLoop;
}
}
}
atom = new Text(pattern.substring(start, end));
pos = end;
break;
}
if (pos < limit && isRepetitionStart(pattern.charAt(pos))) {
atom = parseRepetition(atom);
}
if (preceder == null) {
preceder = atom;
} else {
preceder = new Concatenation(preceder, atom);
}
// If this is an alternative in a alternation, then add it to the
// list of complete alternatives, and reset the parser state for the
// next alternative.
if (pos < limit && pattern.charAt(pos) == '|') {
if (alternatives == null) {
alternatives = ImmutableList.builder();
}
alternatives.add(preceder);
preceder = null;
++pos;
}
}
// An alternative may have no parsed content blank as in /foo|/.
if (preceder == null) { preceder = Empty.INSTANCE; }
if (alternatives != null) {
alternatives.add(preceder);
return new Alternation(alternatives.build());
} else {
return preceder;
}
}
/**
* Handles capturing groups {@code (...)},
* non-capturing groups {@code (?:...)}, and lookahead assertions
* {@code (?=...)}.
*/
private RegExpTree parseParenthetical() {
checkState(pattern.charAt(pos) == '(');
int start = pos;
++pos;
ParentheticalType type;
String captureName = null;
if (pos < limit && pattern.charAt(pos) == '?') {
if (pos + 1 < limit) {
char ch = pattern.charAt(pos + 1);
switch (ch) {
// (?:...) Non-capturing groups.
case ':':
pos += 2;
type = ParentheticalType.NONCAPTURING;
break;
// (?=...) and (?!...) Lookahead Assertions
case '=':
pos += 2;
type = ParentheticalType.POSITIVE_LOOKAHEAD;
break;
case '!':
pos += 2;
type = ParentheticalType.NEGATIVE_LOOKAHEAD;
break;
// (?<=...) and (?) named groups
case '<':
if (pos + 2 < limit && pattern.charAt(pos + 2) == '=') {
pos += 3;
type = ParentheticalType.POSITIVE_LOOKBEHIND;
break;
} else if (pos + 2 < limit && pattern.charAt(pos + 2) == '!') {
pos += 3;
type = ParentheticalType.NEGATIVE_LOOKBEHIND;
break;
} else {
pos += 2;
captureName = scanNamedGroupName();
capturingGroupNames.add(captureName);
type = ParentheticalType.NAMED_GROUPS;
break;
}
default:
throw new IllegalArgumentException(
"Malformed parenthetical: " + pattern.substring(start));
}
} else {
throw new IllegalArgumentException(
"Malformed parenthetical: " + pattern.substring(start));
}
} else {
type = ParentheticalType.CAPTURING;
}
RegExpTree body = parse();
if (pos < limit && pattern.charAt(pos) == ')') {
++pos;
} else {
throw new IllegalArgumentException(
"Unclosed parenthetical group: " + pattern.substring(start));
}
switch (type) {
case CAPTURING:
++numCapturingGroups;
return new CapturingGroup(body);
case NONCAPTURING:
return body;
case POSITIVE_LOOKAHEAD:
return new LookaheadAssertion(body, true);
case NEGATIVE_LOOKAHEAD:
return new LookaheadAssertion(body, false);
case POSITIVE_LOOKBEHIND:
return new LookbehindAssertion(body, true);
case NEGATIVE_LOOKBEHIND:
return new LookbehindAssertion(body, false);
case NAMED_GROUPS:
if (captureName != null) {
++numCapturingGroups;
return new NamedCaptureGroup(body, captureName);
} else {
throw new IllegalArgumentException(
"Malformed named capture group: " + pattern.substring(start));
}
}
throw new AssertionError("Unrecognized ParentheticalType " + type);
}
/**
* Helper that scans the pattern for a named group name. Assumes that {@code pos} points to
* the character after '<'
*
* @return the group name
*/
private String scanNamedGroupName() {
int start = pos;
int end = pos;
if (!isIdentifierStart(pattern.charAt(start))) {
throw new IllegalArgumentException(
"Invalid capture group name: <" + pattern.substring(start));
}
++end;
while (end < limit) {
if (pattern.charAt(end) == '>') {
pos = end + 1;
return pattern.substring(start, end);
} else if (isIdentifierPart(pattern.charAt(end))) {
++end;
} else {
throw new IllegalArgumentException(
"Invalid capture group name: <" + pattern.substring(start));
}
}
throw new IllegalArgumentException(
"Malformed named capture group: <" + pattern.substring(start));
}
/**
* Parses a square bracketed character set.
* Standalone character groups (@code /\d/} are handled by
* {@link #parseEscape}.
*/
private RegExpTree parseCharset() {
checkState(pattern.charAt(pos) == '[');
++pos;
boolean isCaseInsensitive = flags.indexOf('i') >= 0;
boolean inverse = pos < limit && pattern.charAt(pos) == '^';
if (inverse) { ++pos; }
CharRanges ranges = CharRanges.EMPTY;
CharRanges ieExplicits = CharRanges.EMPTY;
while (pos < limit && pattern.charAt(pos) != ']') {
char ch = pattern.charAt(pos);
int start;
if (ch == '\\') {
++pos;
char possibleGroupName = pattern.charAt(pos);
CharRanges group = NAMED_CHAR_GROUPS.get(possibleGroupName);
if (group != null) {
++pos;
ranges = ranges.union(group);
continue;
}
start = parseEscapeChar();
} else {
start = ch;
++pos;
}
int end = start;
if (pos + 1 < limit && pattern.charAt(pos) == '-'
&& pattern.charAt(pos + 1) != ']') {
++pos;
ch = pattern.charAt(pos);
if (ch == '\\') {
++pos;
end = parseEscapeChar();
} else {
end = ch;
++pos;
}
}
CharRanges range = CharRanges.inclusive(start, end);
ranges = ranges.union(range);
if (IE_SPEC_ERRORS.contains(start) && IE_SPEC_ERRORS.contains(end)) {
ieExplicits = ieExplicits.union(range.intersection(IE_SPEC_ERRORS));
}
if (isCaseInsensitive) {
// If the flags contain the 'i' flag, then it is not correct to
// say that [^a-z] contains the letter 'A', or that [a-z] does not
// contain the letter 'A'.
// We expand out letter groups here so that parse returns something
// that is valid independent of flags.
// Calls to simplify(flags) may later reintroduce flag assumptions.
// but without this step, later steps might conflate
// /[a-z]/i
// and
// /[^\0-`{-\uffff]/i
// which matches nothing because the information about whether the
// ^ is present has been lost during optimizations and charset
// unionizing as in /[...]|[^...]/.
ranges = CaseCanonicalize.expandToAllMatched(ranges);
}
}
++pos; // Consume ']'
if (inverse) {
ranges = CharRanges.ALL_CODE_UNITS.difference(ranges);
}
return new Charset(ranges, ieExplicits);
}
/**
* Parses an escape to a code point. Some of the characters parsed here have special meanings
* in various contexts, so contexts must filter those instead. E.g. '\b' means a different
* thing inside a charset than without.
*/
private int parseEscapeChar() {
char ch = pattern.charAt(pos++);
switch (ch) {
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'u':
return (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{')
? parseBracedUnicodeEscape()
: parseHex(4);
case 'v': return '\u000b';
case 'x': return parseHex(2);
default:
if ('0' <= ch && ch <= '7') {
char codeUnit = (char) (ch - '0');
// Allow octal literals in the range \0-\377.
// \41 might be a group, but \041 is not a group.
// We read, but do not emit octal literals since they
// are deprecated in ES5.
int octLimit = min(limit, pos + (ch <= '3' ? 2 : 1) + (ch == '0' ? 1 : 0));
while (pos < octLimit) {
ch = pattern.charAt(pos);
if ('0' <= ch && ch <= '7') {
codeUnit = (char) ((codeUnit << 3) + (ch - '0'));
++pos;
} else {
break;
}
}
return codeUnit;
}
return ch;
}
}
/**
* Parses an escape that appears outside a charset.
*/
private RegExpTree parseEscape() {
checkState(pattern.charAt(pos) == '\\');
int start = pos;
++pos;
char ch = pattern.charAt(pos);
if (ch == 'b' || ch == 'B') {
++pos;
return new WordBoundary(ch);
} else if ((ch == 'p' || ch == 'P') && flags.contains("u")) {
// handle ES2018 unicode property tests, e.g.
// /\p{ASCII_Hex_Digit=true}/ only hex digits
// /\P{Script=Greek}/ no greek letters
boolean negated = ch == 'P';
++pos;
if (pos < limit && pattern.charAt(pos) == '{') {
StringBuilder lhs = new StringBuilder();
while (++pos < limit
&& ((ch = pattern.charAt(pos)) == '_'
|| ('a' <= ch && ch <= 'z')
|| ('A' <= ch && ch <= 'Z')
|| ('0' <= ch && ch <= '9'))) {
lhs.append(ch);
}
if (pos < limit && ch == '}') {
// Case of shorthand like /\p{ASCII_Hex_Digit}/u
++pos;
return new UnicodePropertyEscape(null, lhs.toString(), negated);
} else if (pos < limit && ch == '=') {
// Case of having '=' like /\p{Script=Greek}/u
StringBuilder rhs = new StringBuilder();
while (++pos < limit
&& ((ch = pattern.charAt(pos)) == '_'
|| ('a' <= ch && ch <= 'z')
|| ('A' <= ch && ch <= 'Z')
|| ('0' <= ch && ch <= '9'))) {
rhs.append(ch);
}
if (pos < limit && ch == '}') {
++pos;
return new UnicodePropertyEscape(lhs.toString(), rhs.toString(), negated);
} else {
throw new IllegalArgumentException(
"Malformed Unicode Property Escape: expected '}' after "
+ pattern.substring(start, pos));
}
} else {
throw new IllegalArgumentException(
"Malformed Unicode Property Escape: expected '=' or '}' after "
+ pattern.substring(start, pos));
}
} else {
throw new IllegalArgumentException(
"Malformed Unicode Property Escape: expected '{' after "
+ pattern.substring(start, pos));
}
} else if ('1' <= ch && ch <= '9') {
++pos;
int possibleGroupIndex = ch - '0';
if (numCapturingGroups >= possibleGroupIndex) {
if (pos < limit) {
char next = pattern.charAt(pos);
if ('0' <= next && next <= '9') {
int twoDigitGroupIndex = possibleGroupIndex * 10 + (next - '0');
if (numCapturingGroups >= twoDigitGroupIndex) {
++pos;
possibleGroupIndex = twoDigitGroupIndex;
}
}
}
return new BackReference(possibleGroupIndex);
} else {
// \1 - \7 are octal escapes if there is no such group.
// \8 and \9 are the literal characters '8' and '9' if there
// is no such group.
return new Text(Character.toString(
possibleGroupIndex <= 7 ? (char) possibleGroupIndex : ch));
}
} else if (lookForNamedCaptureBackreferences
&& ch == 'k'
&& pos + 1 < limit
&& pattern.charAt(pos + 1) == '<'
// According to the spec
// https://github.com/tc39/proposal-regexp-named-groups#backwards-compatibility-of-new-syntax
// we want to treat \k as a normal string if there are no named capturing groups present
&& !capturingGroupNames.isEmpty()) {
pos += 2;
String potentialName = scanNamedGroupName();
if (!capturingGroupNames.contains(potentialName)) {
throw new IllegalArgumentException(
"Invalid named capture referenced: " + pattern.substring(start));
}
return new NamedBackReference(potentialName);
} else {
CharRanges charGroup = NAMED_CHAR_GROUPS.get(ch);
if (charGroup != null) { // Handle \d, etc.
++pos;
return new Charset(charGroup, CharRanges.EMPTY);
}
return new Text(new String(Character.toChars(parseEscapeChar())));
}
}
/** Parses n hex digits to a code-unit. */
private int parseHex(int n) {
if (pos + n > limit) {
throw new IllegalArgumentException(
"Abbreviated hex escape " + pattern.substring(pos));
}
if (n > 7) {
// We need to guard the MSB to prevent overflow.
throw new IllegalArgumentException(
"Cannot parse hexadecimal encoding wider than 28 bits: "
+ pattern.substring(pos, pos + n));
}
int result = 0;
while (--n >= 0) {
char ch = pattern.charAt(pos);
int digit;
if ('0' <= ch && ch <= '9') {
digit = ch - '0';
} else if ('a' <= ch && ch <= 'f') {
digit = ch + (10 - 'a');
} else if ('A' <= ch && ch <= 'F') {
digit = ch + (10 - 'A');
} else {
throw new IllegalArgumentException(pattern.substring(pos));
}
++pos;
result = (result << 4) | digit;
}
return result;
}
private int parseBracedUnicodeEscape() {
int openBrace = pos;
checkState(pattern.charAt(pos++) == '{');
int closeBrace = pos;
while (closeBrace < limit && pattern.charAt(closeBrace) != '}') {
closeBrace++;
}
if (closeBrace == limit) {
throw new IllegalArgumentException(
"Malformed unicode escape: expected '}' after " + pattern.substring(openBrace));
} else if (closeBrace == pos) {
throw new IllegalArgumentException("Empty unicode escape");
}
int result = parseHex(closeBrace - pos);
if (result > 0x10FFFF) {
throw new IllegalArgumentException(
"Unicode must be at most 0x10FFFF: " + pattern.substring(openBrace + 1, pos));
}
pos++; // Consume the close brace.
return result;
}
private boolean isRepetitionStart(char ch) {
switch (ch) {
case '?':
case '*':
case '+':
case '{':
return true;
default:
return false;
}
}
/**
* Parse a repetition. {@code x?} is treated as a repetition --
* an optional production can be matched 0 or 1 time.
*/
private RegExpTree parseRepetition(RegExpTree body) {
if (pos == limit) { return body; }
int min;
int max;
switch (pattern.charAt(pos)) {
case '+':
++pos;
min = 1;
max = Integer.MAX_VALUE;
break;
case '*':
++pos;
min = 0;
max = Integer.MAX_VALUE;
break;
case '?':
++pos;
min = 0;
max = 1;
break;
case '{':
++pos;
int start = pos;
int end = pattern.indexOf('}', start);
if (end < 0) {
pos = start - 1;
return body;
}
String counts = pattern.substring(start, end);
pos = end + 1;
int comma = counts.indexOf(',');
try {
min = Integer.parseInt(
comma >= 0 ? counts.substring(0, comma) : counts);
max = comma >= 0
? comma + 1 != counts.length()
? Integer.parseInt(counts.substring(comma + 1))
: Integer.MAX_VALUE
: min;
} catch (NumberFormatException ex) {
min = max = -1;
}
if (min < 0 || min > max) {
// Treat the open curly bracket literally.
pos = start - 1;
return body;
}
break;
default:
return body;
}
boolean greedy = true;
if (pos < limit && pattern.charAt(pos) == '?') {
greedy = false;
++pos;
}
return new Repetition(body, min, max, greedy);
}
}
return new Parser().parseTopLevel();
}
/**
* @param ch the character
* @return true if the character is a valid Javascript identifier start character.
*/
private static boolean isIdentifierStart(char ch) {
// TODO(yitingwang) This and the one in Scanner.java should share the same implementation
// Most code is written in pure ASCII create a fast path here.
if (ch <= 127) {
return ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch == '_' || ch == '$'));
}
// Workaround b/36459436
// When running under GWT, Character.isLetter only handles ASCII
// Angular relies heavily on U+0275 (Latin Barred O)
return ch == 0x0275
// TODO: UnicodeLetter also includes Letter Number (NI)
|| Character.isLetter(ch);
}
/**
* @param ch the character
* @return true if the character is allowed in Javascript identifiers.
*/
private static boolean isIdentifierPart(char ch) {
// TODO(yitingwang) This and the one in Scanner.java should share the same implementation
// Most code is written in pure ASCII create a fast path here.
if (ch <= 127) {
return ((ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z')
|| (ch >= '0' && ch <= '9')
|| (ch == '_' || ch == '$')); // _ or $
}
// TODO: identifier part character classes
// CombiningMark
// Non-Spacing mark (Mn)
// Combining spacing mark(Mc)
// Connector punctuation (Pc)
// Zero Width Non-Joiner
// Zero Width Joiner
return isIdentifierStart(ch) || Character.isDigit(ch);
}
/**
* True if, but not necessarily always when the, given regular expression
* must match the whole input or none of it.
*/
public static boolean matchesWholeInput(RegExpTree t, String flags) {
if (flags.indexOf('m') >= 0) { return false; }
if (!(t instanceof Concatenation)) {
return false;
}
Concatenation c = (Concatenation) t;
if (c.elements.isEmpty()) { return false; }
RegExpTree first = c.elements.get(0);
RegExpTree last = Iterables.getLast(c.elements);
if (!(first instanceof Anchor && last instanceof Anchor)) { return false; }
return ((Anchor) first).type == '^' && ((Anchor) last).type == '$';
}
/** Represents a node that never has children such as an anchor or charset. */
public abstract static class RegExpTreeAtom extends RegExpTree {
@Override
public boolean isCaseSensitive() {
return false;
}
@Override
public boolean containsAnchor() {
return false;
}
@Override
public final int numCapturingGroups() {
return 0;
}
@Override
public final ImmutableList extends RegExpTree> children() {
return ImmutableList.of();
}
}
/** Represents an empty portion of a RegExp such as the middle of "||" */
public static final class Empty extends RegExpTreeAtom {
static final Empty INSTANCE = new Empty();
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
// No output
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// No output
}
@Override
public boolean equals(Object o) {
return o instanceof Empty;
}
@Override
public int hashCode() {
return 0x7ee06141;
}
}
/** Represents an anchor, namely ^ or $. */
public static final class Anchor extends RegExpTreeAtom {
final char type;
Anchor(char type) { this.type = type; }
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
public boolean containsAnchor() {
return true;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(type);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(type);
}
@Override
public boolean equals(Object o) {
return o instanceof Anchor && type == ((Anchor) o).type;
}
@Override
public int hashCode() {
return type ^ 0xe85317ff;
}
}
/** Represents \b or \B */
public static final class WordBoundary extends RegExpTreeAtom {
final char type;
WordBoundary(char type) {
this.type = type;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('\\').append(type);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(type);
}
@Override
public boolean equals(Object o) {
return o instanceof WordBoundary && type == ((WordBoundary) o).type;
}
@Override
public int hashCode() {
return 0x5673aa29 ^ type;
}
}
/** Represents a reference to a previous group such as \1 or \2 */
public static final class BackReference extends RegExpTreeAtom {
final int groupIndex;
BackReference(int groupIndex) {
checkArgument(groupIndex >= 0 && groupIndex <= 99);
this.groupIndex = groupIndex;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('\\').append(groupIndex);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(groupIndex);
}
@Override
public boolean equals(Object o) {
return o instanceof BackReference
&& groupIndex == ((BackReference) o).groupIndex;
}
@Override
public int hashCode() {
return 0xff072663 ^ groupIndex;
}
}
/** Represents a reference to a previous named group */
public static final class NamedBackReference extends RegExpTreeAtom {
final String groupName;
NamedBackReference(String groupName) {
this.groupName = groupName;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append("\\k<").append(groupName).append('>');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(groupName);
}
@Override
public boolean equals(Object o) {
return o instanceof NamedBackReference
&& groupName.equals(((NamedBackReference) o).groupName);
}
@Override
public int hashCode() {
return Objects.hashCode(groupName);
}
}
/** Represents a run of non-special characters such as "foobar" */
public static final class Text extends RegExpTreeAtom {
final String text;
Text(String text) {
this.text = text;
}
/**
* @param ch The code-unit to escape.
* @param next The next code-unit or -1 if indeterminable.
*/
private static void escapeRegularCharOnto(
char ch, int next, StringBuilder sb) {
switch (ch) {
case '$':
case '^':
case '*':
case '(':
case ')':
case '+':
case '[':
case '|':
case '.':
case '/':
case '?':
sb.append('\\').append(ch);
break;
case '{':
// If possibly part of a repetition, then escape.
// Concatenation is handled by the digitsMightBleed check.
if ('0' <= next && next <= '9') {
sb.append('\\');
}
sb.append(ch);
break;
default:
escapeCharOnto(ch, sb);
}
}
@Override
public RegExpTree simplify(String flags) {
int n = text.length();
if (n == 0) {
return Empty.INSTANCE;
}
if (flags.indexOf('i') >= 0) {
String canonicalized = CaseCanonicalize.caseCanonicalize(text);
if (!text.equals(canonicalized)) {
return new Text(canonicalized);
}
}
return this;
}
@Override
public boolean isCaseSensitive() {
for (int i = 0, n = text.length(); i < n; ++i) {
if (CaseCanonicalize.CASE_SENSITIVE.contains(text.charAt(i))) {
return true;
}
}
return false;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
for (int i = 0, n = text.length(); i < n; ++i) {
escapeRegularCharOnto(text.charAt(i), i + 1 < n ? text.charAt(i + 1) : -1, sb);
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append('`').append(text).append('`');
}
@Override
public boolean equals(Object o) {
return o instanceof Text && text.equals(((Text) o).text);
}
@Override
public int hashCode() {
return text.hashCode() ^ 0x617e310;
}
}
/** Represents a repeating item such as ...+, ...*, or ...{0,1} */
public static final class Repetition extends RegExpTree {
final RegExpTree body;
final int min;
final int max;
final boolean greedy;
Repetition(RegExpTree body, int min, int max, boolean greedy) {
this.body = body;
this.min = min;
this.max = max;
this.greedy = greedy;
}
@Override
public RegExpTree simplify(String flags) {
RegExpTree body = this.body.simplify(flags);
if (max == 0 && !body.hasCapturingGroup()) { return Empty.INSTANCE; }
if (body instanceof Empty || NEVER_MATCHES.equals(body)) { return body; }
int min = this.min;
int max = this.max;
if (body instanceof Repetition) {
Repetition rbody = (Repetition) body;
if (rbody.greedy == greedy) {
long lmin = ((long) min) * rbody.min;
long lmax = ((long) max) * rbody.max;
if (lmin < Integer.MAX_VALUE) {
body = rbody.body;
min = (int) lmin;
max = lmax >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) lmax;
}
}
}
if (min == 1 && max == 1) { return body; }
boolean greedy = this.greedy || min == max;
return body.equals(this.body) && min == this.min && max == this.max
&& greedy == this.greedy
? this
: new Repetition(body, min, max, greedy).simplify(flags);
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return body.numCapturingGroups();
}
@Override
public ImmutableList extends RegExpTree> children() {
return ImmutableList.of(body);
}
private void appendBodySourceCode(StringBuilder sb) {
if (body instanceof Alternation || body instanceof Concatenation
|| body instanceof Repetition
|| (body instanceof Text && ((Text) body).text.length() > 1)) {
sb.append("(?:");
body.appendSourceCode(sb);
sb.append(')');
} else {
body.appendSourceCode(sb);
}
}
private static int suffixLen(int min, int max) {
// This mirrors the branches that renders a suffix in appendSourceCode below.
if (max == Integer.MAX_VALUE) {
switch (min) {
case 0: // *
case 1: // +
return 1;
default:
return 3 + numDecimalDigits(min); // {3,}
}
}
if (min == 0 && max == 1) {
return 1; // ?
}
if (min == max) {
if (min == 1) {
return 0; // No suffix needed for {1}.
}
return 2 + numDecimalDigits(min); // {4}
}
return 3 + numDecimalDigits(min) + numDecimalDigits(max); // {2,7}
}
private static int numDecimalDigits(int n) {
if (n < 0) {
// Negative values should not be passed in.
throw new AssertionError();
// If changing this code to support negative values,
// Integer.MIN_VALUE is a corner-case..
}
int nDigits = 1;
while (n >= 10) {
++nDigits;
n /= 10;
}
return nDigits;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
int bodyStart = sb.length();
appendBodySourceCode(sb);
int bodyEnd = sb.length();
int bodyLen = bodyEnd - bodyStart;
int min = this.min;
int max = this.max;
if ((min >= 2 && max == Integer.MAX_VALUE) || max - min <= 1) {
int expanded =
// If min == max then we want to try expanding to the limit and
// attach the empty suffix, which is equivalent to min = max = 1,
// i.e. /a/ vs /a{1}/.
min == max
// Give aa+ preference over aaa*.
|| max == Integer.MAX_VALUE
? min - 1
: min;
int expandedMin = min - expanded;
int expandedMax = max == Integer.MAX_VALUE ? max : max - expanded;
int suffixLen = suffixLen(min, max);
int expandedSuffixLen = suffixLen(expandedMin, expandedMax);
if (bodyLen * expanded + expandedSuffixLen < suffixLen
&& !body.hasCapturingGroup()) {
// a{2} -> aa
// a{2,} -> aa+
// a{2,3} -> aaa?
while (--expanded >= 0) {
sb.append(sb, bodyStart, bodyEnd);
}
min = expandedMin;
max = expandedMax;
}
}
if (max == Integer.MAX_VALUE) {
switch (min) {
case 0: sb.append('*'); break;
case 1: sb.append('+'); break;
default:
sb.append('{').append(min).append(",}");
}
} else if (min == 0 && max == 1) {
sb.append('?');
} else if (min == max) {
if (min != 1) {
sb.append('{').append(min).append('}');
}
} else {
sb.append('{').append(min).append(',').append(max).append('}');
}
if (!greedy) {
sb.append('?');
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(" min=").append(min).append(", max=").append(max);
if (!greedy) { sb.append(" not_greedy"); }
}
@Override
public boolean equals(Object o) {
if (!(o instanceof Repetition)) { return false; }
Repetition that = (Repetition) o;
return this.body.equals(that.body)
&& this.min == that.min
&& this.max == that.max
&& this.greedy == that.greedy;
}
@Override
public int hashCode() {
return min + 31 * (max + 31 * ((greedy ? 1 : 0) + 31 * body.hashCode()));
}
}
/** Represents the possibilities ["foo", "bar" ] for a RegExp /foo|bar/ */
public static final class Alternation extends RegExpTree {
final ImmutableList alternatives;
Alternation(List extends RegExpTree> alternatives) {
this.alternatives = ImmutableList.copyOf(alternatives);
}
@Override
public RegExpTree simplify(String flags) {
List alternatives = new ArrayList<>();
for (RegExpTree alternative : this.alternatives) {
alternative = alternative.simplify(flags);
if (alternative instanceof Alternation) {
alternatives.addAll(((Alternation) alternative).alternatives);
} else {
alternatives.add(alternative);
}
}
// Remove duplicates
RegExpTree last = null;
for (Iterator it = alternatives.iterator(); it.hasNext();) {
RegExpTree alternative = it.next();
if (alternative.equals(NEVER_MATCHES)) { continue; }
if (alternative.equals(last) && !alternative.hasCapturingGroup()) {
it.remove();
} else {
last = alternative;
}
}
// Collapse character alternatives into character sets.
for (int i = 0, n = alternatives.size(); i < n; ++i) {
RegExpTree alternative = alternatives.get(i);
if ((alternative instanceof Text
&& ((Text) alternative).text.length() == 1)
|| alternative instanceof Charset) {
int end = i;
int nCharsets = 0;
while (end < n) {
RegExpTree follower = alternatives.get(end);
if (follower instanceof Charset) {
++nCharsets;
} else if (!(follower instanceof Text
&& ((Text) follower).text.length() == 1)) {
break;
}
++end;
}
if (end - i >= 3 || (nCharsets != 0 && end - i >= 2)) {
int[] members = new int[end - i - nCharsets];
int memberIdx = 0;
CharRanges chars = CharRanges.EMPTY;
CharRanges ieExplicits = CharRanges.EMPTY;
List charAlternatives = alternatives.subList(i, end);
for (RegExpTree charAlternative : charAlternatives) {
if (charAlternative instanceof Text) {
char ch = ((Text) charAlternative).text.charAt(0);
members[memberIdx++] = ch;
if (IE_SPEC_ERRORS.contains(ch)) {
ieExplicits = ieExplicits.union(CharRanges.inclusive(ch, ch));
}
} else if (charAlternative instanceof Charset) {
Charset cs = (Charset) charAlternative;
chars = chars.union(cs.ranges);
ieExplicits = ieExplicits.union(cs.ieExplicits);
}
}
chars = chars.union(CharRanges.withMembers(members));
charAlternatives.clear();
charAlternatives.add(
new Charset(chars, ieExplicits).simplify(flags));
n = alternatives.size();
}
}
}
switch (alternatives.size()) {
case 0: return Empty.INSTANCE;
case 1: return alternatives.get(0);
case 2:
if (alternatives.get(1) instanceof Empty) { // (?:a|) -> a?
return new Repetition(alternatives.get(0), 0, 1, true);
} else if (alternatives.get(0) instanceof Empty) {
return new Repetition(alternatives.get(1), 0, 1, false);
}
break;
}
// TODO: maybe pull out common prefix or suffix
return alternatives.equals(this.alternatives)
? this : new Alternation(alternatives);
}
@Override
public boolean isCaseSensitive() {
for (RegExpTree alternative : alternatives) {
if (alternative.isCaseSensitive()) { return true; }
}
return false;
}
@Override
public boolean containsAnchor() {
for (RegExpTree alternative : alternatives) {
if (alternative.containsAnchor()) { return true; }
}
return false;
}
@Override
public int numCapturingGroups() {
int n = 0;
for (RegExpTree alternative : alternatives) {
n += alternative.numCapturingGroups();
}
return n;
}
@Override
public ImmutableList extends RegExpTree> children() {
return alternatives;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
for (int i = 0, n = alternatives.size(); i < n; ++i) {
if (i != 0) {
sb.append('|');
}
alternatives.get(i).appendSourceCode(sb);
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return this == o || (
(o instanceof Alternation)
&& alternatives.equals(((Alternation) o).alternatives));
}
@Override
public int hashCode() {
return 0x51b57cd1 ^ alternatives.hashCode();
}
}
private static final RegExpTree NEVER_MATCHES = new LookaheadAssertion(
Empty.INSTANCE, false);
/** Represents a lookahead assertion such as (?=...) or (?!...) */
public static final class LookaheadAssertion extends RegExpTree {
final RegExpTree body;
final boolean positive;
LookaheadAssertion(RegExpTree body, boolean positive) {
this.body = body;
this.positive = positive;
}
@Override
public RegExpTree simplify(String flags) {
RegExpTree simpleBody = body.simplify(flags);
if (simpleBody instanceof Empty) {
if (positive) { // Always true
return simpleBody;
}
}
return new LookaheadAssertion(simpleBody, positive);
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return body.numCapturingGroups();
}
@Override
public ImmutableList extends RegExpTree> children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(positive ? "(?=" : "(?!");
body.appendSourceCode(sb);
sb.append(')');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(positive ? "positive" : "negative");
}
@Override
public boolean equals(Object o) {
if (!(o instanceof LookaheadAssertion)) { return false; }
LookaheadAssertion that = (LookaheadAssertion) o;
return this.positive == that.positive && this.body.equals(that.body);
}
@Override
public int hashCode() {
return 0x723aba9 ^ body.hashCode();
}
}
/** Represents a lookbehind assertion such as {@code (?<=...) } or {@code (? children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(positive ? "(?<=" : "(? children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('(');
body.appendSourceCode(sb);
sb.append(')');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return o instanceof CapturingGroup
&& body.equals(((CapturingGroup) o).body);
}
@Override
public int hashCode() {
return 0x55781738 ^ body.hashCode();
}
}
/** Represents a named capture group */
public static final class NamedCaptureGroup extends RegExpTree {
final RegExpTree body;
final String name;
NamedCaptureGroup(RegExpTree body, String name) {
this.body = body;
this.name = name;
}
@Override
public RegExpTree simplify(String flags) {
return new NamedCaptureGroup(body.simplify(flags), name);
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return 1 + body.numCapturingGroups();
}
@Override
public ImmutableList extends RegExpTree> children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append("(?<");
sb.append(name);
sb.append('>');
body.appendSourceCode(sb);
sb.append(')');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(" name=").append(name);
}
@Override
public boolean equals(Object o) {
return o instanceof NamedCaptureGroup
&& name.equals(((NamedCaptureGroup) o).name)
&& body.equals(((NamedCaptureGroup) o).body);
}
@Override
public int hashCode() {
return Objects.hashCode(name) ^ body.hashCode();
}
}
/** Represents a Unicode Property Escape such as in /\p{Script=Greek}/u */
public static final class UnicodePropertyEscape extends RegExpTreeAtom {
private final String propertyName;
private final String propertyValue;
private final boolean negated;
UnicodePropertyEscape(@Nullable String propertyName, String propertyValue, boolean negated) {
checkState(propertyValue != null);
checkArgument(
propertyName == null || !propertyName.isEmpty(),
"if '=' is present in a unicode property escape, the name cannot be empty");
checkArgument(!propertyValue.isEmpty(), "unicode property escape value cannot be empty");
this.propertyName = propertyName;
this.propertyValue = propertyValue;
this.negated = negated;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(negated ? "\\P{" : "\\p{");
if (propertyName != null) {
sb.append(propertyName);
sb.append('=');
}
sb.append(propertyValue);
sb.append("}");
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides properties and possible negation.
}
@Override
public boolean equals(Object o) {
return (o instanceof UnicodePropertyEscape)
&& negated == ((UnicodePropertyEscape) o).negated
&& Objects.equals(propertyName, ((UnicodePropertyEscape) o).propertyName)
&& Objects.equals(propertyValue, ((UnicodePropertyEscape) o).propertyValue);
}
@Override
public int hashCode() {
return Objects.hash(negated, propertyName, propertyValue);
}
}
private static final CharRanges DIGITS = CharRanges.inclusive('0', '9');
private static final CharRanges UCASE_LETTERS
= CharRanges.inclusive('A', 'Z');
private static final CharRanges LCASE_LETTERS
= CharRanges.inclusive('a', 'z');
private static final CharRanges LETTERS = UCASE_LETTERS.union(LCASE_LETTERS);
private static final CharRanges WORD_CHARS = DIGITS
.union(LETTERS).union(CharRanges.withMembers('_'));
private static final CharRanges INVERSE_WORD_CHARS
= CharRanges.ALL_CODE_UNITS.difference(WORD_CHARS);
private static final CharRanges SPACE_CHARS = CharRanges.withMembers(
'\t', '\n', '\u000b', '\u000c', '\r', ' ', '\u00a0',
// Unicode 3.0 Zs
'\u1680', '\u180e', '\u2000', '\u2001',
'\u2002', '\u2003', '\u2004', '\u2005',
'\u2006', '\u2007', '\u2008', '\u2009',
'\u200a',
// Line terminator chars
'\u2028', '\u2029',
// Unicode 3.0 Zs
'\u202f', '\u205f', '\u3000',
// Byte order marker is a space character in ES5 but not ES3.
'\ufeff'
);
/** IE is broken around \s. IE (6, 7, 8 at least), only recognize these. */
private static final CharRanges IE_SPACE_CHARS = CharRanges.withMembers(
'\t', '\n', '\u000b', '\u000c', '\r', ' '
);
/** IE is broken around \s. IE (6, 7, 8 at least), only recognize these. */
private static final CharRanges IE_SPEC_ERRORS = SPACE_CHARS.difference(
IE_SPACE_CHARS);
private static final ImmutableMap NAMED_CHAR_GROUPS =
ImmutableMap.builder()
.put('d', DIGITS)
.put('D', CharRanges.ALL_CODE_UNITS.difference(DIGITS))
.put('s', SPACE_CHARS)
.put('S', CharRanges.ALL_CODE_UNITS.difference(SPACE_CHARS))
.put('w', WORD_CHARS)
.put('W', INVERSE_WORD_CHARS)
.buildOrThrow();
private static final Charset DOT_CHARSET = new Charset(
CharRanges.ALL_CODE_UNITS.difference(
CharRanges.withMembers('\n', '\r', '\u2028', '\u2029')),
CharRanges.EMPTY);
/** Represents a set of possible characters structured as [a-zA-Z] or [^a-zA-Z] */
public static final class Charset extends RegExpTreeAtom {
final CharRanges ranges;
/**
* Code units that were mentioned explicitly and that might be matched by
* a group according to ECMAScript 5 but would not because of specification
* violations in IE.
*/
final CharRanges ieExplicits;
Charset(CharRanges ranges, CharRanges ieExplicits) {
this.ranges = ranges;
this.ieExplicits = ieExplicits;
}
private static int complexityWordFolded(CharRanges ranges) {
return min(
complexityWordFoldedHelper(ranges),
1 + complexityWordFoldedHelper(CharRanges.ALL_CODE_UNITS.difference(ranges)));
}
private static int complexityWordFoldedHelper(CharRanges ranges) {
int complexity = DecomposedCharset.complexity(ranges);
if (ranges.containsAll(WORD_CHARS)) {
complexity =
min(complexity, 1 + DecomposedCharset.complexity(ranges.difference(WORD_CHARS)));
}
if (ranges.containsAll(INVERSE_WORD_CHARS)) {
complexity =
min(
complexity,
1 + DecomposedCharset.complexity(ranges.difference(INVERSE_WORD_CHARS)));
}
return complexity;
}
@Override
public RegExpTree simplify(String flags) {
if (ranges.isEmpty()) {
return NEVER_MATCHES;
}
CharRanges best = ranges;
if (flags.indexOf('i') >= 0) {
Set options = new LinkedHashSet<>();
options.add(CaseCanonicalize.expandToAllMatched(ranges));
options.add(CaseCanonicalize.reduceToMinimum(ranges));
CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS);
CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS);
CharRanges lcaseLettersToUpper = lcaseLetters.shift(-32);
CharRanges ucaseLettersToLower = ucaseLetters.shift(32);
options.add(ranges.union(ucaseLettersToLower));
options.add(ranges.union(lcaseLettersToUpper));
options.add(ranges.union(lcaseLettersToUpper)
.union(ucaseLettersToLower));
options.add(ranges.union(ucaseLettersToLower).difference(ucaseLetters));
options.add(ranges.union(lcaseLettersToUpper).difference(lcaseLetters));
int bestComplexity = complexityWordFolded(ranges);
for (CharRanges option : options) {
int complexity = complexityWordFolded(option);
if (complexity < bestComplexity) {
bestComplexity = complexity;
best = option;
}
}
}
if (best.getNumRanges() == 1
&& best.end(0) - best.start(0) == 1) {
return new Text(Character.toString((char) best.start(0)));
}
if (!best.equals(ranges)) {
return new Charset(best, ieExplicits);
}
return this;
}
@Override
public boolean isCaseSensitive() {
// We could test
// !ranges.equals(CaseCanonicalize.expandToAllMatched(ranges))
// but we get better optimizations by leaving the 'i' flag on in most
// cases.
// Check whether skipping all the character groups that are known
// case-insensitive leaves us with something that matches the above
// definition.
CharRanges withoutNamedGroups = decompose().ranges;
return !withoutNamedGroups.equals(
CaseCanonicalize.expandToAllMatched(withoutNamedGroups));
}
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) {
StringBuilder namedGroups = new StringBuilder();
CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits);
while (true) {
char groupName = 0;
CharRanges simplest = null;
int minComplexity = DecomposedCharset.complexity(ranges);
for (Map.Entry namedGroup
: NAMED_CHAR_GROUPS.entrySet()) {
CharRanges group = namedGroup.getValue();
if (ranges.containsAll(group)) {
CharRanges withoutGroup = ranges.difference(group).union(
rangesInterIeExplicits);
int complexity = DecomposedCharset.complexity(withoutGroup);
if (complexity < minComplexity) {
simplest = withoutGroup;
groupName = namedGroup.getKey().charValue();
minComplexity = complexity;
}
}
}
if (simplest != null) {
namedGroups.append('\\').append(groupName);
ranges = simplest;
} else {
break;
}
}
return new DecomposedCharset(inverted, ranges, namedGroups.toString());
}
DecomposedCharset decompose() {
CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges);
if (!ieExplicits.isEmpty()) {
if (negRanges.intersection(ieExplicits).isEmpty()) {
return decompose(ranges, false);
} else if (ranges.intersection(ieExplicits).isEmpty()) {
return decompose(negRanges, true);
}
}
DecomposedCharset positive = decompose(ranges, false);
DecomposedCharset negative = decompose(negRanges, true);
return positive.complexity() <= negative.complexity() ? positive : negative;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
if (DOT_CHARSET.ranges.equals(ranges)) {
sb.append('.');
return;
}
decompose().appendSourceCode(sb);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(ranges);
}
@Override
public boolean equals(Object o) {
return o instanceof Charset && ranges.equals(((Charset) o).ranges);
}
@Override
public int hashCode() {
return ranges.hashCode() ^ 0xdede2246;
}
}
/** Internal representation for [] charsets */
static final class DecomposedCharset {
boolean inverted;
final CharRanges ranges;
final String namedGroups;
DecomposedCharset(
boolean inverted, CharRanges ranges, String namedGroups) {
this.inverted = inverted;
this.ranges = ranges;
this.namedGroups = namedGroups;
}
int complexity() {
return (inverted ? 1 : 0) + namedGroups.length() + complexity(ranges);
}
void appendSourceCode(StringBuilder sb) {
if (ranges.isEmpty()) {
if (!inverted && namedGroups.length() == 2) {
sb.append(namedGroups);
return;
} else if (ranges.isEmpty() && namedGroups.isEmpty()) {
sb.append(inverted ? "[\\S\\s]" : "(?!)");
return;
}
}
sb.append('[');
if (inverted) { sb.append('^'); }
sb.append(namedGroups);
boolean rangesStartCharset = !inverted && namedGroups.isEmpty();
boolean emitDashAtEnd = false;
for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
char start = (char) ranges.start(i);
char end = (char) (ranges.end(i) - 1);
switch (end - start) {
case 0:
if (start == '-') {
// Put it at the end where it doesn't need escaping.
emitDashAtEnd = true;
} else {
escapeRangeCharOnto(
start, rangesStartCharset, i == 0, i + 1 == n, sb);
}
break;
case 1:
escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
escapeRangeCharOnto(
end, rangesStartCharset, false, i + 1 == n, sb);
break;
default:
escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
sb.append('-');
escapeRangeCharOnto(end, rangesStartCharset, false, true, sb);
break;
}
}
if (emitDashAtEnd) { sb.append('-'); }
sb.append(']');
}
static void escapeRangeCharOnto(
char ch, boolean startIsFlush, boolean atStart, boolean atEnd,
StringBuilder sb) {
switch (ch) {
case '\b':
sb.append("\\b");
break;
case '^':
sb.append(atStart && startIsFlush ? "\\^" : "^");
break;
case '-':
sb.append(atStart || atEnd ? "-" : "\\-");
break;
case '\\':
case ']':
sb.append('\\').append(ch);
break;
default:
escapeCharOnto(ch, sb);
}
}
static int complexity(CharRanges ranges) {
int complexity = 0;
for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
int start = ranges.start(i);
int end = ranges.end(i) - 1;
if (start < 0x20 || start >= 0x7f) {
complexity += start >= 0x100 ? 6 : 4;
} else {
++complexity;
}
switch (end - start) {
case 0: continue;
case 1: break;
default: complexity += 1;
}
if (end < 0x20 || end >= 0x7f) {
complexity += end >= 0x100 ? 6 : 4;
} else {
++complexity;
}
}
return complexity;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof DecomposedCharset)) {
return false;
}
DecomposedCharset that = (DecomposedCharset) o;
return this.inverted = that.inverted
&& this.ranges.equals(that.ranges)
&& this.namedGroups.equals(that.namedGroups);
}
@Override
public int hashCode() {
return ranges.hashCode()
+ 31 * (namedGroups.hashCode() + (inverted ? 1 : 0));
}
}
/** Represents a series of nodes chained one after another such as (?:...)[a-z]*(...) */
public static final class Concatenation extends RegExpTree {
final ImmutableList elements;
Concatenation(RegExpTree a, RegExpTree b) {
elements = ImmutableList.of(a, b);
}
Concatenation(List extends RegExpTree> elements) {
this.elements = ImmutableList.copyOf(elements);
}
@Override
public RegExpTree simplify(final String flags) {
class Simplifier {
final List simplified = new ArrayList<>();
void simplify(RegExpTree t) {
if (t instanceof Concatenation) {
for (RegExpTree child : ((Concatenation) t).elements) {
simplify(child);
}
} else if (t instanceof Empty) {
// Do nothing
} else {
int lastIndex = simplified.size() - 1;
if (lastIndex >= 0) {
RegExpTree pairwise = simplifyPairwise(
simplified.get(lastIndex), t);
if (pairwise != null) {
simplified.set(lastIndex, pairwise);
return;
}
}
simplified.add(t);
}
}
@Nullable RegExpTree simplifyPairwise(RegExpTree before, RegExpTree after) {
if (before instanceof Text && after instanceof Text) {
return new Text(
((Text) before).text + ((Text) after).text).simplify(flags);
}
// Fold adjacent repetitions.
int beforeMin = 1;
int beforeMax = 1;
RegExpTree beforeBody = before;
boolean beforeGreedy = false;
if (before instanceof Repetition) {
Repetition r = (Repetition) before;
beforeMin = r.min;
beforeMax = r.max;
beforeBody = r.body;
beforeGreedy = r.greedy;
}
int afterMin = 1;
int afterMax = 1;
RegExpTree afterBody = after;
boolean afterGreedy = false;
if (after instanceof Repetition) {
Repetition r = (Repetition) after;
afterMin = r.min;
afterMax = r.max;
afterBody = r.body;
afterGreedy = r.greedy;
}
if (beforeBody.equals(afterBody)
&& !beforeBody.hasCapturingGroup()) {
long lmin = ((long) beforeMin) + afterMin;
long lmax = ((long) beforeMax) + afterMax;
if (lmin < Integer.MAX_VALUE) {
int min = (int) lmin;
int max = lmax >= Integer.MAX_VALUE
? Integer.MAX_VALUE : (int) lmax;
return new Repetition(
beforeBody, min, max,
beforeGreedy || afterGreedy || min == max);
}
}
return null;
}
}
Simplifier s = new Simplifier();
for (RegExpTree element : elements) {
s.simplify(element.simplify(flags));
}
switch (s.simplified.size()) {
case 0: return Empty.INSTANCE;
case 1: return s.simplified.get(0);
default: return new Concatenation(s.simplified);
}
}
@Override
public boolean isCaseSensitive() {
for (RegExpTree element : elements) {
if (element.isCaseSensitive()) {
return true;
}
}
return false;
}
@Override
public boolean containsAnchor() {
for (RegExpTree element : elements) {
if (element.containsAnchor()) {
return true;
}
}
return false;
}
@Override
public int numCapturingGroups() {
int n = 0;
for (RegExpTree element : elements) {
n += element.numCapturingGroups();
}
return n;
}
@Override
public ImmutableList extends RegExpTree> children() {
return elements;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
// True if the last content written might consume
// decimal digits written subsequently.
boolean digitsMightBleed = false;
for (RegExpTree element : elements) {
boolean parenthesize = false;
if (element instanceof Alternation
|| element instanceof Concatenation) {
parenthesize = true;
}
if (parenthesize) {
sb.append("(?:");
element.appendSourceCode(sb);
sb.append(')');
} else {
int start = sb.length();
element.appendSourceCode(sb);
if (digitsMightBleed && sb.length() > start) {
char firstChar = sb.charAt(start);
if ('0' <= firstChar && firstChar <= '9') {
// Bleeding happened.
// If the last character would be ambiguous
// with a repetition, escape it.
if (sb.charAt(start - 1) == '{') {
// Concatenation from optimization of
// /{(?:0,}/ -> /\{0,}/
sb.insert(start - 1, '\\');
} else {
// Or parenthesize otherwise.
// Concatenation from optimization of
// /(.)\1(?:0)/ -> /(.)\1(?:0)/.
sb.insert(start, "(?:").append(')');
}
}
}
}
digitsMightBleed = (
// \1(?:0) bleeds if there are 10 or more
// capturing groups preceding.
(element instanceof BackReference
&& ((BackReference) element).groupIndex < 10)
// foo{(?:10}) bleeds.
|| (element instanceof Text
&& ((Text) element).text.endsWith("{")));
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return o instanceof Concatenation
&& elements.equals(((Concatenation) o).elements);
}
@Override
public int hashCode() {
return 0x20997e3e ^ elements.hashCode();
}
}
static void escapeCharOnto(char ch, StringBuilder sb) {
switch (ch) {
case '\u0000':
sb.append("\\0");
break;
case '\f':
sb.append("\\f");
break;
case '\t':
sb.append("\\t");
break;
case '\n':
sb.append("\\n");
break;
case '\r':
sb.append("\\r");
break;
case '\\':
sb.append("\\\\");
break;
default:
if (ch < 0x20 || ch >= 0x7f) {
if (ch >= 0x100) {
sb.append("\\u");
sb.append("0123456789abcdef".charAt((ch >> 12) & 0xf));
sb.append("0123456789abcdef".charAt((ch >> 8) & 0xf));
sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
sb.append("0123456789abcdef".charAt((ch) & 0xf));
} else {
sb.append("\\x");
sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
sb.append("0123456789abcdef".charAt((ch) & 0xf));
}
} else {
sb.append(ch);
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy