com.google.re2j.RE2 Maven / Gradle / Ivy
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go
// Beware, submatch results may pin a large underlying String into
// memory. Consider creating explicit string copies if submatches are
// long-lived and inputs are large.
//
// The JDK API supports incremental processing of the input without
// necessarily consuming it all; we do not attempt to do so.
// The Java API emphasises UTF-16 Strings, not UTF-8 byte[] as in Go, as
// the primary input datatype, and the method names have been changed to
// reflect this.
package com.google.re2j;
import com.google.re2j.DFA.DFATooManyStatesException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import io.airlift.slice.Slices;
import static com.google.re2j.MachineInput.EOF;
import static com.google.re2j.Options.Algorithm.DFA;
import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA;
import static com.google.re2j.RE2.Anchor.UNANCHORED;
import static com.google.re2j.RE2.MatchKind.FIRST_MATCH;
import static com.google.re2j.RE2.MatchKind.LONGEST_MATCH;
/**
* An RE2 class instance is a compiled representation of an RE2 regular
* expression, independent of the public Java-like Pattern/Matcher API.
*
* This class also contains various implementation helpers for RE2
* regular expressions.
*
*
Use the {@link #quoteMeta(String)} utility function to quote all
* regular expression metacharacters in an arbitrary string.
*
*
See the {@code Matcher} and {@code Pattern} classes for the public
* API, and the package-level
* documentation for an overview of how to use this API.
*/
class RE2 {
// (In the Go implementation this structure is just called "Regexp".)
//// Parser flags.
// Fold case during matching (case-insensitive).
static final int FOLD_CASE = 0x01;
// Treat pattern as a literal string instead of a regexp.
static final int LITERAL = 0x02;
// Allow character classes like [^a-z] and [[:space:]] to match newline.
static final int CLASS_NL = 0x04;
// Allow '.' to match newline.
static final int DOT_NL = 0x08;
// Treat ^ and $ as only matching at beginning and end of text, not
// around embedded newlines. (Perl's default).
static final int ONE_LINE = 0x10;
// Make repetition operators default to non-greedy.
static final int NON_GREEDY = 0x20;
// allow Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?Pexpr) for named captures
// \C (any byte) is not supported.
static final int PERL_X = 0x40;
// Allow \p{Han}, \P{Han} for Unicode group and negation.
static final int UNICODE_GROUPS = 0x80;
// Regexp END_TEXT was $, not \z. Internal use only.
static final int WAS_DOLLAR = 0x100;
static final int MATCH_NL = CLASS_NL | DOT_NL;
// As close to Perl as possible.
static final int PERL = CLASS_NL | ONE_LINE | PERL_X | UNICODE_GROUPS;
// POSIX syntax.
static final int POSIX = 0;
//// Anchors
enum Anchor {
UNANCHORED,
ANCHOR_START,
ANCHOR_BOTH;
boolean isUnanchored() {
return this == UNANCHORED;
}
boolean isAnchorEnd() {
return this == ANCHOR_BOTH;
}
boolean isAnchorStart() {
return this == ANCHOR_START || this == ANCHOR_BOTH;
}
boolean isAnchorBoth() {
return this == ANCHOR_BOTH;
}
}
// Kind of match to look for (for anchor != ANCHOR_BOTH)
//
// LONGEST_MATCH mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
enum MatchKind {
FIRST_MATCH, // like Perl, PCRE
LONGEST_MATCH // like egrep or POSIX
}
//// RE2 instance members.
final String expr; // as passed to Compile
final Prog prog; // compiled program
final Prog reverseProg; // program for matching reversed text
final int cond; // EMPTY_* bitmask: empty-width conditions
// required at start of match
final int numSubexp;
final Map namedGroupIndexes;
final Options options;
MatchKind matchKind;
Slice prefixUTF8; // required UTF-8 prefix in unanchored matches
boolean prefixComplete; // true iff prefix is the entire regexp
// Cache of machines for running regexp.
final ThreadLocal nfaMachine = new ThreadLocal() {
@Override
protected NFAMachine initialValue() {
return new NFAMachine(RE2.this);
}
};
volatile DFAMachine dfaMachine;
AtomicInteger numberOfDFARetriesLeft;
// This is visible for testing.
RE2(RE2 re2) {
// Copy everything.
this(re2.expr, re2.prog, re2.reverseProg, re2.numSubexp, re2.namedGroupIndexes, re2.matchKind,
re2.options, re2.prefixComplete, re2.prefixUTF8);
}
private RE2(String expr, Prog prog, Prog reverseProg, int numSubexp,
Map namedGroupIndexes, MatchKind matchKind,
Options options, boolean prefixComplete, Slice prefixUTF8) {
this.expr = expr;
this.prog = prog;
this.reverseProg = reverseProg;
this.numSubexp = numSubexp;
this.namedGroupIndexes = namedGroupIndexes;
this.options = options;
this.cond = prog.startCond();
this.matchKind = matchKind;
this.prefixComplete = prefixComplete;
this.prefixUTF8 = prefixUTF8;
if (options.getAlgorithm() == DFA || options.getAlgorithm() == DFA_FALLBACK_TO_NFA) {
this.dfaMachine = new DFAMachine(this, options.getMaximumNumberOfDFAStates());
this.numberOfDFARetriesLeft = new AtomicInteger(options.getNumberOfDFARetries());
}
}
/**
* Parses a regular expression and returns, if successful, an
* {@code RE2} instance that can be used to match against text.
*
* When matching against text, the regexp returns a match that
* begins as early as possible in the input (leftmost), and among those
* it chooses the one that a backtracking search would have found first.
* This so-called leftmost-first matching is the same semantics
* that Perl, Python, and other implementations use, although this
* package implements it without the expense of backtracking.
* For POSIX leftmost-longest matching, see {@link #compilePOSIX}.
*/
static RE2 compile(String expr, Options options) throws PatternSyntaxException {
return compileImpl(expr, PERL, FIRST_MATCH, options);
}
/**
* {@code compilePOSIX} is like {@link #compile} but restricts the
* regular expression to POSIX ERE (egrep) syntax and changes the
* match semantics to leftmost-longest.
*
*
That is, when matching against text, the regexp returns a match that
* begins as early as possible in the input (leftmost), and among those
* it chooses a match that is as long as possible.
* This so-called leftmost-longest matching is the same semantics
* that early regular expression implementations used and that POSIX
* specifies.
*
*
However, there can be multiple leftmost-longest matches, with different
* submatch choices, and here this package diverges from POSIX.
* Among the possible leftmost-longest matches, this package chooses
* the one that a backtracking search would have found first, while POSIX
* specifies that the match be chosen to maximize the length of the first
* subexpression, then the second, and so on from left to right.
* The POSIX rule is computationally prohibitive and not even well-defined.
* See http://swtch.com/~rsc/regexp/regexp2.html#posix
*/
static RE2 compilePOSIX(String expr, Options options) throws PatternSyntaxException {
return compileImpl(expr, POSIX, LONGEST_MATCH, options);
}
// Exposed to ExecTests.
static RE2 compileImpl(String expr, int mode, MatchKind matchKind, Options options)
throws PatternSyntaxException {
Regexp re = Parser.parse(expr, mode);
int maxCap = re.maxCap(); // (may shrink during simplify)
re = Simplify.simplify(re);
Prog prog = Compiler.compileRegexp(re, false);
Prog reverseProg = Compiler.compileRegexp(re, true);
SliceOutput prefixBuilder = new DynamicSliceOutput(prog.numInst());
boolean prefixComplete = prog.prefix(prefixBuilder);
Slice prefixUTF8 = prefixBuilder.slice();
return new RE2(expr, prog, reverseProg, maxCap, re.namedGroupIndexes(), matchKind, options,prefixComplete, prefixUTF8);
}
/**
* Returns the number of parenthesized subexpressions in this regular
* expression.
*/
int numberOfCapturingGroups() {
return numSubexp;
}
@Override
public String toString() {
return expr;
}
// doExecute() finds the leftmost match in the input and returns
// the position of its subexpressions.
// Derived from exec.go.
private int[] doExecute(MachineInput in, int pos, Anchor anchor, int ncap) {
DFAMachine currentDFAMachine = dfaMachine;
if (currentDFAMachine == null) {
return doExecute(nfaMachine.get(), in, pos, anchor, ncap);
} else {
try {
return doExecute(currentDFAMachine, in, pos, anchor, ncap);
} catch (DFATooManyStatesException e) {
handleTooManyDFAStatesException(e, currentDFAMachine);
return doExecute(nfaMachine.get(), in, pos, anchor, ncap);
}
}
}
private int[] doExecute(Machine machine, MachineInput in, int pos, Anchor anchor, int ncap) {
int[] submatches = new int[ncap];
return machine.match(in, pos, anchor, submatches) ? submatches : null;
}
private synchronized void handleTooManyDFAStatesException(DFATooManyStatesException e, DFAMachine currentDFAMachine) {
// make sure we don't penalize new DFAMachine instance
if (currentDFAMachine == dfaMachine) {
if (numberOfDFARetriesLeft.decrementAndGet() < 0) {
if (options.getAlgorithm() == DFA_FALLBACK_TO_NFA) {
dfaMachine = null;
if (options.getEventsListener() != null) {
options.getEventsListener().fallbackToNFA();
}
} else {
// keep the old DFAMachine, so other threads can fail too
throw e;
}
} else {
dfaMachine = new DFAMachine(this, options.getMaximumNumberOfDFAStates());
}
}
}
/**
* Returns true iff this regexp matches the string {@code s}.
*/
boolean match(Slice s) {
return doExecute(MachineInput.fromUTF8(s), 0, UNANCHORED, 0) != null;
}
/**
* Matches the regular expression against input starting at position start
* and ending at position end, with the given anchoring.
* Records the submatch boundaries in group, which is [start, end) pairs
* of byte offsets. The number of boundaries needed is inferred
* from the size of the group array. It is most efficient not to ask for
* submatch boundaries.
*
* @param input the input {@link Slice}
* @param start the beginning position in the input
* @param anchor the anchoring flag (UNANCHORED, ANCHOR_START, ANCHOR_BOTH)
* @param group the array to fill with submatch positions
* @param ngroup the number of array pairs to fill in
* @return true if a match was found
*/
boolean match(Slice input, int start, Anchor anchor, int[] group,
int ngroup) {
if (start > input.length()) {
return false;
}
// TODO(afrozm): We suspect that the correct code should look something
// like the following:
// doExecute(MachineInput.fromUTF16(input), start, anchor, 2*ngroup);
//
// In Russ' own words:
// That is, I believe doExecute needs to know the bounds of the whole input
// as well as the bounds of the subpiece that is being searched.
int[] groupMatch = doExecute(MachineInput.fromUTF8(input), start,
anchor, 2 * ngroup);
if (groupMatch == null) {
return false;
}
if (group != null) {
System.arraycopy(groupMatch, 0, group, 0, groupMatch.length);
}
return true;
}
/**
* Returns true iff textual regular expression {@code pattern}
* matches {@link Slice} {@code s}.
*
*
More complicated queries need to use {@link #compile} and the
* full {@code RE2} interface.
*/
// This is visible for testing.
static boolean match(String pattern, Slice s, Options options) throws PatternSyntaxException {
return compile(pattern, options).match(s);
}
// This is visible for testing.
interface ReplaceFunc {
Slice replace(Slice orig);
}
/**
* Returns a copy of {@code src} in which all matches for this regexp
* have been replaced by {@code repl}. No support is provided for
* expressions (e.g. {@code \1} or {@code $1}) in the replacement
* {@link Slice}.
*/
// This is visible for testing.
Slice replaceAll(Slice src, final Slice repl) {
return replaceAllFunc(src, new ReplaceFunc() {
@Override public Slice replace(Slice orig) { return repl; }
}, 2 * src.length() + 1);
// TODO(afrozm): Is the reasoning correct, there can be at the most 2*len +1
// replacements. Basically [a-z]*? abc x will be xaxbcx. So should it be
// len + 1 or 2*len + 1.
}
/**
* Returns a copy of {@code src} in which only the first match for this regexp
* has been replaced by {@code repl}. No support is provided for
* expressions (e.g. {@code \1} or {@code $1}) in the replacement
* {@link Slice}.
*/
// This is visible for testing.
Slice replaceFirst(Slice src, final Slice repl) {
return replaceAllFunc(src, new ReplaceFunc() {
@Override public Slice replace(Slice orig) { return repl; }
}, 1);
}
/**
* Returns a copy of {@code src} in which at most {@code maxReplaces} matches
* for this regexp have been replaced by the return value of of function
* {@code repl} (whose first argument is the matched string). No support is
* provided for expressions (e.g. {@code \1} or {@code $1}) in the
* replacement {@link Slice}.
*/
// This is visible for testing.
Slice replaceAllFunc(Slice src, ReplaceFunc repl, int maxReplaces) {
int lastMatchEnd = 0; // end position of the most recent match
int searchPos = 0; // position where we next look for a match
SliceOutput buf = new DynamicSliceOutput(src.length());
MachineInput input = MachineInput.fromUTF8(src);
int numReplaces = 0;
while (searchPos <= src.length()) {
int[] a = doExecute(input, searchPos, UNANCHORED, 2);
if (a == null || a.length == 0) {
break; // no more matches
}
// Copy the unmatched characters before this match.
buf.writeBytes(src, lastMatchEnd, a[0] - lastMatchEnd);
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
// FIXME(adonovan), FIXME(afrozm) - JDK seems to be doing exactly this
// put a replacement for a pattern that also matches empty and non-empty
// strings. The fix would not just be a[1] >= lastMatchEnd, there are a
// few corner cases in that as well, and there are tests which will fail
// when that case is touched (happens only at the end of the input string
// though).
if (a[1] > lastMatchEnd || a[0] == 0) {
buf.writeBytes(repl.replace(src.slice(a[0], a[1] - a[0])));
// Increment the replace count.
++numReplaces;
}
lastMatchEnd = a[1];
// Advance past this match
if (searchPos + 1 > a[1]) {
searchPos++;
} else {
searchPos = a[1];
}
if (numReplaces >= maxReplaces) {
// Should never be greater though.
break;
}
}
// Copy the unmatched characters after the last match.
buf.writeBytes(src, lastMatchEnd, src.length() - lastMatchEnd);
return buf.slice();
}
/**
* Returns a string that quotes all regular expression metacharacters
* inside the argument text; the returned string is a regular
* expression matching the literal text. For example,
* {@code quoteMeta("[foo]").equals("\\[foo\\]")}.
*/
static String quoteMeta(String s) {
StringBuilder b = new StringBuilder(2 * s.length());
// A char loop is correct because all metacharacters fit in one UTF-16 code.
for (int i = 0, len = s.length(); i < len; i++) {
char c = s.charAt(i);
if ("\\.+*?()|[]{}^$".indexOf(c) >= 0) {
b.append('\\');
}
b.append(c);
}
return b.toString();
}
// The number of capture values in the program may correspond
// to fewer capturing expressions than are in the regexp.
// For example, "(a){0}" turns into an empty program, so the
// maximum capture in the program is 0 but we need to return
// an expression for \1. Pad returns a with -1s appended as needed;
// the result may alias a.
private int[] pad(int[] a) {
if (a == null) {
return null; // No match.
}
int n = (1 + numSubexp) * 2;
if (a.length < n) {
int[] a2 = new int[n];
System.arraycopy(a, 0, a2, 0, a.length);
Arrays.fill(a2, a.length, n, -1);
a = a2;
}
return a;
}
private interface DeliverFunc {
// Called iteratively with a list of submatch indices in the same
// unit as the MachineInput cursor.
void deliver(int[] x);
}
// Find matches in input.
private void allMatches(MachineInput input, int n, DeliverFunc deliver) {
int end = input.endPos();
if (n < 0) {
n = end + 1;
}
for (int pos = 0, i = 0, prevMatchEnd = -1; i < n && pos <= end; ) {
int[] matches = doExecute(input, pos, UNANCHORED, prog.numCap);
if (matches == null || matches.length == 0) {
break;
}
boolean accept = true;
if (matches[1] == pos) {
// We've found an empty match.
if (matches[0] == prevMatchEnd) {
// We don't allow an empty match right
// after a previous match, so ignore it.
accept = false;
}
byte b = input.getByte(pos);
if (b == EOF) {
pos = end + 1;
} else {
pos++;
}
} else {
pos = matches[1];
}
prevMatchEnd = matches[1];
if (accept) {
deliver.deliver(pad(matches));
i++;
}
}
}
// Legacy Go-style interface; preserved (package-private) for better
// test coverage.
//
// There are 16 methods of RE2 that match a regular expression and
// identify the matched text. Their names are matched by this regular
// expression:
//
// find(All)?(UTF8)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a
// preceding match are ignored. The return value is an array
// containing the successive return values of the corresponding
// non-All routine. These routines take an extra integer argument, n;
// if n >= 0, the function returns at most n matches/submatches.
//
// If 'UTF8' is present, the argument is a UTF-8 encoded byte[] array;
// otherwise it is a UTF-16 encoded java.lang.String; return values
// are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is an list identifying
// the successive submatches of the expression. Submatches are
// matches of parenthesized subexpressions within the regular
// expression, numbered from left to right in order of opening
// parenthesis. Submatch 0 is the match of the entire expression,
// submatch 1 the match of the first parenthesized subexpression, and
// so on.
//
// If 'Index' is present, matches and submatches are identified by
// byte index pairs within the input string: result[2*n:2*n+1]
// identifies the indexes of the nth submatch. The pair for n==0
// identifies the match of the entire expression. If 'Index' is not
// present, the match is identified by the text of the match/submatch.
// If an index is negative, it means that subexpression did not match
// any string in the input.
/**
* Returns a {@link Slice} holding the text of the leftmost match in
* {@code s} of this regular expression.
*
*
If there is no match, the return value is an empty {@link Slice}, but it
* will also be empty if the regular expression successfully matches
* an empty {@link Slice}. Use {@link #findIndex} or
* {@link #findSubmatch} if it is necessary to distinguish these
* cases.
*/
// This is visible for testing.
Slice find(Slice s) {
int[] a = doExecute(MachineInput.fromUTF8(s), 0, UNANCHORED, 2);
if (a == null) {
return Slices.EMPTY_SLICE;
}
return s.slice(a[0], a[1] - a[0]);
}
/**
* Returns a two-element array of integers defining the location of
* the leftmost match in {@code s} of this regular expression. The
* match itself is at {@code s.slice(loc[0], loc[1] - loc[0])}.
*
*
A return value of null indicates no match.
*/
// This is visible for testing.
int[] findIndex(Slice s) {
int[] a = doExecute(MachineInput.fromUTF8(s), 0, UNANCHORED, 2);
if (a == null) {
return null;
}
return a;
}
/**
* Returns an array of {@link Slice}s holding the text of the leftmost match
* of the regular expression in {@code s} and the matches, if any, of
* its subexpressions, as defined by the Submatch description above.
*
*
A return value of null indicates no match.
*/
// This is visible for testing.
Slice[] findSubmatch(Slice s) {
int[] a = doExecute(MachineInput.fromUTF8(s), 0, UNANCHORED, prog.numCap);
if (a == null) {
return null;
}
Slice[] ret = new Slice[1 + numSubexp];
for (int i = 0; i < ret.length; i++) {
if (2 * i < a.length && a[2 * i] >= 0) {
int begin = a[2 * i];
int end = a[2 * i + 1];
ret[i] = s.slice(begin, end - begin);
}
}
return ret;
}
/**
* Returns an array holding the index pairs identifying the leftmost
* match of this regular expression in {@code s} and the matches, if
* any, of its subexpressions, as defined by the Submatch description above.
*
*
A return value of null indicates no match.
*/
// This is visible for testing.
int[] findSubmatchIndex(Slice s) {
return pad(doExecute(MachineInput.fromUTF8(s), 0, UNANCHORED, prog.numCap));
}
/**
* {@code findAll} is the All version of
* {@link #find}; it returns a list of up to {@code n}
* successive matches of the expression, as defined by the All description above.
*
*
A return value of null indicates no match.
*/
// This is visible for testing.
List findAll(final Slice s, int n) {
final List result = new ArrayList();
allMatches(MachineInput.fromUTF8(s), n, new DeliverFunc() {
@Override public void deliver(int[] match) {
result.add(s.slice(match[0], match[1] - match[0]));
}});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllIndex} is the All version of
* {@link #findIndex}; it returns a list of up to {@code n}
* successive matches of the expression, as defined by the All description above.
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllIndex(Slice s, int n) {
final List result = new ArrayList();
allMatches(MachineInput.fromUTF8(s), n, new DeliverFunc() {
@Override public void deliver(int[] match) {
result.add(Utils.subarray(match, 0, 2));
}});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllSubmatch} is the All version
* of {@link #findSubmatch}; it returns a list of up to
* {@code n} successive matches of the expression, as defined by the
* All description above.
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllSubmatch(final Slice s, int n) {
final List result = new ArrayList();
allMatches(MachineInput.fromUTF8(s), n, new DeliverFunc() {
@Override public void deliver(int[] match) {
Slice[] slice = new Slice[match.length / 2];
for (int j = 0; j < slice.length; ++j) {
if (match[2 * j] >= 0) {
int begin = match[2 * j];
int end = match[2 * j + 1];
slice[j] = s.slice(begin, end - begin);
}
}
result.add(slice);
}});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllSubmatchIndex} is the All
* version of {@link #findSubmatchIndex}; it returns a list of
* up to {@code n} successive matches of the expression, as defined by
* the All description above.
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllSubmatchIndex(Slice s, int n) {
final List result = new ArrayList();
allMatches(MachineInput.fromUTF8(s), n, new DeliverFunc() {
@Override public void deliver(int[] match) {
result.add(match);
}});
if (result.isEmpty()) {
return null;
}
return result;
}
}