com.google.re2j.RE2 Maven / Gradle / Ivy
/*
* Copyright (c) 2020 The Go Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
*/
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go
// Beware, submatch results may pin a large underlying String into
// memory. Consider creating explicit string copies if submatches are
// long-lived and inputs are large.
//
// The JDK API supports incremental processing of the input without
// necessarily consuming it all; we do not attempt to do so.
// The Java API emphasises UTF-16 Strings, not UTF-8 byte[] as in Go, as
// the primary input datatype, and the method names have been changed to
// reflect this.
package com.google.re2j;
import com.google.re2j.MatcherInput.Encoding;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
/**
* An RE2 class instance is a compiled representation of an RE2 regular expression, independent of
* the public Java-like Pattern/Matcher API.
*
*
* This class also contains various implementation helpers for RE2 regular expressions.
*
*
* Use the {@link #quoteMeta(String)} utility function to quote all regular expression
* metacharacters in an arbitrary string.
*
*
* See the {@code Matcher} and {@code Pattern} classes for the public API, and the package-level documentation for an overview of how to use this API.
*/
class RE2 {
// (In the Go implementation this structure is just called "Regexp".)
//// Parser flags.
// Fold case during matching (case-insensitive).
static final int FOLD_CASE = 0x01;
// Treat pattern as a literal string instead of a regexp.
static final int LITERAL = 0x02;
// Allow character classes like [^a-z] and [[:space:]] to match newline.
static final int CLASS_NL = 0x04;
// Allow '.' to match newline.
static final int DOT_NL = 0x08;
// Treat ^ and $ as only matching at beginning and end of text, not
// around embedded newlines. (Perl's default).
static final int ONE_LINE = 0x10;
// Make repetition operators default to non-greedy.
static final int NON_GREEDY = 0x20;
// allow Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?Pexpr) for named captures
// \C (any byte) is not supported.
static final int PERL_X = 0x40;
// Allow \p{Han}, \P{Han} for Unicode group and negation.
static final int UNICODE_GROUPS = 0x80;
// Regexp END_TEXT was $, not \z. Internal use only.
static final int WAS_DOLLAR = 0x100;
static final int MATCH_NL = CLASS_NL | DOT_NL;
// As close to Perl as possible.
static final int PERL = CLASS_NL | ONE_LINE | PERL_X | UNICODE_GROUPS;
// POSIX syntax.
static final int POSIX = 0;
//// Anchors
static final int UNANCHORED = 0;
static final int ANCHOR_START = 1;
static final int ANCHOR_BOTH = 2;
//// RE2 instance members.
final String expr; // as passed to Compile
final Prog prog; // compiled program
final int cond; // EMPTY_* bitmask: empty-width conditions
// required at start of match
final int numSubexp;
boolean longest;
String prefix; // required UTF-16 prefix in unanchored matches
byte[] prefixUTF8; // required UTF-8 prefix in unanchored matches
boolean prefixComplete; // true iff prefix is the entire regexp
int prefixRune; // first rune in prefix
// Cache of machines for running regexp. Forms a Treiber stack.
private final AtomicReference pooled = new AtomicReference();
public Map namedGroups;
// This is visible for testing.
RE2(String expr) {
RE2 re2 = RE2.compile(expr);
// Copy everything.
this.expr = re2.expr;
this.prog = re2.prog;
this.cond = re2.cond;
this.numSubexp = re2.numSubexp;
this.longest = re2.longest;
this.prefix = re2.prefix;
this.prefixUTF8 = re2.prefixUTF8;
this.prefixComplete = re2.prefixComplete;
this.prefixRune = re2.prefixRune;
}
private RE2(String expr, Prog prog, int numSubexp, boolean longest) {
this.expr = expr;
this.prog = prog;
this.numSubexp = numSubexp;
this.cond = prog.startCond();
this.longest = longest;
}
/**
* Parses a regular expression and returns, if successful, an {@code RE2} instance that can be
* used to match against text.
*
*
* When matching against text, the regexp returns a match that begins as early as possible in the
* input (leftmost), and among those it chooses the one that a backtracking search would have
* found first. This so-called leftmost-first matching is the same semantics that Perl, Python,
* and other implementations use, although this package implements it without the expense of
* backtracking. For POSIX leftmost-longest matching, see {@link #compilePOSIX}.
*/
static RE2 compile(String expr) throws PatternSyntaxException {
return compileImpl(expr, PERL, /*longest=*/ false);
}
/**
* {@code compilePOSIX} is like {@link #compile} but restricts the regular expression to POSIX ERE
* (egrep) syntax and changes the match semantics to leftmost-longest.
*
*
* That is, when matching against text, the regexp returns a match that begins as early as
* possible in the input (leftmost), and among those it chooses a match that is as long as
* possible. This so-called leftmost-longest matching is the same semantics that early regular
* expression implementations used and that POSIX specifies.
*
*
* However, there can be multiple leftmost-longest matches, with different submatch choices, and
* here this package diverges from POSIX. Among the possible leftmost-longest matches, this
* package chooses the one that a backtracking search would have found first, while POSIX
* specifies that the match be chosen to maximize the length of the first subexpression, then the
* second, and so on from left to right. The POSIX rule is computationally prohibitive and not
* even well-defined. See http://swtch.com/~rsc/regexp/regexp2.html#posix
*/
static RE2 compilePOSIX(String expr) throws PatternSyntaxException {
return compileImpl(expr, POSIX, /*longest=*/ true);
}
// Exposed to ExecTests.
static RE2 compileImpl(String expr, int mode, boolean longest) throws PatternSyntaxException {
Regexp re = Parser.parse(expr, mode);
int maxCap = re.maxCap(); // (may shrink during simplify)
re = Simplify.simplify(re);
Prog prog = Compiler.compileRegexp(re);
RE2 re2 = new RE2(expr, prog, maxCap, longest);
StringBuilder prefixBuilder = new StringBuilder();
re2.prefixComplete = prog.prefix(prefixBuilder);
re2.prefix = prefixBuilder.toString();
try {
re2.prefixUTF8 = re2.prefix.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("can't happen");
}
if (!re2.prefix.isEmpty()) {
re2.prefixRune = re2.prefix.codePointAt(0);
}
re2.namedGroups = re.namedGroups;
return re2;
}
/**
* Returns the number of parenthesized subexpressions in this regular expression.
*/
int numberOfCapturingGroups() {
return numSubexp;
}
// get() returns a machine to use for matching |this|. It uses |this|'s
// machine cache if possible, to avoid unnecessary allocation.
Machine get() {
// Pop a machine off the stack if available.
Machine head;
do {
head = pooled.get();
} while (head != null && !pooled.compareAndSet(head, head.next));
return head;
}
// Clears the memory associated with this machine.
void reset() {
pooled.set(null);
}
// put() returns a machine to |this|'s machine cache. There is no attempt to
// limit the size of the cache, so it will grow to the maximum number of
// simultaneous matches run using |this|. (The cache empties when |this|
// gets garbage collected or reset is called.)
@SuppressWarnings("MakeAlwaysEqual") // for ErrorProne, see below
void put(Machine m, boolean isNew) {
// To avoid allocation in the single-thread or uncontended case, reuse a node only if
// it was the only element in the stack when it was popped, and it's the only element
// in the stack when it's pushed back after use.
Machine head;
do {
head = pooled.get();
if (!isNew && head != null) {
// If an element had a null next pointer and it was previously in the stack, another thread
// might be trying to pop it out right now, and if it sees the same node now in the
// stack the pop will succeed, but the new top of the stack will be the stale (null) value
// of next. Allocate a new Machine so that the CAS will not succeed if this node has been
// popped and re-pushed.
m = new Machine(m);
isNew = true;
}
// Without this comparison, TSAN will complain about a race condition:
// Thread A, B, and C all attempt to do a match on the same pattern.
//
// A: Allocates Machine 1; executes match; put machine 1. State is now:
//
// pooled -> machine 1 -> null
//
// B reads pooled, sees machine 1
//
// C reads pooled, sees machine 1
//
// B successfully CASes pooled to null
//
// B executes match; put machine 1, which involves setting machine1.next to
// null (even though it's already null); preempted before CAS
//
// C resumes, and reads machine1.next in order to execute cas(head, head.next)
//
// There is no happens-before relationship between B's redundant null write
// and C's read, thus triggering TSAN.
//
// A future release of ErrorProne may want to make the assignment unconditionally. The
// @SuppressWarning("MakeAlwaysEqual") on this method is intended to prevent that from happening.
if (m.next != head) {
m.next = head;
}
} while (!pooled.compareAndSet(head, m));
}
@Override
public String toString() {
return expr;
}
// doExecute() finds the leftmost match in the input and returns
// the position of its subexpressions.
// Derived from exec.go.
private int[] doExecute(MachineInput in, int pos, int anchor, int ncap) {
Machine m = get();
// The Treiber stack cannot reuse nodes, unless the node to be reused has only ever been at
// the bottom of the stack (i.e., next == null).
boolean isNew = false;
if (m == null) {
m = new Machine(this);
isNew = true;
} else if (m.next != null) {
m = new Machine(m);
isNew = true;
}
m.init(ncap);
int[] cap = m.match(in, pos, anchor) ? m.submatches() : null;
put(m, isNew);
return cap;
}
/**
* Returns true iff this regexp matches the string {@code s}.
*/
boolean match(CharSequence s) {
return doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 0) != null;
}
boolean match(CharSequence input, int start, int end, int anchor, int[] group, int ngroup) {
return match(MatcherInput.utf16(input), start, end, anchor, group, ngroup);
}
/**
* Matches the regular expression against input starting at position start and ending at position
* end, with the given anchoring. Records the submatch boundaries in group, which is [start, end)
* pairs of byte offsets. The number of boundaries needed is inferred from the size of the group
* array. It is most efficient not to ask for submatch boundaries.
*
* @param input the input byte array
* @param start the beginning position in the input
* @param end the end position in the input
* @param anchor the anchoring flag (UNANCHORED, ANCHOR_START, ANCHOR_BOTH)
* @param group the array to fill with submatch positions
* @param ngroup the number of array pairs to fill in
* @return true if a match was found
*/
boolean match(MatcherInput input, int start, int end, int anchor, int[] group, int ngroup) {
if (start > end) {
return false;
}
// TODO(afrozm): We suspect that the correct code should look something
// like the following:
// doExecute(MachineInput.fromUTF16(input), start, anchor, 2*ngroup);
//
// In Russ' own words:
// That is, I believe doExecute needs to know the bounds of the whole input
// as well as the bounds of the subpiece that is being searched.
MachineInput machineInput =
input.getEncoding() == Encoding.UTF_16
? MachineInput.fromUTF16(input.asCharSequence(), 0, end)
: MachineInput.fromUTF8(input.asBytes(), 0, end);
int[] groupMatch = doExecute(machineInput, start, anchor, 2 * ngroup);
if (groupMatch == null) {
return false;
}
if (group != null) {
System.arraycopy(groupMatch, 0, group, 0, groupMatch.length);
}
return true;
}
/**
* Returns true iff this regexp matches the UTF-8 byte array {@code b}.
*/
// This is visible for testing.
boolean matchUTF8(byte[] b) {
return doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, 0) != null;
}
/**
* Returns true iff textual regular expression {@code pattern} matches string {@code s}.
*
*
* More complicated queries need to use {@link #compile} and the full {@code RE2} interface.
*/
// This is visible for testing.
static boolean match(String pattern, CharSequence s) throws PatternSyntaxException {
return compile(pattern).match(s);
}
// This is visible for testing.
interface ReplaceFunc {
String replace(String orig);
}
/**
* Returns a copy of {@code src} in which all matches for this regexp have been replaced by
* {@code repl}. No support is provided for expressions (e.g. {@code \1} or {@code $1}) in the
* replacement string.
*/
// This is visible for testing.
String replaceAll(String src, final String repl) {
return replaceAllFunc(
src,
new ReplaceFunc() {
@Override
public String replace(String orig) {
return repl;
}
},
2 * src.length() + 1);
// TODO(afrozm): Is the reasoning correct, there can be at the most 2*len +1
// replacements. Basically [a-z]*? abc x will be xaxbcx. So should it be
// len + 1 or 2*len + 1.
}
/**
* Returns a copy of {@code src} in which only the first match for this regexp has been replaced
* by {@code repl}. No support is provided for expressions (e.g. {@code \1} or {@code $1}) in the
* replacement string.
*/
// This is visible for testing.
String replaceFirst(String src, final String repl) {
return replaceAllFunc(
src,
new ReplaceFunc() {
@Override
public String replace(String orig) {
return repl;
}
},
1);
}
/**
* Returns a copy of {@code src} in which at most {@code maxReplaces} matches for this regexp have
* been replaced by the return value of of function {@code repl} (whose first argument is the
* matched string). No support is provided for expressions (e.g. {@code \1} or {@code $1}) in the
* replacement string.
*/
// This is visible for testing.
String replaceAllFunc(String src, ReplaceFunc repl, int maxReplaces) {
int lastMatchEnd = 0; // end position of the most recent match
int searchPos = 0; // position where we next look for a match
StringBuilder buf = new StringBuilder();
MachineInput input = MachineInput.fromUTF16(src);
int numReplaces = 0;
while (searchPos <= src.length()) {
int[] a = doExecute(input, searchPos, UNANCHORED, 2);
if (a == null || a.length == 0) {
break; // no more matches
}
// Copy the unmatched characters before this match.
buf.append(src.substring(lastMatchEnd, a[0]));
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
// FIXME(adonovan), FIXME(afrozm) - JDK seems to be doing exactly this
// put a replacement for a pattern that also matches empty and non-empty
// strings. The fix would not just be a[1] >= lastMatchEnd, there are a
// few corner cases in that as well, and there are tests which will fail
// when that case is touched (happens only at the end of the input string
// though).
if (a[1] > lastMatchEnd || a[0] == 0) {
buf.append(repl.replace(src.substring(a[0], a[1])));
// Increment the replace count.
++numReplaces;
}
lastMatchEnd = a[1];
// Advance past this match; always advance at least one character.
int width = input.step(searchPos) & 0x7;
if (searchPos + width > a[1]) {
searchPos += width;
} else if (searchPos + 1 > a[1]) {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++;
} else {
searchPos = a[1];
}
if (numReplaces >= maxReplaces) {
// Should never be greater though.
break;
}
}
// Copy the unmatched characters after the last match.
buf.append(src.substring(lastMatchEnd));
return buf.toString();
}
/**
* Returns a string that quotes all regular expression metacharacters inside the argument text;
* the returned string is a regular expression matching the literal text. For example,
* {@code quoteMeta("[foo]").equals("\\[foo\\]")}.
*/
static String quoteMeta(String s) {
StringBuilder b = new StringBuilder(2 * s.length());
// A char loop is correct because all metacharacters fit in one UTF-16 code.
for (int i = 0, len = s.length(); i < len; i++) {
char c = s.charAt(i);
if ("\\.+*?()|[]{}^$".indexOf(c) >= 0) {
b.append('\\');
}
b.append(c);
}
return b.toString();
}
// The number of capture values in the program may correspond
// to fewer capturing expressions than are in the regexp.
// For example, "(a){0}" turns into an empty program, so the
// maximum capture in the program is 0 but we need to return
// an expression for \1. Pad returns a with -1s appended as needed;
// the result may alias a.
private int[] pad(int[] a) {
if (a == null) {
return null; // No match.
}
int n = (1 + numSubexp) * 2;
if (a.length < n) {
int[] a2 = new int[n];
System.arraycopy(a, 0, a2, 0, a.length);
Arrays.fill(a2, a.length, n, -1);
a = a2;
}
return a;
}
private interface DeliverFunc {
// Called iteratively with a list of submatch indices in the same
// unit as the MachineInput cursor.
void deliver(int[] x);
}
// Find matches in input.
private void allMatches(MachineInput input, int n, DeliverFunc deliver) {
int end = input.endPos();
if (n < 0) {
n = end + 1;
}
for (int pos = 0, i = 0, prevMatchEnd = -1; i < n && pos <= end; ) {
int[] matches = doExecute(input, pos, UNANCHORED, prog.numCap);
if (matches == null || matches.length == 0) {
break;
}
boolean accept = true;
if (matches[1] == pos) {
// We've found an empty match.
if (matches[0] == prevMatchEnd) {
// We don't allow an empty match right
// after a previous match, so ignore it.
accept = false;
}
int r = input.step(pos);
if (r < 0) { // EOF
pos = end + 1;
} else {
pos += r & 0x7;
}
} else {
pos = matches[1];
}
prevMatchEnd = matches[1];
if (accept) {
deliver.deliver(pad(matches));
i++;
}
}
}
// Legacy Go-style interface; preserved (package-private) for better
// test coverage.
//
// There are 16 methods of RE2 that match a regular expression and
// identify the matched text. Their names are matched by this regular
// expression:
//
// find(All)?(UTF8)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a
// preceding match are ignored. The return value is an array
// containing the successive return values of the corresponding
// non-All routine. These routines take an extra integer argument, n;
// if n >= 0, the function returns at most n matches/submatches.
//
// If 'UTF8' is present, the argument is a UTF-8 encoded byte[] array;
// otherwise it is a UTF-16 encoded java.lang.String; return values
// are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is an list identifying
// the successive submatches of the expression. Submatches are
// matches of parenthesized subexpressions within the regular
// expression, numbered from left to right in order of opening
// parenthesis. Submatch 0 is the match of the entire expression,
// submatch 1 the match of the first parenthesized subexpression, and
// so on.
//
// If 'Index' is present, matches and submatches are identified by
// byte index pairs within the input string: result[2*n:2*n+1]
// identifies the indexes of the nth submatch. The pair for n==0
// identifies the match of the entire expression. If 'Index' is not
// present, the match is identified by the text of the match/submatch.
// If an index is negative, it means that subexpression did not match
// any string in the input.
/**
* Returns an array holding the text of the leftmost match in {@code b} of this regular
* expression.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
byte[] findUTF8(byte[] b) {
int[] a = doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, 2);
if (a == null) {
return null;
}
return Utils.subarray(b, a[0], a[1]);
}
/**
* Returns a two-element array of integers defining the location of the leftmost match in
* {@code b} of this regular expression. The match itself is at {@code b[loc[0]...loc[1]]}.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
int[] findUTF8Index(byte[] b) {
int[] a = doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, 2);
if (a == null) {
return null;
}
return Utils.subarray(a, 0, 2);
}
/**
* Returns a string holding the text of the leftmost match in {@code s} of this regular
* expression.
*
*
* If there is no match, the return value is an empty string, but it will also be empty if the
* regular expression successfully matches an empty string. Use {@link #findIndex} or
* {@link #findSubmatch} if it is necessary to distinguish these cases.
*/
// This is visible for testing.
String find(String s) {
int[] a = doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 2);
if (a == null) {
return "";
}
return s.substring(a[0], a[1]);
}
/**
* Returns a two-element array of integers defining the location of the leftmost match in
* {@code s} of this regular expression. The match itself is at
* {@code s.substring(loc[0], loc[1])}.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
int[] findIndex(String s) {
return doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 2);
}
/**
* Returns an array of arrays the text of the leftmost match of the regular expression in
* {@code b} and the matches, if any, of its subexpressions, as defined by the Submatch description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
byte[][] findUTF8Submatch(byte[] b) {
int[] a = doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, prog.numCap);
if (a == null) {
return null;
}
byte[][] ret = new byte[1 + numSubexp][];
for (int i = 0; i < ret.length; i++) {
if (2 * i < a.length && a[2 * i] >= 0) {
ret[i] = Utils.subarray(b, a[2 * i], a[2 * i + 1]);
}
}
return ret;
}
/**
* Returns an array holding the index pairs identifying the leftmost match of this regular
* expression in {@code b} and the matches, if any, of its subexpressions, as defined by the the
* Submatch and Index descriptions above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
int[] findUTF8SubmatchIndex(byte[] b) {
return pad(doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, prog.numCap));
}
/**
* Returns an array of strings holding the text of the leftmost match of the regular expression in
* {@code s} and the matches, if any, of its subexpressions, as defined by the Submatch description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
String[] findSubmatch(String s) {
int[] a = doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, prog.numCap);
if (a == null) {
return null;
}
String[] ret = new String[1 + numSubexp];
for (int i = 0; i < ret.length; i++) {
if (2 * i < a.length && a[2 * i] >= 0) {
ret[i] = s.substring(a[2 * i], a[2 * i + 1]);
}
}
return ret;
}
/**
* Returns an array holding the index pairs identifying the leftmost match of this regular
* expression in {@code s} and the matches, if any, of its subexpressions, as defined by the Submatch description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
int[] findSubmatchIndex(String s) {
return pad(doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, prog.numCap));
}
/**
* {@code findAllUTF8()} is the All version of {@link #findUTF8}; it returns a
* list of up to {@code n} successive matches of the expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*
* TODO(adonovan): think about defining a byte slice view class, like a read-only Go slice backed
* by |b|.
*/
// This is visible for testing.
List findAllUTF8(final byte[] b, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF8(b),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(Utils.subarray(b, match[0], match[1]));
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllUTF8Index} is the All version of {@link #findUTF8Index}; it
* returns a list of up to {@code n} successive matches of the expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllUTF8Index(final byte[] b, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF8(b),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(Utils.subarray(match, 0, 2));
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAll} is the All version of {@link #find}; it returns a list of up
* to {@code n} successive matches of the expression, as defined by the All
* description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAll(final String s, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF16(s),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(s.substring(match[0], match[1]));
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllIndex} is the All version of {@link #findIndex}; it returns a
* list of up to {@code n} successive matches of the expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllIndex(String s, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF16(s),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(Utils.subarray(match, 0, 2));
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllUTF8Submatch} is the All version of {@link #findUTF8Submatch};
* it returns a list of up to {@code n} successive matches of the expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllUTF8Submatch(final byte[] b, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF8(b),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
byte[][] slice = new byte[match.length / 2][];
for (int j = 0; j < slice.length; ++j) {
if (match[2 * j] >= 0) {
slice[j] = Utils.subarray(b, match[2 * j], match[2 * j + 1]);
}
}
result.add(slice);
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllUTF8SubmatchIndex} is the All version of
* {@link #findUTF8SubmatchIndex}; it returns a list of up to {@code n} successive matches of the
* expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllUTF8SubmatchIndex(byte[] b, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF8(b),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(match);
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllSubmatch} is the All version of {@link #findSubmatch}; it
* returns a list of up to {@code n} successive matches of the expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllSubmatch(final String s, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF16(s),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
String[] slice = new String[match.length / 2];
for (int j = 0; j < slice.length; ++j) {
if (match[2 * j] >= 0) {
slice[j] = s.substring(match[2 * j], match[2 * j + 1]);
}
}
result.add(slice);
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
/**
* {@code findAllSubmatchIndex} is the All version of
* {@link #findSubmatchIndex}; it returns a list of up to {@code n} successive matches of the
* expression, as defined by the All description above.
*
*
* A return value of null indicates no match.
*/
// This is visible for testing.
List findAllSubmatchIndex(String s, int n) {
final List result = new ArrayList();
allMatches(
MachineInput.fromUTF16(s),
n,
new DeliverFunc() {
@Override
public void deliver(int[] match) {
result.add(match);
}
});
if (result.isEmpty()) {
return null;
}
return result;
}
}