regexodus.Pattern Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jtransc-rt Show documentation
Show all versions of jtransc-rt Show documentation
JVM AOT compiler currently generating JavaScript, C++, Haxe, with initial focus on Kotlin and games.
/**
* Copyright (c) 2001, Sergey A. Samokhodkin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form
* must reproduce the above copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided with the distribution.
* - Neither the name of jregex nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @version 1.2_01
*/
package regexodus;
import com.jtransc.annotation.JTranscInvisible;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.HashMap;
/**
* A handle for a precompiled regular expression; core operations should be identical to java.util.regex.Pattern .
* Pattern should be no different.
*
* To match a regular expression myExpr
against a text myString
one should first
* create a Pattern object:
* Pattern p = new Pattern(myExpr);
*
* or
* Pattern p = Pattern.compile(myExpr);
*
* then obtain a Matcher object:
* Matcher matcher=p.matcher(myText);
*
* The latter is an automaton that actually performs a search. It provides the following methods:
*
* - search for matching substrings : matcher.find() or matcher.findAll();
* - test whether the text matches the whole pattern : matcher.matches();
* - test whether the text matches the beginning of the pattern : matcher.matchesPrefix();
* - search with custom options : matcher.find(int options);
* - replace simply : matcher.replaceFirst(String) or matcher.replaceAll(String) or matcher.replaceAmount(String, int);
* - for more complex replacement or to obtain a Replacer that should last, use the Pattern : p.replacer()
*
*
* Flags
*
* Flags (see REFlags interface) change the meaning of some regular expression elements at compile-time. Only Unicode
* matching (for predefined char classes like \\w
; doesn't affect user-defined Unicode character classes) is
* enabled by default, but specifying the flags manually disregards the defaults.
* These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of:
*
* - REFlags.IGNORE_CASE - enables case insensitivity
* - REFlags.MULTILINE - forces "^" and "$" to match both at the start and the end of line;
* - REFlags.DOTALL - forces "." to match eols('\r' and '\n' in ASCII);
* - REFlags.IGNORE_SPACES - literal spaces in expression are ignored for better readability;
* - REFlags.UNICODE - the predefined classes('\w','\d',etc) are referenced to Unicode;
* - REFlags.XML_SCHEMA - permits XML Schema regular expressions syntax extensions.
*
*
* Multithreading
* Pattern instances are not thread-safe, and neither are Matcher objects.
*
* Special Syntax
*
* RegExodus adds some features to Java's standard regexes and may not implement some rarely-used features, e.g.
* character class intersections. Syntax is mostly similar to PCRE's regexes, which Java's are also based on.
*
* Here's all of it, as best as I can supply at 3 in the morning:
*
* - A single character, unless it is a metacharacter, represents itself. "a" will match "a".
* - Metacharacters affect regex behavior, and include:
*
* - '.' matches any single character on the same line (not a newline or carriage return), unless the DOTALL
* flag is enabled, in which it matches any character, including newlines and carriage returns.
* - '^' means the start of the searched text.
* - '$' means the end of the searched text, and has different meaning in a replacement string.
* - '*' means the previous term can be repeated 0 or more times.
* - '+' means the previous term can be repeated 1 or more times. Its meaning as a "possessive" modifier for
* repeating terms is not supported.
* - '?' means the previous term can be repeated 0 or 1 times. It can also be placed after a repeating term,
* like "*", "+", or "?", to make it "reluctant." It also has special meaning in parentheses.
* - '{' and '}' should be in matching pairs, and usually have an integer between them or two integers separated by a comma.
* One integer with no comma means the previous term should be repeated that many times exactly.
* Two integers, "{a,b}" with a comma mean the previous term should be repeated anywhere between a and b times, inclusive.
* One integer with a comma on either side acts as the two-integer case with no bound where the integer of a and b has been omitted.
* - '(' and ')' in a matching pair define a group, and groups have extensive syntax of their own.
* - '[' and ']' in a matching pair define a character class, which can match a single occurrence of one of
* multiple possible characters, possibly represented as ranges of characters. "[abc]" will match "a", "b", and
* "c". "[A-Z]" will match all upper-case English letters (it will not behave as intended if another language
* finds its way into your text). You can use shorthands defined with a backslash, listed below, in a character
* class most of the time (not \\b or other boundary matchers, nor \\Q and \\E). This means you could match all
* upper-case letters in just about any language with "[\\p{Lu}]", which is the same as without the
* brackets, but you could add additional parts to that, such as "[\\p{Lu}_]+" to match strings like
* "ABACUS_ÆTHỲŔ" (the '+' allows matching the whole string, and the extra '_' in the character class allows the
* whole thing to match). If a character class starts with '^', it negates the character class, matching any
* char that isn't one of the characters in the character class. If you need to match a
* - '\\' (the single backslash, which usually needs to be escaped in source) can be used as an escape for
* other regex-specific terms, or to match a backreference to an earlier group. Backreferences are augmented in
* RegExodus, and you can do some useful and uncommon things with them. They are documented later, with groups.
* Regex-specific terms include the common \\w to match a word character (alphanumeric or underscore, by default
* respecting Unicode), \\b to match a boundary between a word character and a non-word character or the
* start/end of the text, \\d to match a digit, \\s to match a space, and capitalized versions of the previous
* ones mean the negation of them (like \\D is any non-digit character). You can also use \\x?? where ? is a hex
* digit to produce a char between Unicode 0 and 255, \\o???? where you can have 1 or more ? that are octal
* digits to produce a char by its code point as an octal number, \\m????? where you can have between 1 and 5
* decimal digits to produce a char by its code point as a normal base-10 number, and also \\u???? with exactly
* 4 hex digits (similar to the escaping that Java will do on its own if you don't escape the backslash) and
* \\x{????} to do basically the same thing with a variable amount of ? as hex digits, but no more
* than 4 digits. In addition to the predefined character classes like \\w, there are Unicode categories,
* accessible with \\p? or \\p{??} , where the single ? is a group of categories like L for letters,
* P for punctuation, or N for numbers, and an upper-case-then-lower-case pair of ? in curly braces is an
* individual category like Ll for lower-case letters, Nd for decimal numbers, or Sc for currency symbols.
* There's a good list here, http://www.regular-expressions.info/unicode.html ; only Cased_Letter is not
* supported of the list of categories. RegExodus also supports Zh and Zv for horizontal and vertical spacing
* characters, respectively. Some other \\p... features may be supported, but not necessarily for the same
* version of Unicode that the categories support (Unicode 8.0.0 for categories here, in standard Java it isn't
* expected until Java 9). \\< and \\> can be used to match the start or end of a word (similar to \\b
* but only for one side). \\Q starts a literal escape in which metacharacters lose their special treatment and
* are matched like normal characters; this escape ends when the sequence \\E is reached. \\h, \\H, \\v, and \\V
* match horizontal whitespace, non-(horizontal whitespace), vertical whitespace, and non-(vertical whitespace),
* respectively; they behave like Java 8's handling of these escapes and not earlier versions (which matched a
* specific vertical tab character with \\v instead of all vertical whitespace). Java 8's \\R escape, which
* matches all line separators known in use, is not supported yet, but you can copy its behavior with
* "(?>\\r\\n|[\\n\\cK\\f\\r\\u0085\\u2028\\u2029])" .
*
*
*
* - Groups are, as in most regex flavors, rather complicated. You can create a group with "(something)", which
* would match "something" and also store the text that matched the parenthesized section as a numbered group, here,
* group 1. Referencing these groups is covered next; RegExodus adds some features to backreferences that aren't
* present elsewhere. You can also name a group with the syntax "({NAME}something)", which would again
* match "something" but this time would have the name "NAME", which allows you to reassign the contents of the
* remembered group's match for the purposes of later regexes. Reassignment uses the syntax
* "({=NAME}something)", and can be done even if the group called "NAME" hasn't been found yet. If an
* earlier match to a repeated group like "(({=NAME}a+) ?)+" matched the "aa" in "aa a aaa aaa", finding
* all matches (the outer group with a '+') would finally cause the group with the name "NAME" to have the value
* "aaa", the last match to the named group. Groups can have many other kinds of special syntax, usually starting
* with a question mark as a metacharacter just after the opening parenthesis:
*
* - "?:" means a "plain group"; one that only keeps its contents together as a single unit for the purposes
* of repetition and other things like it, but doesn't remember the matched text for backreferences.
* - "?=" means "positive lookahead"; it doesn't consume the text it matches but does require that text to be
* ahead for the regex to succeed.
* - "?<=" means "positive lookbehind"; it doesn't consume the text it matches but does require that text to be
* before the next part of the regex for the regex to succeed.
* - "?!" means "negative lookahead"; it doesn't consume the text it matches but does require that text to NOT
* be ahead for the regex to succeed.
* - "?
*
- "?>" means an "atomic group"; it acts like a plain group in that it doesn't remember the matched text for
* backreferences, but it also tracks backreferences independently in itself, temporarily forgetting whatever is
* outside the independent, atomic group.
* - "?#" means a comment in the regex; anything up until the closing parenthesis is ignored.
* - "?imsuxX", where all of the letters are optional, changes the mode of the regex after that point. The "i"
* makes it case-insensitive, "m" makes "^" and "$" match the multi-line start and end of the text, instead of
* the start and end of a line. "s" makes "." match all characters, even line endings. "u" turns on Unicode
* handling for escapes like "\\w" and "\\b". "x" makes whitespace ignored in the regex, which can help
* legibility sometimes. "X" (capitalized) makes XML Schema terms allowed in "\\p" categories; "X" might not
* currently be working. You can add a ":" (and some text to match) after the letters to make a plain group that
* matches that text with the specified modes enabled, only for that group.
* - "?(...)" means a "conditional group"; I actually don't know what this does since it was present in JRegex
* before I forked it to make RegExodus. It seems to be a non-standard extension.
* - "?[...]" means a "class group"; I actually don't know what this does since it was present in JRegex
* before I forked it to make RegExodus. It seems to be a non-standard extension.
*
*
* - If a group matched some text, you may want to refer to that match later in the regex, or use it during a
* replacement. In replacement strings or in the matching regex itself, you can refer to what a group matched,
* though with slightly different syntax. In replacement strings, you can refer to a numbered group with "$?" where
* ? is a base-10 number with any number of digits, starting at 1. In a regex, you can make a backreference to an
* earlier group and what it matched with "\\?", where ? is again a base-10 number and refers to the text matched by
* the specified numbered group earlier in the regex. This could be used to match the text "HEY HEY" or "WHAT WHAT"
* with the example pattern "([A-Z]+) \\1", which would match the previous two examples but not "HEY WHAT". You can
* match a named group with the longer syntax "{\\NAME}" in a pattern or "${NAME}" in a
* replacement string. This syntax also works with numbered groups (change "NAME" to "1", for example), and enables
* the special augmentations RegExodus adds:
*
* - Immediately before the name or number of the group ("NAME" in the examples), you can place one or more punctuation
* characters that (only in this context) change what the backreference is considered equivalent to.
* - An at sign, {@literal @}, before the name or number of the group makes the match between the
* backreference and matched group case-insensitive. In replacement strings, this always makes the replacement
* lower-case.
* - A slash, {@literal /}, before the name or number of the group makes the backreference match the group
* in reverse order, or reverses the group when used in a replacement string. This may have bad behavior with
* Unicode chars outside the first 65536 (the Basic Multilingual Plane), such as emoji.
* - A colon, {@literal :}, before the name or number of the group makes any opening or closing parentheses
* or bracket-like characters match their closing or opening counterpart. Thus, "(" would be replaced with ")"
* and "〖〗" would be replaced with "〗〖" (it handles practically all of the Unicode Ps and Pe categories).
*
* - Anywhere from none of these modifiers to all 3 can appear in a backreference; the order doesn't matter.
*
*
*
*
*
* I hope that answers at least some questions about the syntax extensions RegExodus makes.
*
* @see REFlags
* @see Matcher
* @see Matcher#setTarget(java.lang.CharSequence)
* @see Matcher#setTarget(java.lang.CharSequence, int, int)
* @see Matcher#setTarget(char[], int, int)
* @see Matcher#setTarget(java.io.Reader, int)
* @see MatchResult
* @see MatchResult#group(int)
* @see MatchResult#start(int)
* @see MatchResult#end(int)
* @see MatchResult#length(int)
* @see MatchResult#charAt(int, int)
* @see MatchResult#prefix()
* @see MatchResult#suffix()
*/
@JTranscInvisible
public class Pattern implements Serializable, REFlags {
private static final long serialVersionUID = -3628346657932720807L;
String stringRepr;
// tree entry
Term root, root0;
// required number of memory slots
int memregs;
// required number of iteration counters
int counters;
// number of lookahead groups
int lookaheads;
HashMap namedGroupMap;
boolean caseless = false;
protected Pattern() throws PatternSyntaxException {
}
/**
* Compiles an expression with default flags.
*
* @param regex the Perl5-compatible regular expression string.
* @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* @see Pattern#Pattern(java.lang.String, java.lang.String)
* @see Pattern#Pattern(java.lang.String, int)
*/
public Pattern(String regex) throws PatternSyntaxException {
this(regex, DEFAULT);
}
/**
* Compiles a regular expression using Perl5-style flags.
* The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus.
* The meaning of letters:
*
* - i - case insensitivity, corresponds to REFlags.IGNORE_CASE;
* - m - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;
* - s - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;
* - x - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.
* - u - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.
* - X - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.
* - - - turn off the specified flags; normally has no effect unless something adds the flags.
* - + - turn on the specified flags; normally is no different from just using the letters.
*
*
* @param regex the Perl5-compatible regular expression string.
* @param flags the Perl5-compatible flags.
* @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* see REFlags
*/
public Pattern(String regex, String flags) throws PatternSyntaxException {
internalCompile(regex, parseFlags(flags));
}
/**
* Compiles a regular expression using REFlags.
* The flags
parameter is a bitwise OR of the following values:
*
* - REFlags.IGNORE_CASE - case insensitivity, corresponds to 'i' letter;
* - REFlags.MULTILINE - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to 'm';
* - REFlags.DOTALL - single line treatment('.' matches \r's and \n's),corresponds to 's';
* - REFlags.IGNORE_SPACES - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to 'x'.
* - REFlags.UNICODE - predefined classes are regarded as belonging to Unicode, corresponds to 'u'; this may yield some performance penalty.
* - REFlags.XML_SCHEMA - compatibility with XML Schema, corresponds to 'X'.
*
*
* @param regex the Perl5-compatible regular expression string.
* @param flags the Perl5-compatible flags.
* @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* see REFlags
*/
private Pattern(String regex, int flags) throws PatternSyntaxException {
internalCompile(regex, flags);
}
/**
* Sets this Pattern's flags with the char-per-flag representation of regex flags. Removes flags set earlier.
* The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus.
* The meaning of letters:
*
* - i - case insensitivity, corresponds to REFlags.IGNORE_CASE;
* - m - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;
* - s - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;
* - x - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.
* - u - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.
* - X - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.
* - - - turn off the specified flags; normally has no effect unless something adds the flags.
* - + - turn on the specified flags; normally is no different from just using the letters.
*
* @param flags a String that stores various flags as chars
*/
public void setFlags(String flags)
{
internalCompile(stringRepr, parseFlags(flags));
}
/**
* Sets this Pattern's flags with the bitmask-style int representation of regex flags. Removes flags set earlier.
* Flag constants can be found in REFlags; UNICODE is enabled normally but is not automatically turned on here.
* The flags
parameter is a bitwise OR of the following values:
*
* - REFlags.IGNORE_CASE - case insensitivity, corresponds to 'i' letter;
* - REFlags.MULTILINE - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to 'm';
* - REFlags.DOTALL - single line treatment('.' matches \r's and \n's),corresponds to 's';
* - REFlags.IGNORE_SPACES - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to 'x'.
* - REFlags.UNICODE - predefined classes are regarded as belonging to Unicode, corresponds to 'u'; this may yield some performance penalty.
* - REFlags.XML_SCHEMA - compatibility with XML Schema, corresponds to 'X'.
*
* @param flags an int that stores various flags from REFlags bitwise-OR-ed with each other
*/
public void setFlags(int flags)
{
internalCompile(stringRepr, flags);
}
//java.util.regex.* compatibility
/**
* Compiles the given String into a Pattern that can be used to match text.
* The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
* escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
* @param regex a String in normal Java regular expression format
* @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
* @throws PatternSyntaxException
*/
public static Pattern compile(String regex) throws PatternSyntaxException{
return new Pattern(regex, DEFAULT);
}
//java.util.regex.* compatibility
/**
* Compiles the given String into a Pattern that can be used to match text.
* The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
* escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
*
* This variant allows flags to be passed as an int constructed via bitwise OR from REFlags constants. You may prefer
* the variant that takes a String for clarity.
* @param regex a String in normal Java regular expression format
* @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags.
* @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
* @throws PatternSyntaxException
*/
public static Pattern compile(String regex,int flags) throws PatternSyntaxException{
return new Pattern(regex, flags);
}
//java.util.regex.* compatibility
/**
* Compiles the given String into a Pattern that can be used to match text.
* The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
* escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
*
* This variant allows flags to be passed as an String.
* The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus.
* The meaning of letters:
*
* - i - case insensitivity, corresponds to REFlags.IGNORE_CASE;
* - m - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;
* - s - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;
* - x - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.
* - u - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.
* - X - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.
* - - - turn off the specified flags; normally has no effect unless something adds the flags.
* - + - turn on the specified flags; normally is no different from just using the letters.
*
*
* @param regex a String in normal Java regular expression format
* @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags.
* @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
* @throws PatternSyntaxException
*/
public static Pattern compile(String regex,String flags) throws PatternSyntaxException{
return new Pattern(regex, flags);
}
private void internalCompile(String regex, int flags) throws PatternSyntaxException {
stringRepr = regex;
caseless = (flags & IGNORE_CASE) == IGNORE_CASE;
Term.makeTree(regex, new int[]{flags}, this);
}
/**
* How many capturing groups does this expression include?
*/
public int groupCount() {
return memregs;
}
/**
* Get numeric id for a group name.
*
* @return null
if no such name found.
* @see MatchResult#group(java.lang.String)
* @see MatchResult#isCaptured(java.lang.String)
*/
public Integer groupId(String name) {
return (namedGroupMap.get(name));
}
/**
* A shorthand for Pattern.matcher(String).matches().
*
* @param s the target
* @return true if the entire target matches the pattern
* @see Matcher#matches()
* @see Matcher#matches(String)
*/
public boolean matches(String s) {
return matcher(s).matches();
}
/**
* A shorthand for Pattern.matcher(String).matchesPrefix().
*
* @param s the target
* @return true if the entire target matches the beginning of the pattern
* @see Matcher#matchesPrefix()
*/
public boolean startsWith(String s) {
return matcher(s).matchesPrefix();
}
/**
* Returns a target-less matcher.
* Don't forget to supply a target.
*/
public Matcher matcher() {
return new Matcher(this);
}
/**
* Returns a matcher for a specified string.
*/
public Matcher matcher(CharSequence s) {
Matcher m = new Matcher(this);
m.setTarget(s);
return m;
}
/**
* Returns a matcher for a specified region.
*/
public Matcher matcher(char[] data, int start, int end) {
Matcher m = new Matcher(this);
m.setTarget(data, start, end);
return m;
}
/**
* Returns a matcher for a match result (in a performance-friendly way).
* groupId
parameter specifies which group is a target.
*
* @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET.
*/
public Matcher matcher(MatchResult res, int groupId) {
Matcher m = new Matcher(this);
if (res instanceof Matcher) {
m.setTarget((Matcher) res, groupId);
} else {
m.setTarget(res.targetChars(), res.start(groupId) + res.targetStart(), res.length(groupId));
}
return m;
}
/**
* Just as above, yet with symbolic group name.
*
* @throws NullPointerException if there is no group with such name
*/
public Matcher matcher(MatchResult res, String groupName) {
Integer id = res.pattern().groupId(groupName);
if (id == null) throw new IllegalArgumentException("group not found:" + groupName);
int group = id;
return matcher(res, group);
}
/**
* Returns a matcher taking a text stream as target.
* Note that this is not a true POSIX-style stream matching, i.e. the whole length of the text is preliminary read and stored in a char array.
*
* @param text a text stream
* @param length the length to read from a stream; if len
is -1
, the whole stream is read in.
* @throws IOException indicates an IO problem
*/
@GwtIncompatible
public Matcher matcher(Reader text, int length) throws IOException {
Matcher m = new Matcher(this);
m.setTarget(text, length);
return m;
}
/**
* Returns a replacer of a pattern by specified perl-like expression.
* Such replacer will substitute all occurrences of a pattern by an evaluated expression
* ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc).
* Example:
* String text="The quick brown fox jumped over the lazy dog";
* Pattern word=new Pattern("\\w+");
* System.out.println(word.replacer("[$&]").replace(text));
* //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]"
* Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)");
* System.out.println(swap.replacer("$3$2$1").replace(text));
* //prints "The quick brown dog jumped over the lazy fox"
* Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)");
* System.out.println(scramble.replacer("$3$2$1").replace(text));
* //prints "quick The fox brown over jumped lazy the dog"
*
*
* @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo.
* @see Replacer
*/
public Replacer replacer(String expr) {
return new Replacer(this, expr);
}
/**
* Returns a replacer will substitute all occurrences of a pattern
* through applying a user-defined substitution model.
*
* @param model a Substitution object which is in charge for match substitution
* @see Replacer
*/
public Replacer replacer(Substitution model) {
return new Replacer(this, model);
}
/**
* Tokenizes a text by an occurrences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,String);
*
* @see RETokenizer
* @see RETokenizer#RETokenizer(regexodus.Pattern, java.lang.String)
*/
public RETokenizer tokenizer(String text) {
return new RETokenizer(this, text);
}
/**
* Tokenizes a specified region by an occurrences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,char[],int,int);
*
* @see RETokenizer
* @see RETokenizer#RETokenizer(regexodus.Pattern, char[], int, int)
*/
public RETokenizer tokenizer(char[] data, int off, int len) {
return new RETokenizer(this, data, off, len);
}
/**
* Tokenizes a specified region by an occurrences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,Reader,int);
*
* @see RETokenizer
* @see RETokenizer#RETokenizer(regexodus.Pattern, java.io.Reader, int)
*/
@GwtIncompatible
public RETokenizer tokenizer(Reader in, int length) throws IOException {
return new RETokenizer(this, in, length);
}
public String toString() {
return stringRepr;
}
/**
* Returns a less or more readable representation of a bytecode for the pattern.
*/
public String toString_d() {
return root.toStringAll();
}
private static int parseFlags(String flags) throws PatternSyntaxException {
boolean enable = true;
int len = flags.length();
int result = DEFAULT;
for (int i = 0; i < len; i++) {
char c = flags.charAt(i);
switch (c) {
case '+':
enable = true;
break;
case '-':
enable = false;
break;
default:
int flag = getFlag(c);
if (enable) result |= flag;
else result &= (~flag);
}
}
return result;
}
static int parseFlags(char[] data, int start, int len) throws PatternSyntaxException {
boolean enable = true;
int result = DEFAULT;
for (int i = 0; i < len; i++) {
char c = data[start + i];
switch (c) {
case '+':
enable = true;
break;
case '-':
enable = false;
break;
default:
int flag = getFlag(c);
if (enable) result |= flag;
else result &= (~flag);
}
}
return result;
}
private static int getFlag(char c) throws PatternSyntaxException {
switch (c) {
case 'i':
return IGNORE_CASE;
case 'm':
return MULTILINE;
case 's':
return DOTALL;
case 'x':
return IGNORE_SPACES;
case 'u':
return UNICODE;
case 'X':
return XML_SCHEMA;
}
throw new PatternSyntaxException("unknown flag: " + c);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Pattern pattern = (Pattern) o;
if (memregs != pattern.memregs) return false;
if (counters != pattern.counters) return false;
if (lookaheads != pattern.lookaheads) return false;
if (stringRepr != null ? !stringRepr.equals(pattern.stringRepr) : pattern.stringRepr != null) return false;
return root != null ? root.equals(pattern.root) : pattern.root == null && (root0 != null ? root0.equals(pattern.root0) : pattern.root0 == null && (namedGroupMap != null ? namedGroupMap.equals(pattern.namedGroupMap) : pattern.namedGroupMap == null));
}
@Override
public int hashCode() {
int result = stringRepr != null ? stringRepr.hashCode() : 0;
result = 31 * result + (root != null ? root.hashCode() : 0);
result = 31 * result + (root0 != null ? root0.hashCode() : 0);
result = 31 * result + memregs;
result = 31 * result + counters;
result = 31 * result + lookaheads;
result = 31 * result + (namedGroupMap != null ? namedGroupMap.hashCode() : 0);
return result;
}
}