info.codesaway.util.regex.Pattern Maven / Gradle / Ivy
Show all versions of regexplus Show documentation
// TODO: verify group names consisting of solely numbers are handled correctly
/*
* TODO: need to throw correct exception for cases:
* 1) Integer values too high
* 2) Illegal integer values
*/
package info.codesaway.util.regex;
import static info.codesaway.util.regex.Matcher.getAbsoluteGroupIndex;
import static info.codesaway.util.regex.Matcher.noNamedGroup;
import static info.codesaway.util.regex.RefactorUtility.fullGroupName;
import static info.codesaway.util.regex.RefactorUtility.parseInt;
import static info.codesaway.util.regex.RegExPlusSupport.setLastMatcher;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Predicate;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
// TODO: finish documenting
// mention that all compiled patterns are now cached
/**
* A compiled representation of a regular expression.
*
* This class is an extension
* of Java's {@link java.util.regex.Pattern} class. Javadocs were copied and
* appended with the added functionality.
*
* A regular expression, specified as a string, must first be compiled into
* an instance of this class. The resulting pattern can then be used to create
* a {@link Matcher} object that can match arbitrary {@linkplain java.lang.CharSequence character sequences} against the
* regular
* expression. All of the state involved in performing a match resides in the
* matcher, so many matchers can share the same pattern.
*
*
A typical invocation sequence is thus
*
*
* Pattern p = Pattern.{@link #compile compile}("a*b");
* Matcher m = p.{@link #matcher matcher}("aaaaab");
* boolean b = m.{@link Matcher#matches matches}();
*
* A {@link #matches matches} method is defined by this class as a
* convenience for when a regular expression is used just once. This method
* compiles an expression and matches an input sequence against it in a single
* invocation. The statement
*
*
* boolean b = Pattern.matches("a*b", "aaaaab");
*
* is equivalent to the three statements above, though for repeated matches it
* is less efficient since it does not allow the compiled pattern to be reused.
*
* Instances of this class are immutable and are safe for use by multiple
* concurrent threads. Instances of the {@link Matcher} class are not safe for
* such use.
*
* Summary of regular-expression constructs
*
*
*
*
* Construct
* Matches
*
*
*
*
*
*
* Characters
*
*
*
* x
* The character x
*
*
* \\
* The backslash character
*
*
* \0n
* The character with octal value 0n
* (0 <= n <= 7)
*
*
* \0nn
* The character with octal value 0nn
* (0 <= n <= 7)
*
*
* \0mnn
* The character with octal value 0mnn
* (0 <= m <= 3, 0
* <= n <= 7)
*
*
* \xhh
* The character with
* hexadecimal value 0xhh
*
*
* \x{hhh..}
* The character with
* hexadecimal value 0xhhh..
*
*
* \uhhhh
* The character with
* hexadecimal value 0xhhhh
*
*
* \t
* The tab character ('\u0009')
*
*
* \n
* The newline (line feed) character
* ('\u000A')
*
*
* \r
* The carriage-return character
* ('\u000D')
*
*
* \f
* The form-feed character ('\u000C')
*
*
* \a
* The alert (bell) character ('\u0007')
*
*
* \e
* The escape character ('\u001B')
*
*
* \cx
* The control character corresponding to x
*
*
*
*
*
*
* Character classes
*
*
*
* [abc]
* a, b, or c (simple
* class)
*
*
* [^abc]
* Any character except a, b, or
* c (negation)
*
*
* [a-zA-Z]
* a through z or A through
* Z, inclusive (range)
*
*
* [a-d[m-p]]
* a through d, or m through
* p: [a-dm-p] (union)
*
*
* [a-z&&[def]]
* d, e, or f (intersection)
*
*
* [a-z&&[^bc]]
* a through z, except for b
* and c: [ad-z] (subtraction)
*
*
* [a-z&&[^m-p]]
* a through z, and not m
* through p: [a-lq-z](subtraction)
*
*
*
*
*
*
* Predefined character classes
*
*
*
* .
* Any character (may or may not match
* line terminators)
*
*
* \X
* Single grapheme - equivalent to
* (?>\P{M}\p{M}*)
*
*
* \d
* A digit: [0-9]
*
*
* \D
* A non-digit: [^0-9]
*
*
* \s
* A whitespace character: [ \t\n\x0B\f\r]
*
*
* \S
* A non-whitespace character: [^\s]
*
*
* \w
* A word character: [a-zA-Z_0-9]
*
*
* \W
* A non-word character: [^\w]
*
*
*
*
*
*
* POSIX character classes (US-ASCII
* only)
*
*
*
* \p{Lower}
* A lower-case alphabetic character: [a-z]
*
*
* \p{Upper}
* An upper-case alphabetic character: [A-Z]
*
*
* \p{ASCII}
* All ASCII: [\x00-\x7F]
*
*
* \p{Alpha}
* An alphabetic
* character: [\p{Lower}\p{Upper}]
*
*
* \p{Digit}
* A decimal digit: [0-9]
*
*
* \p{Alnum}
* An alphanumeric character:
* [\p{Alpha}\p{Digit}]
*
*
*
* \p{Punct}
* Punctuation: One of
* !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
*
*
*
* \p{Graph}
* A visible character: [\p{Alnum}\p{Punct}]
*
*
* \p{Print}
* A printable character: [\p{Graph}\x20]
*
*
* \p{Blank}
* A space or a tab: [ \t]
*
*
* \p{Cntrl}
* A control character: [\x00-\x1F\x7F]
*
*
* \p{XDigit}
* A hexadecimal digit: [0-9a-fA-F]
*
*
* \p{Space}
* A whitespace character: [ \t\n\x0B\f\r]
*
*
*
*
*
*
* POSIX character classes (US-ASCII
* only)
(equivalent to the
* above POSIX classes - only allowed in a character class)
*
*
*
* [:lower:]
* A lower-case alphabetic character: [a-z]
*
*
* [:upper:]
* An upper-case alphabetic character: [A-Z]
*
*
* [:ascii:]
* All ASCII: [\x00-\x7F]
*
*
* [:alpha:]
* An alphabetic
* character: [[:lower:][:upper:]]
*
*
* [:digit:]
* A decimal digit: [0-9]
*
*
* [:alnum:]
* An alphanumeric character:
* [[:alpha:][:digit:]]
*
*
*
* [:punct:]
* Punctuation: One of
* !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
*
*
*
* [:graph:]
* A visible character: [[:alnum:][:punct:]]
*
*
* [:print:]
* A printable character: [[:graph:]\x20]
*
*
* [:blank:]
* A space or a tab: [ \t]
*
*
* [:cntrl:]
* A control character: [\x00-\x1F\x7F]
*
*
* [:xdigit:]
* A hexadecimal digit: [0-9a-fA-F]
*
*
* [:space:]
* A whitespace character: [ \t\n\x0B\f\r]
*
*
* [:word:]
* A word character: [\w]
*
*
*
*
*
*
* java.lang.Character classes (simple
* java character type)
*
*
*
* \p{javaLowerCase}
* Equivalent to java.lang.Character.isLowerCase()
*
*
* \p{javaUpperCase}
* Equivalent to java.lang.Character.isUpperCase()
*
*
* \p{javaWhitespace}
* Equivalent to java.lang.Character.isWhitespace()
*
*
* \p{javaMirrored}
* Equivalent to java.lang.Character.isMirrored()
*
*
*
*
*
*
* Classes for Unicode blocks and categories
*
*
*
* \p{InGreek}
* A character in the Greek block (simple
* block)
*
*
* \p{Lu}
* An uppercase letter (simple
* category)
*
*
* \p{Sc}
* A currency symbol
*
*
* \P{InGreek}
* Any character except one in the Greek block
* (negation)
*
*
*
* [\p{L}&&[^\p{Lu}]]
* Any letter except an uppercase letter
* (subtraction)
*
*
*
*
*
*
* Boundary matchers
*
*
*
* ^
* The beginning of a line
*
*
* $
* The end of a line
*
*
* \b
* A word boundary
*
*
* \B
* A non-word boundary
*
*
* \A
* The beginning of the input
*
*
* \G
* The end of the previous match
*
*
* \Z
* The end of the input but for the final
* terminator, if any
*
*
* \z
* The end of the input
*
*
*
*
*
*
* Greedy quantifiers
*
*
*
* X?
* X, once or not at all
*
*
* X*
* X, zero or more times
*
*
* X+
* X, one or more times
*
*
*
* X{n}
* X, exactly n times
*
*
*
* X{n,}
* X, at least n times
*
*
*
* X{n,m}
* X, at least n but not more than m
* times
*
*
*
*
*
*
* Reluctant quantifiers
*
*
*
* X??
* X, once or not at all
*
*
* X*?
* X, zero or more times
*
*
* X+?
* X, one or more times
*
*
*
* X{n}?
* X, exactly n times
*
*
*
* X{n,}?
* X, at least n times
*
*
*
* X{n,m}?
* X, at least n but not more than m
* times
*
*
*
*
*
*
* Possessive quantifiers
*
*
*
* X?+
* X, once or not at all
*
*
* X*+
* X, zero or more times
*
*
* X++
* X, one or more times
*
*
*
* X{n}+
* X, exactly n times
*
*
*
* X{n,}+
* X, at least n times
*
*
*
* X{n,m}+
* X, at least n but not more than m
* times
*
*
*
*
*
*
* Logical operators
*
*
*
* XY
* X followed by Y
*
*
* X|Y
* Either X or Y
*
*
*
*
*
*
* Capturing
*
*
* (X)
* X, as a capturing
* group
*
*
*
* (?<name>X)
* X, as a named-capturing
* group
*
*
*
* (?'name'X)
* X, as a named-capturing group
*
*
*
* (?P<name>X)
* X, as a named-capturing group
*
*
*
*
*
*
* Back references
*
*
*
* \n
* Whatever the nth
* capturing group matched
*
*
* \gn
* Whatever the nth
* capturing group matched
*
*
*
* \g{n}
* Whatever the nth
* capturing group matched
*
*
*
*
*
*
* \g-n
* Relative back reference
*
*
*
* \g{-n}
* Relative back reference
*
*
*
*
*
*
*
* \k<name>
* Whatever the named-capturing group "name" matched
*
*
*
* \k'name'
* Whatever the named-capturing group
* "name" matched
*
*
*
* \g{name}
* Whatever the named-capturing group
* "name" matched
*
*
*
* \k{name}
* Whatever the named-capturing group
* "name" matched
*
*
*
* (?P=name)
* Whatever the named-capturing group
* "name" matched
*
*
*
*
*
*
* Quotation
*
*
*
* \
* Nothing, but quotes the following character
*
*
* \Q
* Nothing, but quotes all characters until
* \E
*
*
* \E
* Nothing, but ends quoting started by \Q
*
*
*
*
*
*
*
* Special constructs (non-capturing)
*
*
*
* (?:X)
* X, as a non-capturing group
*
*
*
* (?idmsuxJn-idmsuxJn)
* Nothing, but turns match flags {@link #CASE_INSENSITIVE i} {@link #UNIX_LINES d}
* {@link #MULTILINE m} {@link #DOTALL s} {@link #UNICODE_CASE u} {@link #COMMENTS x} {@link #DUPLICATE_NAMES J}
* {@link #EXPLICIT_CAPTURE n} on - off
*
*
* (?idmsuxJn-idmsuxJn:X)
* X, as a non-capturing group
* with the given flags {@link #CASE_INSENSITIVE i} {@link #UNIX_LINES d} {@link #MULTILINE m} {@link #DOTALL s}
* {@link #UNICODE_CASE u} {@link #COMMENTS x} {@link #DUPLICATE_NAMES J} {@link #EXPLICIT_CAPTURE n} on
* - off
*
*
*
* (?>X)
* X, as an independent (atomic), non-capturing
* group
*
*
*
* (?|X)
* X, as a "branch reset"
* pattern
*
*
*
*
*
*
* Assertions (non-capturing)
*
*
*
* (?=X)
* X, via zero-width positive lookahead
*
*
* (?!X)
* X, via zero-width negative lookahead
*
*
*
* (?<=X)
* X, via zero-width positive lookbehind
*
*
*
* (?<!X)
* X, via zero-width negative lookbehind
*
*
*
*
*
*
* Comment (non-capturing)
*
*
*
*
* (?x:#comment\n)
* comment (cannot contain a
* line terminator)
*
*
*
* (?xd:#comment\n)
* comment (cannot contain '\n')
*
*
*
* (?#comment)
* comment (cannot contain a close parenthesis)
*
*
*
*
*
*
* Conditional patterns (non-capturing)
* (?(condition)yes-pattern)
* (?(condition)yes-pattern|no-pattern)
*
*
*
*
*
*
*
*
* (?(n)...)
* absolute reference condition
*
*
*
* (?(-n)...)
* relative reference condition
*
*
*
*
*
*
*
*
* (?(<name>)...)
* named reference condition
*
*
*
* (?('name')...)
* named reference condition
*
*
*
* (?(name)...)
* named reference condition
*
*
*
*
*
*
* (?(assert)...)
* assert condition
*
*
*
*
*
*
* Numeric ranges (non-capturing)
*
*
*
*
* (?Z[start..end])
* matches a numeric range
* (allowing for leading zeros)
*
*
*
* (?Z16[start..end])
* matches a numeric range in
* base 16 (allowing for leading zeros)
*
*
*
*
*
*
* (?NZ[start..end])
* matches a numeric range
* (not allowing for leading zeros)
*
*
*
* (?NZ16[start..end])
* matches a numeric range in
* base 16 (not allowing for leading zeros)
*
*
*
*
*
* Backslashes, escapes, and quoting
*
* The backslash character ('\') serves to introduce escaped
* constructs, as defined in the table above, as well as to quote characters
* that otherwise would be interpreted as unescaped constructs. Thus the
* expression \\ matches a single backslash and \{ matches a
* left brace.
*
* It is an error to use a backslash prior to any alphabetic character that
* does not denote an escaped construct; these are reserved for future
* extensions to the regular-expression language. A backslash may be used
* prior to a non-alphabetic character regardless of whether that character is
* part of an unescaped construct.
*
* Backslashes within string literals in Java source code are interpreted
* as required by the Java Language
* Specification as either Unicode
* escapes or other character
* escapes. It is therefore necessary to double backslashes in string
* literals that represent regular expressions to protect them from
* interpretation by the Java bytecode compiler. The string literal
* "\b", for example, matches a single backspace character when
* interpreted as a regular expression, while "\\b" matches a
* word boundary. The string literal "\(hello\)" is illegal
* and leads to a compile-time error; in order to match the string
* (hello) the string literal "\\(hello\\)"
* must be used.
*
* Character Classes
*
* Character classes may appear within other character classes, and
* may be composed by the union operator (implicit) and the intersection
* operator (&&).
* The union operator denotes a class that contains every character that is
* in at least one of its operand classes. The intersection operator
* denotes a class that contains every character that is in both of its
* operand classes.
*
* The precedence of character-class operators is as follows, from
* highest to lowest:
*
*
* 1
* Literal escape
* \x
* 2
* Grouping
* [...]
* 3
* Range
* a-z
* 4
* Union
* [a-e][i-u]
* 5
* Intersection
* [a-z&&[aeiou]]
*
*
* Note that a different set of metacharacters are in effect inside
* a character class than outside a character class. For instance, the
* regular expression . loses its special meaning inside a
* character class, while the expression - becomes a range
* forming metacharacter.
*
* Line terminators
*
* A line terminator is a one- or two-character sequence that marks
* the end of a line of the input character sequence. The following are
* recognized as line terminators:
*
*
*
* - A newline (line feed) character ('\n'),
*
* - A carriage-return character followed immediately by a newline
* character ("\r\n"),
*
* - A standalone carriage-return character ('\r'),
*
* - A next-line character ('\u0085'),
*
* - A line-separator character ('\u2028'), or
*
* - A paragraph-separator character ('\u2029).
*
*
* If {@link #UNIX_LINES} mode is activated, then the only line terminators
* recognized are newline characters.
*
* The regular expression . matches any character except a line
* terminator unless the {@link #DOTALL} flag is specified.
*
* By default, the regular expressions ^ and $ ignore
* line terminators and only match at the beginning and the end, respectively,
* of the entire input sequence. If {@link #MULTILINE} mode is activated then
* ^ matches at the beginning of input and after any line terminator
* except at the end of input. When in {@link #MULTILINE} mode $
* matches just before a line terminator or the end of the input sequence.
*
* Groups and capturing
*
* Group number
*
* Capturing groups are numbered by counting their opening parentheses from
* left to right. In the expression ((A)(B(C))), for example, there
* are four such groups:
*
*
* 1
* ((A)(B(C)))
* 2
* (A)
* 3
* (B(C))
* 4
* (C)
*
*
* Group zero always stands for the entire expression.
*
* Capturing groups are so named because, during a match, each subsequence
* of the input sequence that matches such a group is saved. The captured
* subsequence may be used later in the expression, via a back reference, and
* may also be retrieved from the matcher once the match operation is
* complete.
*
* Note: To use .NET's numbering for capture groups (instead of
* Java's), specify the {@link #DOTNET_NUMBERING} flag when compiling a
* pattern.
*
* Group name
* A capturing group can also be assigned a "name", a named-capturing
* group,
* and then be back-referenced later by the "name". Group names are composed of
* the following characters:
*
*
* - The uppercase letters 'A' through 'Z'
* ('\u0041' through '\u005a'),
* - The lowercase letters 'a' through 'z'
* ('\u0061' through '\u007a'),
* - The digits '0' through '9'
* ('\u0030' through '\u0039'),
* - The underscore character '_'
* ('\u005f'),
*
*
* A named-capturing group is still numbered as described in
* Group number.
*
* The captured input associated with a group is always the subsequence
* that the group most recently matched. If a group is evaluated a second time
* because of quantification then its previously-captured value, if any, will
* be retained if the second evaluation fails. Matching the string
* "aba" against the expression (a(b)?)+, for example, leaves
* group two set to "b". All captured input is discarded at the
* beginning of each match.
*
* Groups beginning with (? are either pure, non-capturing
* groups
* that do not capture text and do not count towards the group total, or
* named-capturing groups.
*
* Note: by default, capture group names must be unique, and if
* multiple groups
* with the same name exist,
* a {@link PatternSyntaxException} is thrown. By setting the {@link #DUPLICATE_NAMES} flag, multiple capture groups
* with
* the same name are allowed.
*
* Group
*
* A group is either the name of a named-capturing group or a string of
* the form groupName[occurrence].
*
* Use a positive occurrence (starting with 1) to refer to a specific
* occurrence of the group. A negative occurrence is a relative occurrence of
* the group. If the occurrence is omitted, or zero, the reference is to
* the
* first matched group with the specified group name. For example,
* groupName and groupName[0] both refer to the first
* matched occurrence of "groupName".
*
* This syntax allows referring to any
* capture group in the pattern - even if the case where multiple groups
* have the same name (see {@link #DUPLICATE_NAMES}), or the same
* number (see "branch reset"
* pattern).
*
* Using this syntax, to refer to
*
*
* any group
*
* - groupName
* - the group index wrapped in square brackets - a negative number
* is a relative reference
* - occurrence
* - In a "branch reset" pattern more than one
* occurrence of the group may exist.
* - example
* - group "[1]" is equivalent to group 1; group "[1][2]" is the second
* occurrence of group 1
*
*
*
* a named group
*
* - groupName
* - name of a named-capturing group
*
- occurrence
* - If the {@link Pattern#DUPLICATE_NAMES} flag is set, more than one
* occurrence of the group may exist.
* - example
* - "myGroup" refers to the first matched occurrence of the named
* group, "myGroup", and "myGroup[1]" refers to just the first occurrence.
*
*
*
*
* "Branch reset" pattern
*
* Quoted from the PCRE manual (the
* DUPLICATE SUBPATTERN NUMBERS section)
*
*
* Perl 5.10 introduced a feature where each alternative in a subpattern
* uses the same numbers for its capturing parentheses. Such a subpattern starts
* with (?| and is itself a non-capturing subpattern. This construct is
* useful when you want to capture part, but
* not all, of one of a number of alternatives.
*
* Inside a branch reset pattern, capture groups are numbered as
* usual,
* but the number is reset at the start of each branch. The numbers of any
* capturing buffers that follow the subpattern start after the highest number
* used in any branch.
*
*
* The following example is taken from the Perl documentation. The numbers
* underneath show in which buffer the captured content will be stored.
*
* # before ---------------branch-reset-----------
* after
* / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
* # 1 2
* 2 3
* 2 3
* 4
*
* As a note, nested branch reset patterns are fully supported:
*
*
* / (?| ( 1a ) ( 2a ) | ( 1b )
* (?| ( 2b1 ) | ( 2b2 ) ) ) /x
* # 1 2
* 1
* 2
* 2
*
*
* Note: if the {@link #DOTNET_NUMBERING} flag is set, named capture
* groups inside
* of a branch reset pattern will be numbered as if they were unnamed
* groups. The group remains a named group, and can still be referred by
* name.
*
* (?|(?<One>1a)(2a)|(1b)(?<Two>2b))
* # 1 2
* 1 2
*
* Unicode support
*
* This class is in conformance with Level 1 of Unicode Technical
* Standard #18: Unicode Regular Expression Guidelines, plus RL2.1
* Canonical Equivalents.
*
* Unicode escape sequences such as \u2014 in Java source code
* are processed as described in \u00A73.3
* of the Java Language Specification. Such escape sequences are also
* implemented directly by the regular-expression parser so that Unicode
* escapes can be used in expressions that are read from files or from the
* keyboard. Thus the strings "\u2014" and "\\u2014",
* while not equal, compile into the same pattern, which matches the character
* with hexadecimal value 0x2014.
*
* Unicode blocks and categories are written with the
* \p and \P constructs as in
* Perl. \p{prop} matches if the input has the
* property prop, while \P{prop} does not match
* if
* the input has that property. Blocks are specified with the prefix
* In, as in InMongolian. Categories may be specified with
* the optional prefix Is: Both \p{L} and \p{IsL}
* denote the category of Unicode letters. Blocks and categories can be used
* both inside and outside of a character class.
*
* The supported categories are those of
*
* The Unicode Standard in the version specified by the {@link java.lang.Character Character} class. The
* category names are those
* defined in the Standard, both normative and informative.
* The block names supported by Pattern
are the valid block names
* accepted and defined by {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName} .
*
* Categories that behave like the java.lang.Character
* boolean ismethodname methods (except for the deprecated ones) are
* available through the same \p{prop} syntax where
* the specified property has the name javamethodname.
*
* Comparison to Perl 5
*
* The Pattern
engine performs traditional NFA-based matching
* with ordered alternation as occurs in Perl 5.
*
*
Perl constructs not supported by this class:
*
*
*
* The conditional constructs (?{X}),
*
* The embedded code constructs (?{code})
* and (??{code}), and
*
* The preprocessing operations \l \u,
* \L, and \U.
*
*
*
* Constructs supported by this class but not by Perl:
*
*
*
* Possessive quantifiers, which greedily match as much as they can
* and do not back off, even when doing so would allow the overall match to
* succeed.
*
* Character-class union and intersection as described
* above.
*
*
*
* Notable differences from Perl:
*
*
*
* In Perl,\1 through \9 are always interpreted
* as back references; a backslash-escaped number greater than 9 is
* treated as a back reference if at least that many subexpressions exist,
* otherwise it is interpreted, if possible, as an octal escape. In this
* class octal escapes must always begin with a zero. In this class,
* \1 through \9 are always interpreted as back
* references, and a larger number is accepted as a back reference if at
* least that many subexpressions exist at that point in the regular
* expression, otherwise the parser will drop digits until the number is
* smaller or equal to the existing number of groups or it is one
* digit.
*
* Note: specify the {@link #PERL_OCTAL} flag when compiling a pattern
* to use Perl's octal syntax (as described above), instead of Java's.
*
* Perl uses the g flag to request a match that resumes
* where the last match left off. This functionality is provided implicitly
* by the {@link Matcher} class: Repeated invocations of the {@link Matcher#find find} method will resume where the last
* match left off,
* unless the matcher is reset.
*
* In Perl, embedded flags at the top level of an expression affect
* the whole expression. In this class, embedded flags always take effect
* at the point at which they appear, whether they are at the top level or
* within a group; in the latter case, flags are restored at the end of the
* group just as in Perl.
*
* Perl is forgiving about malformed matching constructs, as in the
* expression *a, as well as dangling brackets, as in the
* expression abc], and treats them as literals. This
* class also accepts dangling brackets but is strict about dangling
* metacharacters like +, ? and *, and will throw a {@link PatternSyntaxException} if it encounters them.
*
*
*
*
* For a more precise description of the behavior of regular expression
* constructs, please see
* Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl,
* O'Reilly and Associates, 2006.
*
* Numeric range
*
* Regular expressions may have extensive functionality, but they are
* designed to match text, so matching a numeric range requires some extra work.
* Since the need to match numeric ranges is sometimes necessary, The
* Pattern
class has built-in support for handling them.
*
* To allow leading zeros in a match, use the syntax
* (?Z[start..end])
. In this case, the match's width (number of
* digits matched), is between the number of digits in start and the
* number of digits in end. For example, (?Z[071..9])
* matches a number between 9 and 71 with between 1 and 3 digits. As the
* previous example shows, you can specify a range as [start..end]
* or as [end..start]
. As a note, a range can have a negative
* number for either its start or end, and the syntax remains the
* same.
*
* In the case that one bound is
* negative, and the other bound is positive, the match's width is as follows.
* For a negative number, the number of digits in a match must be between 1 and
* the number of digits in the negative bound. For a positive number, the number
* of digits is between 1 and the number of digits in the positive bound.
*
* To not allow leading zeros in a match, use the syntax
* (?NZ[start..end])
. In this case, the match will not contain
* any leading zeros. For example, (?NZ[071..9])
will match the
* "9", in "09", but it won't match the entire "09", since leading zeros are not
* part of the match.
*
* For either format, by default, the numbers are decimal numbers (base 10).
* If you want to match a range in a different base, specify the base number
* after the "Z" or "NZ". For example, (?Z16[0..ff])
will match a
* hex number between 0 and 0xFF - for example, "aa".
*
* When working with bases above 10, letters are used as digits, for example,
* in base 16, 'A' through 'F' are used to represent digits 10 through 15. By
* default, when matching a number, both upper-case and lower-case digits are
* allowed. For example, (?Z16[0..ff])
will match both "AA" and
* "aa". By specifying an 'L' or a 'U' after the base number, you can force only
* lower or upper-case digits to match. The regex
* (?Z16U[0..ff])
, for example, will match "AA", but not "aa". Note
* that regardless of this setting, in the pattern, either upper-case or
* lower-case digits may be used. For bases 10 or less, this setting
* has no effect, but, for consistency, can be specified - for example, the regex
* (?Z8U[0..377])
is equivalent to (?Z8[0..377])
.
*
* @see Pattern#split(CharSequence, String, int)
* @see Pattern#split(CharSequence, String)
*/
public final class Pattern implements Serializable {
/**
* Regular expression modifier values. Instead of being passed as
* arguments, they can also be passed as inline modifiers.
* For example, the following statements have the same effect.
*
* RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
* RegExp r2 = RegExp.compile("(?im)abc", 0);
*
*
* The flags are duplicated so that the familiar Perl match flag
* names are available.
*/
/**
* Enables Unix lines mode.
*
* In this mode, only the '\n' line terminator is recognized in
* the behavior of ., ^, and $.
*
* Unix lines mode can also be enabled via the embedded flag
* expression (?d).
*/
public static final int UNIX_LINES = java.util.regex.Pattern.UNIX_LINES;
/**
* Enables case-insensitive matching.
*
* By default, case-insensitive
* matching assumes that only characters in the US-ASCII charset are being
* matched. Unicode-aware case-insensitive matching can be enabled by
* specifying the {@link #UNICODE_CASE} flag in conjunction with this
* flag.
*
* Case-insensitive matching can also be enabled via the embedded flag
* expression (?i).
*
* Specifying this flag may impose a slight performance penalty.
*/
public static final int CASE_INSENSITIVE = java.util.regex.Pattern.CASE_INSENSITIVE;
/**
* Permits whitespace and comments in pattern.
*
* In this mode, whitespace is ignored, and embedded comments starting
* with # are ignored until the end of a line.
*
* Comments mode can also be enabled via the embedded flag
* expression (?x).
*/
public static final int COMMENTS = java.util.regex.Pattern.COMMENTS;
/**
* Enables multiline mode.
*
* In multiline mode the expressions ^ and $ match just
* after or just before, respectively, a line terminator or the end of the
* input sequence. By default these expressions only match at the
* beginning
* and the end of the entire input sequence.
*
* Multiline mode can also be enabled via the embedded flag
* expression (?m).
*/
public static final int MULTILINE = java.util.regex.Pattern.MULTILINE;
/**
* Enables literal parsing of the pattern.
*
* When this flag is specified then the input string that specifies
* the
* pattern is treated as a sequence of literal characters. Metacharacters or
* escape sequences in the input sequence will be given no special
* meaning.
*
* The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
* matching when used in conjunction with this flag. The other flags become
* superfluous.
*
* There is no embedded flag character for enabling literal
* parsing.
*/
public static final int LITERAL = java.util.regex.Pattern.LITERAL;
/**
* Enables dotall mode.
*
* In dotall mode, the expression . matches any character,
* including a line terminator. By default this expression does not match
* line terminators.
*
* Dotall mode can also be enabled via the embedded flag expression
* (?s). (The s is a mnemonic for "single-line" mode,
* which is what this is called in Perl.)
*/
public static final int DOTALL = java.util.regex.Pattern.DOTALL;
/**
* Enables Unicode-aware case folding.
*
* When this flag is specified then case-insensitive matching, when
* enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
* consistent with the Unicode Standard. By default, case-insensitive
* matching assumes that only characters in the US-ASCII charset are being
* matched.
*
* Unicode-aware case folding can also be enabled via the embedded flag
* expression (?u).
*
* Specifying this flag may impose a performance penalty.
*/
public static final int UNICODE_CASE = java.util.regex.Pattern.UNICODE_CASE;
/**
* Enables canonical equivalence.
*
* When this flag is specified then two characters will be considered to
* match if, and only if, their full canonical decompositions match. The
* expression "a\u030A", for example, will match the string
* "\u00E5" when this flag is specified. By default, matching
* does not take canonical equivalence into account.
*
* There is no embedded flag character for enabling canonical
* equivalence.
*
* Specifying this flag may impose a performance penalty.
*/
public static final int CANON_EQ = java.util.regex.Pattern.CANON_EQ;
/**
* Enables the Unicode version of Predefined character classes and
* POSIX character classes.
*
* When this flag is specified then the (US-ASCII only)
* Predefined character classes and POSIX character classes
* are in conformance with
* Unicode Technical
* Standard #18: Unicode Regular Expression
* Annex C: Compatibility Properties.
*
* The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
* flag expression (?U).
*
* The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
* folding.
*
* Specifying this flag may impose a performance penalty.
* @since 1.2
*/
// Added as part of Java 1.7
public static final int UNICODE_CHARACTER_CLASS = java.util.regex.Pattern.UNICODE_CHARACTER_CLASS;
/**
* Allows duplicate capture group names in pattern.
*
* If a pattern has this flag set, multiple capture groups with the same
* name are allowed. By default, capture group names must be unique.
*
* Allowing duplicate names can also be enabled via the embedded flag
* expression (?J)
.
*/
public static final int DUPLICATE_NAMES = 0x80000000;
/**
* When compiling a pattern, verifies that all referenced groups exist.
*
* If this flag is set, a {@link PatternSyntaxException} will be thrown
* if the pattern contains a reference to a non-existent group, whereas, by
* default, no exception would be thrown.
*
* Verification of groups can also be enabled via the embedded flag
* expression (?v)
.
*/
public static final int VERIFY_GROUPS = 0x40000000;
/**
* Use Perl's octal syntax (instead of Java's).
*
* That is, \n is a back reference if at least that many
* groups have
* occurred at the current point in the pattern. Otherwise, up to the first
* three (octal) digits are used to form an octal code, and any additional
* trailing digits will be treated literally.
*
* Using Perl's octal syntax can also be enabled via the embedded flag
* expression (?o)
.
*/
public static final int PERL_OCTAL = 0x20000000;
/**
* Use .NET numbering for capture groups (instead of Java's).
*
* In .NET, named-capture groups are numbered like unnamed groups,
* but numbering of named groups starts after all unnamed groups have been
* counted.
*
* For example, the expression
* ((?<One>A)B)?(?<Two>C)(D)
* produces the following capturing groups by number and name.
*
*
*
*
* Number
*
* Name
* Pattern
*
*
*
* 0
* none
* ((?<One>A)B)?(?<Two>C)(D)
*
*
*
* 1
* none
* ((?<One>A)B)
*
*
*
* 2
* none
* (D)
*
*
*
* 3
* One
* (?<One>A)
*
*
*
* 4
* Two
* (?<Two>C)
*
*
*
*/
// TODO: check about having inline modifier (doubtful)
public static final int DOTNET_NUMBERING = 0x10000000;
/**
* Enables explicit capture mode.
*
* In this mode, unnamed capture groups don't capture - that is, they
* are treated like non-capture groups. However, named capture groups can
* still be used for capturing (and they acquire numbers in the usual
* way).
*
* Explicit capture mode can also be enabled via the embedded flag
* expression (?n).
*
* Note: this feature is taken from .NET.
*/
public static final int EXPLICIT_CAPTURE = 0x8000000;
/*
* Pattern has only two serialized components: The pattern string and the
* flags, which are all that is needed to recompile the pattern when it is
* deserialized.
*/
/**
* use serialVersionUID from Merlin b59 for interoperability
*
* Note: this is the same serialVersionUID
used in
* the Java Pattern
class.
*/
private static final long serialVersionUID = 5073258162644648461L;
/** the internal {@link java.util.regex.Pattern} object for this pattern. */
private transient java.util.regex.Pattern internalPattern;
/** The pattern */
private final String pattern;
/** The flags. */
private final int flags;
/**
* Boolean indicating this Pattern is compiled; this is necessary in order
* to lazily compile deserialized Patterns.
*/
// TODO: learn about Serialization and test Class for correctness
private transient volatile boolean compiled = false;
/**
* The number of capturing groups in this Pattern.
*
* Note: this is the number of capture groups in the inputted
* pattern, which is not necessarily the actual number of capturing groups
* in the (refactored) internal pattern.
*/
private transient int capturingGroupCount;
/**
* Indicates whether groups were added when creating the internal pattern.
*
* Note: Some refactorings add capture groups (e.g. "branch reset" subpattern, which are invisible to
* outside users, except when using the internal pattern to create a matcher
*/
private transient boolean addedGroups;
/**
* A map with mappings from a "mapping name" to the actual group number in
* the internal pattern.
*
*
Note: both named and unnamed groups are included
*/
private transient Map groupMapping;
/**
* A map with mappings from the group name to the group count for
* the group.
*/
private transient Map groupCounts;
/**
* @since 0.2
*/
// TODO: update cache to detect common pattern which are equivalent ??
// e.g. "(?i)abc" = "abc" with Case-insensitive flag
private static final Map patternCache = new Hashtable<>();
/**
* A pattern with the RegEx being the empty string
*/
// public static final Pattern EMPTY_PATTERN = Pattern.compile("");
/**
* A compiled Java pattern with the RegEx being the empty string.
*/
static final java.util.regex.Pattern JAVA_EMPTY_PATTERN = java.util.regex.Pattern.compile("");
/**
* Whether to use lazy compiling or to compile on creation (default, how Java patterns are done)
*
* Note: changing this setting will not affect Pattern
s that are already created. To
* force a Pattern to compile, call the {@link #forceCompile()} method.
*/
@SuppressFBWarnings("MS_SHOULD_BE_FINAL")
public static boolean lazyCompiling = false;
/**
* Pattern used with the {@link #naturalCompareTo(CharSequence, CharSequence)} function
* to provide a natural sort
*/
private static final java.util.regex.Pattern naturalSort = java.util.regex.Pattern
.compile("\\G(?:(\\D++)|0*(\\d++)|$)");
/** The natural comparator */
private static Comparator naturalComparator = Pattern::naturalCompareTo;
/**
* Returns a comparator which sorts using {@link #naturalCompareTo(CharSequence, CharSequence)}, to treat embedded
* numbers as numbers, instead of comparing them lexicographically.
*
* NOTE: This comparator is case-sensitive, mimicking String comparisons.
*
* @return the natural comparator
* @since 0.2
*/
public static Comparator getNaturalComparator() {
return naturalComparator;
}
private static class PatternCacheKey {
private final String regex;
private final int flags;
public PatternCacheKey(final String regex, final int flags) {
this.regex = regex;
this.flags = flags;
}
/**
* {@inheritDoc}
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + this.flags;
result = prime * result + ((this.regex == null) ? 0 : this.regex.hashCode());
return result;
}
/**
* {@inheritDoc}
*/
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (this.getClass() != obj.getClass()) {
return false;
}
PatternCacheKey other = (PatternCacheKey) obj;
if (this.flags != other.flags) {
return false;
}
if (this.regex == null) {
if (other.regex != null) {
return false;
}
} else if (!this.regex.equals(other.regex)) {
return false;
}
return true;
}
@Override
public String toString() {
return this.flags + ": " + this.regex;
}
}
/**
* Compiles the given regular expression into a pattern.
*
* @param regex
* The expression to be compiled
* @return The compiled Pattern
*
* @throws PatternSyntaxException
* If the expression's patternSyntax is invalid
*/
public static Pattern compile(final String regex) {
return compile(regex, 0);
}
/**
* Compiles the given regular expression into a pattern with the given
* flags.
*
* @param regex
* The expression to be compiled
*
* @param flags
* the flags
*
* @return The compiled Pattern
*
* @throws PatternSyntaxException
* If the expression's patternSyntax is invalid
*/
public static Pattern compile(final String regex, final PatternOptions... flags) {
return compile(regex, new PatternFlags(flags).intValue());
}
public static Pattern compile(final String regex, final int... flags) {
int flagsTotal = 0;
for (int flag : flags) {
flagsTotal |= flag;
}
return compile(regex, flagsTotal);
}
/**
* Compiles the given regular expression into a pattern with the given
* flags.
*
* @param regex
* The expression to be compiled
*
* @param flags
* Match flags, a bit mask that may include {@link #CASE_INSENSITIVE}, {@link #MULTILINE},
* {@link #DOTALL}, {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, {@link #LITERAL},
* {@link #COMMENTS},
*
* {@link #DUPLICATE_NAMES}, {@link #VERIFY_GROUPS}, {@link #PERL_OCTAL}, {@link #DOTNET_NUMBERING},
* and {@link #EXPLICIT_CAPTURE}
*
* @return The compiled Pattern
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
* match flags are set in flags
*
* @throws PatternSyntaxException
* If the expression's patternSyntax is invalid
*/
public static Pattern compile(final String regex, final int flags) {
return compile(regex, flags, lazyCompiling);
}
static Pattern lazyCompile(final String regex) {
return lazyCompile(regex, 0);
}
private static Pattern lazyCompile(final String regex, final int flags) {
return compile(regex, flags, true);
}
public static Pattern compile(final String regex, final int flags, final boolean lazyCompiling) {
PatternCacheKey key = new PatternCacheKey(regex, flags);
Pattern cachedPattern = patternCache.get(key);
if (cachedPattern != null) {
return cachedPattern;
}
synchronized (patternCache) {
// System.out.println("Caching (" + flags + "): " + regex);
cachedPattern = patternCache.get(key);
if (cachedPattern != null) {
return cachedPattern;
}
}
Pattern newPattern = new Pattern(regex, flags, lazyCompiling);
synchronized (patternCache) {
patternCache.put(key, newPattern);
}
return newPattern;
}
/**
*
* @param pattern
* @return
* @since 0.2
*/
public static Pattern valueOf(final java.util.regex.Pattern pattern) {
PatternCacheKey key = new PatternCacheKey(pattern.pattern(), pattern.flags());
Pattern cachedPattern = patternCache.get(key);
if (cachedPattern != null) {
return cachedPattern;
}
synchronized (patternCache) {
cachedPattern = patternCache.get(key);
if (cachedPattern != null) {
return cachedPattern;
}
Pattern newPattern = new Pattern(pattern);
patternCache.put(key, newPattern);
return newPattern;
}
}
/**
* Forces this Pattern
to compile.
*
* If the pattern has already been compiled, then this method returns immediately. Otherwise, the pattern is
* compiled.
*
* Starting with version 0.2 of RegExPlus, patterns can be lazily compiled (compiled when needed) by
* setting the {@link #lazyCompiling} static field to true
. Calling this method forces these lazily
* compiled patterns to compile.
*
* @since 0.2
*/
public Pattern forceCompile() {
if (!this.compiled) {
synchronized (this) {
if (!this.compiled) {
this.compile();
}
}
}
return this;
}
/**
* Gets the internal pattern
*
* @return The internal {@link java.util.regex.Pattern} used by this
* pattern.
*/
public java.util.regex.Pattern getInternalPattern() {
this.forceCompile();
return this.internalPattern;
}
/**
* Returns the regular expression from which the internal pattern was
* compiled.
*
* @return The source of the internal pattern
*/
public String internalPattern() {
return this.getInternalPattern().pattern();
}
/**
* Returns the regular expression from which this pattern was compiled.
*
* @return The source of this pattern
*/
public String pattern() {
return this.pattern;
}
/**
* Indicates whether additional capture groups were added to the internal pattern when refactoring the compiled
* regular expression.
*
* @return
* @since 0.2
*/
public boolean addedGroups() {
this.forceCompile();
return this.addedGroups;
}
public List getGroupNames() {
List groupNames = new ArrayList<>();
for (Entry groupEntry : this.groupCounts.entrySet()) {
String groupName = groupEntry.getKey();
int occurrences = groupEntry.getValue();
if (occurrences == 1) {
groupNames.add(groupName);
} else {
for (int i = 1; i <= occurrences; i++) {
groupNames.add(groupName + "[" + i + "]");
}
}
}
return Collections.unmodifiableList(groupNames);
}
/**
* Returns the number of capturing groups in this pattern.
*
* Group zero denotes the entire pattern by convention. It is not
* included in this count.
*
* Any non-negative integer smaller than or equal to the value returned
* by this method is guaranteed to be a valid group index for this
* matcher.
*
* @return The number of capturing groups in this matcher's pattern
*/
public int groupCount() {
this.forceCompile();
return this.capturingGroupCount;
}
/**
* Returns the number of capturing groups (with the given group index) in
* this pattern.
*
* Note: in most cases, this return will be 1 - the only exception
* is in the case
* of a "branch reset" pattern, where there may be multiple groups with the
* same group index.
*
*
For example,
* // Outputs 2, since there are two groups that have the group index of 1
* System.out.println(Pattern.compile("(?|(1a)|(1b))").groupCount(1));
*
*
* Group zero denotes the entire pattern by convention. It is not
* included in this count.
*
* Any non-negative integer smaller than or equal to the value returned
* by this method is guaranteed to be a valid occurrence (for a group,
* groupName[occurrence]) for this
* matcher.
*
* Note: unlike other methods, this
* method doesn't throw an exception if the specified group doesn't exist.
* Instead, zero is returned, since the number of groups with the
* (non-existent) group name is zero.
*
* @param group
* The group index for a capturing group in this matcher's
* pattern
*
* @return The number of capturing groups (with the given group index) in
* this matcher's pattern
* @since 0.2
*/
public int groupCount(final int group) {
String groupName;
try {
int index = getAbsoluteGroupIndex(group, this.groupCount());
groupName = wrapIndex(index);
} catch (IndexOutOfBoundsException e) {
return 0;
} catch (IllegalArgumentException e) {
return 0;
}
Integer groupCount = this.groupCounts.get(groupName);
return groupCount != null ? groupCount : 0;
// return groupCount(wrapIndex(group));
}
/**
* Returns the number of capturing groups (with the given group name) in
* this pattern.
*
*
Group zero denotes the entire pattern by convention. It is not
* included in this count.
*
* Any non-negative integer smaller than or equal to the value returned
* by this method is guaranteed to be a valid occurrence (for a group,
* groupName[occurrence]) for this
* matcher.
*
* If groupName
is the empty string, this method's return is
* equal to the return from {@link #groupCount()}.
*
* Note: unlike other methods, this
* method doesn't throw an exception if the specified group doesn't exist.
* Instead, zero is returned, since the number of groups with the
* (non-existent) group name is zero.
*
* @param groupName
* The group name for a capturing group in this matcher's pattern
*
* @return The number of capturing groups (with the given group name) in
* this matcher's pattern
*/
public int groupCount(String groupName) {
this.forceCompile();
try {
groupName = this.normalizeGroupName(groupName);
} catch (IllegalArgumentException e) {
/*
* groupName is a relative unnamed group (e.g.
* "[-4]"), which doesn't exist, or an unnamed group whose index is
* not a parsable integer (e.g. "[a]")
*/
// Illegal group means the group count is 0
// TODO: if not parsable int, throw exception
return 0;
}
Integer groupCount = this.groupCounts.get(groupName);
return groupCount != null ? groupCount : 0;
}
/**
* Indicates whether this pattern has any capturing groups.
*
* @return true
if this pattern has at least one capturing group; otherwise, false
*
* @since 0.2
*/
public boolean hasGroup() {
return this.groupCount() > 0;
}
/**
* Indicates whether this pattern contains the specified group.
*
* @param group
* The group index for a capturing group in this pattern
* @return true
if this pattern contains the specified group; otherwise, false
.
*
* @since 0.2
*/
public boolean hasGroup(final int group) {
return this.groupCount(group) > 0;
}
/**
* Indicates whether this pattern contains the specified group.
*
* @param group
* A capturing group in this pattern
* @return true
if this pattern contains the specified group; otherwise, false
.
*
* @since 0.2
*/
public boolean hasGroup(final String group) {
// TODO: parse out group name and occurrence and pass to hasGroup(groupName, occurrence)
java.util.regex.Matcher matcher = fullGroupName.matcher(group);
if (!matcher.matches()) {
return false;
}
String groupName = matcher.group(1);
String groupOccurrence = matcher.group(2);
int occurrence = groupOccurrence != null ? parseInt(groupOccurrence) : 0;
return this.hasGroup(groupName, occurrence);
// return containsKey(group);
}
/**
* Indicates whether this matcher contains the specified group.
*
* @param groupName
* The group name for a capturing group in this matcher's pattern
*
* @param occurrence
* The occurrence of the specified group name
* @return true
if this matcher contains the specified group; otherwise, false
.
*
* @since 0.2
*/
@SuppressFBWarnings("RV_RETURN_VALUE_IGNORED_NO_SIDE_EFFECT")
public boolean hasGroup(final String groupName, final int occurrence) {
int groupCount = this.groupCount(groupName);
if (groupCount == 0) {
return false;
}
try {
getAbsoluteGroupIndex(occurrence, groupCount);
return true;
} catch (IndexOutOfBoundsException e) {
return false;
}
}
/**
* Returns the group mapping.
*
* @return The group mapping
*/
Map getGroupMapping() {
return this.groupMapping;
}
/**
* Returns the string representation of this pattern. This is the regular
* expression from which this pattern was compiled.
*
* @return The string representation of this pattern
*/
@Override
public String toString() {
return this.pattern;
}
/**
* Creates a matcher that will match the empty string against this pattern.
*
* This is commonly used to initialize an "empty" matcher with a later call to {@link Matcher#reset(CharSequence)}.
*
* This method can also be used in Java 8 as a Supplier via pattern::matcher
* @return
* @since 1.0
*/
public Matcher matcher() {
return this.matcher("");
}
/**
* Creates a matcher that will match the given input against this
* pattern.
*
* @param input
* The character sequence to be matched
*
* @return A new matcher for this pattern
*/
public Matcher matcher(final CharSequence input) {
this.forceCompile();
return new Matcher(this.internalPattern.matcher(input), this, input);
}
/**
* Indicates whether the given input partially matches this
* Pattern
.
*
* For the given input to be a partial match, it must be the prefix
* of
* some valid match. Conversely, if this method returns false
,
* then appending characters to the given input will never
* yield a
* match.
*
* For example, given the following pattern to match a decimal number
*
*
* Pattern p = Pattern.compile("\\d+\\.\\d+");
*
* The following calls return true
*
*
* p.isPartialMatch("");
* p.isPartialMatch("1");
* p.isPartialMatch("2");
* p.isPartialMatch("9");
* p.isPartialMatch("123");
* p.isPartialMatch("123.");
* p.isPartialMatch("123.456");
* // p.matcher("123.456").matches() would also return true (see note below)
*
* Whereas these calls return false
*
*
* p.isPartialMatch("a");
* p.isPartialMatch(".");
* p.isPartialMatch(".4");
* p.isPartialMatch(".45");
* p.isPartialMatch(".456");
*
* Note: if the
* given input would match the pattern, this method
* returns true
. That is, a match is also a partial match.
*
* @param input
* The character sequence to be matched
*
* @return true if, and only if, the given input partially
* matches
* this pattern
*/
/*
* Original Source:
*
* http://forums.sun.com/thread.jspa?messageID=4425768#4425768
*/
public boolean isPartialMatch(final CharSequence input) {
Matcher m = setLastMatcher(this.matcher(input));
if (m.matches()) {
return true;
}
return m.hitEnd();
}
/**
* Returns this pattern's match flags.
*
* @return The match flags specified when this pattern was compiled
*/
public int flags() {
return this.flags;
}
/**
* @return
* @since 0.2
*/
public PatternFlags getFlags() {
return new PatternFlags(this.flags());
}
/**
* @param pattern
* @return
*/
public static Pattern normalize(final java.util.regex.Pattern pattern) {
int flags = pattern.flags();
if (flags == 0) {
// return this;
// Done this way to allow lazy compiling, since normalized forms aren't used for matchers (in my code)
// Could instead use new Pattern(pattern), same end result
// TODO: instead make valueOf method lazily compilable
return lazyCompile(pattern.pattern());
}
/* Inline modifiers */
String unixLines = (flags & UNIX_LINES) != 0 ? "d" : "";
String caseInsensitive = (flags & CASE_INSENSITIVE) != 0 ? "i" : "";
String comments = (flags & COMMENTS) != 0 ? "x" : "";
String multiline = (flags & MULTILINE) != 0 ? "m" : "";
String dotall = (flags & DOTALL) != 0 ? "s" : "";
String unicodeCase = (flags & UNICODE_CASE) != 0 ? "u" : "";
String duplicateNames = (flags & DUPLICATE_NAMES) != 0 ? "J" : "";
String explicitCapture = (flags & EXPLICIT_CAPTURE) != 0 ? "n" : "";
String perlOctal = (flags & PERL_OCTAL) != 0 ? "o" : "";
String verifyGroups = (flags & VERIFY_GROUPS) != 0 ? "v" : "";
String unicodeCharacterClass = (flags & UNICODE_CHARACTER_CLASS) != 0 ? "U" : "";
/* No inline modifier */
int canonEq = (flags & CANON_EQ) != 0 ? CANON_EQ : 0;
// int verifyGroups = has(VERIFY_GROUPS) ? VERIFY_GROUPS : 0;
// int perlOctal = has(PERL_OCTAL) ? PERL_OCTAL : 0;
int dotnetNumbering = (flags & DOTNET_NUMBERING) != 0 ? DOTNET_NUMBERING : 0;
if ((flags & LITERAL) != 0) {
// Not sure if CANON_EQ is included for literal, docs says no, so following the docs
String newFlags = caseInsensitive + unicodeCase;
if (newFlags.length() != 0) {
newFlags = "(?" + newFlags + ")";
}
// TODO: see if cannot add constructor that removes need to recompile pattern
// Note that java 5 has a bug, where RegExPlus refactors quote blocks with escaped metacharacters
// Other java versions, a quoted section does not need refactoring
return lazyCompile(newFlags + quote(pattern.pattern()));
} else {
String newFlags = unixLines + caseInsensitive + comments + multiline + dotall + unicodeCase + duplicateNames
+ explicitCapture + perlOctal + verifyGroups + unicodeCharacterClass;
if (newFlags.length() == 0) {
// No changes are necessary, since no flags can be inlined
// return this;
return lazyCompile(pattern.pattern());
} else {
newFlags = "(?" + newFlags + ")";
// TODO: see if cannot add constructor that removes need to recompile pattern
return lazyCompile(newFlags + pattern.pattern(), canonEq | dotnetNumbering);
// return Pattern.compile(flags + pattern(), canonEq | verifyGroups | dotnetNumbering);
// return Pattern.compile(flags + pattern(), canonEq | verifyGroups | perlOctal | dotnetNumbering);
}
}
}
/**
* Normalizes the pattern by inlining all possible flags.
*
* Note: the returned pattern matches the exact same inputs as this pattern.
*
* @return the normalized pattern
*/
public Pattern normalize() {
if (this.flags() == 0) {
return this;
}
/* Inline modifiers */
String unixLines = this.has(UNIX_LINES) ? "d" : "";
String caseInsensitive = this.has(CASE_INSENSITIVE) ? "i" : "";
String comments = this.has(COMMENTS) ? "x" : "";
String multiline = this.has(MULTILINE) ? "m" : "";
String dotall = this.has(DOTALL) ? "s" : "";
String unicodeCase = this.has(UNICODE_CASE) ? "u" : "";
String duplicateNames = this.has(DUPLICATE_NAMES) ? "J" : "";
String explicitCapture = this.has(EXPLICIT_CAPTURE) ? "n" : "";
String perlOctal = this.has(PERL_OCTAL) ? "o" : "";
String verifyGroups = this.has(VERIFY_GROUPS) ? "v" : "";
String unicodeCharacterClass = this.has(UNICODE_CHARACTER_CLASS) ? "U" : "";
/* No inline modifier */
int canonEq = this.has(CANON_EQ) ? CANON_EQ : 0;
// int verifyGroups = has(VERIFY_GROUPS) ? VERIFY_GROUPS : 0;
// int perlOctal = has(PERL_OCTAL) ? PERL_OCTAL : 0;
int dotnetNumbering = this.has(DOTNET_NUMBERING) ? DOTNET_NUMBERING : 0;
if (this.has(LITERAL)) {
// Not sure if CANON_EQ is included for literal, docs says no, so following the docs
@SuppressWarnings("hiding")
String flags = caseInsensitive + unicodeCase;
if (flags.length() != 0) {
flags = "(?" + flags + ")";
}
// TODO: see if cannot add constructor that removes need to recompile pattern
// Note that java 5 has a bug, where RegExPlus refactors quote blocks with escaped metacharacters
// Other java versions, a quoted section does not need refactoring
return lazyCompile(flags + quote(this.pattern()));
} else {
@SuppressWarnings("hiding")
String flags = unixLines + caseInsensitive + comments + multiline + dotall + unicodeCase + duplicateNames
+ explicitCapture + perlOctal + verifyGroups + unicodeCharacterClass;
if (flags.length() == 0) {
// No changes are necessary, since no flags can be inlined
return this;
} else {
flags = "(?" + flags + ")";
// TODO: see if cannot add constructor that removes need to recompile pattern
return lazyCompile(flags + this.pattern(), canonEq | dotnetNumbering);
// return Pattern.compile(flags + pattern(), canonEq | verifyGroups | dotnetNumbering);
// return Pattern.compile(flags + pattern(), canonEq | verifyGroups | perlOctal | dotnetNumbering);
}
}
}
/**
* Compiles the given regular expression and attempts to match the given
* input against it.
*
* An invocation of this convenience method of the form
*
*
* Pattern.matches(regex, input);
*
* behaves in exactly the same way as the expression
*
*
* Pattern.compile(regex).matcher(input).matches()
*
* If a pattern is to be used multiple times, compiling it once and
* reusing it will be more efficient than invoking this method each
* time.
*
* @param regex
* The expression to be compiled
*
* @param input
* The character sequence to be matched
*
* @return true if, and only if, the entire region
* sequence matches
* this matcher's pattern
*
* @throws PatternSyntaxException
* If the expression's patternSyntax is invalid
*/
public static boolean matches(final String regex, final CharSequence input) {
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
return m.matches();
}
/**
* Replaces the first substring of the given input sequence that
* matches the
* given regular expression with the given replacement.
*
* An invocation of this method of the form
* Pattern.replaceFirst(input,
* regex, replacement) yields exactly the
* same result as the expression
*
*
{@link Pattern}.{@link Pattern#compile
* compile}(regex).{@link Pattern#matcher(java.lang.CharSequence)
* matcher}(input).{@link Matcher#replaceFirst
* replaceFirst}(replacement)
*
* Note that backslashes (\) and dollar signs ($) in
* the replacement string may cause the results to be different than if it
* were being treated as a literal replacement string; see {@link Matcher#replaceFirst}. Use
* {@link Matcher#quoteReplacement} to
* suppress the special meaning of these characters, if desired.
*
* Note: this function serves as a substitute for {@link String#replaceFirst(String, String)}.
*
* @param input
* The character sequence to be matched
* @param regex
* The regular expression to which the input sequence is to be
* matched
* @param replacement
* The string to be substituted for the first match
*
* @return The resulting String
*
* @throws PatternSyntaxException
* If the regular expression's patternSyntax is invalid
*/
public static String replaceFirst(final CharSequence input, final String regex, final String replacement) {
return Pattern.compile(regex).matcher(input).replaceFirst(replacement);
}
/**
* Replaces each substring of the given input sequence that matches
* the
* given regular expression with the given replacement.
*
* An invocation of this method of the form
* Pattern.replaceAll(input,
* regex,
* replacement) yields exactly the same result as the
* expression
*
*
{@link Pattern}.{@link Pattern#compile
* compile}(regex).{@link Pattern#matcher(java.lang.CharSequence)
* matcher}(input).{@link Matcher#replaceAll
* replaceAll}(replacement)
*
* Note that backslashes (\) and dollar signs ($) in
* the replacement string may cause the results to be different than if it
* were being treated as a literal replacement string; see {@link Matcher#replaceAll}. Use
* {@link Matcher#quoteReplacement} to
* suppress the special meaning of these characters, if desired.
*
* Note: this function serves as a substitute for {@link String#replaceAll(String, String)}.
*
* @param input
* The character sequence to be matched
* @param regex
* The regular expression to which the input sequence is to be
* matched
* @param replacement
* The string to be substituted for each match
*
* @return The resulting String
*
* @throws PatternSyntaxException
* If the regular expression's patternSyntax is invalid
*/
public static String replaceAll(final CharSequence input, final String regex, final String replacement) {
return Pattern.compile(regex).matcher(input).replaceAll(replacement);
}
/**
* Splits the given input sequence around matches of the given regular
* expression.
*
*
* The array returned by this method contains each substring of the
* input
* sequence that is terminated by another substring that matches the given
* expression or is terminated by the end of the string. The substrings in
* the array are in the order in which they occur in this string. If the
* expression does not match any part of the input then the resulting
* array
* has just one element, namely the input sequence.
*
*
* The limit parameter controls the number of times the pattern is
* applied and therefore affects the length of the resulting array. If the
* limit n is greater than zero then the pattern will be applied at
* most n - 1 times, the array's length will be no greater
* than n, and the array's last entry will contain all input
* beyond
* the last matched delimiter. If n is non-positive then the pattern
* will be applied as many times as possible and the array can have any
* length. If n is zero then the pattern will be applied as many
* times as possible, the array can have any length, and trailing empty
* strings will be discarded.
*
*
* The string "boo:and:foo", for example, yields the following
* results with these parameters:
*
*
*
*
* Regex
* Limit
* Result
*
*
* :
* 2
* { "boo", "and:foo" }
*
*
* :
* 5
* { "boo", "and", "foo" }
*
*
* :
* -2
* { "boo", "and", "foo" }
*
*
* o
* 5
* { "b", "", ":and:f", "", "" }
*
*
* o
* -2
* { "b", "", ":and:f", "", "" }
*
*
* o
* 0
* { "b", "", ":and:f" }
*
*
*
*
*
* An invocation of this method of the form
* Pattern.split(input,
* regex,
* n) yields the same result as the expression
*
*
{@link Pattern}.{@link Pattern#compile
* compile}(regex).{@link Pattern#split(CharSequence, int)
* split}(input,limit)
*
* Note: this function serves as a substitute for {@link String#split(String, int)}.
*
* @param input
* The character sequence to be split
* @param regex
* The delimiting regular expression
* @param limit
* The result threshold, as described above
*
* @return The array of strings computed by splitting the input
* sequence around matches of the given regular expression
*
* @throws PatternSyntaxException
* If the regular expression's patternSyntax is invalid
*/
public static String[] split(final CharSequence input, final String regex, final int limit) {
return Pattern.compile(regex).split(input, limit);
}
/**
* Splits this string around matches of the given regular expression.
*
*
* This method works as if by invoking the three-argument {@link #split(CharSequence, String, int) split} method
* with the given
* input sequence, expression and a limit argument of zero. Trailing
* empty
* strings are therefore not included in the resulting array.
*
*
*
* The string "boo:and:foo", for example, yields the following
* results with these expressions:
*
*
*
*
* Regex
* Result
*
*
* :
* { "boo", "and", "foo" }
*
*
* o
* { "b", "", ":and:f" }
*
*
*
*
* Note: this function serves as a substitute for {@link String#split(String)}.
*
* @param input
* The character sequence to be split
* @param regex
* The delimiting regular expression
*
* @return The array of strings computed by splitting the input
* sequence
* around matches of the given regular expression
*
* @throws PatternSyntaxException
* If the regular expression's patternSyntax is invalid
*/
public static String[] split(final CharSequence input, final String regex) {
return Pattern.compile(regex).split(input, 0);
}
/**
* Splits the given input sequence around matches of this pattern.
*
*
* The array returned by this method contains each substring of the
* input
* sequence that is terminated by another subsequence that matches this
* pattern or is terminated by the end of the input sequence. The
* substrings
* in the array are in the order in which they occur in the input. If
* this
* pattern does not match any subsequence of the input then the
* resulting
* array has just one element, namely the input sequence in string
* form.
*
*
* The limit parameter controls the number of times the pattern is
* applied and therefore affects the length of the resulting array. If the
* limit n is greater than zero then the pattern will be applied at
* most n - 1 times, the array's length will be no greater
* than n, and the array's last entry will contain all input
* beyond
* the last matched delimiter. If n is non-positive then the pattern
* will be applied as many times as possible and the array can have any
* length. If n is zero then the pattern will be applied as many
* times as possible, the array can have any length, and trailing empty
* strings will be discarded.
*
*
* The input "boo:and:foo", for example, yields the following
* results with these parameters:
*
*
*
*
*
*
*
* Regex
*
*
* Limit
*
*
* Result
*
*
* :
* 2
* { "boo", "and:foo" }
*
*
* :
* 5
* { "boo", "and", "foo" }
*
*
* :
* -2
* { "boo", "and", "foo" }
*
*
* o
* 5
* { "b", "", ":and:f", "", "" }
*
*
* o
* -2
* { "b", "", ":and:f", "", "" }
*
*
* o
* 0
* { "b", "", ":and:f" }
*
*
*
*
*
* @param input
* The character sequence to be split
*
* @param limit
* The result threshold, as described above
*
* @return The array of strings computed by splitting the input around
* matches of this pattern
*/
public String[] split(final CharSequence input, final int limit) {
return this.getInternalPattern().split(input, limit);
}
/**
* Splits the given input sequence around matches of this pattern.
*
*
* This method works as if by invoking the two-argument {@link #split(java.lang.CharSequence, int) split} method
* with the given
* input sequence and a limit argument of zero. Trailing empty strings
* are
* therefore not included in the resulting array.
*
*
*
* The input "boo:and:foo", for example, yields the following
* results with these expressions:
*
*
*
*
*
*
* Regex
*
*
* Result
*
*
* :
* { "boo", "and", "foo" }
*
*
* o
* { "b", "", ":and:f" }
*
*
*
*
*
* @param input
* The character sequence to be split
*
* @return The array of strings computed by splitting the input around
* matches of this pattern
*/
public String[] split(final CharSequence input) {
return this.getInternalPattern().split(input, 0);
}
/**
* Returns a literal pattern String
for the specified
* String
.
*
*
* This method produces a String
that can be used to create a
* Pattern
that would match the string s
as if it
* were a literal pattern.
*
*
*
* Metacharacters or escape sequences in the input sequence will be
* given no
* special meaning.
*
*
* @param s
* The string to be literalized
* @return A literal string replacement
*/
public static String quote(final String s) {
return java.util.regex.Pattern.quote(s);
}
/**
* Java regular expression metacharacters.
*
* \.*?+[]{|()^$#
*
* Note: '#' is included in the escapped metacharacters in case {@link Pattern#COMMENTS COMMENTS} is
* enabled, and the escaped text in only a part of the regular expression, which occurs when the COMMENTS flag is
* enabled.
*
* Note: ']' is included in the case that the regular expression is used in a regex tool that
* requires the closing brace to be escaped (for example, Javascript). Java does not require it to be
* escaped, but escaping it does no harm.
*
* Note: '}' is included in the case that the regular expression is used in a regex tool that
* requires the closing curly brace to be escaped (for example, Android development). Java does not require it to be
* escaped, but escaping it does no harm.
*/
public static final String REGEX_METACHARACTERS = "\\.*?+[]{}|()^$#";
/**
* Java regular expression metacharacters when within a character class (for example, [abc]
).
*
* ^-[]\#&
*
* Note: '#' is included in the escapped metacharacters in case {@link Pattern#COMMENTS COMMENTS} is
* enabled, and the escaped text in only a part of the regular expression, which occurs when the COMMENTS flag is
* enabled.
*/
public static final String REGEX_CHAR_CLASS_METACHARACTERS = "^-[]\\#&";
/** Pattern to match any character. */
// private static final Pattern anyCharacterPattern = compile(".", DOTALL);
/**
* Pattern used to escape metacharacters.
*/
// static final Pattern ESCAPE_REGEX_METACHARACTERS = escapeMetacharacters(REGEX_METACHARACTERS);
/**
* Pattern used to escape metacharacters found in a character class.
*/
// static final Pattern ESCAPE_REGEX_CHAR_CLASS_METACHARS = escapeMetacharacters(REGEX_CHAR_CLASS_METACHARACTERS);
/**
* Returns a literal pattern String
for the specified
* String
.
*
* This method produces a String
that can be used to
* create a Pattern
that would match the string s
as
* if it were a literal pattern.
*
* Metacharacters or escape sequences in the input sequence will be given no special meaning.
*
* This method escapes the metacharacters specified by {@link #REGEX_METACHARACTERS}:
* \.*?+[]{|()^$#
*
* Note: this function escapes each metacharacter individually,
* whereas {@link Pattern#quote(String)} uses a \Q..\E
block. This
* method can be used to create a regular expression to use with tools that don't support \Q..\E
* blocks.
*
* @param s
* The string to be literalized
* @return A literal string replacement
*/
public static String literal(final String s) {
return literal(s, REGEX_METACHARACTERS);
}
/**
* Returns a literal pattern String
for the specified
* String
.
*
* This method produces a String
that can be used to
* create a Pattern
that would match the string s
as
* if it were a literal pattern.
*
* The specified metacharacters
or escape sequences in the input sequence will be given no special
* meaning.
*
* Note: this function escapes each metacharacter individually,
* whereas {@link Pattern#quote(String)} uses a \Q..\E
block. This
* method can be used to create a regular expression to use with tools that don't support \Q..\E
* blocks.
*
* @param s
* The string to be literalized
* @param metacharacters
* the metacharacters to escape
* @return A literal string replacement
*/
public static String literal(final String s, final String metacharacters) {
// literal.length() will be at least s.length, but no more than 2*s.length()
StringBuilder literal = new StringBuilder(s.length());
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') {
literal.append(c);
continue;
}
if (metacharacters.indexOf(c) != -1) {
literal.append('\\');
}
literal.append(c);
}
return literal.toString();
// return literal(s, escapeMetacharacters(metacharacters));
}
/**
* Returns a literal pattern String
for the specified
* String
.
*
* This method produces a String
that can be used to
* create a Pattern
that would match the string s
as
* if it were a literal pattern.
*
* Metacharacters or escape sequences in the input sequence will be given no special meaning.
*
* This method escapes the following metacharacters:
*
* Note: this function escapes each metacharacter individually,
* whereas {@link Pattern#quote(String)} uses a \Q..\E
block. This
* method can be used to create a regular expression to use with tools that don't support \Q..\E
* blocks.
*
* @param s
* The string to be literalized
* @param escapeMetachars
* pattern to match the metacharacters to escape
* @return A literal string replacement
*/
// static String literal(String s, Pattern escapeMetachars)
// {
// return escapeMetachars.matcher(s).replaceAll("\\\\$0");
// }
/**
* Returns a pattern which will match any single metacharacter in the specified metacharacters.
*
* @param metacharacters
* the metacharacters
* @return a pattern which will match any single metacharacter in the specified metacharacters
*/
// private static Pattern escapeMetacharacters(String metacharacters)
// {
// return compile("[" + literal(metacharacters, anyCharacterPattern) + "]", 0, false);
// }
/**
* Recompile the Pattern instance from a stream. The original pattern string
* is read in and the object tree is recompiled from it.
*/
private void readObject(final java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException {
// Read in all fields
s.defaultReadObject();
// this.caseInsensitiveGroupNames = has(CASE_INSENSITIVE_NAMES);
this.groupMapping = new HashMap<>(2);
// if length > 0, the Pattern is lazily compiled
this.compiled = false;
if (this.pattern.length() == 0) {
this.initializeEmptyPattern();
}
// TODO: cache pattern here??
}
private void initializeForZeroGroups() {
/*
* Expected results (with 0 groups):
*
* groupMapping : {[0][1]=0}
* -size: 1
*/
this.groupMapping = new HashMap<>(1);
// Map [i][1] -> i (e.g. [0][1]=0)
this.groupMapping.put(getMappingName(0, 1), 0);
/*
* Expected results (with 0 groups):
*
* groupCounts: {=0, [0]=1}
* -size: 2
*/
this.groupCounts = new HashMap<>(2);
// Map empty string group name to group count
this.groupCounts.put("", 0);
// Map [i] -> 1 (e.g. [0]=1)
this.groupCounts.put(wrapIndex(0), 1);
/* Initialize pattern values */
this.capturingGroupCount = 0;
this.addedGroups = false;
}
private void initializeEmptyPattern() {
this.initializeForZeroGroups();
this.setInternalPattern("");
this.compiled = true;
}
/**
* This private constructor is used to create all Patterns (other than those upcasted from Java patterns).
* The pattern string and match flags are all that is needed to completely describe a
* Pattern.
*
* @param regex
* the expression to be compiled
* @param flags
* match flags bit mask
* @param lazyCompiling
* whether to lazily compile pattern
*/
private Pattern(final String regex, final int flags, final boolean lazyCompiling) {
// TODO: derive method to perform a simple check on regex to see if refactoring is necessary
// many regexes don't need to be refactored, try to detect some
// Optimize refactoring
this.pattern = regex;
this.flags = flags;
// this.caseInsensitiveGroupNames = has(CASE_INSENSITIVE_NAMES);
if (regex.length() > 0) {
if (!lazyCompiling) {
this.compile();
}
} else {
this.initializeEmptyPattern();
}
}
/**
* Constructor to create a Pattern object from a Java Pattern.
*
* Note: no compiling or refactoring of the pattern is performed,
* since the Pattern is already compiled and valid for use by Java's Regular
* Expression engine.
*
* @param pattern
* the pattern
* @since 0.2
*/
Pattern(final java.util.regex.Pattern pattern) {
this.pattern = pattern.pattern();
this.flags = pattern.flags();
int groupCount = pattern.matcher("").groupCount();
this.capturingGroupCount = groupCount;
// Initialize groupMapping and groupCounts
this.groupMapping = new HashMap<>(groupCount + 1);
this.groupCounts = new HashMap<>(groupCount + 2);
/*
* Expected results (with three groups):
*
* groupMapping : {[0][1]=0, [1][1]=1, [2][1]=2, [3][1]=3}
* -size: groupCount + 1
*
* groupCounts: {=3, [0]=1, [1]=1, [2]=1, [3]=1}
* -size: groupCount + 2
*/
// Map empty string group name to group count
this.groupCounts.put("", groupCount);
for (int i = 0; i <= groupCount; i++) {
String groupName = wrapIndex(i);
// Map [i][1] -> i (e.g. [0][1]=0)
this.groupMapping.put(getMappingName(groupName, 1), i);
// Map [i] -> 1 (e.g. [0]=1)
this.groupCounts.put(groupName, 1);
}
// For Java 7, adds named groups (if any)
try {
// Get map of group names -> group index
Field field;
field = pattern.getClass().getDeclaredField("namedGroups");
// https://stackoverflow.com/a/43013209
// (fix for SpotBugs warning)
AccessController.doPrivileged((PrivilegedAction) () -> {
field.setAccessible(true);
return null;
});
@SuppressWarnings("unchecked")
Map javaGroupMapping = (Map) field.get(pattern);
// Null if doesn't have any named groups
if (javaGroupMapping != null) {
// Add entries for each named group
for (Entry entry : javaGroupMapping.entrySet()) {
String groupName = entry.getKey();
Integer groupNumber = entry.getValue();
// Map groupName[1] -> groupNumber
this.groupMapping.put(getMappingName(groupName, 1), groupNumber);
// Map groupName -> 1 (e.g. [0]=1)
// (Always valid, since Java doesn't allow duplicate groups)
this.groupCounts.put(groupName, 1);
}
}
} catch (RuntimeException | NoSuchFieldException | IllegalAccessException e) {
// Do nothing
}
this.internalPattern = pattern;
this.compiled = true;
}
/* Methods called ONLY when initializing a Pattern */
void setCapturingGroupCount(final int capturingGroupCount) {
this.capturingGroupCount = capturingGroupCount;
}
void setAddedGroups(final boolean addedGroups) {
this.addedGroups = addedGroups;
}
int getGroupCount(final String groupName) {
Integer groupCount = this.groupCounts.get(groupName);
return (groupCount == null ? 0 : groupCount);
}
/**
* Gets the mapping from group names to group counts
*
* For named groups, the key is the group name
* For unnamed groups, the key is the group number surrounded by '[' and ']'. For example, group 1 would be the
* key "[1]" in the returned map
*
* @return an unmodifiable map from group names to group counts
*/
public Map getGroupCounts() {
return Collections.unmodifiableMap(this.groupCounts);
}
void setGroupCounts(final Map groupCounts) {
this.groupCounts = groupCounts;
}
/**
* Refactors the regular expression to an equivalent form usable by Java's {@link java.util.regex.Pattern} class,
* and compiles the internal
* Pattern
using the refactored regular expression.
*/
private void compile() {
// System.out.println("Compiling (" + flags + "): " + pattern);
String refactoredPattern;
Refactor refactor;
if (this.has(LITERAL)) {
this.initializeForZeroGroups();
refactor = null;
refactoredPattern = this.pattern;
} else {
this.groupMapping = new HashMap<>(2);
// refactor to be used as a RegEx pattern
// Refactor refactor = new Refactor(pattern);
refactor = new Refactor(this);
refactoredPattern = refactor.toString();
}
try {
// System.out.println(refactoredPattern);
// setInternalPattern(refactor.toString());
this.setInternalPattern(refactoredPattern);
} catch (java.util.regex.PatternSyntaxException e) {
// // on error, show error using the original pattern
// // (not refactored form)
//
// System.out.println(e.getMessage());
//
String desc = e.getDescription();
int index;
try {
// TODO: can sometimes be incorrect with subpatterns
// e.g. "(?+1)(?*)"
index = refactor == null ? -1 : refactor.changes.getOriginalIndex(e.getIndex());
} catch (IllegalArgumentException e1) {
// Unknown original index
// Can occurn, for example, in the regex "(?*\\01)"
index = -1;
}
// System.out.println(refactor.result);
throw new PatternSyntaxException(desc, this.pattern, index);
//
// PatternErrorMessage errorMessage = PatternErrorMessage
// .getValue(desc);
//
// if (errorMessage != null)
// throw new PatternSyntaxException(errorMessage, pattern, index);
// else
// throw new PatternSyntaxException(desc, pattern, index);
}
// catch (info.codesaway.util.regex.PatternSyntaxException e)
// {
// int index = refactor.changes.getOriginalIndex(e.getIndex());
// PatternErrorMessage errorMessage = e.getErrorMessage();
//
// if (errorMessage != null)
// throw new PatternSyntaxException(errorMessage, pattern, index);
// else
// {
// int errorCode = e.getErrorCode();
// String desc = e.getDescription();
//
// throw new PatternSyntaxException(errorCode, desc, pattern, index);
// }
// }
this.compiled = true;
}
/**
* Sets the internal Pattern
to the Pattern
* returned when calling
*
* {@link java.util.regex.Pattern}. {@link java.util.regex.Pattern#compile(String, int)
* compile}(regex , flags)
*
*
* @param regex
* the expression to be compiled
* @see java.util.regex.Pattern#compile(String, int)
*/
private void setInternalPattern(final String regex) {
// keep all flags except those introduced in this class
this.internalPattern = java.util.regex.Pattern.compile(regex,
this.flags & ~(DUPLICATE_NAMES | VERIFY_GROUPS | DOTNET_NUMBERING | EXPLICIT_CAPTURE | PERL_OCTAL));
}
/**
* Indicates whether a particular flag is set or not.
*
* @param f
* the flag to test
* @return true
if the particular flag is set
*/
boolean has(final int f) {
return (this.flags & f) != 0;
}
/**
* @param flag
* @return
* @since 0.2
*/
public boolean has(final PatternFlag flag) {
return this.has(flag.intValue());
}
/**
* Compares two character sequences lexigraphically,
* except that embedded numbers are treated numerically.
*
* For example, when using this method, "1.2.9.1" is less than
* "1.2.10.5", whereas a lexigraphical comparison would yield the
* opposite.
*
* When comparing, leading zeros are ignored, unless the inputted
* sequences are otherwise
* equivalent. If the two inputs are identical, then 0 is
* returned. Otherwise, the left-most number
* where the number of leading zeros differs is used to determine the
* ordering. In this case, the one with more leading zeros is first.
*
* For example, the below list is sorted in increasing order:
*
*
* - 2009-1-2
* - 2009-01-05
* - 2009-01-5
* - 2009-1-05
* - 2009-1-5
*
*
* This function can be used to compare versions, dates, and other
* numeric based data. Since the comparison is done from left to right, the
* format must have the most significant part first. For example, in a date
* format, that would be year, month, and then day to sort in chronological
* order.
*
* Note that for correct sorting of numeric based data, the format's must
* be identical - otherwise, where the formats differ, the sorting is based
* on the ascii value of the change in the format.
* For example, the date "2009-1-5" is less
* than "2009.1.2", but not chronologically before. This result is due to
* the ascii value for '-' (\u2d) being less than the ascii value for
* '.' (\u2e).
*
* This method can be called in the compare function of a {@link Comparator} object
* to provide sorting.
*
*
* Comparator<String> comparator = new Comparator<String>() {
*
* public int compare(String o1, String o2) {
* return naturalCompareTo(o1, o2);
* }
* };
*
* @param value1
* the first character sequence
* @param value2
* the second character sequence
* @return 0 if the two values are equal, -1 if the first value is
* "less than" the second, and 1 if the first value is
* "greater than" the second.
*
* @see String#compareTo(String)
*/
public static int naturalCompareTo(final CharSequence value1, final CharSequence value2) {
// XXX: don't use regex - performance hit
java.util.regex.Matcher matcher1 = naturalSort.matcher(value1);
java.util.regex.Matcher matcher2 = naturalSort.matcher(value2);
// left-most number with different number of leading zeros
// (used to break ties if strings are otherwise equivalent)
int leftMostDifference = 0;
while (matcher1.find() && matcher2.find()) {
String match1 = matcher1.group();
String match2 = matcher2.group();
if (match1.length() == 0) {
if (match2.length() == 0) {
// equivalent - return leftMostDifference
return leftMostDifference;
}
// string2 is "longer"
return -1;
} else if (match2.length() == 0) {
// string1 is "longer"
return 1;
}
boolean isNumber1 = matcher1.start(2) != -1;
boolean isNumber2 = matcher2.start(2) != -1;
if (isNumber1 && isNumber2) {
// both numbers - compare numerically
String number1 = matcher1.group(2);
String number2 = matcher2.group(2);
if (number1.length() != number2.length()) {
return number1.length() < number2.length() ? -1 : 1;
}
int compareTo = number1.compareTo(number2);
if (compareTo != 0) {
return compareTo;
}
if (leftMostDifference == 0) {
// only do once - for left-most difference
if (match1.length() != match2.length()) {
// different number of leading zeros
// e.g. "01" and "1"
// more leading zeros first "01" < "1"
leftMostDifference = match1.length() > match2.length() ? -1 : 1;
}
}
} else {
int compareTo = match1.compareTo(match2);
if (compareTo != 0) {
return compareTo;
}
}
}
return 0;
}
/**
* Normalizes the group name.
*
* @param groupName
* the group name
* @return the normalized group name
* @throws IllegalArgumentException
* If groupName
is a relative unnamed group (e.g.
* "[-4]"), which doesn't exist, or if groupName
is
* an unnamed group whose index is not a parsable integer (e.g.
* "[a]")
*/
String normalizeGroupName(final String groupName) {
// System.out.println(groupName);
if (groupName.startsWith("[") && groupName.endsWith("]")) {
try {
int index = getAbsoluteGroupIndex(parseInt(groupName.substring(1, groupName.length() - 1)),
this.groupCount());
return wrapIndex(index);
} catch (IndexOutOfBoundsException e) {
throw noNamedGroup(groupName);
} catch (IllegalArgumentException e) {
throw noNamedGroup(groupName);
}
}
// else if (groupCounts.get(groupName) == null) {
// try {
// // Check if numbered group
// int groupNumber = Integer.parseInt(groupName);
//
// if (groupCount(groupNumber) != 0)
// groupName = wrapIndex(getAbsoluteGroupIndex(groupNumber, groupCount()));
// } catch (Exception e) {
// }
// }
return groupName;
}
/* Groovy methods - makes RegExPlus groovier */
/**
* 'Case' implementation for this class, which allows
* testing a String against a number of regular expressions (in Groovy only).
* For example:
* switch( str ) {
* case +/one/ :
* // the regex 'one' matches the value of str
* }
*
*
* @param switchValue
* the switch value
* @return true
if the switchValue
is deemed to match this Pattern
* @since 0.2
*/
public boolean isCase(final Object switchValue) {
if (switchValue == null) {
// return caseValue == null;
// Since this != null, always return false if switch value is null
return false;
}
final Matcher matcher = this.matcher(switchValue.toString());
if (matcher.matches()) {
RegExPlusSupport.setLastMatcher(matcher);
return true;
} else {
return false;
}
}
/**
* Alias for {@link #getInternalPattern()}.
*
* @return the regular expression pattern
* @since 0.2
*/
public java.util.regex.Pattern bitwiseNegate() {
return this.getInternalPattern();
}
/**
* Returns this Pattern
.
*
* Added for consistency for use in Groovy, since both +charSequence and +javaPattern are also supported.
* This method ensures that the 'positive' operator will return a RegExPlus Pattern, for all three cases:
*
*
* - Compiling a CharSequence regex:
+charSequence
* - Promoting a Java Pattern:
+javaPattern
* - When used on an existing RegExPlus Pattern:
+regexPlusPattern
*
*
* @return this Pattern
.
*/
public Pattern positive() {
return this;
}
/**
* @param regex
* @return
* @since 0.2
*/
public Pattern or(final CharSequence regex) {
Pattern pattern1 = this.normalize();
return or(pattern1.pattern(), pattern1.flags(), regex, 0);
}
/**
*
* @param pattern
* @return
* @since 0.2
*/
public Pattern or(final Pattern pattern) {
Pattern pattern1 = this.normalize();
Pattern pattern2 = pattern.normalize();
return or(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags());
}
/**
*
* @param pattern
* @return
* @since 0.2
*/
public Pattern or(final java.util.regex.Pattern pattern) {
Pattern pattern1 = this.normalize();
Pattern pattern2 = normalize(pattern);
return or(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags());
}
private static Pattern or(final String regex1, final int flags1, final CharSequence regex2, final int flags2) {
if (flags1 != flags2) {
throw new IllegalArgumentException(
"Flags in normalized patterns must be identical to 'or' them.\n" + "Normalized :\n"
+ "Flags: " + new PatternFlags(flags1) + "\n" + "Pattern: " + regex1 + "\n\n" +
"Normalized :\n" + "Flags: " + new PatternFlags(flags2) + "\n" + "Pattern: "
+ regex2);
}
return lazyCompile("(?|(?:" + regex1 + ")|(?:" + regex2 + "))");
}
public Pattern or(final PatternFlag flag) {
return this.or(flag.intValue());
}
public Pattern or(final Set flags) {
return this.or(PatternFlags.intValue(flags));
}
public Pattern or(final int flags) {
return lazyCompile(this.pattern(), this.flags() | flags);
}
/*
* TODO: implement 'and' using positive look-aheads.
* Works, but what would it match - the entire string?? nothing?
* Not sure how to implement to be most effective.
*/
/*
* TODO: implement a negate operation ('negative' operator) - returns a Pattern which matches everything
* that this Pattern does not.
*/
/**
*
* @param regex
* @return
* @since 0.2
*/
public Pattern plus(final CharSequence regex) {
return lazyCompile(this.pattern() + regex, this.flags());
}
/**
* @param pattern
* @return
* @since 0.2
*/
public Pattern plus(final Pattern pattern) {
Pattern pattern1 = this.normalize();
Pattern pattern2 = pattern.normalize();
return plus(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags());
}
/**
* @param pattern
* @return
* @since 0.2
*/
public Pattern plus(final java.util.regex.Pattern pattern) {
Pattern pattern1 = this.normalize();
Pattern pattern2 = normalize(pattern);
return plus(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags());
}
private static Pattern plus(final String regex1, final int flags1, final CharSequence regex2, final int flags2) {
if (flags2 != 0 && flags2 != flags1) {
throw new IllegalArgumentException(
"Flags in normalized patterns must be 0 or the same as the first pattern to 'add' to existing pattern.\n"
+ "Pattern 1:\n" + "Flags: " + new PatternFlags(flags1) + "\n" + "Pattern: " + regex1
+ "\n\n"
+ "Normalized Pattern 2:\n" + "Flags: " + new PatternFlags(flags2) + "\n" + "Pattern: "
+ regex2);
}
return lazyCompile(regex1 + regex2, flags1);
}
/**
* Returns the actual group number (in the internal pattern) for the given
* mapping name.
*
* @param groupName
* the group name
* @param occurrence
* the occurrence
* @return the mapped index
*/
Integer getMappedIndex(final String groupName, final int occurrence) {
String mappingName = getMappingName(groupName, occurrence);
return this.getMappedIndex(mappingName);
}
/**
* Returns the actual group number (in the internal pattern) for the given
* mapping name.
*
* @param mappingName
* the mapping name
* @return the mapped index
*/
Integer getMappedIndex(final String mappingName) {
return this.getGroupMapping().get(mappingName);
}
/**
* Returns the (internally) used string for mapping a group and occurrence
* (in the original pattern) to its group index (in the refactored pattern).
*
* @param groupName
* the group name
* @param occurrence
* the occurrence
* @return groupName + "[" + occurrence + "]"
*/
static String getMappingName(final String groupName, final int occurrence) {
return groupName + "[" + occurrence + "]";
}
/**
* Returns the (internally) used string for mapping a group and occurrence
* (in the original pattern) to its group index (in the refactored pattern).
*
* @param groupIndex
* the group index
* @param occurrence
* the occurrence
* @return "[" groupIndex + "][" + occurrence + "]"
*/
static String getMappingName(final int groupIndex, final int occurrence) {
return getMappingName(wrapIndex(groupIndex), occurrence);
}
/**
* Returns the group name for the given group index in a "branch reset"
* subpattern
*
* @param groupIndex
* the group number
* @return the group name for the given group index in a "branch reset"
* subpattern
*/
static String wrapIndex(final int groupIndex) {
return "[" + groupIndex + "]";
}
/**
* Returns the given group name, adjusting the case based on
* the {@link #CASE_INSENSITIVE_NAMES} flag.
*
* @param groupName
* the group name
* @return the group name, adjusting the case based on the {@link #CASE_INSENSITIVE_NAMES} flag
*/
// String handleCase(String groupName)
// {
// return hasCaseInsensitiveGroupNames()
// ? groupName.toLowerCase(Locale.ENGLISH) : groupName;
// }
/**
* Returns a regular expression that matches the specified numeric range.
* The returned expression is wrapped in a non-capture group to allow
* easy integration.
*
* The mode parameter has the same form as the leading part of
* a numeric range. The return from
* range(start, end,
* mode) is equivalent to the internal representation
* of a numeric range.
*
* Format for mode parameter: Mode[Base[BaseMode]]
*
* Descriptions and valid values:
*
* - Mode: either "Z" (allows leading zeros) or "NZ" (no
* leading zeros)
*
* - Base: the numeric base for start and end (valid bases, 2 -
* 36)
*
* - BaseMode: whether to allow lower ("L"), upper ("U"), or both
* upper and lower-case digts (omit BaseMode). This mode applies only when
* matching
* numbers in bases above 10. Note that this only affects matching, and that
* both upper lower-case digits can be specified as part of the range in
* the start and end parameters, regardless of this setting.
*
*
If the result doesn't include "letter digits" or if the base is
* ten or less,
* BaseMode has no effect, but can be specified (for
* consistency).
*
*
* @param start
* the start of the range
* @param end
* the end of the range
* @param mode
* a string in the format described above that specifies the mode
* for the numeric range
* @return a regular expression that matches the specified numeric range,
* wrapped in a non-capture group for easy integration
* @throws IllegalArgumentException
* If mode is not in the correct form, as described above
*/
public static String range(final int start, final int end, final String mode) {
// java.util.regex.Matcher rangeMode = Range.rangeModeRegEx.matcher(mode);
// if (!rangeMode.matches())
// throw new IllegalArgumentException("Illegal range mode");
// int base = rangeMode.group(2) == null ? 10 : Integer.parseInt(rangeMode
// .group(2));
RangeMode rangeMode = new RangeMode(mode);
int base = rangeMode.base();
// return "(?:" +
// Range.range(Integer.toString(start, base), Integer.toString(
// end, base), mode) + ")";
return "(?:" + PatternRange.range(Integer.toString(start, base), Integer.toString(end, base), rangeMode) + ")";
}
/**
* Returns a regular expression that matches the specified numeric range.
* The returned expression is wrapped in a non-capture group to allow
* easy integration.
*
* The mode parameter has the same form as the leading part of
* a numeric range. The return from
* range(start, end,
* mode) is equivalent to the internal representation
* of a numeric range.
*
* Format for mode parameter: Mode[Base[BaseMode]]
*
* Descriptions and valid values:
*
* - Mode: either "Z" (allows leading zeros) or "NZ" (no
* leading zeros)
*
* - Base: the numeric base for start and end (valid bases, 2 -
* 36)
*
* - BaseMode: whether to allow lower ("L"), upper ("U"), or both
* upper and lower-case digts (omit BaseMode). This mode applies only when
* matching
* numbers in bases above 10. Note that this only affects matching, and that
* both upper lower-case digits can be specified as part of the range in
* the start and end parameters, regardless of this setting.
*
*
If the result doesn't include "letter digits" or if the base is
* ten or less,
* BaseMode has no effect, but can be specified (for
* consistency).
*
*
* @param start
* the start of the range
* @param end
* the end of the range
* @param mode
* a string in the format described above that specifies the mode
* for the numeric range
* @return a regular expression that matches the specified numeric range,
* wrapped in a non-capture group for easy integration
* @throws NullPointerException
* If either start or end is null
* @throws IllegalArgumentException
* If either start or end
* is the empty string or contains invalid digits for the
* specified base; also thrown if
* mode is not in the correct form, as described above
*/
public static String range(final String start, final String end, final String mode) {
if (start == null) {
throw new NullPointerException("Start value cannot be null");
}
if (start.length() == 0) {
throw new IllegalArgumentException("Start value cannot be the empty string");
}
if (end == null) {
throw new NullPointerException("End value cannot be null");
}
if (end.length() == 0) {
throw new IllegalArgumentException("End value cannot be the empty string");
}
return "(?:" + PatternRange.range(start, end, new RangeMode(mode)) + ")";
}
/**
* Get ThreadLocal for matcher
*
* This is to help handle the fact that the Matcher is not thread-safe
* @param regex The expression to be compiled
* @return a ThreadLocal matcher for the specified regex
* @since 1.0
*/
public static ThreadLocal getThreadLocalMatcher(final String regex) {
Pattern pattern = Pattern.compile(regex);
return ThreadLocal.withInitial(pattern::matcher);
}
/**
* Creates a predicate which can be used to match a string.
*
* @return The predicate which can be used for matching on a string
* @since 1.1
*/
// Added in Java 1.8 Pattern class
public Predicate asPredicate() {
return s -> this.matcher(s).find();
}
/**
* Creates a predicate which can be used to match a string.
*
* Implementation note: this method uses {@link #getThreadLocalMatcher(String)} to reuse the Matcher
* @param regex The regular expression
* @return The predicate which can be used for matching on a string
* @since 1.1
*/
public static Predicate asPredicate(final String regex) {
ThreadLocal matcher = getThreadLocalMatcher(regex);
return s -> matcher.get().reset(s).find();
}
}