All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.codesaway.util.regex.Pattern Maven / Gradle / Ivy

Go to download

Extends Java's regular expression syntax by adding support for additional Perl and .NET syntax.

The newest version!
// TODO: verify group names consisting of solely numbers are handled correctly
/*
 * TODO: need to throw correct exception for cases:
 * 1) Integer values too high
 * 2) Illegal integer values
 */

package info.codesaway.util.regex;

import static info.codesaway.util.regex.Matcher.getAbsoluteGroupIndex;
import static info.codesaway.util.regex.Matcher.noNamedGroup;
import static info.codesaway.util.regex.RefactorUtility.fullGroupName;
import static info.codesaway.util.regex.RefactorUtility.parseInt;
import static info.codesaway.util.regex.RegExPlusSupport.setLastMatcher;

import java.io.Serializable;
import java.lang.reflect.Field;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Predicate;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;

// TODO: finish documenting
// mention that all compiled patterns are now cached

/**
 * A compiled representation of a regular expression.
 *
 * 

This class is an extension * of Java's {@link java.util.regex.Pattern} class. Javadocs were copied and * appended with the added functionality.

* *

A regular expression, specified as a string, must first be compiled into * an instance of this class. The resulting pattern can then be used to create * a {@link Matcher} object that can match arbitrary {@linkplain java.lang.CharSequence character sequences} against the * regular * expression. All of the state involved in performing a match resides in the * matcher, so many matchers can share the same pattern. * *

A typical invocation sequence is thus

* *
 * Pattern p = Pattern.{@link #compile compile}("a*b");
 * Matcher m = p.{@link #matcher matcher}("aaaaab");
 * boolean b = m.{@link Matcher#matches matches}();
* *

A {@link #matches matches} method is defined by this class as a * convenience for when a regular expression is used just once. This method * compiles an expression and matches an input sequence against it in a single * invocation. The statement

* *
 * boolean b = Pattern.matches("a*b", "aaaaab");
* * is equivalent to the three statements above, though for repeated matches it * is less efficient since it does not allow the compiled pattern to be reused. * *

Instances of this class are immutable and are safe for use by multiple * concurrent threads. Instances of the {@link Matcher} class are not safe for * such use.

* *

Summary of regular-expression constructs

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
ConstructMatches
 
Characters
xThe character x
\\The backslash character
\0nThe character with octal value 0n * (0 <= n <= 7)
\0nnThe character with octal value 0nn * (0 <= n <= 7)
\0mnnThe character with octal value 0mnn * (0 <= m <= 3, 0  * <= n <= 7)
\xhhThe character with * hexadecimal value 0xhh
\x{hhh..}The character with * hexadecimal value 0xhhh..
\uhhhhThe character with * hexadecimal value 0xhhhh
\tThe tab character ('\u0009')
\nThe newline (line feed) character * ('\u000A')
\rThe carriage-return character * ('\u000D')
\fThe form-feed character ('\u000C')
\aThe alert (bell) character ('\u0007')
\eThe escape character ('\u001B')
\cxThe control character corresponding to x
 
Character classes
[abc]a, b, or c (simple * class)
[^abc]Any character except a, b, or * c (negation)
[a-zA-Z]a through z or A through * Z, inclusive (range)
[a-d[m-p]]a through d, or m through * p: [a-dm-p] (union)
[a-z&&[def]]d, e, or f (intersection) *
[a-z&&[^bc]]a through z, except for b * and c: [ad-z] (subtraction)
[a-z&&[^m-p]]a through z, and not m * through p: [a-lq-z](subtraction)
 
Predefined character classes
.Any character (may or may not match * line terminators)
\XSingle grapheme - equivalent to * (?>\P{M}\p{M}*)
\dA digit: [0-9]
\DA non-digit: [^0-9]
\sA whitespace character: [ \t\n\x0B\f\r]
\SA non-whitespace character: [^\s]
\wA word character: [a-zA-Z_0-9]
\WA non-word character: [^\w]
 
POSIX character classes (US-ASCII * only)
\p{Lower}A lower-case alphabetic character: [a-z]
\p{Upper}An upper-case alphabetic character: [A-Z]
\p{ASCII}All ASCII: [\x00-\x7F]
\p{Alpha}An alphabetic * character: [\p{Lower}\p{Upper}]
\p{Digit}A decimal digit: [0-9]
\p{Alnum}An alphanumeric character: * [\p{Alpha}\p{Digit}] *
\p{Punct}Punctuation: One of * !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
\p{Graph}A visible character: [\p{Alnum}\p{Punct}]
\p{Print}A printable character: [\p{Graph}\x20]
\p{Blank}A space or a tab: [ \t]
\p{Cntrl}A control character: [\x00-\x1F\x7F]
\p{XDigit}A hexadecimal digit: [0-9a-fA-F]
\p{Space}A whitespace character: [ \t\n\x0B\f\r]
 
POSIX character classes (US-ASCII * only)

(equivalent to the * above POSIX classes - only allowed in a character class)

[:lower:]A lower-case alphabetic character: [a-z]
[:upper:]An upper-case alphabetic character: [A-Z]
[:ascii:]All ASCII: [\x00-\x7F]
[:alpha:]An alphabetic * character: [[:lower:][:upper:]]
[:digit:]A decimal digit: [0-9]
[:alnum:]An alphanumeric character: * [[:alpha:][:digit:]] *
[:punct:]Punctuation: One of * !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
[:graph:]A visible character: [[:alnum:][:punct:]]
[:print:]A printable character: [[:graph:]\x20]
[:blank:]A space or a tab: [ \t]
[:cntrl:]A control character: [\x00-\x1F\x7F]
[:xdigit:]A hexadecimal digit: [0-9a-fA-F]
[:space:]A whitespace character: [ \t\n\x0B\f\r]
[:word:]A word character: [\w]
 
java.lang.Character classes (simple * java character type)
\p{javaLowerCase}Equivalent to java.lang.Character.isLowerCase()
\p{javaUpperCase}Equivalent to java.lang.Character.isUpperCase()
\p{javaWhitespace}Equivalent to java.lang.Character.isWhitespace()
\p{javaMirrored}Equivalent to java.lang.Character.isMirrored()
 
Classes for Unicode blocks and categories
\p{InGreek}A character in the Greek block (simple * block)
\p{Lu}An uppercase letter (simple * category)
\p{Sc}A currency symbol
\P{InGreek}Any character except one in the Greek block * (negation)
* [\p{L}&&[^\p{Lu}]] Any letter except an uppercase letter * (subtraction)
 
Boundary matchers
^The beginning of a line
$The end of a line
\bA word boundary
\BA non-word boundary
\AThe beginning of the input
\GThe end of the previous match
\ZThe end of the input but for the final * terminator, if any
\zThe end of the input
 
Greedy quantifiers
X?X, once or not at all
X*X, zero or more times
X+X, one or more times
* X{n}X, exactly n times
* X{n,}X, at least n times
* X{n,m}X, at least n but not more than m * times
 
Reluctant quantifiers
X??X, once or not at all
X*?X, zero or more times
X+?X, one or more times
* X{n}?X, exactly n times
* X{n,}?X, at least n times
* X{n,m}?X, at least n but not more than m * times
 
Possessive quantifiers
X?+X, once or not at all
X*+X, zero or more times
X++X, one or more times
* X{n}+X, exactly n times
* X{n,}+X, at least n times
* X{n,m}+X, at least n but not more than m * times
 
Logical operators
XYX followed by Y
X|YEither X or Y
 
Capturing
(X)X, as a capturing * group
* (?<name>X)X, as a named-capturing * group
* (?'name'X)X, as a named-capturing group
* (?P<name>X)X, as a named-capturing group
 
Back references
\nWhatever the nth * capturing group matched
\gnWhatever the nth * capturing group matched
* \g{n}Whatever the nth * capturing group matched
  
\g-nRelative back reference
* \g{-n}Relative back reference
  
* \k<name>Whatever the named-capturing group "name" matched
* \k'name'Whatever the named-capturing group * "name" matched
* \g{name}Whatever the named-capturing group * "name" matched
* \k{name}Whatever the named-capturing group * "name" matched
* (?P=name)Whatever the named-capturing group * "name" matched
 
Quotation
\Nothing, but quotes the following character
\QNothing, but quotes all characters until * \E
\ENothing, but ends quoting started by \Q
 
Special constructs (non-capturing)
(?:X)X, as a non-capturing group
* (?idmsuxJn-idmsuxJn) Nothing, but turns match flags {@link #CASE_INSENSITIVE i} {@link #UNIX_LINES d} * {@link #MULTILINE m} {@link #DOTALL s} {@link #UNICODE_CASE u} {@link #COMMENTS x} {@link #DUPLICATE_NAMES J} * {@link #EXPLICIT_CAPTURE n} on - off
(?idmsuxJn-idmsuxJn:X)  X, as a non-capturing group * with the given flags {@link #CASE_INSENSITIVE i} {@link #UNIX_LINES d} {@link #MULTILINE m} {@link #DOTALL s} * {@link #UNICODE_CASE u} {@link #COMMENTS x} {@link #DUPLICATE_NAMES J} {@link #EXPLICIT_CAPTURE n} on * - off
* (?>X)X, as an independent (atomic), non-capturing * group
* (?|X)X, as a "branch reset" * pattern
 
Assertions (non-capturing)
(?=X)X, via zero-width positive lookahead
(?!X)X, via zero-width negative lookahead
* (?<=X)X, via zero-width positive lookbehind
* (?<!X)X, via zero-width negative lookbehind
 
Comment (non-capturing)
* (?x:#comment\n)comment (cannot contain a * line terminator)
* (?xd:#comment\n)comment (cannot contain '\n')
* (?#comment)comment (cannot contain a close parenthesis)
 
Conditional patterns (non-capturing)
* (?(condition)yes-pattern)
* (?(condition)yes-pattern|no-pattern)
*  
* (?(n)...)absolute reference condition
* (?(-n)...)relative reference condition
  
* (?(<name>)...)named reference condition
* (?('name')...)named reference condition
* (?(name)...)named reference condition
 
* (?(assert)...)assert condition
 
Numeric ranges (non-capturing)
* (?Z[start..end])matches a numeric range * (allowing for leading zeros)
* (?Z16[start..end])matches a numeric range in * base 16 (allowing for leading zeros)
 
* (?NZ[start..end])matches a numeric range * (not allowing for leading zeros)
* (?NZ16[start..end])matches a numeric range in * base 16 (not allowing for leading zeros)
* *
* *

Backslashes, escapes, and quoting

* *

The backslash character ('\') serves to introduce escaped * constructs, as defined in the table above, as well as to quote characters * that otherwise would be interpreted as unescaped constructs. Thus the * expression \\ matches a single backslash and \{ matches a * left brace.

* *

It is an error to use a backslash prior to any alphabetic character that * does not denote an escaped construct; these are reserved for future * extensions to the regular-expression language. A backslash may be used * prior to a non-alphabetic character regardless of whether that character is * part of an unescaped construct.

* *

Backslashes within string literals in Java source code are interpreted * as required by the Java Language * Specification as either Unicode * escapes or other character * escapes. It is therefore necessary to double backslashes in string * literals that represent regular expressions to protect them from * interpretation by the Java bytecode compiler. The string literal * "\b", for example, matches a single backspace character when * interpreted as a regular expression, while "\\b" matches a * word boundary. The string literal "\(hello\)" is illegal * and leads to a compile-time error; in order to match the string * (hello) the string literal "\\(hello\\)" * must be used.

* *

Character Classes

* *

Character classes may appear within other character classes, and * may be composed by the union operator (implicit) and the intersection * operator (&&). * The union operator denotes a class that contains every character that is * in at least one of its operand classes. The intersection operator * denotes a class that contains every character that is in both of its * operand classes.

* *

The precedence of character-class operators is as follows, from * highest to lowest:

* *
* * * * * * * * * * * * * * * *
1    Literal escape    \x
2    Grouping[...]
3    Rangea-z
4    Union[a-e][i-u]
5    Intersection[a-z&&[aeiou]]
* *

Note that a different set of metacharacters are in effect inside * a character class than outside a character class. For instance, the * regular expression . loses its special meaning inside a * character class, while the expression - becomes a range * forming metacharacter.

* *

Line terminators

* *

A line terminator is a one- or two-character sequence that marks * the end of a line of the input character sequence. The following are * recognized as line terminators:

* *
    * *
  • A newline (line feed) character ('\n'),
  • * *
  • A carriage-return character followed immediately by a newline * character ("\r\n"),
  • * *
  • A standalone carriage-return character ('\r'),
  • * *
  • A next-line character ('\u0085'),
  • * *
  • A line-separator character ('\u2028'), or
  • * *
  • A paragraph-separator character ('\u2029).
  • * *
*

If {@link #UNIX_LINES} mode is activated, then the only line terminators * recognized are newline characters.

* *

The regular expression . matches any character except a line * terminator unless the {@link #DOTALL} flag is specified.

* *

By default, the regular expressions ^ and $ ignore * line terminators and only match at the beginning and the end, respectively, * of the entire input sequence. If {@link #MULTILINE} mode is activated then * ^ matches at the beginning of input and after any line terminator * except at the end of input. When in {@link #MULTILINE} mode $ * matches just before a line terminator or the end of the input sequence.

* *

Groups and capturing

* *

Group number

* *

Capturing groups are numbered by counting their opening parentheses from * left to right. In the expression ((A)(B(C))), for example, there * are four such groups:

* *
* * * * * * * * *
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
* *

Group zero always stands for the entire expression.

* *

Capturing groups are so named because, during a match, each subsequence * of the input sequence that matches such a group is saved. The captured * subsequence may be used later in the expression, via a back reference, and * may also be retrieved from the matcher once the match operation is * complete.

* *

Note: To use .NET's numbering for capture groups (instead of * Java's), specify the {@link #DOTNET_NUMBERING} flag when compiling a * pattern.

* *

Group name

*

A capturing group can also be assigned a "name", a named-capturing * group, * and then be back-referenced later by the "name". Group names are composed of * the following characters: * *

    *
  • The uppercase letters 'A' through 'Z' * ('\u0041' through '\u005a'),
  • *
  • The lowercase letters 'a' through 'z' * ('\u0061' through '\u007a'),
  • *
  • The digits '0' through '9' * ('\u0030' through '\u0039'),
  • *
  • The underscore character '_' * ('\u005f'),
  • *
* *

A named-capturing group is still numbered as described in * Group number.

* *

The captured input associated with a group is always the subsequence * that the group most recently matched. If a group is evaluated a second time * because of quantification then its previously-captured value, if any, will * be retained if the second evaluation fails. Matching the string * "aba" against the expression (a(b)?)+, for example, leaves * group two set to "b". All captured input is discarded at the * beginning of each match.

* *

Groups beginning with (? are either pure, non-capturing * groups * that do not capture text and do not count towards the group total, or * named-capturing groups.

* *

Note: by default, capture group names must be unique, and if * multiple groups * with the same name exist, * a {@link PatternSyntaxException} is thrown. By setting the {@link #DUPLICATE_NAMES} flag, multiple capture groups * with * the same name are allowed.

* *

Group

* *

A group is either the name of a named-capturing group or a string of * the form groupName[occurrence].

* *

Use a positive occurrence (starting with 1) to refer to a specific * occurrence of the group. A negative occurrence is a relative occurrence of * the group. If the occurrence is omitted, or zero, the reference is to * the * first matched group with the specified group name. For example, * groupName and groupName[0] both refer to the first * matched occurrence of "groupName".

* *

This syntax allows referring to any * capture group in the pattern - even if the case where multiple groups * have the same name (see {@link #DUPLICATE_NAMES}), or the same * number (see "branch reset" * pattern).

* *

Using this syntax, to refer to

* *
    *
  • any group

    *
    *
    groupName
    *
    the group index wrapped in square brackets - a negative number * is a relative reference
    *
    occurrence
    *
    In a "branch reset" pattern more than one * occurrence of the group may exist.
    *
    example
    *
    group "[1]" is equivalent to group 1; group "[1][2]" is the second * occurrence of group 1
    *
    *
  • * *
  • a named group

    *
    *
    groupName
    *
    name of a named-capturing group *
    occurrence
    *
    If the {@link Pattern#DUPLICATE_NAMES} flag is set, more than one * occurrence of the group may exist.
    *
    example
    *
    "myGroup" refers to the first matched occurrence of the named * group, "myGroup", and "myGroup[1]" refers to just the first occurrence.
    *
    *
  • *
* *

"Branch reset" pattern

* *

Quoted from the PCRE manual (the * DUPLICATE SUBPATTERN NUMBERS section)

* *
*

Perl 5.10 introduced a feature where each alternative in a subpattern * uses the same numbers for its capturing parentheses. Such a subpattern starts * with (?| and is itself a non-capturing subpattern. This construct is * useful when you want to capture part, but * not all, of one of a number of alternatives.

* *

Inside a branch reset pattern, capture groups are numbered as * usual, * but the number is reset at the start of each branch. The numbers of any * capturing buffers that follow the subpattern start after the highest number * used in any branch.

*
* *

The following example is taken from the Perl documentation. The numbers * underneath show in which buffer the captured content will be stored.

* *
# before  ---------------branch-reset----------- * after
* / ( a )  (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
* # 1            2 *         2  3  *       2     3 *     4

* *

As a note, nested branch reset patterns are fully supported:

* *
*

(?| ( 1a ) ( 2a ) | ( 1b ) * (?| ( 2b1 ) | ( 2b2 ) ) ) /x
* #     1      2 *        1 *          2 *         2

*
* *

Note: if the {@link #DOTNET_NUMBERING} flag is set, named capture * groups inside * of a branch reset pattern will be numbered as if they were unnamed * groups. The group remains a named group, and can still be referred by * name.

* *
(?|(?<One>1a)(2a)|(1b)(?<Two>2b))
* #  1         2 *    1   2
* *

Unicode support

* *

This class is in conformance with Level 1 of Unicode Technical * Standard #18: Unicode Regular Expression Guidelines, plus RL2.1 * Canonical Equivalents.

* *

Unicode escape sequences such as \u2014 in Java source code * are processed as described in \u00A73.3 * of the Java Language Specification. Such escape sequences are also * implemented directly by the regular-expression parser so that Unicode * escapes can be used in expressions that are read from files or from the * keyboard. Thus the strings "\u2014" and "\\u2014", * while not equal, compile into the same pattern, which matches the character * with hexadecimal value 0x2014.

* *

Unicode blocks and categories are written with the * \p and \P constructs as in * Perl. \p{prop} matches if the input has the * property prop, while \P{prop} does not match * if * the input has that property. Blocks are specified with the prefix * In, as in InMongolian. Categories may be specified with * the optional prefix Is: Both \p{L} and \p{IsL} * denote the category of Unicode letters. Blocks and categories can be used * both inside and outside of a character class.

* *

The supported categories are those of * * The Unicode Standard in the version specified by the {@link java.lang.Character Character} class. The * category names are those * defined in the Standard, both normative and informative. * The block names supported by Pattern are the valid block names * accepted and defined by {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName} .

* *

Categories that behave like the java.lang.Character * boolean ismethodname methods (except for the deprecated ones) are * available through the same \p{prop} syntax where * the specified property has the name javamethodname.

* *

Comparison to Perl 5

* *

The Pattern engine performs traditional NFA-based matching * with ordered alternation as occurs in Perl 5. * *

Perl constructs not supported by this class:

* *
    * *
  • The conditional constructs (?{X}),

  • * *
  • The embedded code constructs (?{code}) * and (??{code}), and

  • * *
  • The preprocessing operations \l \u, * \L, and \U.

  • * *
* *

Constructs supported by this class but not by Perl:

* *
    * *
  • Possessive quantifiers, which greedily match as much as they can * and do not back off, even when doing so would allow the overall match to * succeed.

  • * *
  • Character-class union and intersection as described * above.

  • * *
* *

Notable differences from Perl:

* *
    * *
  • In Perl,\1 through \9 are always interpreted * as back references; a backslash-escaped number greater than 9 is * treated as a back reference if at least that many subexpressions exist, * otherwise it is interpreted, if possible, as an octal escape. In this * class octal escapes must always begin with a zero. In this class, * \1 through \9 are always interpreted as back * references, and a larger number is accepted as a back reference if at * least that many subexpressions exist at that point in the regular * expression, otherwise the parser will drop digits until the number is * smaller or equal to the existing number of groups or it is one * digit.

  • * *
  • Note: specify the {@link #PERL_OCTAL} flag when compiling a pattern * to use Perl's octal syntax (as described above), instead of Java's.

  • * *
  • Perl uses the g flag to request a match that resumes * where the last match left off. This functionality is provided implicitly * by the {@link Matcher} class: Repeated invocations of the {@link Matcher#find find} method will resume where the last * match left off, * unless the matcher is reset.

  • * *
  • In Perl, embedded flags at the top level of an expression affect * the whole expression. In this class, embedded flags always take effect * at the point at which they appear, whether they are at the top level or * within a group; in the latter case, flags are restored at the end of the * group just as in Perl.

  • * *
  • Perl is forgiving about malformed matching constructs, as in the * expression *a, as well as dangling brackets, as in the * expression abc], and treats them as literals. This * class also accepts dangling brackets but is strict about dangling * metacharacters like +, ? and *, and will throw a {@link PatternSyntaxException} if it encounters them.

  • * *
* * *

For a more precise description of the behavior of regular expression * constructs, please see * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, * O'Reilly and Associates, 2006.

* *

Numeric range

* *

Regular expressions may have extensive functionality, but they are * designed to match text, so matching a numeric range requires some extra work. * Since the need to match numeric ranges is sometimes necessary, The * Pattern class has built-in support for handling them.

* *

To allow leading zeros in a match, use the syntax * (?Z[start..end]). In this case, the match's width (number of * digits matched), is between the number of digits in start and the * number of digits in end. For example, (?Z[071..9]) * matches a number between 9 and 71 with between 1 and 3 digits. As the * previous example shows, you can specify a range as [start..end] * or as [end..start]. As a note, a range can have a negative * number for either its start or end, and the syntax remains the * same.

* *

In the case that one bound is * negative, and the other bound is positive, the match's width is as follows. * For a negative number, the number of digits in a match must be between 1 and * the number of digits in the negative bound. For a positive number, the number * of digits is between 1 and the number of digits in the positive bound.

* *

To not allow leading zeros in a match, use the syntax * (?NZ[start..end]). In this case, the match will not contain * any leading zeros. For example, (?NZ[071..9]) will match the * "9", in "09", but it won't match the entire "09", since leading zeros are not * part of the match.

* *

For either format, by default, the numbers are decimal numbers (base 10). * If you want to match a range in a different base, specify the base number * after the "Z" or "NZ". For example, (?Z16[0..ff]) will match a * hex number between 0 and 0xFF - for example, "aa".

* *

When working with bases above 10, letters are used as digits, for example, * in base 16, 'A' through 'F' are used to represent digits 10 through 15. By * default, when matching a number, both upper-case and lower-case digits are * allowed. For example, (?Z16[0..ff]) will match both "AA" and * "aa". By specifying an 'L' or a 'U' after the base number, you can force only * lower or upper-case digits to match. The regex * (?Z16U[0..ff]), for example, will match "AA", but not "aa". Note * that regardless of this setting, in the pattern, either upper-case or * lower-case digits may be used. For bases 10 or less, this setting * has no effect, but, for consistency, can be specified - for example, the regex * (?Z8U[0..377]) is equivalent to (?Z8[0..377]).

* * @see Pattern#split(CharSequence, String, int) * @see Pattern#split(CharSequence, String) */ public final class Pattern implements Serializable { /** * Regular expression modifier values. Instead of being passed as * arguments, they can also be passed as inline modifiers. * For example, the following statements have the same effect. *
	 * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
	 * RegExp r2 = RegExp.compile("(?im)abc", 0);
	 * 
* * The flags are duplicated so that the familiar Perl match flag * names are available. */ /** * Enables Unix lines mode. * *

In this mode, only the '\n' line terminator is recognized in * the behavior of ., ^, and $.

* *

Unix lines mode can also be enabled via the embedded flag * expression (?d).

*/ public static final int UNIX_LINES = java.util.regex.Pattern.UNIX_LINES; /** * Enables case-insensitive matching. * *

By default, case-insensitive * matching assumes that only characters in the US-ASCII charset are being * matched. Unicode-aware case-insensitive matching can be enabled by * specifying the {@link #UNICODE_CASE} flag in conjunction with this * flag.

* *

Case-insensitive matching can also be enabled via the embedded flag * expression (?i).

* *

Specifying this flag may impose a slight performance penalty.

*/ public static final int CASE_INSENSITIVE = java.util.regex.Pattern.CASE_INSENSITIVE; /** * Permits whitespace and comments in pattern. * *

In this mode, whitespace is ignored, and embedded comments starting * with # are ignored until the end of a line.

* *

Comments mode can also be enabled via the embedded flag * expression  (?x).

*/ public static final int COMMENTS = java.util.regex.Pattern.COMMENTS; /** * Enables multiline mode. * *

In multiline mode the expressions ^ and $ match just * after or just before, respectively, a line terminator or the end of the * input sequence. By default these expressions only match at the * beginning * and the end of the entire input sequence.

* *

Multiline mode can also be enabled via the embedded flag * expression (?m).

*/ public static final int MULTILINE = java.util.regex.Pattern.MULTILINE; /** * Enables literal parsing of the pattern. * *

When this flag is specified then the input string that specifies * the * pattern is treated as a sequence of literal characters. Metacharacters or * escape sequences in the input sequence will be given no special * meaning.

* *

The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on * matching when used in conjunction with this flag. The other flags become * superfluous.

* *

There is no embedded flag character for enabling literal * parsing.

*/ public static final int LITERAL = java.util.regex.Pattern.LITERAL; /** * Enables dotall mode. * *

In dotall mode, the expression . matches any character, * including a line terminator. By default this expression does not match * line terminators.

* *

Dotall mode can also be enabled via the embedded flag expression  * (?s). (The s is a mnemonic for "single-line" mode, * which is what this is called in Perl.)

*/ public static final int DOTALL = java.util.regex.Pattern.DOTALL; /** * Enables Unicode-aware case folding. * *

When this flag is specified then case-insensitive matching, when * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner * consistent with the Unicode Standard. By default, case-insensitive * matching assumes that only characters in the US-ASCII charset are being * matched.

* *

Unicode-aware case folding can also be enabled via the embedded flag * expression (?u).

* *

Specifying this flag may impose a performance penalty.

*/ public static final int UNICODE_CASE = java.util.regex.Pattern.UNICODE_CASE; /** * Enables canonical equivalence. * *

When this flag is specified then two characters will be considered to * match if, and only if, their full canonical decompositions match. The * expression "a\u030A", for example, will match the string * "\u00E5" when this flag is specified. By default, matching * does not take canonical equivalence into account.

* *

There is no embedded flag character for enabling canonical * equivalence.

* *

Specifying this flag may impose a performance penalty.

*/ public static final int CANON_EQ = java.util.regex.Pattern.CANON_EQ; /** * Enables the Unicode version of Predefined character classes and * POSIX character classes. * *

When this flag is specified then the (US-ASCII only) * Predefined character classes and POSIX character classes * are in conformance with * Unicode Technical * Standard #18: Unicode Regular Expression * Annex C: Compatibility Properties. *

* The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded * flag expression (?U). *

* The flag implies UNICODE_CASE, that is, it enables Unicode-aware case * folding. *

* Specifying this flag may impose a performance penalty.

* @since 1.2 */ // Added as part of Java 1.7 public static final int UNICODE_CHARACTER_CLASS = java.util.regex.Pattern.UNICODE_CHARACTER_CLASS; /** * Allows duplicate capture group names in pattern. * *

If a pattern has this flag set, multiple capture groups with the same * name are allowed. By default, capture group names must be unique.

* *

Allowing duplicate names can also be enabled via the embedded flag * expression (?J).

*/ public static final int DUPLICATE_NAMES = 0x80000000; /** * When compiling a pattern, verifies that all referenced groups exist. * *

If this flag is set, a {@link PatternSyntaxException} will be thrown * if the pattern contains a reference to a non-existent group, whereas, by * default, no exception would be thrown.

* *

Verification of groups can also be enabled via the embedded flag * expression (?v).

*/ public static final int VERIFY_GROUPS = 0x40000000; /** * Use Perl's octal syntax (instead of Java's). * *

That is, \n is a back reference if at least that many * groups have * occurred at the current point in the pattern. Otherwise, up to the first * three (octal) digits are used to form an octal code, and any additional * trailing digits will be treated literally.

* *

Using Perl's octal syntax can also be enabled via the embedded flag * expression (?o).

*/ public static final int PERL_OCTAL = 0x20000000; /** * Use .NET numbering for capture groups (instead of Java's). * *

In .NET, named-capture groups are numbered like unnamed groups, * but numbering of named groups starts after all unnamed groups have been * counted.

* *

For example, the expression * ((?<One>A)B)?(?<Two>C)(D) * produces the following capturing groups by number and name.

* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Number        * Name        Pattern
0none((?<One>A)B)?(?<Two>C)(D)
1none((?<One>A)B)
2none(D)
3One(?<One>A)
4Two(?<Two>C)
*/ // TODO: check about having inline modifier (doubtful) public static final int DOTNET_NUMBERING = 0x10000000; /** * Enables explicit capture mode. * *

In this mode, unnamed capture groups don't capture - that is, they * are treated like non-capture groups. However, named capture groups can * still be used for capturing (and they acquire numbers in the usual * way).

* *

Explicit capture mode can also be enabled via the embedded flag * expression (?n).

* *

Note: this feature is taken from .NET.

*/ public static final int EXPLICIT_CAPTURE = 0x8000000; /* * Pattern has only two serialized components: The pattern string and the * flags, which are all that is needed to recompile the pattern when it is * deserialized. */ /** *

use serialVersionUID from Merlin b59 for interoperability

* *

Note: this is the same serialVersionUID used in * the Java Pattern class.

*/ private static final long serialVersionUID = 5073258162644648461L; /** the internal {@link java.util.regex.Pattern} object for this pattern. */ private transient java.util.regex.Pattern internalPattern; /** The pattern */ private final String pattern; /** The flags. */ private final int flags; /** * Boolean indicating this Pattern is compiled; this is necessary in order * to lazily compile deserialized Patterns. */ // TODO: learn about Serialization and test Class for correctness private transient volatile boolean compiled = false; /** *

The number of capturing groups in this Pattern.

* *

Note: this is the number of capture groups in the inputted * pattern, which is not necessarily the actual number of capturing groups * in the (refactored) internal pattern.

*/ private transient int capturingGroupCount; /** * Indicates whether groups were added when creating the internal pattern. * *

Note: Some refactorings add capture groups (e.g. "branch reset" subpattern, which are invisible to * outside users, except when using the internal pattern to create a matcher */ private transient boolean addedGroups; /** * A map with mappings from a "mapping name" to the actual group number in * the internal pattern. * *

Note: both named and unnamed groups are included

*/ private transient Map groupMapping; /** * A map with mappings from the group name to the group count for * the group. */ private transient Map groupCounts; /** * @since 0.2 */ // TODO: update cache to detect common pattern which are equivalent ?? // e.g. "(?i)abc" = "abc" with Case-insensitive flag private static final Map patternCache = new Hashtable<>(); /** * A pattern with the RegEx being the empty string */ // public static final Pattern EMPTY_PATTERN = Pattern.compile(""); /** * A compiled Java pattern with the RegEx being the empty string. */ static final java.util.regex.Pattern JAVA_EMPTY_PATTERN = java.util.regex.Pattern.compile(""); /** * Whether to use lazy compiling or to compile on creation (default, how Java patterns are done) * *

Note: changing this setting will not affect Patterns that are already created. To * force a Pattern to compile, call the {@link #forceCompile()} method.

*/ @SuppressFBWarnings("MS_SHOULD_BE_FINAL") public static boolean lazyCompiling = false; /** * Pattern used with the {@link #naturalCompareTo(CharSequence, CharSequence)} function * to provide a natural sort */ private static final java.util.regex.Pattern naturalSort = java.util.regex.Pattern .compile("\\G(?:(\\D++)|0*(\\d++)|$)"); /** The natural comparator */ private static Comparator naturalComparator = Pattern::naturalCompareTo; /** * Returns a comparator which sorts using {@link #naturalCompareTo(CharSequence, CharSequence)}, to treat embedded * numbers as numbers, instead of comparing them lexicographically. * *

NOTE: This comparator is case-sensitive, mimicking String comparisons.

* * @return the natural comparator * @since 0.2 */ public static Comparator getNaturalComparator() { return naturalComparator; } private static class PatternCacheKey { private final String regex; private final int flags; public PatternCacheKey(final String regex, final int flags) { this.regex = regex; this.flags = flags; } /** * {@inheritDoc} */ @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + this.flags; result = prime * result + ((this.regex == null) ? 0 : this.regex.hashCode()); return result; } /** * {@inheritDoc} */ @Override public boolean equals(final Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (this.getClass() != obj.getClass()) { return false; } PatternCacheKey other = (PatternCacheKey) obj; if (this.flags != other.flags) { return false; } if (this.regex == null) { if (other.regex != null) { return false; } } else if (!this.regex.equals(other.regex)) { return false; } return true; } @Override public String toString() { return this.flags + ": " + this.regex; } } /** * Compiles the given regular expression into a pattern. * * @param regex * The expression to be compiled * @return The compiled Pattern * * @throws PatternSyntaxException * If the expression's patternSyntax is invalid */ public static Pattern compile(final String regex) { return compile(regex, 0); } /** * Compiles the given regular expression into a pattern with the given * flags. * * @param regex * The expression to be compiled * * @param flags * the flags * * @return The compiled Pattern * * @throws PatternSyntaxException * If the expression's patternSyntax is invalid */ public static Pattern compile(final String regex, final PatternOptions... flags) { return compile(regex, new PatternFlags(flags).intValue()); } public static Pattern compile(final String regex, final int... flags) { int flagsTotal = 0; for (int flag : flags) { flagsTotal |= flag; } return compile(regex, flagsTotal); } /** * Compiles the given regular expression into a pattern with the given * flags. * * @param regex * The expression to be compiled * * @param flags * Match flags, a bit mask that may include {@link #CASE_INSENSITIVE}, {@link #MULTILINE},  * {@link #DOTALL}, {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, {@link #LITERAL}, * {@link #COMMENTS}, * *

{@link #DUPLICATE_NAMES}, {@link #VERIFY_GROUPS}, {@link #PERL_OCTAL}, {@link #DOTNET_NUMBERING}, * and {@link #EXPLICIT_CAPTURE}

* * @return The compiled Pattern * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in flags * * @throws PatternSyntaxException * If the expression's patternSyntax is invalid */ public static Pattern compile(final String regex, final int flags) { return compile(regex, flags, lazyCompiling); } static Pattern lazyCompile(final String regex) { return lazyCompile(regex, 0); } private static Pattern lazyCompile(final String regex, final int flags) { return compile(regex, flags, true); } public static Pattern compile(final String regex, final int flags, final boolean lazyCompiling) { PatternCacheKey key = new PatternCacheKey(regex, flags); Pattern cachedPattern = patternCache.get(key); if (cachedPattern != null) { return cachedPattern; } synchronized (patternCache) { // System.out.println("Caching (" + flags + "): " + regex); cachedPattern = patternCache.get(key); if (cachedPattern != null) { return cachedPattern; } } Pattern newPattern = new Pattern(regex, flags, lazyCompiling); synchronized (patternCache) { patternCache.put(key, newPattern); } return newPattern; } /** * * @param pattern * @return * @since 0.2 */ public static Pattern valueOf(final java.util.regex.Pattern pattern) { PatternCacheKey key = new PatternCacheKey(pattern.pattern(), pattern.flags()); Pattern cachedPattern = patternCache.get(key); if (cachedPattern != null) { return cachedPattern; } synchronized (patternCache) { cachedPattern = patternCache.get(key); if (cachedPattern != null) { return cachedPattern; } Pattern newPattern = new Pattern(pattern); patternCache.put(key, newPattern); return newPattern; } } /** * Forces this Pattern to compile. * *

If the pattern has already been compiled, then this method returns immediately. Otherwise, the pattern is * compiled.

* *

Starting with version 0.2 of RegExPlus, patterns can be lazily compiled (compiled when needed) by * setting the {@link #lazyCompiling} static field to true. Calling this method forces these lazily * compiled patterns to compile. * * @since 0.2 */ public Pattern forceCompile() { if (!this.compiled) { synchronized (this) { if (!this.compiled) { this.compile(); } } } return this; } /** * Gets the internal pattern * * @return The internal {@link java.util.regex.Pattern} used by this * pattern. */ public java.util.regex.Pattern getInternalPattern() { this.forceCompile(); return this.internalPattern; } /** * Returns the regular expression from which the internal pattern was * compiled. * * @return The source of the internal pattern */ public String internalPattern() { return this.getInternalPattern().pattern(); } /** * Returns the regular expression from which this pattern was compiled. * * @return The source of this pattern */ public String pattern() { return this.pattern; } /** * Indicates whether additional capture groups were added to the internal pattern when refactoring the compiled * regular expression. * * @return * @since 0.2 */ public boolean addedGroups() { this.forceCompile(); return this.addedGroups; } public List getGroupNames() { List groupNames = new ArrayList<>(); for (Entry groupEntry : this.groupCounts.entrySet()) { String groupName = groupEntry.getKey(); int occurrences = groupEntry.getValue(); if (occurrences == 1) { groupNames.add(groupName); } else { for (int i = 1; i <= occurrences; i++) { groupNames.add(groupName + "[" + i + "]"); } } } return Collections.unmodifiableList(groupNames); } /** * Returns the number of capturing groups in this pattern. * *

Group zero denotes the entire pattern by convention. It is not * included in this count.

* *

Any non-negative integer smaller than or equal to the value returned * by this method is guaranteed to be a valid group index for this * matcher.

* * @return The number of capturing groups in this matcher's pattern */ public int groupCount() { this.forceCompile(); return this.capturingGroupCount; } /** * Returns the number of capturing groups (with the given group index) in * this pattern. * *

Note: in most cases, this return will be 1 - the only exception * is in the case * of a "branch reset" pattern, where there may be multiple groups with the * same group index. * *

For example,

*
// Outputs 2, since there are two groups that have the group index of 1
	 * System.out.println(Pattern.compile("(?|(1a)|(1b))").groupCount(1));
*
* *

Group zero denotes the entire pattern by convention. It is not * included in this count.

* *

Any non-negative integer smaller than or equal to the value returned * by this method is guaranteed to be a valid occurrence (for a group, * groupName[occurrence]) for this * matcher.

* *

Note: unlike other methods, this * method doesn't throw an exception if the specified group doesn't exist. * Instead, zero is returned, since the number of groups with the * (non-existent) group name is zero. * * @param group * The group index for a capturing group in this matcher's * pattern * * @return The number of capturing groups (with the given group index) in * this matcher's pattern * @since 0.2 */ public int groupCount(final int group) { String groupName; try { int index = getAbsoluteGroupIndex(group, this.groupCount()); groupName = wrapIndex(index); } catch (IndexOutOfBoundsException e) { return 0; } catch (IllegalArgumentException e) { return 0; } Integer groupCount = this.groupCounts.get(groupName); return groupCount != null ? groupCount : 0; // return groupCount(wrapIndex(group)); } /** * Returns the number of capturing groups (with the given group name) in * this pattern. * *

Group zero denotes the entire pattern by convention. It is not * included in this count.

* *

Any non-negative integer smaller than or equal to the value returned * by this method is guaranteed to be a valid occurrence (for a group, * groupName[occurrence]) for this * matcher.

* *

If groupName is the empty string, this method's return is * equal to the return from {@link #groupCount()}.

* *

Note: unlike other methods, this * method doesn't throw an exception if the specified group doesn't exist. * Instead, zero is returned, since the number of groups with the * (non-existent) group name is zero. * * @param groupName * The group name for a capturing group in this matcher's pattern * * @return The number of capturing groups (with the given group name) in * this matcher's pattern */ public int groupCount(String groupName) { this.forceCompile(); try { groupName = this.normalizeGroupName(groupName); } catch (IllegalArgumentException e) { /* * groupName is a relative unnamed group (e.g. * "[-4]"), which doesn't exist, or an unnamed group whose index is * not a parsable integer (e.g. "[a]") */ // Illegal group means the group count is 0 // TODO: if not parsable int, throw exception return 0; } Integer groupCount = this.groupCounts.get(groupName); return groupCount != null ? groupCount : 0; } /** * Indicates whether this pattern has any capturing groups. * * @return true if this pattern has at least one capturing group; otherwise, false * * @since 0.2 */ public boolean hasGroup() { return this.groupCount() > 0; } /** * Indicates whether this pattern contains the specified group. * * @param group * The group index for a capturing group in this pattern * @return true if this pattern contains the specified group; otherwise, false. * * @since 0.2 */ public boolean hasGroup(final int group) { return this.groupCount(group) > 0; } /** * Indicates whether this pattern contains the specified group. * * @param group * A capturing group in this pattern * @return true if this pattern contains the specified group; otherwise, false. * * @since 0.2 */ public boolean hasGroup(final String group) { // TODO: parse out group name and occurrence and pass to hasGroup(groupName, occurrence) java.util.regex.Matcher matcher = fullGroupName.matcher(group); if (!matcher.matches()) { return false; } String groupName = matcher.group(1); String groupOccurrence = matcher.group(2); int occurrence = groupOccurrence != null ? parseInt(groupOccurrence) : 0; return this.hasGroup(groupName, occurrence); // return containsKey(group); } /** * Indicates whether this matcher contains the specified group. * * @param groupName * The group name for a capturing group in this matcher's pattern * * @param occurrence * The occurrence of the specified group name * @return true if this matcher contains the specified group; otherwise, false. * * @since 0.2 */ @SuppressFBWarnings("RV_RETURN_VALUE_IGNORED_NO_SIDE_EFFECT") public boolean hasGroup(final String groupName, final int occurrence) { int groupCount = this.groupCount(groupName); if (groupCount == 0) { return false; } try { getAbsoluteGroupIndex(occurrence, groupCount); return true; } catch (IndexOutOfBoundsException e) { return false; } } /** * Returns the group mapping. * * @return The group mapping */ Map getGroupMapping() { return this.groupMapping; } /** * Returns the string representation of this pattern. This is the regular * expression from which this pattern was compiled. * * @return The string representation of this pattern */ @Override public String toString() { return this.pattern; } /** * Creates a matcher that will match the empty string against this pattern. * *

This is commonly used to initialize an "empty" matcher with a later call to {@link Matcher#reset(CharSequence)}.

* *

This method can also be used in Java 8 as a Supplier via pattern::matcher

* @return * @since 1.0 */ public Matcher matcher() { return this.matcher(""); } /** * Creates a matcher that will match the given input against this * pattern. * * @param input * The character sequence to be matched * * @return A new matcher for this pattern */ public Matcher matcher(final CharSequence input) { this.forceCompile(); return new Matcher(this.internalPattern.matcher(input), this, input); } /** * Indicates whether the given input partially matches this * Pattern. * *

For the given input to be a partial match, it must be the prefix * of * some valid match. Conversely, if this method returns false, * then appending characters to the given input will never * yield a * match.

* *

For example, given the following pattern to match a decimal number * *

	 * Pattern p = Pattern.compile("\\d+\\.\\d+");
* * The following calls return true * *
	 * p.isPartialMatch("");
	 * p.isPartialMatch("1");
	 * p.isPartialMatch("2");
	 * p.isPartialMatch("9");
	 * p.isPartialMatch("123");
	 * p.isPartialMatch("123.");
	 * p.isPartialMatch("123.456");
	 * // p.matcher("123.456").matches() would also return true (see note below)
* * Whereas these calls return false * *
	 * p.isPartialMatch("a");
	 * p.isPartialMatch(".");
	 * p.isPartialMatch(".4");
	 * p.isPartialMatch(".45");
	 * p.isPartialMatch(".456");
* *

Note: if the * given input would match the pattern, this method * returns true. That is, a match is also a partial match.

* * @param input * The character sequence to be matched * * @return true if, and only if, the given input partially * matches * this pattern */ /* * Original Source: * * http://forums.sun.com/thread.jspa?messageID=4425768#4425768 */ public boolean isPartialMatch(final CharSequence input) { Matcher m = setLastMatcher(this.matcher(input)); if (m.matches()) { return true; } return m.hitEnd(); } /** * Returns this pattern's match flags. * * @return The match flags specified when this pattern was compiled */ public int flags() { return this.flags; } /** * @return * @since 0.2 */ public PatternFlags getFlags() { return new PatternFlags(this.flags()); } /** * @param pattern * @return */ public static Pattern normalize(final java.util.regex.Pattern pattern) { int flags = pattern.flags(); if (flags == 0) { // return this; // Done this way to allow lazy compiling, since normalized forms aren't used for matchers (in my code) // Could instead use new Pattern(pattern), same end result // TODO: instead make valueOf method lazily compilable return lazyCompile(pattern.pattern()); } /* Inline modifiers */ String unixLines = (flags & UNIX_LINES) != 0 ? "d" : ""; String caseInsensitive = (flags & CASE_INSENSITIVE) != 0 ? "i" : ""; String comments = (flags & COMMENTS) != 0 ? "x" : ""; String multiline = (flags & MULTILINE) != 0 ? "m" : ""; String dotall = (flags & DOTALL) != 0 ? "s" : ""; String unicodeCase = (flags & UNICODE_CASE) != 0 ? "u" : ""; String duplicateNames = (flags & DUPLICATE_NAMES) != 0 ? "J" : ""; String explicitCapture = (flags & EXPLICIT_CAPTURE) != 0 ? "n" : ""; String perlOctal = (flags & PERL_OCTAL) != 0 ? "o" : ""; String verifyGroups = (flags & VERIFY_GROUPS) != 0 ? "v" : ""; String unicodeCharacterClass = (flags & UNICODE_CHARACTER_CLASS) != 0 ? "U" : ""; /* No inline modifier */ int canonEq = (flags & CANON_EQ) != 0 ? CANON_EQ : 0; // int verifyGroups = has(VERIFY_GROUPS) ? VERIFY_GROUPS : 0; // int perlOctal = has(PERL_OCTAL) ? PERL_OCTAL : 0; int dotnetNumbering = (flags & DOTNET_NUMBERING) != 0 ? DOTNET_NUMBERING : 0; if ((flags & LITERAL) != 0) { // Not sure if CANON_EQ is included for literal, docs says no, so following the docs String newFlags = caseInsensitive + unicodeCase; if (newFlags.length() != 0) { newFlags = "(?" + newFlags + ")"; } // TODO: see if cannot add constructor that removes need to recompile pattern // Note that java 5 has a bug, where RegExPlus refactors quote blocks with escaped metacharacters // Other java versions, a quoted section does not need refactoring return lazyCompile(newFlags + quote(pattern.pattern())); } else { String newFlags = unixLines + caseInsensitive + comments + multiline + dotall + unicodeCase + duplicateNames + explicitCapture + perlOctal + verifyGroups + unicodeCharacterClass; if (newFlags.length() == 0) { // No changes are necessary, since no flags can be inlined // return this; return lazyCompile(pattern.pattern()); } else { newFlags = "(?" + newFlags + ")"; // TODO: see if cannot add constructor that removes need to recompile pattern return lazyCompile(newFlags + pattern.pattern(), canonEq | dotnetNumbering); // return Pattern.compile(flags + pattern(), canonEq | verifyGroups | dotnetNumbering); // return Pattern.compile(flags + pattern(), canonEq | verifyGroups | perlOctal | dotnetNumbering); } } } /** * Normalizes the pattern by inlining all possible flags. * *

Note: the returned pattern matches the exact same inputs as this pattern.

* * @return the normalized pattern */ public Pattern normalize() { if (this.flags() == 0) { return this; } /* Inline modifiers */ String unixLines = this.has(UNIX_LINES) ? "d" : ""; String caseInsensitive = this.has(CASE_INSENSITIVE) ? "i" : ""; String comments = this.has(COMMENTS) ? "x" : ""; String multiline = this.has(MULTILINE) ? "m" : ""; String dotall = this.has(DOTALL) ? "s" : ""; String unicodeCase = this.has(UNICODE_CASE) ? "u" : ""; String duplicateNames = this.has(DUPLICATE_NAMES) ? "J" : ""; String explicitCapture = this.has(EXPLICIT_CAPTURE) ? "n" : ""; String perlOctal = this.has(PERL_OCTAL) ? "o" : ""; String verifyGroups = this.has(VERIFY_GROUPS) ? "v" : ""; String unicodeCharacterClass = this.has(UNICODE_CHARACTER_CLASS) ? "U" : ""; /* No inline modifier */ int canonEq = this.has(CANON_EQ) ? CANON_EQ : 0; // int verifyGroups = has(VERIFY_GROUPS) ? VERIFY_GROUPS : 0; // int perlOctal = has(PERL_OCTAL) ? PERL_OCTAL : 0; int dotnetNumbering = this.has(DOTNET_NUMBERING) ? DOTNET_NUMBERING : 0; if (this.has(LITERAL)) { // Not sure if CANON_EQ is included for literal, docs says no, so following the docs @SuppressWarnings("hiding") String flags = caseInsensitive + unicodeCase; if (flags.length() != 0) { flags = "(?" + flags + ")"; } // TODO: see if cannot add constructor that removes need to recompile pattern // Note that java 5 has a bug, where RegExPlus refactors quote blocks with escaped metacharacters // Other java versions, a quoted section does not need refactoring return lazyCompile(flags + quote(this.pattern())); } else { @SuppressWarnings("hiding") String flags = unixLines + caseInsensitive + comments + multiline + dotall + unicodeCase + duplicateNames + explicitCapture + perlOctal + verifyGroups + unicodeCharacterClass; if (flags.length() == 0) { // No changes are necessary, since no flags can be inlined return this; } else { flags = "(?" + flags + ")"; // TODO: see if cannot add constructor that removes need to recompile pattern return lazyCompile(flags + this.pattern(), canonEq | dotnetNumbering); // return Pattern.compile(flags + pattern(), canonEq | verifyGroups | dotnetNumbering); // return Pattern.compile(flags + pattern(), canonEq | verifyGroups | perlOctal | dotnetNumbering); } } } /** * Compiles the given regular expression and attempts to match the given * input against it. * *

An invocation of this convenience method of the form

* *
	 * Pattern.matches(regex, input);
* * behaves in exactly the same way as the expression * *
	 * Pattern.compile(regex).matcher(input).matches()
* *

If a pattern is to be used multiple times, compiling it once and * reusing it will be more efficient than invoking this method each * time.

* * @param regex * The expression to be compiled * * @param input * The character sequence to be matched * * @return true if, and only if, the entire region * sequence matches * this matcher's pattern * * @throws PatternSyntaxException * If the expression's patternSyntax is invalid */ public static boolean matches(final String regex, final CharSequence input) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); return m.matches(); } /** * Replaces the first substring of the given input sequence that * matches the * given regular expression with the given replacement. * *

An invocation of this method of the form  * Pattern.replaceFirst(input, * regex, replacement) yields exactly the * same result as the expression * *

{@link Pattern}.{@link Pattern#compile * compile}(regex).{@link Pattern#matcher(java.lang.CharSequence) * matcher}(input).{@link Matcher#replaceFirst * replaceFirst}(replacement)
* *

Note that backslashes (\) and dollar signs ($) in * the replacement string may cause the results to be different than if it * were being treated as a literal replacement string; see {@link Matcher#replaceFirst}. Use * {@link Matcher#quoteReplacement} to * suppress the special meaning of these characters, if desired.

* *

Note: this function serves as a substitute for {@link String#replaceFirst(String, String)}.

* * @param input * The character sequence to be matched * @param regex * The regular expression to which the input sequence is to be * matched * @param replacement * The string to be substituted for the first match * * @return The resulting String * * @throws PatternSyntaxException * If the regular expression's patternSyntax is invalid */ public static String replaceFirst(final CharSequence input, final String regex, final String replacement) { return Pattern.compile(regex).matcher(input).replaceFirst(replacement); } /** * Replaces each substring of the given input sequence that matches * the * given regular expression with the given replacement. * *

An invocation of this method of the form  * Pattern.replaceAll(input, * regex, * replacement) yields exactly the same result as the * expression * *

{@link Pattern}.{@link Pattern#compile * compile}(regex).{@link Pattern#matcher(java.lang.CharSequence) * matcher}(input).{@link Matcher#replaceAll * replaceAll}(replacement)
* *

Note that backslashes (\) and dollar signs ($) in * the replacement string may cause the results to be different than if it * were being treated as a literal replacement string; see {@link Matcher#replaceAll}. Use * {@link Matcher#quoteReplacement} to * suppress the special meaning of these characters, if desired.

* *

Note: this function serves as a substitute for {@link String#replaceAll(String, String)}.

* * @param input * The character sequence to be matched * @param regex * The regular expression to which the input sequence is to be * matched * @param replacement * The string to be substituted for each match * * @return The resulting String * * @throws PatternSyntaxException * If the regular expression's patternSyntax is invalid */ public static String replaceAll(final CharSequence input, final String regex, final String replacement) { return Pattern.compile(regex).matcher(input).replaceAll(replacement); } /** * Splits the given input sequence around matches of the given regular * expression. * *

* The array returned by this method contains each substring of the * input * sequence that is terminated by another substring that matches the given * expression or is terminated by the end of the string. The substrings in * the array are in the order in which they occur in this string. If the * expression does not match any part of the input then the resulting * array * has just one element, namely the input sequence. * *

* The limit parameter controls the number of times the pattern is * applied and therefore affects the length of the resulting array. If the * limit n is greater than zero then the pattern will be applied at * most n - 1 times, the array's length will be no greater * than n, and the array's last entry will contain all input * beyond * the last matched delimiter. If n is non-positive then the pattern * will be applied as many times as possible and the array can have any * length. If n is zero then the pattern will be applied as many * times as possible, the array can have any length, and trailing empty * strings will be discarded. * *

* The string "boo:and:foo", for example, yields the following * results with these parameters: * *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
RegexLimitResult
:2{ "boo", "and:foo" }
:5{ "boo", "and", "foo" }
:-2{ "boo", "and", "foo" }
o5{ "b", "", ":and:f", "", "" }
o-2{ "b", "", ":and:f", "", "" }
o0{ "b", "", ":and:f" }
*
* *

* An invocation of this method of the form  * Pattern.split(input, * regex, * n) yields the same result as the expression * *

{@link Pattern}.{@link Pattern#compile * compile}(regex).{@link Pattern#split(CharSequence, int) * split}(input,limit)
* *

Note: this function serves as a substitute for {@link String#split(String, int)}.

* * @param input * The character sequence to be split * @param regex * The delimiting regular expression * @param limit * The result threshold, as described above * * @return The array of strings computed by splitting the input * sequence around matches of the given regular expression * * @throws PatternSyntaxException * If the regular expression's patternSyntax is invalid */ public static String[] split(final CharSequence input, final String regex, final int limit) { return Pattern.compile(regex).split(input, limit); } /** * Splits this string around matches of the given regular expression. * *

* This method works as if by invoking the three-argument {@link #split(CharSequence, String, int) split} method * with the given * input sequence, expression and a limit argument of zero. Trailing * empty * strings are therefore not included in the resulting array. *

* *

* The string "boo:and:foo", for example, yields the following * results with these expressions: * *

* * * * * * * * * * * * * *
RegexResult
:{ "boo", "and", "foo" }
o{ "b", "", ":and:f" }
*
* *

Note: this function serves as a substitute for {@link String#split(String)}.

* * @param input * The character sequence to be split * @param regex * The delimiting regular expression * * @return The array of strings computed by splitting the input * sequence * around matches of the given regular expression * * @throws PatternSyntaxException * If the regular expression's patternSyntax is invalid */ public static String[] split(final CharSequence input, final String regex) { return Pattern.compile(regex).split(input, 0); } /** * Splits the given input sequence around matches of this pattern. * *

* The array returned by this method contains each substring of the * input * sequence that is terminated by another subsequence that matches this * pattern or is terminated by the end of the input sequence. The * substrings * in the array are in the order in which they occur in the input. If * this * pattern does not match any subsequence of the input then the * resulting * array has just one element, namely the input sequence in string * form. * *

* The limit parameter controls the number of times the pattern is * applied and therefore affects the length of the resulting array. If the * limit n is greater than zero then the pattern will be applied at * most n - 1 times, the array's length will be no greater * than n, and the array's last entry will contain all input * beyond * the last matched delimiter. If n is non-positive then the pattern * will be applied as many times as possible and the array can have any * length. If n is zero then the pattern will be applied as many * times as possible, the array can have any length, and trailing empty * strings will be discarded. * *

* The input "boo:and:foo", for example, yields the following * results with these parameters: *

* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*

* Regex    

*

* Limit    

*

* Result    

:2{ "boo", "and:foo" }
:5{ "boo", "and", "foo" }
:-2{ "boo", "and", "foo" }
o5{ "b", "", ":and:f", "", "" }
o-2{ "b", "", ":and:f", "", "" }
o0{ "b", "", ":and:f" }
*
* * * @param input * The character sequence to be split * * @param limit * The result threshold, as described above * * @return The array of strings computed by splitting the input around * matches of this pattern */ public String[] split(final CharSequence input, final int limit) { return this.getInternalPattern().split(input, limit); } /** * Splits the given input sequence around matches of this pattern. * *

* This method works as if by invoking the two-argument {@link #split(java.lang.CharSequence, int) split} method * with the given * input sequence and a limit argument of zero. Trailing empty strings * are * therefore not included in the resulting array. *

* *

* The input "boo:and:foo", for example, yields the following * results with these expressions: * *

* * * * * * * * * * * * * *
*

* Regex    

*

* Result

:{ "boo", "and", "foo" }
o{ "b", "", ":and:f" }
*
* * * @param input * The character sequence to be split * * @return The array of strings computed by splitting the input around * matches of this pattern */ public String[] split(final CharSequence input) { return this.getInternalPattern().split(input, 0); } /** * Returns a literal pattern String for the specified * String. * *

* This method produces a String that can be used to create a * Pattern that would match the string s as if it * were a literal pattern. *

* *

* Metacharacters or escape sequences in the input sequence will be * given no * special meaning. *

* * @param s * The string to be literalized * @return A literal string replacement */ public static String quote(final String s) { return java.util.regex.Pattern.quote(s); } /** * Java regular expression metacharacters. * *

\.*?+[]{|()^$#

* *

Note: '#' is included in the escapped metacharacters in case {@link Pattern#COMMENTS COMMENTS} is * enabled, and the escaped text in only a part of the regular expression, which occurs when the COMMENTS flag is * enabled.

* *

Note: ']' is included in the case that the regular expression is used in a regex tool that * requires the closing brace to be escaped (for example, Javascript). Java does not require it to be * escaped, but escaping it does no harm.

* *

Note: '}' is included in the case that the regular expression is used in a regex tool that * requires the closing curly brace to be escaped (for example, Android development). Java does not require it to be * escaped, but escaping it does no harm.

*/ public static final String REGEX_METACHARACTERS = "\\.*?+[]{}|()^$#"; /** * Java regular expression metacharacters when within a character class (for example, [abc]). * *

^-[]\#&

* *

Note: '#' is included in the escapped metacharacters in case {@link Pattern#COMMENTS COMMENTS} is * enabled, and the escaped text in only a part of the regular expression, which occurs when the COMMENTS flag is * enabled.

*/ public static final String REGEX_CHAR_CLASS_METACHARACTERS = "^-[]\\#&"; /** Pattern to match any character. */ // private static final Pattern anyCharacterPattern = compile(".", DOTALL); /** * Pattern used to escape metacharacters. */ // static final Pattern ESCAPE_REGEX_METACHARACTERS = escapeMetacharacters(REGEX_METACHARACTERS); /** * Pattern used to escape metacharacters found in a character class. */ // static final Pattern ESCAPE_REGEX_CHAR_CLASS_METACHARS = escapeMetacharacters(REGEX_CHAR_CLASS_METACHARACTERS); /** * Returns a literal pattern String for the specified * String. * *

This method produces a String that can be used to * create a Pattern that would match the string s as * if it were a literal pattern.

* *

Metacharacters or escape sequences in the input sequence will be given no special meaning.

* *

This method escapes the metacharacters specified by {@link #REGEX_METACHARACTERS}: * \.*?+[]{|()^$#

* *

Note: this function escapes each metacharacter individually, * whereas {@link Pattern#quote(String)} uses a \Q..\E block. This * method can be used to create a regular expression to use with tools that don't support \Q..\E * blocks.

* * @param s * The string to be literalized * @return A literal string replacement */ public static String literal(final String s) { return literal(s, REGEX_METACHARACTERS); } /** * Returns a literal pattern String for the specified * String. * *

This method produces a String that can be used to * create a Pattern that would match the string s as * if it were a literal pattern.

* *

The specified metacharacters or escape sequences in the input sequence will be given no special * meaning.

* *

Note: this function escapes each metacharacter individually, * whereas {@link Pattern#quote(String)} uses a \Q..\E block. This * method can be used to create a regular expression to use with tools that don't support \Q..\E * blocks.

* * @param s * The string to be literalized * @param metacharacters * the metacharacters to escape * @return A literal string replacement */ public static String literal(final String s, final String metacharacters) { // literal.length() will be at least s.length, but no more than 2*s.length() StringBuilder literal = new StringBuilder(s.length()); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { literal.append(c); continue; } if (metacharacters.indexOf(c) != -1) { literal.append('\\'); } literal.append(c); } return literal.toString(); // return literal(s, escapeMetacharacters(metacharacters)); } /** * Returns a literal pattern String for the specified * String. * *

This method produces a String that can be used to * create a Pattern that would match the string s as * if it were a literal pattern.

* *

Metacharacters or escape sequences in the input sequence will be given no special meaning.

* *

This method escapes the following metacharacters:

* *

Note: this function escapes each metacharacter individually, * whereas {@link Pattern#quote(String)} uses a \Q..\E block. This * method can be used to create a regular expression to use with tools that don't support \Q..\E * blocks.

* * @param s * The string to be literalized * @param escapeMetachars * pattern to match the metacharacters to escape * @return A literal string replacement */ // static String literal(String s, Pattern escapeMetachars) // { // return escapeMetachars.matcher(s).replaceAll("\\\\$0"); // } /** * Returns a pattern which will match any single metacharacter in the specified metacharacters. * * @param metacharacters * the metacharacters * @return a pattern which will match any single metacharacter in the specified metacharacters */ // private static Pattern escapeMetacharacters(String metacharacters) // { // return compile("[" + literal(metacharacters, anyCharacterPattern) + "]", 0, false); // } /** * Recompile the Pattern instance from a stream. The original pattern string * is read in and the object tree is recompiled from it. */ private void readObject(final java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { // Read in all fields s.defaultReadObject(); // this.caseInsensitiveGroupNames = has(CASE_INSENSITIVE_NAMES); this.groupMapping = new HashMap<>(2); // if length > 0, the Pattern is lazily compiled this.compiled = false; if (this.pattern.length() == 0) { this.initializeEmptyPattern(); } // TODO: cache pattern here?? } private void initializeForZeroGroups() { /* * Expected results (with 0 groups): * * groupMapping : {[0][1]=0} * -size: 1 */ this.groupMapping = new HashMap<>(1); // Map [i][1] -> i (e.g. [0][1]=0) this.groupMapping.put(getMappingName(0, 1), 0); /* * Expected results (with 0 groups): * * groupCounts: {=0, [0]=1} * -size: 2 */ this.groupCounts = new HashMap<>(2); // Map empty string group name to group count this.groupCounts.put("", 0); // Map [i] -> 1 (e.g. [0]=1) this.groupCounts.put(wrapIndex(0), 1); /* Initialize pattern values */ this.capturingGroupCount = 0; this.addedGroups = false; } private void initializeEmptyPattern() { this.initializeForZeroGroups(); this.setInternalPattern(""); this.compiled = true; } /** * This private constructor is used to create all Patterns (other than those upcasted from Java patterns). * The pattern string and match flags are all that is needed to completely describe a * Pattern. * * @param regex * the expression to be compiled * @param flags * match flags bit mask * @param lazyCompiling * whether to lazily compile pattern */ private Pattern(final String regex, final int flags, final boolean lazyCompiling) { // TODO: derive method to perform a simple check on regex to see if refactoring is necessary // many regexes don't need to be refactored, try to detect some // Optimize refactoring this.pattern = regex; this.flags = flags; // this.caseInsensitiveGroupNames = has(CASE_INSENSITIVE_NAMES); if (regex.length() > 0) { if (!lazyCompiling) { this.compile(); } } else { this.initializeEmptyPattern(); } } /** * Constructor to create a Pattern object from a Java Pattern. * *

Note: no compiling or refactoring of the pattern is performed, * since the Pattern is already compiled and valid for use by Java's Regular * Expression engine.

* * @param pattern * the pattern * @since 0.2 */ Pattern(final java.util.regex.Pattern pattern) { this.pattern = pattern.pattern(); this.flags = pattern.flags(); int groupCount = pattern.matcher("").groupCount(); this.capturingGroupCount = groupCount; // Initialize groupMapping and groupCounts this.groupMapping = new HashMap<>(groupCount + 1); this.groupCounts = new HashMap<>(groupCount + 2); /* * Expected results (with three groups): * * groupMapping : {[0][1]=0, [1][1]=1, [2][1]=2, [3][1]=3} * -size: groupCount + 1 * * groupCounts: {=3, [0]=1, [1]=1, [2]=1, [3]=1} * -size: groupCount + 2 */ // Map empty string group name to group count this.groupCounts.put("", groupCount); for (int i = 0; i <= groupCount; i++) { String groupName = wrapIndex(i); // Map [i][1] -> i (e.g. [0][1]=0) this.groupMapping.put(getMappingName(groupName, 1), i); // Map [i] -> 1 (e.g. [0]=1) this.groupCounts.put(groupName, 1); } // For Java 7, adds named groups (if any) try { // Get map of group names -> group index Field field; field = pattern.getClass().getDeclaredField("namedGroups"); // https://stackoverflow.com/a/43013209 // (fix for SpotBugs warning) AccessController.doPrivileged((PrivilegedAction) () -> { field.setAccessible(true); return null; }); @SuppressWarnings("unchecked") Map javaGroupMapping = (Map) field.get(pattern); // Null if doesn't have any named groups if (javaGroupMapping != null) { // Add entries for each named group for (Entry entry : javaGroupMapping.entrySet()) { String groupName = entry.getKey(); Integer groupNumber = entry.getValue(); // Map groupName[1] -> groupNumber this.groupMapping.put(getMappingName(groupName, 1), groupNumber); // Map groupName -> 1 (e.g. [0]=1) // (Always valid, since Java doesn't allow duplicate groups) this.groupCounts.put(groupName, 1); } } } catch (RuntimeException | NoSuchFieldException | IllegalAccessException e) { // Do nothing } this.internalPattern = pattern; this.compiled = true; } /* Methods called ONLY when initializing a Pattern */ void setCapturingGroupCount(final int capturingGroupCount) { this.capturingGroupCount = capturingGroupCount; } void setAddedGroups(final boolean addedGroups) { this.addedGroups = addedGroups; } int getGroupCount(final String groupName) { Integer groupCount = this.groupCounts.get(groupName); return (groupCount == null ? 0 : groupCount); } /** * Gets the mapping from group names to group counts * *

For named groups, the key is the group name

*

For unnamed groups, the key is the group number surrounded by '[' and ']'. For example, group 1 would be the * key "[1]" in the returned map

* * @return an unmodifiable map from group names to group counts */ public Map getGroupCounts() { return Collections.unmodifiableMap(this.groupCounts); } void setGroupCounts(final Map groupCounts) { this.groupCounts = groupCounts; } /** * Refactors the regular expression to an equivalent form usable by Java's {@link java.util.regex.Pattern} class, * and compiles the internal * Pattern using the refactored regular expression. */ private void compile() { // System.out.println("Compiling (" + flags + "): " + pattern); String refactoredPattern; Refactor refactor; if (this.has(LITERAL)) { this.initializeForZeroGroups(); refactor = null; refactoredPattern = this.pattern; } else { this.groupMapping = new HashMap<>(2); // refactor to be used as a RegEx pattern // Refactor refactor = new Refactor(pattern); refactor = new Refactor(this); refactoredPattern = refactor.toString(); } try { // System.out.println(refactoredPattern); // setInternalPattern(refactor.toString()); this.setInternalPattern(refactoredPattern); } catch (java.util.regex.PatternSyntaxException e) { // // on error, show error using the original pattern // // (not refactored form) // // System.out.println(e.getMessage()); // String desc = e.getDescription(); int index; try { // TODO: can sometimes be incorrect with subpatterns // e.g. "(?+1)(?*)" index = refactor == null ? -1 : refactor.changes.getOriginalIndex(e.getIndex()); } catch (IllegalArgumentException e1) { // Unknown original index // Can occurn, for example, in the regex "(?*\\01)" index = -1; } // System.out.println(refactor.result); throw new PatternSyntaxException(desc, this.pattern, index); // // PatternErrorMessage errorMessage = PatternErrorMessage // .getValue(desc); // // if (errorMessage != null) // throw new PatternSyntaxException(errorMessage, pattern, index); // else // throw new PatternSyntaxException(desc, pattern, index); } // catch (info.codesaway.util.regex.PatternSyntaxException e) // { // int index = refactor.changes.getOriginalIndex(e.getIndex()); // PatternErrorMessage errorMessage = e.getErrorMessage(); // // if (errorMessage != null) // throw new PatternSyntaxException(errorMessage, pattern, index); // else // { // int errorCode = e.getErrorCode(); // String desc = e.getDescription(); // // throw new PatternSyntaxException(errorCode, desc, pattern, index); // } // } this.compiled = true; } /** * Sets the internal Pattern to the Pattern * returned when calling * *
{@link java.util.regex.Pattern}.regex , flags) *
* * @param regex * the expression to be compiled * @see java.util.regex.Pattern#compile(String, int) */ private void setInternalPattern(final String regex) { // keep all flags except those introduced in this class this.internalPattern = java.util.regex.Pattern.compile(regex, this.flags & ~(DUPLICATE_NAMES | VERIFY_GROUPS | DOTNET_NUMBERING | EXPLICIT_CAPTURE | PERL_OCTAL)); } /** * Indicates whether a particular flag is set or not. * * @param f * the flag to test * @return true if the particular flag is set */ boolean has(final int f) { return (this.flags & f) != 0; } /** * @param flag * @return * @since 0.2 */ public boolean has(final PatternFlag flag) { return this.has(flag.intValue()); } /** * Compares two character sequences lexigraphically, * except that embedded numbers are treated numerically. * *

For example, when using this method, "1.2.9.1" is less than * "1.2.10.5", whereas a lexigraphical comparison would yield the * opposite.

* *

When comparing, leading zeros are ignored, unless the inputted * sequences are otherwise * equivalent. If the two inputs are identical, then 0 is * returned. Otherwise, the left-most number * where the number of leading zeros differs is used to determine the * ordering. In this case, the one with more leading zeros is first.

* *

For example, the below list is sorted in increasing order:

* *
    *
  1. 2009-1-2
  2. *
  3. 2009-01-05
  4. *
  5. 2009-01-5
  6. *
  7. 2009-1-05
  8. *
  9. 2009-1-5
  10. *
* *

This function can be used to compare versions, dates, and other * numeric based data. Since the comparison is done from left to right, the * format must have the most significant part first. For example, in a date * format, that would be year, month, and then day to sort in chronological * order.

* *

Note that for correct sorting of numeric based data, the format's must * be identical - otherwise, where the formats differ, the sorting is based * on the ascii value of the change in the format. * For example, the date "2009-1-5" is less * than "2009.1.2", but not chronologically before. This result is due to * the ascii value for '-' (\u2d) being less than the ascii value for * '.' (\u2e).

* *

This method can be called in the compare function of a {@link Comparator} object * to provide sorting.

* *
	 * Comparator<String> comparator = new Comparator<String>() {
	 *
	 *   public int compare(String o1, String o2) {
	 *     return naturalCompareTo(o1, o2);
	 *   }
	 * };
* * @param value1 * the first character sequence * @param value2 * the second character sequence * @return 0 if the two values are equal, -1 if the first value is * "less than" the second, and 1 if the first value is * "greater than" the second. * * @see String#compareTo(String) */ public static int naturalCompareTo(final CharSequence value1, final CharSequence value2) { // XXX: don't use regex - performance hit java.util.regex.Matcher matcher1 = naturalSort.matcher(value1); java.util.regex.Matcher matcher2 = naturalSort.matcher(value2); // left-most number with different number of leading zeros // (used to break ties if strings are otherwise equivalent) int leftMostDifference = 0; while (matcher1.find() && matcher2.find()) { String match1 = matcher1.group(); String match2 = matcher2.group(); if (match1.length() == 0) { if (match2.length() == 0) { // equivalent - return leftMostDifference return leftMostDifference; } // string2 is "longer" return -1; } else if (match2.length() == 0) { // string1 is "longer" return 1; } boolean isNumber1 = matcher1.start(2) != -1; boolean isNumber2 = matcher2.start(2) != -1; if (isNumber1 && isNumber2) { // both numbers - compare numerically String number1 = matcher1.group(2); String number2 = matcher2.group(2); if (number1.length() != number2.length()) { return number1.length() < number2.length() ? -1 : 1; } int compareTo = number1.compareTo(number2); if (compareTo != 0) { return compareTo; } if (leftMostDifference == 0) { // only do once - for left-most difference if (match1.length() != match2.length()) { // different number of leading zeros // e.g. "01" and "1" // more leading zeros first "01" < "1" leftMostDifference = match1.length() > match2.length() ? -1 : 1; } } } else { int compareTo = match1.compareTo(match2); if (compareTo != 0) { return compareTo; } } } return 0; } /** * Normalizes the group name. * * @param groupName * the group name * @return the normalized group name * @throws IllegalArgumentException * If groupName is a relative unnamed group (e.g. * "[-4]"), which doesn't exist, or if groupName is * an unnamed group whose index is not a parsable integer (e.g. * "[a]") */ String normalizeGroupName(final String groupName) { // System.out.println(groupName); if (groupName.startsWith("[") && groupName.endsWith("]")) { try { int index = getAbsoluteGroupIndex(parseInt(groupName.substring(1, groupName.length() - 1)), this.groupCount()); return wrapIndex(index); } catch (IndexOutOfBoundsException e) { throw noNamedGroup(groupName); } catch (IllegalArgumentException e) { throw noNamedGroup(groupName); } } // else if (groupCounts.get(groupName) == null) { // try { // // Check if numbered group // int groupNumber = Integer.parseInt(groupName); // // if (groupCount(groupNumber) != 0) // groupName = wrapIndex(getAbsoluteGroupIndex(groupNumber, groupCount())); // } catch (Exception e) { // } // } return groupName; } /* Groovy methods - makes RegExPlus groovier */ /** * 'Case' implementation for this class, which allows * testing a String against a number of regular expressions (in Groovy only). * For example: *
switch( str ) {
	 * case +/one/ :
	 * // the regex 'one' matches the value of str
	 * }
	 * 
* * @param switchValue * the switch value * @return true if the switchValue is deemed to match this Pattern * @since 0.2 */ public boolean isCase(final Object switchValue) { if (switchValue == null) { // return caseValue == null; // Since this != null, always return false if switch value is null return false; } final Matcher matcher = this.matcher(switchValue.toString()); if (matcher.matches()) { RegExPlusSupport.setLastMatcher(matcher); return true; } else { return false; } } /** * Alias for {@link #getInternalPattern()}. * * @return the regular expression pattern * @since 0.2 */ public java.util.regex.Pattern bitwiseNegate() { return this.getInternalPattern(); } /** * Returns this Pattern. * *

Added for consistency for use in Groovy, since both +charSequence and +javaPattern are also supported. * This method ensures that the 'positive' operator will return a RegExPlus Pattern, for all three cases:

* *
    *
  1. Compiling a CharSequence regex: +charSequence
  2. *
  3. Promoting a Java Pattern: +javaPattern
  4. *
  5. When used on an existing RegExPlus Pattern: +regexPlusPattern
  6. *
* * @return this Pattern. */ public Pattern positive() { return this; } /** * @param regex * @return * @since 0.2 */ public Pattern or(final CharSequence regex) { Pattern pattern1 = this.normalize(); return or(pattern1.pattern(), pattern1.flags(), regex, 0); } /** * * @param pattern * @return * @since 0.2 */ public Pattern or(final Pattern pattern) { Pattern pattern1 = this.normalize(); Pattern pattern2 = pattern.normalize(); return or(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags()); } /** * * @param pattern * @return * @since 0.2 */ public Pattern or(final java.util.regex.Pattern pattern) { Pattern pattern1 = this.normalize(); Pattern pattern2 = normalize(pattern); return or(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags()); } private static Pattern or(final String regex1, final int flags1, final CharSequence regex2, final int flags2) { if (flags1 != flags2) { throw new IllegalArgumentException( "Flags in normalized patterns must be identical to 'or' them.\n" + "Normalized :\n" + "Flags: " + new PatternFlags(flags1) + "\n" + "Pattern: " + regex1 + "\n\n" + "Normalized :\n" + "Flags: " + new PatternFlags(flags2) + "\n" + "Pattern: " + regex2); } return lazyCompile("(?|(?:" + regex1 + ")|(?:" + regex2 + "))"); } public Pattern or(final PatternFlag flag) { return this.or(flag.intValue()); } public Pattern or(final Set flags) { return this.or(PatternFlags.intValue(flags)); } public Pattern or(final int flags) { return lazyCompile(this.pattern(), this.flags() | flags); } /* * TODO: implement 'and' using positive look-aheads. * Works, but what would it match - the entire string?? nothing? * Not sure how to implement to be most effective. */ /* * TODO: implement a negate operation ('negative' operator) - returns a Pattern which matches everything * that this Pattern does not. */ /** * * @param regex * @return * @since 0.2 */ public Pattern plus(final CharSequence regex) { return lazyCompile(this.pattern() + regex, this.flags()); } /** * @param pattern * @return * @since 0.2 */ public Pattern plus(final Pattern pattern) { Pattern pattern1 = this.normalize(); Pattern pattern2 = pattern.normalize(); return plus(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags()); } /** * @param pattern * @return * @since 0.2 */ public Pattern plus(final java.util.regex.Pattern pattern) { Pattern pattern1 = this.normalize(); Pattern pattern2 = normalize(pattern); return plus(pattern1.pattern(), pattern1.flags(), pattern2.pattern(), pattern2.flags()); } private static Pattern plus(final String regex1, final int flags1, final CharSequence regex2, final int flags2) { if (flags2 != 0 && flags2 != flags1) { throw new IllegalArgumentException( "Flags in normalized patterns must be 0 or the same as the first pattern to 'add' to existing pattern.\n" + "Pattern 1:\n" + "Flags: " + new PatternFlags(flags1) + "\n" + "Pattern: " + regex1 + "\n\n" + "Normalized Pattern 2:\n" + "Flags: " + new PatternFlags(flags2) + "\n" + "Pattern: " + regex2); } return lazyCompile(regex1 + regex2, flags1); } /** * Returns the actual group number (in the internal pattern) for the given * mapping name. * * @param groupName * the group name * @param occurrence * the occurrence * @return the mapped index */ Integer getMappedIndex(final String groupName, final int occurrence) { String mappingName = getMappingName(groupName, occurrence); return this.getMappedIndex(mappingName); } /** * Returns the actual group number (in the internal pattern) for the given * mapping name. * * @param mappingName * the mapping name * @return the mapped index */ Integer getMappedIndex(final String mappingName) { return this.getGroupMapping().get(mappingName); } /** * Returns the (internally) used string for mapping a group and occurrence * (in the original pattern) to its group index (in the refactored pattern). * * @param groupName * the group name * @param occurrence * the occurrence * @return groupName + "[" + occurrence + "]" */ static String getMappingName(final String groupName, final int occurrence) { return groupName + "[" + occurrence + "]"; } /** * Returns the (internally) used string for mapping a group and occurrence * (in the original pattern) to its group index (in the refactored pattern). * * @param groupIndex * the group index * @param occurrence * the occurrence * @return "[" groupIndex + "][" + occurrence + "]" */ static String getMappingName(final int groupIndex, final int occurrence) { return getMappingName(wrapIndex(groupIndex), occurrence); } /** * Returns the group name for the given group index in a "branch reset" * subpattern * * @param groupIndex * the group number * @return the group name for the given group index in a "branch reset" * subpattern */ static String wrapIndex(final int groupIndex) { return "[" + groupIndex + "]"; } /** * Returns the given group name, adjusting the case based on * the {@link #CASE_INSENSITIVE_NAMES} flag. * * @param groupName * the group name * @return the group name, adjusting the case based on the {@link #CASE_INSENSITIVE_NAMES} flag */ // String handleCase(String groupName) // { // return hasCaseInsensitiveGroupNames() // ? groupName.toLowerCase(Locale.ENGLISH) : groupName; // } /** * Returns a regular expression that matches the specified numeric range. * The returned expression is wrapped in a non-capture group to allow * easy integration. * *

The mode parameter has the same form as the leading part of * a numeric range. The return from * range(start, end, * mode) is equivalent to the internal representation * of a numeric range.

* *

Format for mode parameter: Mode[Base[BaseMode]]

* *

Descriptions and valid values:

*
    *
  • Mode: either "Z" (allows leading zeros) or "NZ" (no * leading zeros)
  • * *
  • Base: the numeric base for start and end (valid bases, 2 - * 36)
  • * *
  • BaseMode: whether to allow lower ("L"), upper ("U"), or both * upper and lower-case digts (omit BaseMode). This mode applies only when * matching * numbers in bases above 10. Note that this only affects matching, and that * both upper lower-case digits can be specified as part of the range in * the start and end parameters, regardless of this setting. * *

    If the result doesn't include "letter digits" or if the base is * ten or less, * BaseMode has no effect, but can be specified (for * consistency).

  • *
* * @param start * the start of the range * @param end * the end of the range * @param mode * a string in the format described above that specifies the mode * for the numeric range * @return a regular expression that matches the specified numeric range, * wrapped in a non-capture group for easy integration * @throws IllegalArgumentException * If mode is not in the correct form, as described above */ public static String range(final int start, final int end, final String mode) { // java.util.regex.Matcher rangeMode = Range.rangeModeRegEx.matcher(mode); // if (!rangeMode.matches()) // throw new IllegalArgumentException("Illegal range mode"); // int base = rangeMode.group(2) == null ? 10 : Integer.parseInt(rangeMode // .group(2)); RangeMode rangeMode = new RangeMode(mode); int base = rangeMode.base(); // return "(?:" + // Range.range(Integer.toString(start, base), Integer.toString( // end, base), mode) + ")"; return "(?:" + PatternRange.range(Integer.toString(start, base), Integer.toString(end, base), rangeMode) + ")"; } /** * Returns a regular expression that matches the specified numeric range. * The returned expression is wrapped in a non-capture group to allow * easy integration. * *

The mode parameter has the same form as the leading part of * a numeric range. The return from * range(start, end, * mode) is equivalent to the internal representation * of a numeric range.

* *

Format for mode parameter: Mode[Base[BaseMode]]

* *

Descriptions and valid values:

*
    *
  • Mode: either "Z" (allows leading zeros) or "NZ" (no * leading zeros)
  • * *
  • Base: the numeric base for start and end (valid bases, 2 - * 36)
  • * *
  • BaseMode: whether to allow lower ("L"), upper ("U"), or both * upper and lower-case digts (omit BaseMode). This mode applies only when * matching * numbers in bases above 10. Note that this only affects matching, and that * both upper lower-case digits can be specified as part of the range in * the start and end parameters, regardless of this setting. * *

    If the result doesn't include "letter digits" or if the base is * ten or less, * BaseMode has no effect, but can be specified (for * consistency).

  • *
* * @param start * the start of the range * @param end * the end of the range * @param mode * a string in the format described above that specifies the mode * for the numeric range * @return a regular expression that matches the specified numeric range, * wrapped in a non-capture group for easy integration * @throws NullPointerException * If either start or end is null * @throws IllegalArgumentException * If either start or end * is the empty string or contains invalid digits for the * specified base; also thrown if * mode is not in the correct form, as described above */ public static String range(final String start, final String end, final String mode) { if (start == null) { throw new NullPointerException("Start value cannot be null"); } if (start.length() == 0) { throw new IllegalArgumentException("Start value cannot be the empty string"); } if (end == null) { throw new NullPointerException("End value cannot be null"); } if (end.length() == 0) { throw new IllegalArgumentException("End value cannot be the empty string"); } return "(?:" + PatternRange.range(start, end, new RangeMode(mode)) + ")"; } /** * Get ThreadLocal for matcher * *

This is to help handle the fact that the Matcher is not thread-safe

* @param regex The expression to be compiled * @return a ThreadLocal matcher for the specified regex * @since 1.0 */ public static ThreadLocal getThreadLocalMatcher(final String regex) { Pattern pattern = Pattern.compile(regex); return ThreadLocal.withInitial(pattern::matcher); } /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.1 */ // Added in Java 1.8 Pattern class public Predicate asPredicate() { return s -> this.matcher(s).find(); } /** * Creates a predicate which can be used to match a string. * *

Implementation note: this method uses {@link #getThreadLocalMatcher(String)} to reuse the Matcher

* @param regex The regular expression * @return The predicate which can be used for matching on a string * @since 1.1 */ public static Predicate asPredicate(final String regex) { ThreadLocal matcher = getThreadLocalMatcher(regex); return s -> matcher.get().reset(s).find(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy