java.util.regex.Pattern Maven / Gradle / Ivy
Show all versions of android-all Show documentation
/* * Copyright (C) 2014 The Android Open Source Project * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package java.util.regex; import dalvik.annotation.optimization.ReachabilitySensitive; import libcore.util.NativeAllocationRegistry; import java.util.Iterator; import java.util.ArrayList; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.function.Predicate; import java.util.stream.Stream; import java.util.stream.StreamSupport; import libcore.util.EmptyArray; // Android-changed: Add min API level of 26 for the named capaturing in javadoc /** * A compiled representation of a regular expression. * *
character sequencesA regular expression, specified as a string, must first be compiled into * an instance of this class. The resulting pattern can then be used to create * a {@link Matcher} object that can match arbitrary {@link * java.lang.CharSequence
} against the regular * expression. All of the state involved in performing a match resides in the * matcher, so many matchers can share the same pattern. * *
A typical invocation sequence is thus * *
* ** Pattern p = Pattern.{@link #compile compile}("a*b"); * Matcher m = p.{@link #matcher matcher}("aaaaab"); * boolean b = m.{@link Matcher#matches matches}();A {@link #matches matches} method is defined by this class as a * convenience for when a regular expression is used just once. This method * compiles an expression and matches an input sequence against it in a single * invocation. The statement * *
* * is equivalent to the three statements above, though for repeated matches it * is less efficient since it does not allow the compiled pattern to be reused. * ** boolean b = Pattern.matches("a*b", "aaaaab");Instances of this class are immutable and are safe for use by multiple * concurrent threads. Instances of the {@link Matcher} class are not safe for * such use. * * * *
Summary of regular-expression constructs
* ** *
* ** * *Construct *Matches ** * * Characters * x *The character x * \\ *The backslash character * \0n *The character with octal value 0n * (0 <= n <= 7) * \0nn *The character with octal value 0nn * (0 <= n <= 7) * \0mnn *The character with octal value 0mnn * (0 <= m <= 3, * 0 <= n <= 7) * \xhh *The character with hexadecimal value 0xhh * \uhhhh *The character with hexadecimal value 0xhhhh * \x{h...h} *The character with hexadecimal value 0xh...h * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} * <= 0xh...h <=  * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT}) * \t *The tab character ('\u0009') * \n *The newline (line feed) character ('\u000A') * \r *The carriage-return character ('\u000D') * \f *The form-feed character ('\u000C') * \a *The alert (bell) character ('\u0007') * \e *The escape character ('\u001B') * * \cx *The control character corresponding to x * * * Character classes * [abc] *a, b, or c (simple class) * [^abc] *Any character except a, b, or c (negation) * [a-zA-Z] *a through z * or A through Z, inclusive (range) * [a-d[m-p]] *a through d, * or m through p: [a-dm-p] (union) * [a-z&&[def]] *d, e, or f (intersection) * [a-z&&[^bc]] *a through z, * except for b and c: [ad-z] (subtraction) * [a-z&&[^m-p]] *a through z, * and not m through p: [a-lq-z](subtraction) * * * * Predefined character classes * . *Any character (may or may not match line terminators) * \d *A digit: [0-9] * \D *A non-digit: [^0-9] * \s *A whitespace character: [ \t\n\x0B\f\r] * \S *A non-whitespace character: [^\s] * \w *A word character: [a-zA-Z_0-9] * * \W *A non-word character: [^\w] * * * POSIX character classes (US-ASCII only) * \p{Lower} *A lower-case alphabetic character: [a-z] * \p{Upper} *An upper-case alphabetic character:[A-Z] * \p{ASCII} *All ASCII:[\x00-\x7F] * \p{Alpha} *An alphabetic character:[\p{Lower}\p{Upper}] * \p{Digit} *A decimal digit: [0-9] * \p{Alnum} *An alphanumeric character:[\p{Alpha}\p{Digit}] * * \p{Punct} *Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ * \p{Graph} *A visible character: [\p{Alnum}\p{Punct}] * \p{Print} *A printable character: [\p{Graph}\x20] * \p{Blank} *A space or a tab: [ \t] * \p{Cntrl} *A control character: [\x00-\x1F\x7F] * \p{XDigit} *A hexadecimal digit: [0-9a-fA-F] * * \p{Space} *A whitespace character: [ \t\n\x0B\f\r] * * * java.lang.Character classes (simple java character type) * \p{javaLowerCase} *Equivalent to java.lang.Character.isLowerCase() * \p{javaUpperCase} *Equivalent to java.lang.Character.isUpperCase() * \p{javaWhitespace} *Equivalent to java.lang.Character.isWhitespace() * * \p{javaMirrored} *Equivalent to java.lang.Character.isMirrored() * * * Classes for Unicode scripts, blocks, categories and binary properties * \p{IsLatin} *A Latin script character (script) * \p{InGreek} *A character in the Greek block (block) * \p{Lu} *An uppercase letter (category) * \p{IsAlphabetic} *An alphabetic character (binary property) * \p{Sc} *A currency symbol * \P{InGreek} *Any character except one in the Greek block (negation) * * [\p{L}&&[^\p{Lu}]] *Any letter except an uppercase letter (subtraction) * * * Boundary matchers * ^ *The beginning of a line * $ *The end of a line * \b *A word boundary * \B *A non-word boundary * \A *The beginning of the input * \G *The end of the previous match * \Z *The end of the input but for the final * terminator, if any * * \z *The end of the input * * * Greedy quantifiers * X? *X, once or not at all * X* *X, zero or more times * X+ *X, one or more times * X{n} *X, exactly n times * X{n,} *X, at least n times * * X{n,m} *X, at least n but not more than m times * * * Reluctant quantifiers * X?? *X, once or not at all * X*? *X, zero or more times * X+? *X, one or more times * X{n}? *X, exactly n times * X{n,}? *X, at least n times * * X{n,m}? *X, at least n but not more than m times * * * Possessive quantifiers * X?+ *X, once or not at all * X*+ *X, zero or more times * X++ *X, one or more times * X{n}+ *X, exactly n times * X{n,}+ *X, at least n times * * X{n,m}+ *X, at least n but not more than m times * * * Logical operators * XY *X followed by Y * X|Y *Either X or Y * * (X) *X, as a capturing group * * * Back references * * \n *Whatever the nth * capturing group matched * * \k<name> *Whatever the * named-capturing group "name" matched. Only available for API 26 or above * * * Quotation * \ *Nothing, but quotes the following character * \Q *Nothing, but quotes all characters until \E * * * \E *Nothing, but ends quoting started by \Q * * * Special constructs (named-capturing and non-capturing) * (?<name>X) *X, as a named-capturing group. Only available for API 26 or above. * (?:X) *X, as a non-capturing group * (?idmsuxU-idmsuxU) *Nothing, but turns match flags i * d m s * u x U * on - off * (?idmsux-idmsux:X) *X, as a non-capturing group with the * given flags i d * m s u * x on - off * (?=X) *X, via zero-width positive lookahead * (?!X) *X, via zero-width negative lookahead * (?<=X) *X, via zero-width positive lookbehind * (?<!X) *X, via zero-width negative lookbehind * * (?>X) *X, as an independent, non-capturing group
* * * *Backslashes, escapes, and quoting
* *The backslash character ('\') serves to introduce escaped * constructs, as defined in the table above, as well as to quote characters * that otherwise would be interpreted as unescaped constructs. Thus the * expression \\ matches a single backslash and \{ matches a * left brace. * *
It is an error to use a backslash prior to any alphabetic character that * does not denote an escaped construct; these are reserved for future * extensions to the regular-expression language. A backslash may be used * prior to a non-alphabetic character regardless of whether that character is * part of an unescaped construct. * *
Backslashes within string literals in Java source code are interpreted * as required by * The Java™ Language Specification * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) * It is therefore necessary to double backslashes in string * literals that represent regular expressions to protect them from * interpretation by the Java bytecode compiler. The string literal * "\b", for example, matches a single backspace character when * interpreted as a regular expression, while "\\b" matches a * word boundary. The string literal "\(hello\)" is illegal * and leads to a compile-time error; in order to match the string * (hello) the string literal "\\(hello\\)" * must be used. * * *
Character Classes
* *Character classes may appear within other character classes, and * may be composed by the union operator (implicit) and the intersection * operator (&&). * The union operator denotes a class that contains every character that is * in at least one of its operand classes. The intersection operator * denotes a class that contains every character that is in both of its * operand classes. * *
The precedence of character-class operators is as follows, from * highest to lowest: * *
* **
* 1 *Literal escape *\x * 2 *Grouping *[...] * 3 *Range *a-z * 4 *Union *[a-e][i-u] * 5 *Intersection *[a-z&&[aeiou]] Note that a different set of metacharacters are in effect inside * a character class than outside a character class. For instance, the * regular expression . loses its special meaning inside a * character class, while the expression - becomes a range * forming metacharacter. * * *
Line terminators
* *A line terminator is a one- or two-character sequence that marks * the end of a line of the input character sequence. The following are * recognized as line terminators: * *
* *
*- A newline (line feed) character ('\n'), * *
- A carriage-return character followed immediately by a newline * character ("\r\n"), * *
- A standalone carriage-return character ('\r'), * *
- A next-line character ('\u0085'), * *
- A line-separator character ('\u2028'), or * *
- A paragraph-separator character ('\u2029). * *
If {@link #UNIX_LINES} mode is activated, then the only line terminators * recognized are newline characters. * *
The regular expression . matches any character except a line * terminator unless the {@link #DOTALL} flag is specified. * *
By default, the regular expressions ^ and $ ignore * line terminators and only match at the beginning and the end, respectively, * of the entire input sequence. If {@link #MULTILINE} mode is activated then * ^ matches at the beginning of input and after any line terminator * except at the end of input. When in {@link #MULTILINE} mode $ * matches just before a line terminator or the end of the input sequence. * * *
Groups and capturing
* * *Group number
*Capturing groups are numbered by counting their opening parentheses from * left to right. In the expression ((A)(B(C))), for example, there * are four such groups:
* ** **
* 1 *((A)(B(C))) * 2 *(A) * 3 *(B(C)) * 4 *(C) Group zero always stands for the entire expression. * *
Capturing groups are so named because, during a match, each subsequence * of the input sequence that matches such a group is saved. The captured * subsequence may be used later in the expression, via a back reference, and * may also be retrieved from the matcher once the match operation is complete. * * *
Group name
*The constructs and APIs are available since API level 26. A capturing group * can also be assigned a "name", a named-capturing group, * and then be back-referenced later by the "name". Group names are composed of * the following characters. The first character must be a letter. * *
*
* *- The uppercase letters 'A' through 'Z' * ('\u0041' through '\u005a'), *
- The lowercase letters 'a' through 'z' * ('\u0061' through '\u007a'), *
- The digits '0' through '9' * ('\u0030' through '\u0039'), *
A named-capturing group is still numbered as described in * Group number. * *
The captured input associated with a group is always the subsequence * that the group most recently matched. If a group is evaluated a second time * because of quantification then its previously-captured value, if any, will * be retained if the second evaluation fails. Matching the string * "aba" against the expression (a(b)?)+, for example, leaves * group two set to "b". All captured input is discarded at the * beginning of each match. * *
Groups beginning with (? are either pure, non-capturing groups * that do not capture text and do not count towards the group total, or * named-capturing group. * *
Unicode support
* *This class is in conformance with Level 1 of Unicode Technical * Standard #18: Unicode Regular Expression, plus RL2.1 * Canonical Equivalents. *
* Unicode escape sequences such as \u2014 in Java source code * are processed as described in section 3.3 of * The Java™ Language Specification. * Such escape sequences are also implemented directly by the regular-expression * parser so that Unicode escapes can be used in expressions that are read from * files or from the keyboard. Thus the strings "\u2014" and * "\\u2014", while not equal, compile into the same pattern, which * matches the character with hexadecimal value 0x2014. *
* A Unicode character can also be represented in a regular-expression by * using its Hex notation(hexadecimal code point value) directly as described in construct * \x{...}, for example a supplementary character U+2011F * can be specified as \x{2011F}, instead of two consecutive * Unicode escape sequences of the surrogate pair * \uD840\uDD1F. *
* Unicode scripts, blocks, categories and binary properties are written with * the \p and \P constructs as in Perl. * \p{prop} matches if * the input has the property prop, while \P{prop} * does not match if the input has that property. *
* Scripts, blocks, categories and binary properties can be used both inside * and outside of a character class. * *
* Scripts are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. *
* The script names supported by
Pattern
are the valid script names * accepted and defined by * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. * ** Blocks are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. *
* The block names supported by
Pattern
are the valid block names * accepted and defined by * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. ** * Categories may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. *
* The supported categories are those of * * The Unicode Standard in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. *
* * Binary properties are specified with the prefix {@code Is}, as in * {@code IsAlphabetic}. The supported binary properties by
Pattern
* are **
*- Alphabetic *
- Ideographic *
- Letter *
- Lowercase *
- Uppercase *
- Titlecase *
- Punctuation *
- Control *
- White_Space *
- Digit *
- Hex_Digit *
- Noncharacter_Code_Point *
- Assigned *
* Predefined Character classes and POSIX character classes are in * conformance with the recommendation of Annex C: Compatibility Properties * of Unicode Regular Expression * . *
*
*
** *Classes *Matches ** \p{Lower} *A lowercase character:\p{IsLowercase} * \p{Upper} *An uppercase character:\p{IsUppercase} * \p{ASCII} *All ASCII:[\x00-\x7F] * \p{Alpha} *An alphabetic character:\p{IsAlphabetic} * \p{Digit} *A decimal digit character:p{IsDigit} * \p{Alnum} *An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}] * \p{Punct} *A punctuation character:p{IsPunctuation} * \p{Graph} *A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}] * \p{Print} *A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]] * \p{Blank} *A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]] * \p{Cntrl} *A control character: \p{gc=Cc} * \p{XDigit} *A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}] * \p{Space} *A whitespace character:\p{IsWhite_Space} * \d *A digit: \p{IsDigit} * \D *A non-digit: [^\d] * \s *A whitespace character: \p{IsWhite_Space} * \S *A non-whitespace character: [^\s] * \w *A word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}] * \W *A non-word character: [^\w] * * Categories that behave like the java.lang.Character * boolean ismethodname methods (except for the deprecated ones) are * available through the same \p{prop} syntax where * the specified property has the name javamethodname. * *
Comparison to Perl 5
* *The
Pattern
engine performs traditional NFA-based matching * with ordered alternation as occurs in Perl 5. * *Perl constructs not supported by this class:
* **
* *- * *
Predefined character classes (Unicode character) *
\h A horizontal whitespace *
\H A non horizontal whitespace *
\v A vertical whitespace *
\V A non vertical whitespace *
\R Any Unicode linebreak sequence * \u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029] *
\X Match Unicode * * extended grapheme cluster *
- * *
The backreference constructs, \g{n} for * the nthcapturing group and * \g{name} for * named-capturing group. *
- * *
The named character construct, \N{name} * for a Unicode character by its name. *
- * *
The conditional constructs * (?(condition)X) and * (?(condition)X|Y), *
- * *
The embedded code constructs (?{code}) * and (??{code}),
- * *
The embedded comment syntax (?#comment), and
- * *
The preprocessing operations \l \u, * \L, and \U.
Constructs supported by this class but not by Perl:
* ** *
* *- * *
Character-class union and intersection as described * above.
Notable differences from Perl:
* ** *
* * *- * *
In Perl, \1 through \9 are always interpreted * as back references; a backslash-escaped number greater than 9 is * treated as a back reference if at least that many subexpressions exist, * otherwise it is interpreted, if possible, as an octal escape. In this * class octal escapes must always begin with a zero. In this class, * \1 through \9 are always interpreted as back * references, and a larger number is accepted as a back reference if at * least that many subexpressions exist at that point in the regular * expression, otherwise the parser will drop digits until the number is * smaller or equal to the existing number of groups or it is one digit. *
- * *
Perl uses the g flag to request a match that resumes * where the last match left off. This functionality is provided implicitly * by the {@link Matcher} class: Repeated invocations of the {@link * Matcher#find find} method will resume where the last match left off, * unless the matcher is reset.
- * *
In Perl, embedded flags at the top level of an expression affect * the whole expression. In this class, embedded flags always take effect * at the point at which they appear, whether they are at the top level or * within a group; in the latter case, flags are restored at the end of the * group just as in Perl.
For a more precise description of the behavior of regular expression * constructs, please see * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, * O'Reilly and Associates, 2006. *
* * @see java.lang.String#split(String, int) * @see java.lang.String#split(String) * * @author Mike McCloskey * @author Mark Reinhold * @author JSR-51 Expert Group * @since 1.4 * @spec JSR-51 */ public final class Pattern implements java.io.Serializable { /** * Regular expression modifier values. Instead of being passed as * arguments, they can also be passed as inline modifiers. * For example, the following statements have the same effect. ** RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M); * RegExp r2 = RegExp.compile("(?im)abc", 0); ** * The flags are duplicated so that the familiar Perl match flag * names are available. */ /** * Enables Unix lines mode. * *In this mode, only the '\n' line terminator is recognized * in the behavior of ., ^, and $. * *
Unix lines mode can also be enabled via the embedded flag * expression (?d). */ public static final int UNIX_LINES = 0x01; /** * Enables case-insensitive matching. * *
By default, case-insensitive matching assumes that only characters * in the US-ASCII charset are being matched. Unicode-aware * case-insensitive matching can be enabled by specifying the {@link * #UNICODE_CASE} flag in conjunction with this flag. * *
Case-insensitive matching can also be enabled via the embedded flag * expression (?i). * *
Specifying this flag may impose a slight performance penalty.
*/ public static final int CASE_INSENSITIVE = 0x02; /** * Permits whitespace and comments in pattern. * *In this mode, whitespace is ignored, and embedded comments starting * with # are ignored until the end of a line. * *
Comments mode can also be enabled via the embedded flag * expression (?x). */ public static final int COMMENTS = 0x04; /** * Enables multiline mode. * *
In multiline mode the expressions ^ and $ match * just after or just before, respectively, a line terminator or the end of * the input sequence. By default these expressions only match at the * beginning and the end of the entire input sequence. * *
Multiline mode can also be enabled via the embedded flag * expression (?m).
*/ public static final int MULTILINE = 0x08; /** * Enables literal parsing of the pattern. * *When this flag is specified then the input string that specifies * the pattern is treated as a sequence of literal characters. * Metacharacters or escape sequences in the input sequence will be * given no special meaning. * *
The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on * matching when used in conjunction with this flag. The other flags * become superfluous. * *
There is no embedded flag character for enabling literal parsing. * @since 1.5 */ public static final int LITERAL = 0x10; /** * Enables dotall mode. * *
In dotall mode, the expression . matches any character, * including a line terminator. By default this expression does not match * line terminators. * *
Dotall mode can also be enabled via the embedded flag * expression (?s). (The s is a mnemonic for * "single-line" mode, which is what this is called in Perl.)
*/ public static final int DOTALL = 0x20; /** * Enables Unicode-aware case folding. * *When this flag is specified then case-insensitive matching, when * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner * consistent with the Unicode Standard. By default, case-insensitive * matching assumes that only characters in the US-ASCII charset are being * matched. * *
Unicode-aware case folding can also be enabled via the embedded flag * expression (?u). * *
Specifying this flag may impose a performance penalty.
*/ public static final int UNICODE_CASE = 0x40; /** * Enables canonical equivalence. * *When this flag is specified then two characters will be considered * to match if, and only if, their full canonical decompositions match. * The expression "a\u030A", for example, will match the * string "\u00E5" when this flag is specified. By default, * matching does not take canonical equivalence into account. * *
There is no embedded flag character for enabling canonical * equivalence. * *
Specifying this flag may impose a performance penalty.
*/ public static final int CANON_EQ = 0x80; /** * Enables the Unicode version of Predefined character classes and * POSIX character classes as eefined by Unicode Technical * Standard #18: Unicode Regular Expression * Annex C: Compatibility Properties. ** * This flag has no effect on Android, unicode character classes are always * used. * * @since 1.7 */ public static final int UNICODE_CHARACTER_CLASS = 0x100; /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. */ /** use serialVersionUID from Merlin b59 for interoperability */ private static final long serialVersionUID = 5073258162644648461L; /** * The original regular-expression pattern string. * * @serial */ private final String pattern; /** * The original pattern flags. * * @serial */ private final int flags; @ReachabilitySensitive transient long address; private static final NativeAllocationRegistry registry = new NativeAllocationRegistry( Pattern.class.getClassLoader(), getNativeFinalizer(), nativeSize()); /** * Compiles the given regular expression into a pattern.
* * @param regex * The expression to be compiled * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex) { return new Pattern(regex, 0); } /** * Compiles the given regular expression into a pattern with the given * flags. * * @param regex * The expression to be compiled * * @param flags * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} * and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in flags * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex, int flags) throws PatternSyntaxException { return new Pattern(regex, flags); } /** * Returns the regular expression from which this pattern was compiled. * * * @return The source of this pattern */ public String pattern() { return pattern; } /** *Returns the string representation of this pattern. This * is the regular expression from which this pattern was * compiled.
* * @return The string representation of this pattern * @since 1.5 */ public String toString() { return pattern; } /** * Creates a matcher that will match the given input against this pattern. * * * @param input * The character sequence to be matched * * @return A new matcher for this pattern */ public Matcher matcher(CharSequence input) { Matcher m = new Matcher(this, input); return m; } /** * Returns this pattern's match flags. * * @return The match flags specified when this pattern was compiled */ public int flags() { return flags; } /** * Compiles the given regular expression and attempts to match the given * input against it. * *An invocation of this convenience method of the form * *
* * behaves in exactly the same way as the expression * ** Pattern.matches(regex, input);* ** Pattern.compile(regex).matcher(input).matches()If a pattern is to be used multiple times, compiling it once and reusing * it will be more efficient than invoking this method each time.
* * @param regex * The expression to be compiled * * @param input * The character sequence to be matched * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static boolean matches(String regex, CharSequence input) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); return m.matches(); } /** * Splits the given input sequence around matches of this pattern. * *The array returned by this method contains each substring of the * input sequence that is terminated by another subsequence that matches * this pattern or is terminated by the end of the input sequence. The * substrings in the array are in the order in which they occur in the * input. If this pattern does not match any subsequence of the input then * the resulting array has just one element, namely the input sequence in * string form. * *
The limit parameter controls the number of times the * pattern is applied and therefore affects the length of the resulting * array. If the limit n is greater than zero then the pattern * will be applied at most n - 1 times, the array's * length will be no greater than n, and the array's last entry * will contain all input beyond the last matched delimiter. If n * is non-positive then the pattern will be applied as many times as * possible and the array can have any length. If n is zero then * the pattern will be applied as many times as possible, the array can * have any length, and trailing empty strings will be discarded. * *
The input "boo:and:foo", for example, yields the following * results with these parameters: * *
* * * @param input * The character sequence to be split * * @param limit * The result threshold, as described above * * @return The array of strings computed by splitting the input * around matches of this pattern */ public String[] split(CharSequence input, int limit) { String[] fast = fastSplit(pattern, input.toString(), limit); if (fast != null) { return fast; } int index = 0; boolean matchLimited = limit > 0; ArrayList*
* * Regex
* Limit
Result
* : *2 *{ "boo", "and:foo" } * : *5 *{ "boo", "and", "foo" } * : *-2 *{ "boo", "and", "foo" } * o *5 *{ "b", "", ":and:f", "", "" } * o *-2 *{ "b", "", ":and:f", "", "" } * o *0 *{ "b", "", ":and:f" } matchList = new ArrayList<>(); Matcher m = matcher(input); // Add segments before each match found while(m.find()) { if (!matchLimited || matchList.size() < limit - 1) { String match = input.subSequence(index, m.start()).toString(); matchList.add(match); index = m.end(); } else if (matchList.size() == limit - 1) { // last one String match = input.subSequence(index, input.length()).toString(); matchList.add(match); index = m.end(); } } // If no match was found, return this if (index == 0) return new String[] {input.toString()}; // Add remaining segment if (!matchLimited || matchList.size() < limit) matchList.add(input.subSequence(index, input.length()).toString()); // Construct result int resultSize = matchList.size(); if (limit == 0) while (resultSize > 0 && matchList.get(resultSize-1).equals("")) resultSize--; String[] result = new String[resultSize]; return matchList.subList(0, resultSize).toArray(result); } private static final String FASTSPLIT_METACHARACTERS = "\\?*+[](){}^$.|"; /** * Returns a result equivalent to {@code s.split(separator, limit)} if it's able * to compute it more cheaply than native impl, or null if the caller should fall back to * using native impl. * * fastpath will work if the regex is a * (1)one-char String and this character is not one of the * RegEx's meta characters ".$|()[{^?*+\\", or * (2)two-char String and the first char is the backslash and * the second is one of regEx's meta characters ".$|()[{^?*+\\". * @hide */ public static String[] fastSplit(String re, String input, int limit) { // Can we do it cheaply? int len = re.length(); if (len == 0) { return null; } char ch = re.charAt(0); if (len == 1 && FASTSPLIT_METACHARACTERS.indexOf(ch) == -1) { // We're looking for a single non-metacharacter. Easy. } else if (len == 2 && ch == '\\') { // We're looking for a quoted character. // Quoted metacharacters are effectively single non-metacharacters. ch = re.charAt(1); if (FASTSPLIT_METACHARACTERS.indexOf(ch) == -1) { return null; } } else { return null; } // We can do this cheaply... // Unlike Perl, which considers the result of splitting the empty string to be the empty // array, Java returns an array containing the empty string. if (input.isEmpty()) { return new String[] { "" }; } // Count separators int separatorCount = 0; int begin = 0; int end; while (separatorCount + 1 != limit && (end = input.indexOf(ch, begin)) != -1) { ++separatorCount; begin = end + 1; } int lastPartEnd = input.length(); if (limit == 0 && begin == lastPartEnd) { // Last part is empty for limit == 0, remove all trailing empty matches. if (separatorCount == lastPartEnd) { // Input contains only separators. return EmptyArray.STRING; } // Find the beginning of trailing separators. do { --begin; } while (input.charAt(begin - 1) == ch); // Reduce separatorCount and fix lastPartEnd. separatorCount -= input.length() - begin; lastPartEnd = begin; } // Collect the result parts. String[] result = new String[separatorCount + 1]; begin = 0; for (int i = 0; i != separatorCount; ++i) { end = input.indexOf(ch, begin); result[i] = input.substring(begin, end); begin = end + 1; } // Add last part. result[separatorCount] = input.substring(begin, lastPartEnd); return result; } /** * Splits the given input sequence around matches of this pattern. * * This method works as if by invoking the two-argument {@link * #split(java.lang.CharSequence, int) split} method with the given input * sequence and a limit argument of zero. Trailing empty strings are * therefore not included in the resulting array.
* *The input "boo:and:foo", for example, yields the following * results with these expressions: * *
* * * @param input * The character sequence to be split * * @return The array of strings computed by splitting the input * around matches of this pattern */ public String[] split(CharSequence input) { return split(input, 0); } /** * Returns a literal pattern*
* * Regex
Result
* : *{ "boo", "and", "foo" } * o *{ "b", "", ":and:f" } String
for the specified *String
. * *This method produces a
Metacharacters * or escape sequences in the input sequence will be given no special * meaning. * * @param s The string to be literalized * @return A literal string replacement * @since 1.5 */ public static String quote(String s) { int slashEIndex = s.indexOf("\\E"); if (slashEIndex == -1) return "\\Q" + s + "\\E"; StringBuilder sb = new StringBuilder(s.length() * 2); sb.append("\\Q"); slashEIndex = 0; int current = 0; while ((slashEIndex = s.indexOf("\\E", current)) != -1) { sb.append(s.substring(current, slashEIndex)); current = slashEIndex + 2; sb.append("\\E\\\\E\\Q"); } sb.append(s.substring(current, s.length())); sb.append("\\E"); return sb.toString(); } /** * Recompile the Pattern instance from a stream. The original pattern * string is read in and the object tree is recompiled from it. */ private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { // Read in all fields s.defaultReadObject(); compile(); } /** * This private constructor is used to create all Patterns. The pattern * string and match flags are all that is needed to completely describe * a Pattern. */ private Pattern(String p, int f) { if ((f & CANON_EQ) != 0) { throw new UnsupportedOperationException("CANON_EQ flag not supported"); } int supportedFlags = CASE_INSENSITIVE | COMMENTS | DOTALL | LITERAL | MULTILINE | UNICODE_CASE | UNIX_LINES; if ((f & ~supportedFlags) != 0) { throw new IllegalArgumentException("Unsupported flags: " + (f & ~supportedFlags)); } this.pattern = p; this.flags = f; compile(); } private void compile() throws PatternSyntaxException { if (pattern == null) { throw new NullPointerException("pattern == null"); } String icuPattern = pattern; if ((flags & LITERAL) != 0) { icuPattern = quote(pattern); } // These are the flags natively supported by ICU. // They even have the same value in native code. int icuFlags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES); address = compileImpl(icuPattern, icuFlags); registry.registerNativeAllocation(this, address); } private static native long compileImpl(String regex, int flags); private static native long getNativeFinalizer(); private static native int nativeSize(); /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */ public PredicateString
that can be used to * create aPattern
that would match the string *s
as if it were a literal pattern.asPredicate() { return s -> matcher(s).find(); } /** * Creates a stream from the given input sequence around matches of this * pattern. * * The stream returned by this method contains each substring of the * input sequence that is terminated by another subsequence that matches * this pattern or is terminated by the end of the input sequence. The * substrings in the stream are in the order in which they occur in the * input. Trailing empty strings will be discarded and not encountered in * the stream. * *
If this pattern does not match any subsequence of the input then * the resulting stream has just one element, namely the input sequence in * string form. * *
When there is a positive-width match at the beginning of the input * sequence then an empty leading substring is included at the beginning * of the stream. A zero-width match at the beginning however never produces * such empty leading substring. * *
If the input sequence is mutable, it must remain constant during the * execution of the terminal stream operation. Otherwise, the result of the * terminal stream operation is undefined. * * @param input * The character sequence to be split * * @return The stream of strings computed by splitting the input * around matches of this pattern * @see #split(CharSequence) * @since 1.8 */ public Stream
splitAsStream(final CharSequence input) { class MatcherIterator implements Iterator { private final Matcher matcher; // The start position of the next sub-sequence of input // when current == input.length there are no more elements private int current; // null if the next element, if any, needs to obtained private String nextElement; // > 0 if there are N next empty elements private int emptyElementCount; MatcherIterator() { this.matcher = matcher(input); } public String next() { if (!hasNext()) throw new NoSuchElementException(); if (emptyElementCount == 0) { String n = nextElement; nextElement = null; return n; } else { emptyElementCount--; return ""; } } public boolean hasNext() { if (nextElement != null || emptyElementCount > 0) return true; if (current == input.length()) return false; // Consume the next matching element // Count sequence of matching empty elements while (matcher.find()) { nextElement = input.subSequence(current, matcher.start()).toString(); current = matcher.end(); if (!nextElement.isEmpty()) { return true; } else if (current > 0) { // no empty leading substring for zero-width // match at the beginning of the input emptyElementCount++; } } // Consume last matching element nextElement = input.subSequence(current, input.length()).toString(); current = input.length(); if (!nextElement.isEmpty()) { return true; } else { // Ignore a terminal sequence of matching empty elements emptyElementCount = 0; nextElement = null; return false; } } } return StreamSupport.stream(Spliterators.spliteratorUnknownSize( new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false); } }