org.apache.xerces.impl.xpath.regex.RegularExpression Maven / Gradle / Ivy
Show all versions of xercesImpl Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.xerces.impl.xpath.regex;
import java.text.CharacterIterator;
import java.util.Locale;
import java.util.Stack;
import org.apache.xerces.util.IntStack;
/**
* A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
* This engine does not conform to the POSIX regular expression.
*
*
* How to use
*
*
* - A. Standard way
*
-
*
* RegularExpression re = new RegularExpression(regex);
* if (re.matches(text)) { ... }
*
*
* - B. Capturing groups
*
-
*
* RegularExpression re = new RegularExpression(regex);
* Match match = new Match();
* if (re.matches(text, match)) {
* ... // You can refer captured texts with methods of the Match
class.
* }
*
*
*
*
* Case-insensitive matching
*
* RegularExpression re = new RegularExpression(regex, "i");
* if (re.matches(text) >= 0) { ...}
*
*
* Options
* You can specify options to RegularExpression(
regex,
options)
* or setPattern(
regex,
options)
.
* This options parameter consists of the following characters.
*
*
* "i"
* - This option indicates case-insensitive matching.
*
"m"
* - ^ and $ consider the EOL characters within the text.
*
"s"
* - . matches any one character.
*
"u"
* - Redefines \d \D \w \W \s \S \b \B \< \> as becoming to Unicode.
*
"w"
* - By this option, \b \B \< \> are processed with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
* When "w" and "u" are specified at the same time,
* \b \B \< \> are processed for the "w" option.
*
","
* - The parser treats a comma in a character class as a range separator.
* [a,b] matches a or , or b without this option.
* [a,b] matches a or b with this option.
*
*
"X"
* -
* By this option, the engine confoms to XML Schema: Regular Expression.
* The
match()
method does not do subsring matching
* but entire string matching.
*
*
*
*
* Syntax
*
*
*
* Differences from the Perl 5 regular expression
*
* - There is 6-digit hexadecimal character representation (\u005cvHHHHHH.)
*
- Supports subtraction, union, and intersection operations for character classes.
*
- Not supported: \ooo (Octal character representations),
* \G, \C, \lc,
* \u005c uc, \L, \U,
* \E, \Q, \N{name},
* (?{code}), (??{code})
*
*
*
*
*
* Meta characters are `. * + ? { [ ( ) | \ ^ $'.
*
* - Character
*
* - . (A period)
*
- Matches any one character except the following characters.
*
- LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
* PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
*
- This expression matches one code point in Unicode. It can match a pair of surrogates.
*
- When the "s" option is specified,
* it matches any character including the above four characters.
*
*
- \e \f \n \r \t
*
- Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
* CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
*
*
- \cC
*
- Matches a control character.
* The C must be one of '@', 'A'-'Z',
* '[', '\u005c', ']', '^', '_'.
* It matches a control character of which the character code is less than
* the character code of the C by 0x0040.
*
- For example, a \cJ matches a LINE FEED (U+000A),
* and a \c[ matches an ESCAPE (U+001B).
*
*
- a non-meta character
*
- Matches the character.
*
*
- \ + a meta character
*
- Matches the meta character.
*
*
- \u005cxHH \u005cx{HHHH}
*
- Matches a character of which code point is HH (Hexadecimal) in Unicode.
* You can write just 2 digits for \u005cxHH, and
* variable length digits for \u005cx{HHHH}.
*
*
*
*
- \u005cvHHHHHH
*
- Matches a character of which code point is HHHHHH (Hexadecimal) in Unicode.
*
*
- \g
*
- Matches a grapheme.
*
- It is equivalent to (?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*
*
*
- \X
*
- Matches a combining character sequence.
* It is equivalent to (?:\PM\pM*)
*
*
*
* - Character class
*
+ * - [R1R2...Rn] (without "," option)
+ *
- [R1,R2,...,Rn] (with "," option)
*
- Positive character class. It matches a character in ranges.
*
- Rn:
*
* - A character (including \e \f \n \r \t \u005cxHH \u005cx{HHHH} \u005cvHHHHHH)
*
This range matches the character.
*
- C1-C2
*
This range matches a character which has a code point that is >= C1's code point and <= C2's code point.
+ *
- A POSIX character class: [:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:],
+ * and negative POSIX character classes in Perl like [:^alpha:]
*
...
*
- \d \D \s \S \w \W \p{name} \P{name}
*
These expressions specifies the same ranges as the following expressions.
*
* Enumerated ranges are merged (union operation).
* [a-ec-z] is equivalent to [a-z]
*
*
- [^R1R2...Rn] (without a "," option)
*
- [^R1,R2,...,Rn] (with a "," option)
*
- Negative character class. It matches a character not in ranges.
*
*
- (?[ranges]op[ranges]op[ranges] ... )
* (op is - or + or &.)
*
- Subtraction or union or intersection for character classes.
*
- For exmaple, (?[A-Z]-[CF]) is equivalent to [A-BD-EG-Z], and (?[0x00-0x7f]-[K]&[\p{Lu}]) is equivalent to [A-JL-Z].
*
- The result of this operations is a positive character class
* even if an expression includes any negative character classes.
* You have to take care on this in case-insensitive matching.
* For instance, (?[^b]) is equivalent to [\x00-ac-\x{10ffff}],
* which is equivalent to [^b] in case-sensitive matching.
* But, in case-insensitive matching, (?[^b]) matches any character because
* it includes 'B' and 'B' matches 'b'
* though [^b] is processed as [^Bb].
*
*
- [R1R2...-[RnRn+1...]] (with an "X" option)
* - Character class subtraction for the XML Schema.
* You can use this syntax when you specify an "X" option.
*
*
- \d
*
- Equivalent to [0-9].
*
- When a "u" option is set, it is equivalent to
* \p{Nd}.
*
*
- \D
*
- Equivalent to [^0-9]
*
- When a "u" option is set, it is equivalent to
* \P{Nd}.
*
*
- \s
*
- Equivalent to [ \f\n\r\t]
*
- When a "u" option is set, it is equivalent to
* [ \f\n\r\t\p{Z}].
*
*
- \S
*
- Equivalent to [^ \f\n\r\t]
*
- When a "u" option is set, it is equivalent to
* [^ \f\n\r\t\p{Z}].
*
*
- \w
*
- Equivalent to [a-zA-Z0-9_]
*
- When a "u" option is set, it is equivalent to
* [\p{Lu}\p{Ll}\p{Lo}\p{Nd}_].
*
*
- \W
*
- Equivalent to [^a-zA-Z0-9_]
*
- When a "u" option is set, it is equivalent to
* [^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_].
*
*
- \p{name}
*
- Matches one character in the specified General Category (the second field in UnicodeData.txt) or the specified Block.
* The following names are available:
*
* - Unicode General Categories:
*
-
* L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
* Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
*
*
- (Currently the Cn category includes U+10000-U+10FFFF characters)
*
- Unicode Blocks:
*
-
* Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
* IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
* Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
* Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
* Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
* Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
* Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
* Miscellaneous Technical, Control Pictures, Optical Character Recognition,
* Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
* Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
* Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
* Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
* Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
* Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
* Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
* Small Form Variants, Arabic Presentation Forms-B, Specials,
* Halfwidth and Fullwidth Forms
*
*
- Others:
*
- ALL (Equivalent to [\u005cu0000-\u005cv10FFFF])
*
- ASSGINED (\p{ASSIGNED} is equivalent to \P{Cn})
*
- UNASSGINED
* (\p{UNASSIGNED} is equivalent to \p{Cn})
*
*
* - \P{name}
*
- Matches one character not in the specified General Category or the specified Block.
*
*
*
* - Selection and Quantifier
*
* - X|Y
*
- ...
*
*
- X*
*
- Matches 0 or more X.
*
*
- X+
*
- Matches 1 or more X.
*
*
- X?
*
- Matches 0 or 1 X.
*
*
- X{number}
*
- Matches number times.
*
*
- X{min,}
*
- ...
*
*
- X{min,max}
*
- ...
*
*
- X*?
*
- X+?
*
- X??
*
- X{min,}?
*
- X{min,max}?
*
- Non-greedy matching.
*
*
*
* - Grouping, Capturing, and Back-reference
*
* - (?:X)
*
- Grouping. "foo+" matches "foo" or "foooo".
* If you want it matches "foofoo" or "foofoofoo",
* you have to write "(?:foo)+".
*
*
- (X)
*
- Grouping with capturing.
* It make a group and applications can know
* where in target text a group matched with methods of a
Match
instance
* after matches(String,Match)
.
* The 0th group means whole of this regular expression.
* The Nth gorup is the inside of the Nth left parenthesis.
*
* For instance, a regular expression is
* " *([^<:]*) +<([^>]*)> *"
* and target text is
* "From: TAMURA Kent <[email protected]>":
*
* Match.getCapturedText(0)
:
* " TAMURA Kent <[email protected]>"
* Match.getCapturedText(1)
: "TAMURA Kent"
* Match.getCapturedText(2)
: "[email protected]"
*
*
* - \1 \2 \3 \4 \5 \6 \7 \8 \9
*
-
*
*
- (?>X)
*
- Independent expression group. ................
*
*
- (?options:X)
*
- (?options-options2:X)
*
- ............................
*
- The options or the options2 consists of 'i' 'm' 's' 'w'.
* Note that it can not contain 'u'.
*
*
- (?options)
*
- (?options-options2)
*
- ......
*
- These expressions must be at the beginning of a group.
*
*
*
* - Anchor
*
* - \A
*
- Matches the beginnig of the text.
*
*
- \Z
*
- Matches the end of the text, or before an EOL character at the end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
*
*
- \z
*
- Matches the end of the text.
*
*
- ^
*
- Matches the beginning of the text. It is equivalent to \A.
*
- When a "m" option is set,
* it matches the beginning of the text, or after one of EOL characters (
* LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
* PARAGRAPH SEPARATOR (U+2029).)
*
*
- $
*
- Matches the end of the text, or before an EOL character at the end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
*
- When a "m" option is set,
* it matches the end of the text, or before an EOL character.
*
*
- \b
*
- Matches word boundary.
* (See a "w" option)
*
*
- \B
*
- Matches non word boundary.
* (See a "w" option)
*
*
- \<
*
- Matches the beginning of a word.
* (See a "w" option)
*
*
- \>
*
- Matches the end of a word.
* (See a "w" option)
*
*
* - Lookahead and lookbehind
*
* - (?=X)
*
- Lookahead.
*
*
- (?!X)
*
- Negative lookahead.
*
*
- (?<=X)
*
- Lookbehind.
*
- (Note for text capturing......)
*
*
- (?<!X)
*
- Negative lookbehind.
*
*
*
* - Misc.
*
* - (?(condition)yes-pattern|no-pattern),
*
- (?(condition)yes-pattern)
*
- ......
*
- (?#comment)
*
- Comment. A comment string consists of characters except ')'.
* You can not write comments in character classes and before quantifiers.
*
*
*
*
*
*
* BNF for the regular expression
*
* regex ::= ('(?' options ')')? term ('|' term)*
* term ::= factor+
* factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
* | '(?#' [^)]* ')'
* minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
* | '(?>' regex ')' | '(?' options ':' regex ')'
* | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
* options ::= [imsw]* ('-' [imsw]+)?
* anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
* looks ::= '(?=' regex ')' | '(?!' regex ')'
* | '(?<=' regex ')' | '(?<!' regex ')'
* char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
* category-block ::= '\' [pP] category-symbol-1
* | ('\p{' | '\P{') (category-symbol | block-name
* | other-properties) '}'
* category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
* category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
* | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
* | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
* | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
* | 'Sm' | 'Sc' | 'Sk' | 'So'
* block-name ::= (See above)
* other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
* character-1 ::= (any character except meta-characters)
*
* char-class ::= '[' ranges ']'
* | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
* ranges ::= '^'? (range ','?)+
* range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
* | range-char | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
* code-point ::= '\x' hex-char hex-char
* | '\x{' hex-char+ '}'
* | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
* hex-char ::= [0-9a-fA-F]
* character-2 ::= (any character except \[]-,)
*
*
*
* TODO
*
* - Unicode Regular Expression Guidelines
*
* - 2.4 Canonical Equivalents
*
- Level 3
*
* - Parsing performance
*
*
*
*
* @xerces.internal
*
* @author TAMURA Kent <[email protected]>
* @version $Id: RegularExpression.java 961928 2010-07-08 20:43:46Z knoaman $
*/
public class RegularExpression implements java.io.Serializable {
private static final long serialVersionUID = 6242499334195006401L;
static final boolean DEBUG = false;
/**
* Compiles a token tree into an operation flow.
*/
private synchronized void compile(Token tok) {
if (this.operations != null)
return;
this.numberOfClosures = 0;
this.operations = this.compile(tok, null, false);
}
/**
* Converts a token to an operation.
*/
private Op compile(Token tok, Op next, boolean reverse) {
Op ret;
switch (tok.type) {
case Token.DOT:
ret = Op.createDot();
ret.next = next;
break;
case Token.CHAR:
ret = Op.createChar(tok.getChar());
ret.next = next;
break;
case Token.ANCHOR:
ret = Op.createAnchor(tok.getChar());
ret.next = next;
break;
case Token.RANGE:
case Token.NRANGE:
ret = Op.createRange(tok);
ret.next = next;
break;
case Token.CONCAT:
ret = next;
if (!reverse) {
for (int i = tok.size()-1; i >= 0; i --) {
ret = compile(tok.getChild(i), ret, false);
}
} else {
for (int i = 0; i < tok.size(); i ++) {
ret = compile(tok.getChild(i), ret, true);
}
}
break;
case Token.UNION:
Op.UnionOp uni = Op.createUnion(tok.size());
for (int i = 0; i < tok.size(); i ++) {
uni.addElement(compile(tok.getChild(i), next, reverse));
}
ret = uni; // ret.next is null.
break;
case Token.CLOSURE:
case Token.NONGREEDYCLOSURE:
Token child = tok.getChild(0);
int min = tok.getMin();
int max = tok.getMax();
if (min >= 0 && min == max) { // {n}
ret = next;
for (int i = 0; i < min; i ++) {
ret = compile(child, ret, reverse);
}
break;
}
if (min > 0 && max > 0)
max -= min;
if (max > 0) {
// X{2,6} -> XX(X(X(XX?)?)?)?
ret = next;
for (int i = 0; i < max; i ++) {
Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
q.next = next;
q.setChild(compile(child, ret, reverse));
ret = q;
}
} else {
Op.ChildOp op;
if (tok.type == Token.NONGREEDYCLOSURE) {
op = Op.createNonGreedyClosure();
} else { // Token.CLOSURE
op = Op.createClosure(this.numberOfClosures++);
}
op.next = next;
op.setChild(compile(child, op, reverse));
ret = op;
}
if (min > 0) {
for (int i = 0; i < min; i ++) {
ret = compile(child, ret, reverse);
}
}
break;
case Token.EMPTY:
ret = next;
break;
case Token.STRING:
ret = Op.createString(tok.getString());
ret.next = next;
break;
case Token.BACKREFERENCE:
ret = Op.createBackReference(tok.getReferenceNumber());
ret.next = next;
break;
case Token.PAREN:
if (tok.getParenNumber() == 0) {
ret = compile(tok.getChild(0), next, reverse);
} else if (reverse) {
next = Op.createCapture(tok.getParenNumber(), next);
next = compile(tok.getChild(0), next, reverse);
ret = Op.createCapture(-tok.getParenNumber(), next);
} else {
next = Op.createCapture(-tok.getParenNumber(), next);
next = compile(tok.getChild(0), next, reverse);
ret = Op.createCapture(tok.getParenNumber(), next);
}
break;
case Token.LOOKAHEAD:
ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
break;
case Token.NEGATIVELOOKAHEAD:
ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
break;
case Token.LOOKBEHIND:
ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
break;
case Token.NEGATIVELOOKBEHIND:
ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
break;
case Token.INDEPENDENT:
ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
break;
case Token.MODIFIERGROUP:
ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
((Token.ModifierToken)tok).getOptions(),
((Token.ModifierToken)tok).getOptionsMask());
break;
case Token.CONDITION:
Token.ConditionToken ctok = (Token.ConditionToken)tok;
int ref = ctok.refNumber;
Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
Op yes = compile(ctok.yes, next, reverse);
Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
ret = Op.createCondition(next, ref, condition, yes, no);
break;
default:
throw new RuntimeException("Unknown token type: "+tok.type);
} // switch (tok.type)
return ret;
}
//Public
/**
* Checks whether the target text contains this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(char[] target) {
return this.matches(target, 0, target .length , (Match)null);
}
/**
* Checks whether the target text contains this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @return true if the target is matched to this regular expression.
*/
public boolean matches(char[] target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
/**
* Checks whether the target text contains this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in target; or -1 if not match.
*/
public boolean matches(char[] target, Match match) {
return this.matches(target, 0, target .length , match);
}
/**
* Checks whether the target text contains this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @param match A Match instance for storing matching result.
* @return Offset of the start position in target; or -1 if not match.
*/
public boolean matches(char[] target, int start, int end, Match match) {
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
//System.err.println("DEBUG: matchEnd="+matchEnd);
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this. match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target [ matchStart ] ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this. match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target [matchStart] ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this. match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
/**
* Checks whether the target text contains this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(String target) {
return this.matches(target, 0, target .length() , (Match)null);
}
/**
* Checks whether the target text contains this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @return true if the target is matched to this regular expression.
*/
public boolean matches(String target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
/**
* Checks whether the target text contains this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in target; or -1 if not match.
*/
public boolean matches(String target, Match match) {
return this.matches(target, 0, target .length() , match);
}
/**
* Checks whether the target text contains this pattern
* in specified range or not.
*
* @param start Start offset of the range.
* @param end End offset +1 of the range.
* @param match A Match instance for storing matching result.
* @return Offset of the start position in target; or -1 if not match.
*/
public boolean matches(String target, int start, int end, Match match) {
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
if (DEBUG) {
System.err.println("target string="+target);
}
int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
if (DEBUG) {
System.err.println("matchEnd="+matchEnd);
System.err.println("con.limit="+con.limit);
}
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this.match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .charAt( matchStart ) ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .charAt( matchStart ) ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
/**
* @return -1 when not match; offset of the end of matched string when match.
*/
private int match(Context con, Op op, int offset, int dx, int opts) {
final ExpressionTarget target = con.target;
final Stack opStack = new Stack();
final IntStack dataStack = new IntStack();
final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
int retValue = -1;
boolean returned = false;
for (;;) {
if (op == null || offset > con.limit || offset < con.start) {
if (op == null) {
retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
}
else {
retValue = -1;
}
returned = true;
}
else {
retValue = -1;
// dx value is either 1 or -1
switch (op.type) {
case Op.CHAR:
{
final int o1 = (dx > 0) ? offset : offset -1;
if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
returned = true;
break;
}
offset += dx;
op = op.next;
}
break;
case Op.DOT:
{
int o1 = (dx > 0) ? offset : offset - 1;
if (o1 >= con.limit || o1 < 0) {
returned = true;
break;
}
if (isSet(opts, SINGLE_LINE)) {
if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
o1 += dx;
}
}
else {
int ch = target.charAt(o1);
if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
o1 += dx;
ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
}
if (isEOLChar(ch)) {
returned = true;
break;
}
}
offset = (dx > 0) ? o1 + 1 : o1;
op = op.next;
}
break;
case Op.RANGE:
case Op.NRANGE:
{
int o1 = (dx > 0) ? offset : offset -1;
if (o1 >= con.limit || o1 < 0) {
returned = true;
break;
}
int ch = target.charAt(offset);
if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
o1 += dx;
ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
}
final RangeToken tok = op.getToken();
if (!tok.match(ch)) {
returned = true;
break;
}
offset = (dx > 0) ? o1+1 : o1;
op = op.next;
}
break;
case Op.ANCHOR:
{
if (!matchAnchor(target, op, con, offset, opts)) {
returned = true;
break;
}
op = op.next;
}
break;
case Op.BACKREFERENCE:
{
int refno = op.getData();
if (refno <= 0 || refno >= this.nofparen) {
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
}
if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
returned = true;
break;
}
int o2 = con.match.getBeginning(refno);
int literallen = con.match.getEnd(refno)-o2;
if (dx > 0) {
if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
returned = true;
break;
}
offset += literallen;
}
else {
if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
returned = true;
break;
}
offset -= literallen;
}
op = op.next;
}
break;
case Op.STRING:
{
String literal = op.getString();
int literallen = literal.length();
if (dx > 0) {
if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
returned = true;
break;
}
offset += literallen;
}
else {
if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
returned = true;
break;
}
offset -= literallen;
}
op = op.next;
}
break;
case Op.CLOSURE:
{
// Saves current position to avoid zero-width repeats.
final int id = op.getData();
if (con.closureContexts[id].contains(offset)) {
returned = true;
break;
}
con.closureContexts[id].addOffset(offset);
}
// fall through
case Op.QUESTION:
{
opStack.push(op);
dataStack.push(offset);
op = op.getChild();
}
break;
case Op.NONGREEDYCLOSURE:
case Op.NONGREEDYQUESTION:
{
opStack.push(op);
dataStack.push(offset);
op = op.next;
}
break;
case Op.UNION:
if (op.size() == 0) {
returned = true;
}
else {
opStack.push(op);
dataStack.push(0);
dataStack.push(offset);
op = op.elementAt(0);
}
break;
case Op.CAPTURE:
{
final int refno = op.getData();
if (con.match != null) {
if (refno > 0) {
dataStack.push(con.match.getBeginning(refno));
con.match.setBeginning(refno, offset);
}
else {
final int index = -refno;
dataStack.push(con.match.getEnd(index));
con.match.setEnd(index, offset);
}
opStack.push(op);
dataStack.push(offset);
}
op = op.next;
}
break;
case Op.LOOKAHEAD:
case Op.NEGATIVELOOKAHEAD:
case Op.LOOKBEHIND:
case Op.NEGATIVELOOKBEHIND:
{
opStack.push(op);
dataStack.push(dx);
dataStack.push(offset);
dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
op = op.getChild();
}
break;
case Op.INDEPENDENT:
{
opStack.push(op);
dataStack.push(offset);
op = op.getChild();
}
break;
case Op.MODIFIER:
{
int localopts = opts;
localopts |= op.getData();
localopts &= ~op.getData2();
opStack.push(op);
dataStack.push(opts);
dataStack.push(offset);
opts = localopts;
op = op.getChild();
}
break;
case Op.CONDITION:
{
Op.ConditionOp cop = (Op.ConditionOp)op;
if (cop.refNumber > 0) {
if (cop.refNumber >= this.nofparen) {
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
}
if (con.match.getBeginning(cop.refNumber) >= 0
&& con.match.getEnd(cop.refNumber) >= 0) {
op = cop.yes;
}
else if (cop.no != null) {
op = cop.no;
}
else {
op = cop.next;
}
}
else {
opStack.push(op);
dataStack.push(offset);
op = cop.condition;
}
}
break;
default:
throw new RuntimeException("Unknown operation type: " + op.type);
}
}
// handle recursive operations
while (returned) {
// exhausted all the operations
if (opStack.isEmpty()) {
return retValue;
}
op = (Op) opStack.pop();
offset = dataStack.pop();
switch (op.type) {
case Op.CLOSURE:
case Op.QUESTION:
if (retValue < 0) {
op = op.next;
returned = false;
}
break;
case Op.NONGREEDYCLOSURE:
case Op.NONGREEDYQUESTION:
if (retValue < 0) {
op = op.getChild();
returned = false;
}
break;
case Op.UNION:
{
int unionIndex = dataStack.pop();
if (DEBUG) {
System.err.println("UNION: "+unionIndex+", ret="+retValue);
}
if (retValue < 0) {
if (++unionIndex < op.size()) {
opStack.push(op);
dataStack.push(unionIndex);
dataStack.push(offset);
op = op.elementAt(unionIndex);
returned = false;
}
else {
retValue = -1;
}
}
}
break;
case Op.CAPTURE:
final int refno = op.getData();
final int saved = dataStack.pop();
if (retValue < 0) {
if (refno > 0) {
con.match.setBeginning(refno, saved);
}
else {
con.match.setEnd(-refno, saved);
}
}
break;
case Op.LOOKAHEAD:
case Op.LOOKBEHIND:
{
dx = dataStack.pop();
if (0 <= retValue) {
op = op.next;
returned = false;
}
retValue = -1;
}
break;
case Op.NEGATIVELOOKAHEAD:
case Op.NEGATIVELOOKBEHIND:
{
dx = dataStack.pop();
if (0 > retValue) {
op = op.next;
returned = false;
}
retValue = -1;
}
break;
case Op.MODIFIER:
opts = dataStack.pop();
// fall through
case Op.INDEPENDENT:
if (retValue >= 0) {
offset = retValue;
op = op.next;
returned = false;
}
break;
case Op.CONDITION:
{
final Op.ConditionOp cop = (Op.ConditionOp)op;
if (0 <= retValue) {
op = cop.yes;
}
else if (cop.no != null) {
op = cop.no;
}
else {
op = cop.next;
}
}
returned = false;
break;
default:
break;
}
}
}
}
private boolean matchChar(int ch, int other, boolean ignoreCase) {
return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
}
boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
boolean go = false;
switch (op.getData()) {
case '^':
if (isSet(opts, MULTIPLE_LINES)) {
if (!(offset == con.start
|| offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1))))
return false;
} else {
if (offset != con.start)
return false;
}
break;
case '@': // Internal use only.
// The @ always matches line beginnings.
if (!(offset == con.start
|| offset > con.start && isEOLChar(target.charAt(offset-1))))
return false;
break;
case '$':
if (isSet(opts, MULTIPLE_LINES)) {
if (!(offset == con.limit
|| offset < con.limit && isEOLChar(target.charAt(offset))))
return false;
} else {
if (!(offset == con.limit
|| offset+1 == con.limit && isEOLChar(target.charAt(offset))
|| offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
&& target.charAt(offset+1) == LINE_FEED))
return false;
}
break;
case 'A':
if (offset != con.start) return false;
break;
case 'Z':
if (!(offset == con.limit
|| offset+1 == con.limit && isEOLChar(target.charAt(offset))
|| offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN
&& target.charAt(offset+1) == LINE_FEED))
return false;
break;
case 'z':
if (offset != con.limit) return false;
break;
case 'b':
if (con.length == 0)
return false;
{
int after = getWordType(target, con.start, con.limit, offset, opts);
if (after == WT_IGNORE) return false;
int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
if (after == before) return false;
}
break;
case 'B':
if (con.length == 0)
go = true;
else {
int after = getWordType(target, con.start, con.limit, offset, opts);
go = after == WT_IGNORE
|| after == getPreviousWordType(target, con.start, con.limit, offset, opts);
}
if (!go) return false;
break;
case '<':
if (con.length == 0 || offset == con.limit) return false;
if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
return false;
break;
case '>':
if (con.length == 0 || offset == con.start) return false;
if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
return false;
break;
} // switch anchor type
return true;
}
private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
int offset, int opts) {
int ret = getWordType(target, begin, end, --offset, opts);
while (ret == WT_IGNORE)
ret = getWordType(target, begin, end, --offset, opts);
return ret;
}
private static final int getWordType(ExpressionTarget target, int begin, int end,
int offset, int opts) {
if (offset < begin || offset >= end) return WT_OTHER;
return getWordType0(target.charAt(offset) , opts);
}
/**
* Checks whether the target text contains this pattern or not.
*
* @return true if the target is matched to this regular expression.
*/
public boolean matches(CharacterIterator target) {
return this.matches(target, (Match)null);
}
/**
* Checks whether the target text contains this pattern or not.
*
* @param match A Match instance for storing matching result.
* @return Offset of the start position in target; or -1 if not match.
*/
public boolean matches(CharacterIterator target, Match match) {
int start = target.getBeginIndex();
int end = target.getEndIndex();
synchronized (this) {
if (this.operations == null)
this.prepare();
if (this.context == null)
this.context = new Context();
}
Context con = null;
synchronized (this.context) {
con = this.context.inuse ? new Context() : this.context;
con.reset(target, start, end, this.numberOfClosures);
}
if (match != null) {
match.setNumberOfGroups(this.nofparen);
match.setSource(target);
} else if (this.hasBackReferences) {
match = new Match();
match.setNumberOfGroups(this.nofparen);
// Need not to call setSource() because
// a caller can not access this match instance.
}
con.match = match;
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
//System.err.println("DEBUG: matchEnd="+matchEnd);
if (matchEnd == con.limit) {
if (con.match != null) {
con.match.setBeginning(0, con.start);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
}
return false;
}
/*
* The pattern has only fixed string.
* The engine uses Boyer-Moore.
*/
if (this.fixedStringOnly) {
//System.err.println("DEBUG: fixed-only: "+this.fixedString);
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o >= 0) {
if (con.match != null) {
con.match.setBeginning(0, o);
con.match.setEnd(0, o+this.fixedString.length());
}
con.setInUse(false);
return true;
}
con.setInUse(false);
return false;
}
/*
* The pattern contains a fixed string.
* The engine checks with Boyer-Moore whether the text contains the fixed string or not.
* If not, it return with false.
*/
if (this.fixedString != null) {
int o = this.fixedStringTable.matches(target, con.start, con.limit);
if (o < 0) {
//System.err.println("Non-match in fixed-string search.");
con.setInUse(false);
return false;
}
}
int limit = con.limit-this.minlength;
int matchStart;
int matchEnd = -1;
/*
* Checks whether the expression starts with ".*".
*/
if (this.operations != null
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this.match(con, this.operations, con.start, 1, this.options);
} else {
boolean previousIsEOL = true;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .setIndex( matchStart ) ;
if (isEOLChar(ch)) {
previousIsEOL = true;
} else {
if (previousIsEOL) {
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options)))
break;
}
previousIsEOL = false;
}
}
}
}
/*
* Optimization against the first character.
*/
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
RangeToken range = this.firstChar;
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
int ch = target .setIndex( matchStart ) ;
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
}
if (!range.match(ch)) {
continue;
}
if (0 <= (matchEnd = this.match(con, this.operations,
matchStart, 1, this.options))) {
break;
}
}
}
/*
* Straightforward matching.
*/
else {
for (matchStart = con.start; matchStart <= limit; matchStart ++) {
if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
break;
}
}
if (matchEnd >= 0) {
if (con.match != null) {
con.match.setBeginning(0, matchStart);
con.match.setEnd(0, matchEnd);
}
con.setInUse(false);
return true;
} else {
con.setInUse(false);
return false;
}
}
// ================================================================
/**
* A regular expression.
* @serial
*/
String regex;
/**
* @serial
*/
int options;
/**
* The number of parenthesis in the regular expression.
* @serial
*/
int nofparen;
/**
* Internal representation of the regular expression.
* @serial
*/
Token tokentree;
boolean hasBackReferences = false;
transient int minlength;
transient Op operations = null;
transient int numberOfClosures;
transient Context context = null;
transient RangeToken firstChar = null;
transient String fixedString = null;
transient int fixedStringOptions;
transient BMPattern fixedStringTable = null;
transient boolean fixedStringOnly = false;
static abstract class ExpressionTarget {
abstract char charAt(int index);
abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
}
static final class StringTarget extends ExpressionTarget {
private String target;
StringTarget(String target) {
this.target = target;
}
final void resetTarget(String target) {
this.target = target;
}
final char charAt(int index) {
return target.charAt(index);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (limit-offset < partlen) {
return false;
}
return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
int offset2, int partlen) {
if (limit-offset < partlen) {
return false;
}
return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
: target.regionMatches(offset, target, offset2, partlen);
}
}
static final class CharArrayTarget extends ExpressionTarget {
char[] target;
CharArrayTarget(char[] target) {
this.target = target;
}
final void resetTarget(char[] target) {
this.target = target;
}
char charAt(int index) {
return target[index];
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
: regionMatches(offset, limit, part, partlen);
}
private final boolean regionMatches(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
if (target[offset++] != part.charAt(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
final char ch1 = target[offset++] ;
final char ch2 = part.charAt(i++);
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
: regionMatches(offset, limit, offset2, partlen);
}
private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
if ( target [ offset++ ] != target [ i++ ] )
return false;
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
final char ch1 = target[offset++] ;
final char ch2 = target[i++] ;
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
}
static final class CharacterIteratorTarget extends ExpressionTarget {
CharacterIterator target;
CharacterIteratorTarget(CharacterIterator target) {
this.target = target;
}
final void resetTarget(CharacterIterator target) {
this.target = target;
}
final char charAt(int index) {
return target.setIndex(index);
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit,
String part, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
: regionMatches(offset, limit, part, partlen);
}
private final boolean regionMatches(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
if (target.setIndex(offset++) != part.charAt(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
int i = 0;
while (partlen-- > 0) {
final char ch1 = target.setIndex(offset++) ;
final char ch2 = part.charAt(i++);
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
if (offset < 0 || limit-offset < partlen) {
return false;
}
return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
: regionMatches(offset, limit, offset2, partlen);
}
private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
if (target.setIndex(offset++) != target.setIndex(i++)) {
return false;
}
}
return true;
}
private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
int i = offset2;
while (partlen-- > 0) {
final char ch1 = target.setIndex(offset++) ;
final char ch2 = target.setIndex(i++) ;
if (ch1 == ch2) {
continue;
}
final char uch1 = Character.toUpperCase(ch1);
final char uch2 = Character.toUpperCase(ch2);
if (uch1 == uch2) {
continue;
}
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
return false;
}
}
return true;
}
}
static final class ClosureContext {
int[] offsets = new int[4];
int currentIndex = 0;
boolean contains(int offset) {
for (int i=0; i= 0x10000) { // Op.CHAR
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
} else {
char[] ac = new char[1];
ac[0] = (char)this.operations.getData();
this.fixedString = new String(ac);
}
this.fixedStringOptions = this.options;
this.fixedStringTable = new BMPattern(this.fixedString, 256,
isSet(this.fixedStringOptions, IGNORE_CASE));
} else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
&& !isSet(this.options, XMLSCHEMA_MODE)) {
Token.FixedStringContainer container = new Token.FixedStringContainer();
this.tokentree.findFixedString(container, this.options);
this.fixedString = container.token == null ? null : container.token.getString();
this.fixedStringOptions = container.options;
if (this.fixedString != null && this.fixedString.length() < 2)
this.fixedString = null;
// This pattern has a fixed string of which length is more than one.
if (this.fixedString != null) {
this.fixedStringTable = new BMPattern(this.fixedString, 256,
isSet(this.fixedStringOptions, IGNORE_CASE));
if (DEBUG) {
System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
+"/" //+this.fixedString
+"/"+REUtil.createOptionString(this.fixedStringOptions));
System.err.print("String: ");
REUtil.dumpString(this.fixedString);
}
}
}
}
/**
* An option.
* If you specify this option, (X)
* captures matched text, and (:?X)
* does not capture.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
static final int MARK_PARENS = 1<<0;
*/
/**
* "i"
*/
static final int IGNORE_CASE = 1<<1;
/**
* "s"
*/
static final int SINGLE_LINE = 1<<2;
/**
* "m"
*/
static final int MULTIPLE_LINES = 1<<3;
/**
* "x"
*/
static final int EXTENDED_COMMENT = 1<<4;
/**
* This option redefines \d \D \w \W \s \S.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
* @see #UNICODE_WORD_BOUNDARY
*/
static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
/**
* An option.
* This enables to process locale-independent word boundary for \b \B \< \>.
* By default, the engine considers a position between a word character
* (\w) and a non word character
* is a word boundary.
*
By this option, the engine checks word boundaries with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
*
* @see #RegularExpression(java.lang.String,int)
* @see #setPattern(java.lang.String,int)
*/
static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
/**
* "H"
*/
static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
/**
* "F"
*/
static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
/**
* "X". XML Schema mode.
*/
static final int XMLSCHEMA_MODE = 1<<9;
/**
* ",".
*/
static final int SPECIAL_COMMA = 1<<10;
private static final boolean isSet(int options, int flag) {
return (options & flag) == flag;
}
/**
* Creates a new RegularExpression instance.
*
* @param regex A regular expression
* @exception org.apache.xerces.utils.regex.ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex) throws ParseException {
this(regex, null);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception org.apache.xerces.utils.regex.ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex, String options) throws ParseException {
this.setPattern(regex, options);
}
/**
* Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception org.apache.xerces.utils.regex.ParseException regex is not conforming to the syntax.
*/
public RegularExpression(String regex, String options, Locale locale) throws ParseException {
this.setPattern(regex, options, locale);
}
RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
this.regex = regex;
this.tokentree = tok;
this.nofparen = parens;
this.options = options;
this.hasBackReferences = hasBackReferences;
}
/**
*
*/
public void setPattern(String newPattern) throws ParseException {
this.setPattern(newPattern, Locale.getDefault());
}
public void setPattern(String newPattern, Locale locale) throws ParseException {
this.setPattern(newPattern, this.options, locale);
}
private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
this.regex = newPattern;
this.options = options;
RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
? new ParserForXMLSchema(locale) : new RegexParser(locale);
this.tokentree = rp.parse(this.regex, this.options);
this.nofparen = rp.parennumber;
this.hasBackReferences = rp.hasBackReferences;
this.operations = null;
this.context = null;
}
/**
*
*/
public void setPattern(String newPattern, String options) throws ParseException {
this.setPattern(newPattern, options, Locale.getDefault());
}
public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
this.setPattern(newPattern, REUtil.parseOptions(options), locale);
}
/**
*
*/
public String getPattern() {
return this.regex;
}
/**
* Represents this instence in String.
*/
public String toString() {
return this.tokentree.toString(this.options);
}
/**
* Returns a option string.
* The order of letters in it may be different from a string specified
* in a constructor or setPattern()
.
*
* @see #RegularExpression(java.lang.String,java.lang.String)
* @see #setPattern(java.lang.String,java.lang.String)
*/
public String getOptions() {
return REUtil.createOptionString(this.options);
}
/**
* Return true if patterns are the same and the options are equivalent.
*/
public boolean equals(Object obj) {
if (obj == null) return false;
if (!(obj instanceof RegularExpression))
return false;
RegularExpression r = (RegularExpression)obj;
return this.regex.equals(r.regex) && this.options == r.options;
}
boolean equals(String pattern, int options) {
return this.regex.equals(pattern) && this.options == options;
}
/**
*
*/
public int hashCode() {
return (this.regex+"/"+this.getOptions()).hashCode();
}
/**
* Return the number of regular expression groups.
* This method returns 1 when the regular expression has no capturing-parenthesis.
*
*/
public int getNumberOfGroups() {
return this.nofparen;
}
// ================================================================
private static final int WT_IGNORE = 0;
private static final int WT_LETTER = 1;
private static final int WT_OTHER = 2;
private static final int getWordType0(char ch, int opts) {
if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
if (isSet(opts, USE_UNICODE_CATEGORY)) {
return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
}
return isWordChar(ch) ? WT_LETTER : WT_OTHER;
}
switch (Character.getType(ch)) {
case Character.UPPERCASE_LETTER: // L
case Character.LOWERCASE_LETTER: // L
case Character.TITLECASE_LETTER: // L
case Character.MODIFIER_LETTER: // L
case Character.OTHER_LETTER: // L
case Character.LETTER_NUMBER: // N
case Character.DECIMAL_DIGIT_NUMBER: // N
case Character.OTHER_NUMBER: // N
case Character.COMBINING_SPACING_MARK: // Mc
return WT_LETTER;
case Character.FORMAT: // Cf
case Character.NON_SPACING_MARK: // Mn
case Character.ENCLOSING_MARK: // Mc
return WT_IGNORE;
case Character.CONTROL: // Cc
switch (ch) {
case '\t':
case '\n':
case '\u000B':
case '\f':
case '\r':
return WT_OTHER;
default:
return WT_IGNORE;
}
default:
return WT_OTHER;
}
}
// ================================================================
static final int LINE_FEED = 0x000A;
static final int CARRIAGE_RETURN = 0x000D;
static final int LINE_SEPARATOR = 0x2028;
static final int PARAGRAPH_SEPARATOR = 0x2029;
private static final boolean isEOLChar(int ch) {
return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
|| ch == PARAGRAPH_SEPARATOR;
}
private static final boolean isWordChar(int ch) { // Legacy word characters
if (ch == '_') return true;
if (ch < '0') return false;
if (ch > 'z') return false;
if (ch <= '9') return true;
if (ch < 'A') return false;
if (ch <= 'Z') return true;
if (ch < 'a') return false;
return true;
}
private static final boolean matchIgnoreCase(int chardata, int ch) {
if (chardata == ch) return true;
if (chardata > 0xffff || ch > 0xffff) return false;
char uch1 = Character.toUpperCase((char)chardata);
char uch2 = Character.toUpperCase((char)ch);
if (uch1 == uch2) return true;
return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
}
}