All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.oro.text.regex.Perl5Compiler Maven / Gradle / Ivy

There is a newer version: 5.0.84
Show newest version
/*
 * $Id: Perl5Compiler.java,v 1.21 2003/11/07 20:16:25 dfs Exp $
 *
 * ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact [email protected].
 *
 * 5. Products derived from this software may not be called "Apache" 
 *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
 *    name, without prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 */


package org.apache.oro.text.regex;

import java.util.*;

/**
 * The Perl5Compiler class is used to create compiled regular expressions
 * conforming to the Perl5 regular expression syntax.  It generates
 * Perl5Pattern instances upon compilation to be used in conjunction
 * with a Perl5Matcher instance.  Please see the user's guide for more 
 * information about Perl5 regular expressions.
 * 

* Perl5Compiler and Perl5Matcher are designed with the intent that * you use a separate instance of each per thread to avoid the overhead * of both synchronization and concurrent access (e.g., a match that takes * a long time in one thread will block the progress of another thread with * a shorter match). If you want to use a single instance of each * in a concurrent program, you must appropriately protect access to * the instances with critical sections. If you want to share Perl5Pattern * instances between concurrently executing instances of Perl5Matcher, you * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}. * * @version @version@ * @since 1.0 * @see PatternCompiler * @see MalformedPatternException * @see Perl5Pattern * @see Perl5Matcher */ public final class Perl5Compiler implements PatternCompiler { private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2, __SPSTART = 0x4, __TRYAGAIN = 0x8; private static final char __CASE_INSENSITIVE = 0x0001, __GLOBAL = 0x0002, __KEEP = 0x0004, __MULTILINE = 0x0008, __SINGLELINE = 0x0010, __EXTENDED = 0x0020, __READ_ONLY = 0x8000; private static final String __HEX_DIGIT = "0123456789abcdef0123456789ABCDEFx"; private CharStringPointer __input; private boolean __sawBackreference; private char[] __modifierFlags = { 0 }; // IMPORTANT: __numParentheses starts out equal to 1 during compilation. // It is always one greater than the number of parentheses encountered // so far in the regex. That is because it refers to the number of groups // to save, and the entire match is always saved (group 0) private int __numParentheses, __programSize, __cost; // When doing the second pass and actually generating code, __programSize // keeps track of the current offset. private char[] __program; /** Lookup table for POSIX character class names */ private static final HashMap __hashPOSIX; static { __hashPOSIX = new HashMap(); __hashPOSIX.put("alnum", new Character(OpCode._ALNUMC)); __hashPOSIX.put("word", new Character(OpCode._ALNUM)); __hashPOSIX.put("alpha", new Character(OpCode._ALPHA)); __hashPOSIX.put("blank", new Character(OpCode._BLANK)); __hashPOSIX.put("cntrl", new Character(OpCode._CNTRL)); __hashPOSIX.put("digit", new Character(OpCode._DIGIT)); __hashPOSIX.put("graph", new Character(OpCode._GRAPH)); __hashPOSIX.put("lower", new Character(OpCode._LOWER)); __hashPOSIX.put("print", new Character(OpCode._PRINT)); __hashPOSIX.put("punct", new Character(OpCode._PUNCT)); __hashPOSIX.put("space", new Character(OpCode._SPACE)); __hashPOSIX.put("upper", new Character(OpCode._UPPER)); __hashPOSIX.put("xdigit", new Character(OpCode._XDIGIT)); __hashPOSIX.put("ascii", new Character(OpCode._ASCII)); } /** * The default mask for the {@link #compile compile} methods. * It is equal to 0. * The default behavior is for a regular expression to be case sensitive * and to not specify if it is multiline or singleline. When MULITLINE_MASK * and SINGLINE_MASK are not defined, the ^, $, and . * metacharacters are * interpreted according to the value of isMultiline() in Perl5Matcher. * The default behavior of Perl5Matcher is to treat the Perl5Pattern * as though MULTILINE_MASK were enabled. If isMultiline() returns false, * then the pattern is treated as though SINGLINE_MASK were set. However, * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks * will ALWAYS override whatever behavior is specified by the setMultiline() * in Perl5Matcher. */ public static final int DEFAULT_MASK = 0; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should be case insensitive. */ public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should treat input as having * multiple lines. This option affects the interpretation of * the ^ and $ metacharacters. When this mask is used, * the ^ metacharacter matches at the beginning of every line, * and the $ metacharacter matches at the end of every line. * Additionally the . metacharacter will not match newlines when * an expression is compiled with MULTILINE_MASK , which is its * default behavior. */ public static final int MULTILINE_MASK = __MULTILINE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should treat input as being * a single line. This option affects the interpretation of * the ^ and $ metacharacters. When this mask is used, * the ^ metacharacter matches at the beginning of the input, * and the $ metacharacter matches at the end of the input. * The ^ and $ metacharacters will not match at the beginning * and end of lines occurring between the begnning and end of the input. * Additionally, the . metacharacter will match newlines when * an expression is compiled with SINGLELINE_MASK , unlike its * default behavior. */ public static final int SINGLELINE_MASK = __SINGLELINE; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should be treated as a Perl5 * extended pattern (i.e., a pattern using the /x modifier). This * option tells the compiler to ignore whitespace that is not backslashed or * within a character class. It also tells the compiler to treat the * # character as a metacharacter introducing a comment as in * Perl. In other words, the # character will comment out any * text in the regular expression between it and the next newline. * The intent of this option is to allow you to divide your patterns * into more readable parts. It is provided to maintain compatibility * with Perl5 regular expressions, although it will not often * make sense to use it in Java. */ public static final int EXTENDED_MASK = __EXTENDED; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate that the resulting Perl5Pattern should be treated as a * read only data structure by Perl5Matcher, making it safe to share * a single Perl5Pattern instance among multiple threads without needing * synchronization. Without this option, Perl5Matcher reserves the right * to store heuristic or other information in Perl5Pattern that might * accelerate future matches. When you use this option, Perl5Matcher will * not store or modify any information in a Perl5Pattern. Use this option * when you want to share a Perl5Pattern instance among multiple threads * using different Perl5Matcher instances. */ public static final int READ_ONLY_MASK = __READ_ONLY; /** * Given a character string, returns a Perl5 expression that interprets * each character of the original string literally. In other words, all * special metacharacters are quoted/escaped. This method is useful for * converting user input meant for literal interpretation into a safe * regular expression representing the literal input. *

* In effect, this method is the analog of the Perl5 quotemeta() builtin * method. *

* @param expression The expression to convert. * @return A String containing a Perl5 regular expression corresponding to * a literal interpretation of the pattern. */ public static final String quotemeta(char[] expression) { int ch; StringBuffer buffer; buffer = new StringBuffer(2*expression.length); for(ch = 0; ch < expression.length; ch++) { if(!OpCode._isWordCharacter(expression[ch])) buffer.append('\\'); buffer.append(expression[ch]); } return buffer.toString(); } /** * Given a character string, returns a Perl5 expression that interprets * each character of the original string literally. In other words, all * special metacharacters are quoted/escaped. This method is useful for * converting user input meant for literal interpretation into a safe * regular expression representing the literal input. *

* In effect, this method is the analog of the Perl5 quotemeta() builtin * method. *

* @param pattern The pattern to convert. * @return A String containing a Perl5 regular expression corresponding to * a literal interpretation of the pattern. */ public static final String quotemeta(String expression) { return quotemeta(expression.toCharArray()); } private static boolean __isSimpleRepetitionOp(char ch) { return (ch == '*' || ch == '+' || ch == '?'); } private static boolean __isComplexRepetitionOp(char[] ch, int offset) { if(offset < ch.length && offset >= 0) return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?' || (ch[offset] == '{' && __parseRepetition(ch, offset))); return false; } // determines if {\d+,\d*} is the next part of the string private static boolean __parseRepetition(char[] str, int offset) { if(str[offset] != '{') return false; ++offset; if(offset >= str.length || !Character.isDigit(str[offset])) return false; while(offset < str.length && Character.isDigit(str[offset])) ++offset; if(offset < str.length && str[offset] == ',') ++offset; while(offset < str.length && Character.isDigit(str[offset])) ++offset; if(offset >= str.length || str[offset] != '}') return false; return true; } private static int __parseHex(char[] str, int offset, int maxLength, int[] scanned) { int val = 0, index; scanned[0] = 0; while(offset < str.length && maxLength-- > 0 && (index = __HEX_DIGIT.indexOf(str[offset])) != -1) { val <<= 4; val |= (index & 15); ++offset; ++scanned[0]; } return val; } private static int __parseOctal(char[] str, int offset, int maxLength, int[] scanned) { int val = 0; scanned[0] = 0; while(offset < str.length && maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') { val <<= 3; val |= (str[offset] - '0'); --maxLength; ++offset; ++scanned[0]; } return val; } private static void __setModifierFlag(char[] flags, char ch) { switch(ch) { case 'i' : flags[0] |= __CASE_INSENSITIVE; return; case 'g' : flags[0] |= __GLOBAL; return; case 'o' : flags[0] |= __KEEP; return; case 'm' : flags[0] |= __MULTILINE; return; case 's' : flags[0] |= __SINGLELINE; return; case 'x' : flags[0] |= __EXTENDED; return; } } // Emit a specific character code. private void __emitCode(char code) { if(__program != null) __program[__programSize] = code; ++__programSize; } // Emit an operator with no arguments. // Return an offset into the __program array as a pointer to node. private int __emitNode(char operator) { int offset; offset = __programSize; if(__program == null) __programSize+=2; else { __program[__programSize++] = operator; __program[__programSize++] = OpCode._NULL_POINTER; } return offset; } // Emit an operator with arguments. // Return an offset into the __programarray as a pointer to node. private int __emitArgNode(char operator, char arg) { int offset; offset = __programSize; if(__program== null) __programSize+=3; else { __program[__programSize++] = operator; __program[__programSize++] = OpCode._NULL_POINTER; __program[__programSize++] = arg; } return offset; } // Insert an operator at a given offset. private void __programInsertOperator(char operator, int operand) { int src, dest, offset; offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0); if(__program== null) { __programSize+=(2 + offset); return; } src = __programSize; __programSize+=(2 + offset); dest = __programSize; while(src > operand) { --src; --dest; __program[dest] = __program[src]; } __program[operand++] = operator; __program[operand++] = OpCode._NULL_POINTER; while(offset-- > 0) __program[operand++] = OpCode._NULL_POINTER; } private void __programAddTail(int current, int value) { int scan, temp, offset; if(__program == null || current == OpCode._NULL_OFFSET) return; scan = current; while(true) { temp = OpCode._getNext(__program, scan); if(temp == OpCode._NULL_OFFSET) break; scan = temp; } if(__program[scan] == OpCode._BACK) offset = scan - value; else offset = value - scan; __program[scan + 1] = (char)offset; } private void __programAddOperatorTail(int current, int value) { if(__program == null || current == OpCode._NULL_OFFSET || OpCode._opType[__program[current]] != OpCode._BRANCH) return; __programAddTail(OpCode._getNextOperator(current), value); } private char __getNextChar() { char ret, value; ret = __input._postIncrement(); while(true) { value = __input._getValue(); if(value == '(' && __input._getValueRelative(1) == '?' && __input._getValueRelative(2) == '#') { // Skip comments while(value != CharStringPointer._END_OF_STRING && value != ')') value = __input._increment(); __input._increment(); continue; } if((__modifierFlags[0] & __EXTENDED) != 0) { if(Character.isWhitespace(value)) { __input._increment(); continue; } else if(value == '#') { while(value != CharStringPointer._END_OF_STRING && value != '\n') value = __input._increment(); __input._increment(); continue; } } return ret; } } private int __parseAlternation(int[] retFlags) throws MalformedPatternException { int chain, offset, latest; int flags = 0; char value; retFlags[0] = __WORSTCASE; offset = __emitNode(OpCode._BRANCH); chain = OpCode._NULL_OFFSET; if(__input._getOffset() == 0) { __input._setOffset(-1); __getNextChar(); } else { __input._decrement(); __getNextChar(); } value = __input._getValue(); while(value != CharStringPointer._END_OF_STRING && value != '|' && value != ')') { flags &= ~__TRYAGAIN; latest = __parseBranch(retFlags); if(latest == OpCode._NULL_OFFSET) { if((flags & __TRYAGAIN) != 0){ value = __input._getValue(); continue; } return OpCode._NULL_OFFSET; } retFlags[0] |= (flags & __NONNULL); if(chain == OpCode._NULL_OFFSET) retFlags[0] |= (flags & __SPSTART); else { ++__cost; __programAddTail(chain, latest); } chain = latest; value = __input._getValue(); } // If loop was never entered. if(chain == OpCode._NULL_OFFSET) __emitNode(OpCode._NOTHING); return offset; } private int __parseAtom(int[] retFlags) throws MalformedPatternException { boolean doDefault; char value; int offset, flags[] = { 0 }; retFlags[0] = __WORSTCASE; doDefault = false; offset = OpCode._NULL_OFFSET; tryAgain: while(true) { value = __input._getValue(); switch(value) { case '^' : __getNextChar(); // The order here is important in order to support /ms. // /m takes precedence over /s for ^ and $, but not for . if((__modifierFlags[0] & __MULTILINE) != 0) offset = __emitNode(OpCode._MBOL); else if((__modifierFlags[0] & __SINGLELINE) != 0) offset = __emitNode(OpCode._SBOL); else offset = __emitNode(OpCode._BOL); break tryAgain; case '$': __getNextChar(); // The order here is important in order to support /ms. // /m takes precedence over /s for ^ and $, but not for . if((__modifierFlags[0] & __MULTILINE) != 0) offset = __emitNode(OpCode._MEOL); else if((__modifierFlags[0] & __SINGLELINE) != 0) offset = __emitNode(OpCode._SEOL); else offset = __emitNode(OpCode._EOL); break tryAgain; case '.': __getNextChar(); // The order here is important in order to support /ms. // /m takes precedence over /s for ^ and $, but not for . if((__modifierFlags[0] & __SINGLELINE) != 0) offset = __emitNode(OpCode._SANY); else offset = __emitNode(OpCode._ANY); ++__cost; retFlags[0] |= (__NONNULL | __SIMPLE); break tryAgain; case '[': __input._increment(); offset = __parseUnicodeClass(); retFlags[0] |= (__NONNULL | __SIMPLE); break tryAgain; case '(': __getNextChar(); offset = __parseExpression(true, flags); if(offset == OpCode._NULL_OFFSET) { if((flags[0] & __TRYAGAIN) != 0) continue tryAgain; return OpCode._NULL_OFFSET; } retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART)); break tryAgain; case '|': case ')': if((flags[0] & __TRYAGAIN) != 0) { retFlags[0] |= __TRYAGAIN; return OpCode._NULL_OFFSET; } throw new MalformedPatternException("Error in expression at " + __input._toString(__input._getOffset())); //break tryAgain; case '?': case '+': case '*': throw new MalformedPatternException( "?+* follows nothing in expression"); //break tryAgain; case '\\': value = __input._increment(); switch(value) { case 'A' : offset = __emitNode(OpCode._SBOL); retFlags[0] |= __SIMPLE; __getNextChar(); break; case 'G': offset = __emitNode(OpCode._GBOL); retFlags[0] |= __SIMPLE; __getNextChar(); break; case 'Z': offset = __emitNode(OpCode._SEOL); retFlags[0] |= __SIMPLE; __getNextChar(); break; case 'w': offset = __emitNode(OpCode._ALNUM); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'W': offset = __emitNode(OpCode._NALNUM); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'b': offset = __emitNode(OpCode._BOUND); retFlags[0] |= __SIMPLE; __getNextChar(); break; case 'B': offset = __emitNode(OpCode._NBOUND); retFlags[0] |= __SIMPLE; __getNextChar(); break; case 's': offset = __emitNode(OpCode._SPACE); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'S': offset = __emitNode(OpCode._NSPACE); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'd': offset = __emitNode(OpCode._DIGIT); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'D': offset = __emitNode(OpCode._NDIGIT); retFlags[0] |= (__NONNULL | __SIMPLE); __getNextChar(); break; case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x': case 'c': case '0': doDefault = true; break tryAgain; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': int num; StringBuffer buffer = new StringBuffer(10); num = 0; value = __input._getValueRelative(num); while(Character.isDigit(value)) { buffer.append(value); ++num; value = __input._getValueRelative(num); } try { num = Integer.parseInt(buffer.toString()); } catch(NumberFormatException e) { throw new MalformedPatternException( "Unexpected number format exception. Please report this bug." + "NumberFormatException message: " + e.getMessage()); } if(num > 9 && num >= __numParentheses) { doDefault = true; break tryAgain; } else { // A backreference may only occur AFTER its group if(num >= __numParentheses) throw new MalformedPatternException("Invalid backreference: \\" + num); __sawBackreference = true; offset = __emitArgNode(OpCode._REF, (char)num); retFlags[0] |= __NONNULL; value = __input._getValue(); while(Character.isDigit(value)) value = __input._increment(); __input._decrement(); __getNextChar(); } break; case '\0': case CharStringPointer._END_OF_STRING: if(__input._isAtEnd()) throw new MalformedPatternException("Trailing \\ in expression."); // fall through to default default: doDefault = true; break tryAgain; } break tryAgain; case '#': // skip over comments if((__modifierFlags[0] & __EXTENDED) != 0) { while(!__input._isAtEnd() && __input._getValue() != '\n') __input._increment(); if(!__input._isAtEnd()) continue tryAgain; } // fall through to default default: __input._increment(); doDefault = true; break tryAgain; }// end master switch } // end tryAgain if(doDefault) { char ender; int length, pOffset, maxOffset, lastOffset, numLength[]; offset = __emitNode(OpCode._EXACTLY); // Not sure that it's ok to use 0 to mark end. //__emitCode((char)0); __emitCode((char)CharStringPointer._END_OF_STRING); forLoop: for(length = 0, pOffset = __input._getOffset() - 1, maxOffset = __input._getLength(); length < 127 && pOffset < maxOffset; ++length) { lastOffset = pOffset; value = __input._getValue(pOffset); switch(value) { case '^': case '$': case '.': case '[': case '(': case ')': case '|': break forLoop; case '\\': value = __input._getValue(++pOffset); switch(value) { case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b': case 'B': case 's': case 'S': case 'd': case 'D': --pOffset; break forLoop; case 'n': ender = '\n'; ++pOffset; break; case 'r': ender = '\r'; ++pOffset; break; case 't': ender = '\t'; ++pOffset; break; case 'f': ender = '\f'; ++pOffset; break; case 'e': ender = '\033'; ++pOffset; break; case 'a': ender = '\007'; ++pOffset; break; case 'x': numLength = new int[1]; ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength); pOffset+=numLength[0]; break; case 'c': ++pOffset; ender = __input._getValue(pOffset++); if(Character.isLowerCase(ender)) ender = Character.toUpperCase(ender); ender ^= 64; break; case '0': case '1': case '2': case'3': case '4': case '5': case '6': case '7': case '8': case '9': boolean doOctal = false; value = __input._getValue(pOffset); if(value == '0') doOctal = true; value = __input._getValue(pOffset + 1); if(Character.isDigit(value)) { int num; StringBuffer buffer = new StringBuffer(10); num = pOffset; value = __input._getValue(num); while(Character.isDigit(value)){ buffer.append(value); ++num; value = __input._getValue(num); } try { num = Integer.parseInt(buffer.toString()); } catch(NumberFormatException e) { throw new MalformedPatternException( "Unexpected number format exception. Please report this bug." + "NumberFormatException message: " + e.getMessage()); } if(!doOctal) doOctal = (num >= __numParentheses); } if(doOctal) { numLength = new int[1]; ender = (char)__parseOctal(__input._array, pOffset, 3, numLength); pOffset+=numLength[0]; } else { --pOffset; break forLoop; } break; case CharStringPointer._END_OF_STRING: case '\0': if(pOffset >= maxOffset) throw new MalformedPatternException("Trailing \\ in expression."); // fall through to default default: ender = __input._getValue(pOffset++); break; } // end backslash switch break; case '#': if((__modifierFlags[0] & __EXTENDED) != 0) { while(pOffset < maxOffset && __input._getValue(pOffset) != '\n') ++pOffset; } // fall through to whitespace handling case ' ': case '\t': case '\n': case '\r': case '\f': case '\013': if((__modifierFlags[0] & __EXTENDED) != 0) { ++pOffset; --length; continue; } // fall through to default default: ender = __input._getValue(pOffset++); break; } // end master switch if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && Character.isUpperCase(ender)) ender = Character.toLowerCase(ender); if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) { if(length > 0) pOffset = lastOffset; else { ++length; __emitCode(ender); } break; } __emitCode(ender); } // end for loop __input._setOffset(pOffset - 1); __getNextChar(); if(length < 0) throw new MalformedPatternException( "Unexpected compilation failure. Please report this bug!"); if(length > 0) retFlags[0] |= __NONNULL; if(length == 1) retFlags[0] |= __SIMPLE; if(__program!= null) __program[OpCode._getOperand(offset)] = (char)length; //__emitCode('\0'); // debug __emitCode(CharStringPointer._END_OF_STRING); } return offset; } // These are the original 8-bit character class handling methods. // We don't want to delete them just yet only to have to dig it out // of revision control later. /* // Set the bits in a character class. Only recognizes ascii. private void __setCharacterClassBits(char[] bits, int offset, char deflt, char ch) { if(__program== null || ch >= 256) return; ch &= 0xffff; if(deflt == 0) { bits[offset + (ch >> 4)] |= (1 << (ch & 0xf)); } else { bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf)); } } private int __parseCharacterClass() throws MalformedPatternException { boolean range = false, skipTest; char clss, deflt, lastclss = Character.MAX_VALUE; int offset, bits, numLength[] = { 0 }; offset = __emitNode(OpCode._ANYOF); if(__input._getValue() == '^') { ++__cost; __input._increment(); deflt = 0; } else { deflt = 0xffff; } bits = __programSize; for(clss = 0; clss < 16; clss++) __emitCode(deflt); clss = __input._getValue(); if(clss == ']' || clss == '-') skipTest = true; else skipTest = false; while((!__input._isAtEnd() && (clss = __input._getValue()) != ']') || skipTest) { // It sucks, but we have to make this assignment every time skipTest = false; __input._increment(); if(clss == '\\') { clss = __input._postIncrement(); switch(clss){ case 'w': for(clss = 0; clss < 256; clss++) if(OpCode._isWordCharacter(clss)) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 'W': for(clss = 0; clss < 256; clss++) if(!OpCode._isWordCharacter(clss)) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 's': for(clss = 0; clss < 256; clss++) if(Character.isWhitespace(clss)) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 'S': for(clss = 0; clss < 256; clss++) if(!Character.isWhitespace(clss)) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 'd': for(clss = '0'; clss <= '9'; clss++) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 'D': for(clss = 0; clss < '0'; clss++) __setCharacterClassBits(__program, bits, deflt, clss); for(clss = (char)('9' + 1); clss < 256; clss++) __setCharacterClassBits(__program, bits, deflt, clss); lastclss = Character.MAX_VALUE; continue; case 'n': clss = '\n'; break; case 'r': clss = '\r'; break; case 't': clss = '\t'; break; case 'f': clss = '\f'; break; case 'b': clss = '\b'; break; case 'e': clss = '\033'; break; case 'a': clss = '\007'; break; case 'x': clss = (char)__parseHex(__input._array, __input._getOffset(), 2, numLength); __input._increment(numLength[0]); break; case 'c': clss = __input._postIncrement(); if(Character.isLowerCase(clss)) clss = Character.toUpperCase(clss); clss ^= 64; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': clss = (char)__parseOctal(__input._array, __input._getOffset() - 1, 3, numLength); __input._increment(numLength[0] - 1); break; } } if(range) { if(lastclss > clss) throw new MalformedPatternException( "Invalid [] range in expression."); range = false; } else { lastclss = clss; if(__input._getValue() == '-' && __input._getOffset() + 1 < __input._getLength() && __input._getValueRelative(1) != ']') { __input._increment(); range = true; continue; } } while(lastclss <= clss) { __setCharacterClassBits(__program, bits, deflt, lastclss); if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && Character.isUpperCase(lastclss)) __setCharacterClassBits(__program, bits, deflt, Character.toLowerCase(lastclss)); ++lastclss; } lastclss = clss; } if(__input._getValue() != ']') throw new MalformedPatternException("Unmatched [] in expression."); __getNextChar(); return offset; } */ private int __parseUnicodeClass() throws MalformedPatternException { boolean range = false, skipTest; char clss, lastclss = Character.MAX_VALUE; int offset, numLength[] = { 0 }; boolean negFlag[] = { false }; boolean opcodeFlag; /* clss isn't character when this flag true. */ if(__input._getValue() == '^') { offset = __emitNode(OpCode._NANYOFUN); __input._increment(); } else { offset = __emitNode(OpCode._ANYOFUN); } clss = __input._getValue(); if(clss == ']' || clss == '-') skipTest = true; else skipTest = false; while((!__input._isAtEnd() && (clss = __input._getValue()) != ']') || skipTest) { // It sucks, but we have to make this assignment every time skipTest = false; opcodeFlag = false; __input._increment(); if(clss == '\\' || clss == '[') { if(clss == '\\') { /* character is escaped */ clss = __input._postIncrement(); } else { /* try POSIX expression */ char posixOpCode = __parsePOSIX(negFlag); if(posixOpCode != 0){ opcodeFlag = true; clss = posixOpCode; } } if (opcodeFlag != true) { switch(clss){ case 'w': opcodeFlag = true; clss = OpCode._ALNUM; lastclss = Character.MAX_VALUE; break; case 'W': opcodeFlag = true; clss = OpCode._NALNUM; lastclss = Character.MAX_VALUE; break; case 's': opcodeFlag = true; clss = OpCode._SPACE; lastclss = Character.MAX_VALUE; break; case 'S': opcodeFlag = true; clss = OpCode._NSPACE; lastclss = Character.MAX_VALUE; break; case 'd': opcodeFlag = true; clss = OpCode._DIGIT; lastclss = Character.MAX_VALUE; break; case 'D': opcodeFlag = true; clss = OpCode._NDIGIT; lastclss = Character.MAX_VALUE; break; case 'n': clss = '\n'; break; case 'r': clss = '\r'; break; case 't': clss = '\t'; break; case 'f': clss = '\f'; break; case 'b': clss = '\b'; break; case 'e': clss = '\033'; break; case 'a': clss = '\007'; break; case 'x': clss = (char)__parseHex(__input._array, __input._getOffset(), 2, numLength); __input._increment(numLength[0]); break; case 'c': clss = __input._postIncrement(); if(Character.isLowerCase(clss)) clss = Character.toUpperCase(clss); clss ^= 64; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': clss = (char)__parseOctal(__input._array, __input._getOffset() - 1, 3, numLength); __input._increment(numLength[0] - 1); break; default: break; } } } if(range) { if(lastclss > clss) throw new MalformedPatternException( "Invalid [] range in expression."); range = false; } else { lastclss = clss; if(opcodeFlag == false && __input._getValue() == '-' && __input._getOffset() + 1 < __input._getLength() && __input._getValueRelative(1) != ']') { __input._increment(); range = true; continue; } } if(lastclss == clss) { if(opcodeFlag == true) { if(negFlag[0] == false) __emitCode(OpCode._OPCODE); else __emitCode(OpCode._NOPCODE); } else __emitCode(OpCode._ONECHAR); __emitCode(clss); if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){ __programSize--; __emitCode(Character.toLowerCase(clss)); } } if(lastclss < clss) { __emitCode(OpCode._RANGE); __emitCode(lastclss); __emitCode(clss); if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 && Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){ __programSize-=2; __emitCode(Character.toLowerCase(lastclss)); __emitCode(Character.toLowerCase(clss)); } lastclss = Character.MAX_VALUE; range = false; } lastclss = clss; } if(__input._getValue() != ']') throw new MalformedPatternException("Unmatched [] in expression."); __getNextChar(); __emitCode(OpCode._END); return offset; } /** * Parse POSIX epxression like [:foo:]. * * @return OpCode. return 0 when fail parsing POSIX expression. */ private char __parsePOSIX(boolean negFlag[]) throws MalformedPatternException { int offset = __input._getOffset(); int len = __input._getLength(); int pos = offset; char value = __input._getValue(pos++); StringBuffer buf; Object opcode; if( value != ':' ) return 0; if( __input._getValue(pos) == '^' ) { negFlag[0] = true; pos++; } else { negFlag[0] = false; } buf = new StringBuffer(); try { while ( (value = __input._getValue(pos++)) != ':' && pos < len) { buf.append(value); } } catch (Exception e){ return 0; } if( __input._getValue(pos++) != ']'){ return 0; } opcode = __hashPOSIX.get(buf.toString()); if( opcode == null ) return 0; __input._setOffset(pos); return ((Character)opcode).charValue(); } private int __parseBranch(int[] retFlags) throws MalformedPatternException { boolean nestCheck = false, handleRepetition = false; int offset, next, min, max, flags[] = { 0 }; char operator, value; min = 0; max = Character.MAX_VALUE; offset = __parseAtom(flags); if(offset == OpCode._NULL_OFFSET) { if((flags[0] & __TRYAGAIN) != 0) retFlags[0] |= __TRYAGAIN; return OpCode._NULL_OFFSET; } operator = __input._getValue(); if(operator == '(' && __input._getValueRelative(1) == '?' && __input._getValueRelative(2) == '#') { while(operator != CharStringPointer._END_OF_STRING && operator != ')') operator = __input._increment(); if(operator != CharStringPointer._END_OF_STRING) { __getNextChar(); operator = __input._getValue(); } } if(operator == '{' && __parseRepetition(__input._array, __input._getOffset())) { int maxOffset, pos; next = __input._getOffset() + 1; pos = maxOffset = __input._getLength(); value = __input._getValue(next); while(Character.isDigit(value) || value == ',') { if(value == ',') { if(pos != maxOffset) break; else pos = next; } ++next; value = __input._getValue(next); } if(value == '}') { int num; StringBuffer buffer = new StringBuffer(10); if(pos == maxOffset) pos = next; __input._increment(); num = __input._getOffset(); value = __input._getValue(num); while(Character.isDigit(value)) { buffer.append(value); ++num; value = __input._getValue(num); } try { min = Integer.parseInt(buffer.toString()); } catch(NumberFormatException e) { throw new MalformedPatternException( "Unexpected number format exception. Please report this bug." + "NumberFormatException message: " + e.getMessage()); } value = __input._getValue(pos); if(value == ',') ++pos; else pos = __input._getOffset(); num = pos; buffer = new StringBuffer(10); value = __input._getValue(num); while(Character.isDigit(value)){ buffer.append(value); ++num; value = __input._getValue(num); } try { if(num != pos) max = Integer.parseInt(buffer.toString()); } catch(NumberFormatException e) { throw new MalformedPatternException( "Unexpected number format exception. Please report this bug." + "NumberFormatException message: " + e.getMessage()); } if(max == 0 && __input._getValue(pos) != '0') max = Character.MAX_VALUE; __input._setOffset(next); __getNextChar(); nestCheck = true; handleRepetition = true; } } if(!nestCheck) { handleRepetition = false; if(!__isSimpleRepetitionOp(operator)) { retFlags[0] = flags[0]; return offset; } __getNextChar(); retFlags[0] = ((operator != '+') ? (__WORSTCASE | __SPSTART) : (__WORSTCASE | __NONNULL)); if(operator == '*' && ((flags[0] & __SIMPLE) != 0)) { __programInsertOperator(OpCode._STAR, offset); __cost+=4; } else if(operator == '*') { min = 0; handleRepetition = true; } else if(operator == '+' && (flags[0] & __SIMPLE) != 0) { __programInsertOperator(OpCode._PLUS, offset); __cost+=3; } else if(operator == '+') { min = 1; handleRepetition = true; } else if(operator == '?') { min = 0; max = 1; handleRepetition = true; } } if(handleRepetition) { // handle repetition if((flags[0] & __SIMPLE) != 0){ __cost+= ((2 + __cost) / 2); __programInsertOperator(OpCode._CURLY, offset); } else { __cost += (4 + __cost); __programAddTail(offset, __emitNode(OpCode._WHILEM)); __programInsertOperator(OpCode._CURLYX, offset); __programAddTail(offset, __emitNode(OpCode._NOTHING)); } if(min > 0) retFlags[0] = (__WORSTCASE | __NONNULL); if(max != 0 && max < min) throw new MalformedPatternException( "Invalid interval {" + min + "," + max + "}"); if(__program!= null) { __program[offset + 2] = (char)min; __program[offset + 3] = (char)max; } } if(__input._getValue() == '?') { __getNextChar(); __programInsertOperator(OpCode._MINMOD, offset); __programAddTail(offset, offset + 2); } if(__isComplexRepetitionOp(__input._array, __input._getOffset())) throw new MalformedPatternException( "Nested repetitions *?+ in expression"); return offset; } private int __parseExpression(boolean isParenthesized, int[] hintFlags) throws MalformedPatternException { char value, paren; char[] modifierFlags, posFlags = { 0 }, negFlags = { 0 }; int nodeOffset = OpCode._NULL_OFFSET, parenthesisNum = 0, br, ender; int[] flags = { 0 };; String modifiers = "iogmsx-"; modifierFlags = posFlags; // Initially we assume expression doesn't match null string. hintFlags[0] = __NONNULL; if (isParenthesized) { paren = 1; if(__input._getValue() == '?') { __input._increment(); paren = value = __input._postIncrement(); switch(value) { case ':' : case '=' : case '!' : break; case '#' : value = __input._getValue(); while(value != CharStringPointer._END_OF_STRING && value != ')') value = __input._increment(); if(value != ')') throw new MalformedPatternException( "Sequence (?#... not terminated"); __getNextChar(); hintFlags[0] = __TRYAGAIN; return OpCode._NULL_OFFSET; default : __input._decrement(); value = __input._getValue(); while(value != CharStringPointer._END_OF_STRING && modifiers.indexOf(value) != -1) { if(value == '-') modifierFlags = negFlags; else __setModifierFlag(modifierFlags, value); value = __input._increment(); } __modifierFlags[0] |= posFlags[0]; __modifierFlags[0] &= ~negFlags[0]; if(value != ')') throw new MalformedPatternException( "Sequence (?" + value + "...) not recognized"); __getNextChar(); hintFlags[0] = __TRYAGAIN; return OpCode._NULL_OFFSET; } } else { parenthesisNum = __numParentheses; ++__numParentheses; nodeOffset = __emitArgNode(OpCode._OPEN, (char)parenthesisNum); } } else paren = 0; br = __parseAlternation(flags); if(br == OpCode._NULL_OFFSET) return OpCode._NULL_OFFSET; if(nodeOffset != OpCode._NULL_OFFSET) __programAddTail(nodeOffset, br); else nodeOffset = br; if((flags[0] & __NONNULL) == 0) hintFlags[0] &= ~__NONNULL; hintFlags[0] |= (flags[0] & __SPSTART); while(__input._getValue() == '|') { __getNextChar(); br = __parseAlternation(flags); if(br == OpCode._NULL_OFFSET) return OpCode._NULL_OFFSET; __programAddTail(nodeOffset, br); if((flags[0] & __NONNULL) == 0) hintFlags[0] &= ~__NONNULL; hintFlags[0] |= (flags[0] & __SPSTART); } switch(paren) { case ':' : ender = __emitNode(OpCode._NOTHING); break; case 1: ender = __emitArgNode(OpCode._CLOSE, (char)parenthesisNum); break; case '=': case '!': ender = __emitNode(OpCode._SUCCEED); hintFlags[0] &= ~__NONNULL; break; case 0 : default : ender = __emitNode(OpCode._END); break; } __programAddTail(nodeOffset, ender); for(br = nodeOffset; br != OpCode._NULL_OFFSET; br = OpCode._getNext(__program, br)) __programAddOperatorTail(br, ender); if(paren == '=') { __programInsertOperator(OpCode._IFMATCH, nodeOffset); __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING)); } else if(paren == '!') { __programInsertOperator(OpCode._UNLESSM, nodeOffset); __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING)); } if(paren != 0 && (__input._isAtEnd() || __getNextChar() != ')')) { throw new MalformedPatternException("Unmatched parentheses."); } else if(paren == 0 && !__input._isAtEnd()) { if(__input._getValue() == ')') throw new MalformedPatternException("Unmatched parentheses."); else // Should never happen. throw new MalformedPatternException( "Unreached characters at end of expression. Please report this bug!"); } return nodeOffset; } /** * Compiles a Perl5 regular expression into a Perl5Pattern instance that * can be used by a Perl5Matcher object to perform pattern matching. * Please see the user's guide for more information about Perl5 regular * expressions. *

* @param pattern A Perl5 regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. The flags * are a logical OR of any number of the five MASK * constants. For example: *

   * regex =
   *   compiler.compile(pattern, Perl5Compiler.
   *                    CASE_INSENSITIVE_MASK |
   *                    Perl5Compiler.MULTILINE_MASK);
   *                 
* This says to compile the pattern so that it treats * input as consisting of multiple lines and to perform * matches in a case insensitive manner. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be a Perl5Pattern and can be reliably * casted to a Perl5Pattern. * @exception MalformedPatternException If the compiled expression * is not a valid Perl5 regular expression. */ public Pattern compile(char[] pattern, int options) throws MalformedPatternException { int[] flags = { 0 }; int caseInsensitive, scan; Perl5Pattern regexp; String mustString, startString; int first; boolean sawOpen = false, sawPlus = false; StringBuffer lastLongest, longest; int length, minLength = 0, curBack, back, backmost; __input = new CharStringPointer(pattern); caseInsensitive = options & __CASE_INSENSITIVE; __modifierFlags[0] = (char)options; __sawBackreference = false; __numParentheses = 1; __programSize = 0; __cost = 0; __program= null; __emitCode((char)0); if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) throw new MalformedPatternException("Unknown compilation error."); if(__programSize >= Character.MAX_VALUE - 1) throw new MalformedPatternException("Expression is too large."); __program= new char[__programSize]; regexp = new Perl5Pattern(); regexp._program = __program; regexp._expression = new String(pattern); __input._setOffset(0); __numParentheses = 1; __programSize = 0; __cost = 0; __emitCode((char)0); if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) throw new MalformedPatternException("Unknown compilation error."); caseInsensitive = __modifierFlags[0] & __CASE_INSENSITIVE; regexp._isExpensive = (__cost >= 10); regexp._startClassOffset = OpCode._NULL_OFFSET; regexp._anchor = 0; regexp._back = -1; regexp._options = options; regexp._startString = null; regexp._mustString = null; mustString = null; startString = null; scan = 1; if(__program[OpCode._getNext(__program, scan)] == OpCode._END){ boolean doItAgain; // bad variables names! char op; first = scan = OpCode._getNextOperator(scan); op = __program[first]; while((op == OpCode._OPEN && (sawOpen = true)) || (op == OpCode._BRANCH && __program[OpCode._getNext(__program, first)] != OpCode._BRANCH) || op == OpCode._PLUS || op == OpCode._MINMOD || (OpCode._opType[op] == OpCode._CURLY && OpCode._getArg1(__program, first) > 0)) { if(op == OpCode._PLUS) sawPlus = true; else first+=OpCode._operandLength[op]; first = OpCode._getNextOperator(first); op = __program[first]; } doItAgain = true; while(doItAgain) { doItAgain = false; op = __program[first]; if(op == OpCode._EXACTLY) { startString = new String(__program, OpCode._getOperand(first + 1), __program[OpCode._getOperand(first)]); } else if(OpCode._isInArray(op, OpCode._opLengthOne, 2)) regexp._startClassOffset = first; else if(op == OpCode._BOUND || op == OpCode._NBOUND) regexp._startClassOffset = first; else if(OpCode._opType[op] == OpCode._BOL) { if(op == OpCode._BOL) regexp._anchor = Perl5Pattern._OPT_ANCH_BOL; else if(op == OpCode._MBOL) regexp._anchor = Perl5Pattern._OPT_ANCH_MBOL; else regexp._anchor = Perl5Pattern._OPT_ANCH; first = OpCode._getNextOperator(first); doItAgain = true; continue; } else if(op == OpCode._STAR && OpCode._opType[__program[OpCode._getNextOperator(first)]] == OpCode._ANY && (regexp._anchor & Perl5Pattern._OPT_ANCH) != 0) { regexp._anchor = Perl5Pattern._OPT_ANCH | Perl5Pattern._OPT_IMPLICIT; first = OpCode._getNextOperator(first); doItAgain = true; continue; } } // end while do it again if(sawPlus && (!sawOpen || !__sawBackreference)) regexp._anchor |= Perl5Pattern._OPT_SKIP; lastLongest = new StringBuffer(); longest = new StringBuffer(); length = 0; minLength = 0; curBack = 0; back = 0; backmost = 0; while(scan > 0 && (op = __program[scan]) != OpCode._END) { if(op == OpCode._BRANCH) { if(__program[OpCode._getNext(__program, scan)] == OpCode._BRANCH) { curBack = -30000; while(__program[scan] == OpCode._BRANCH) scan = OpCode._getNext(__program, scan); } else scan = OpCode._getNextOperator(scan); continue; } if(op == OpCode._UNLESSM) { curBack = -30000; scan = OpCode._getNext(__program, scan); continue; } if(op == OpCode._EXACTLY) { int temp; first = scan; while(__program[(temp = OpCode._getNext(__program, scan))] == OpCode._CLOSE) scan = temp; minLength += __program[OpCode._getOperand(first)]; temp = __program[OpCode._getOperand(first)]; if(curBack - back == length) { lastLongest.append(new String(__program, OpCode._getOperand(first) + 1, temp)); length += temp; curBack += temp; first = OpCode._getNext(__program, scan); } else if(temp >= (length + (curBack >= 0 ? 1 : 0))) { length = temp; lastLongest = new StringBuffer(new String(__program, OpCode._getOperand(first) + 1, temp)); back = curBack; curBack += length; first = OpCode._getNext(__program, scan); } else curBack += temp; } else if(OpCode._isInArray(op, OpCode._opLengthVaries, 0)) { curBack = -30000; length = 0; if(lastLongest.length() > longest.length()) { longest = lastLongest; backmost = back; } lastLongest = new StringBuffer(); if(op == OpCode._PLUS && OpCode._isInArray(__program[OpCode._getNextOperator(scan)], OpCode._opLengthOne, 0)) ++minLength; else if(OpCode._opType[op] == OpCode._CURLY && OpCode._isInArray(__program[OpCode._getNextOperator(scan) + 2], OpCode._opLengthOne, 0)) minLength += OpCode._getArg1(__program, scan); } else if(OpCode._isInArray(op, OpCode._opLengthOne, 0)) { ++curBack; ++minLength; length = 0; if(lastLongest.length() > longest.length()) { longest = lastLongest; backmost = back; } lastLongest = new StringBuffer(); } scan = OpCode._getNext(__program, scan); } // end while if(lastLongest.length() + ((OpCode._opType[__program[first]] == OpCode._EOL) ? 1 : 0) > longest.length()) { longest = lastLongest; backmost = back; } else lastLongest = new StringBuffer(); if(longest.length() > 0 && startString == null) { mustString = longest.toString(); if(backmost < 0) backmost = -1; regexp._back = backmost; /* if(longest.length() > (((caseInsensitive & __CASE_INSENSITIVE) != 0 || OpCode._opType[__program[first]] == OpCode._EOL) ? 1 : 0)) */ } else longest = null; } // end if regexp._isCaseInsensitive = ((caseInsensitive & __CASE_INSENSITIVE) != 0); regexp._numParentheses = __numParentheses - 1; regexp._minLength = minLength; if(mustString != null) { regexp._mustString = mustString.toCharArray(); regexp._mustUtility = 100; } if(startString != null) regexp._startString = startString.toCharArray(); return regexp; } /** * Same as calling compile(pattern, Perl5Compiler.DEFAULT_MASK); *

* @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be a Perl5Pattern and can be reliably * casted to a Perl5Pattern. * @exception MalformedPatternException If the compiled expression * is not a valid Perl5 regular expression. */ public Pattern compile(char[] pattern) throws MalformedPatternException { return compile(pattern, DEFAULT_MASK); } /** * Same as calling compile(pattern, Perl5Compiler.DEFAULT_MASK); *

* @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be a Perl5Pattern and can be reliably * casted to a Perl5Pattern. * @exception MalformedPatternException If the compiled expression * is not a valid Perl5 regular expression. */ public Pattern compile(String pattern) throws MalformedPatternException { return compile(pattern.toCharArray(), DEFAULT_MASK); } /** * Compiles a Perl5 regular expression into a Perl5Pattern instance that * can be used by a Perl5Matcher object to perform pattern matching. * Please see the user's guide for more information about Perl5 regular * expressions. *

* @param pattern A Perl5 regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. The flags * are a logical OR of any number of the five MASK * constants. For example: *

   * regex =
   *   compiler.compile("^\\w+\\d+$",
   *                    Perl5Compiler.CASE_INSENSITIVE_MASK |
   *                    Perl5Compiler.MULTILINE_MASK);
   *                 
* This says to compile the pattern so that it treats * input as consisting of multiple lines and to perform * matches in a case insensitive manner. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be a Perl5Pattern and can be reliably * casted to a Perl5Pattern. * @exception MalformedPatternException If the compiled expression * is not a valid Perl5 regular expression. */ public Pattern compile(String pattern, int options) throws MalformedPatternException { return compile(pattern.toCharArray(), options); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy