org.apache.oro.text.regex.Perl5Matcher Maven / Gradle / Ivy
/*
* $Id: Perl5Matcher.java,v 1.27 2003/11/07 20:16:25 dfs Exp $
*
* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
* must not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact [email protected].
*
* 5. Products derived from this software may not be called "Apache"
* or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
* name, without prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* .
*/
package org.apache.oro.text.regex;
import java.util.*;
/**
* The Perl5Matcher class is used to match regular expressions
* (conforming to the Perl5 regular expression syntax) generated by
* Perl5Compiler.
*
* Perl5Compiler and Perl5Matcher are designed with the intent that
* you use a separate instance of each per thread to avoid the overhead
* of both synchronization and concurrent access (e.g., a match that takes
* a long time in one thread will block the progress of another thread with
* a shorter match). If you want to use a single instance of each
* in a concurrent program, you must appropriately protect access to
* the instances with critical sections. If you want to share Perl5Pattern
* instances between concurrently executing instances of Perl5Matcher, you
* must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}.
*
* @version @version@
* @since 1.0
* @see PatternMatcher
* @see Perl5Compiler
*/
public final class Perl5Matcher implements PatternMatcher {
private static final char __EOS = Character.MAX_VALUE;
private static final int __INITIAL_NUM_OFFSETS = 20;
private boolean __multiline = false, __lastSuccess = false;
private boolean __caseInsensitive = false;
private char __previousChar, __input[], __originalInput[];
private Perl5Repetition __currentRep;
private int __numParentheses, __bol, __eol, __currentOffset, __endOffset;
private char[] __program;
private int __expSize, __inputOffset, __lastParen;
private int[] __beginMatchOffsets, __endMatchOffsets;
private Stack __stack = new Stack();
private Perl5MatchResult __lastMatchResult = null;
private static boolean
__compare(char[] s1, int s1Offs, char[] s2, int s2Offs, int n)
{
int cnt;
for(cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) {
if(s1Offs >= s1.length)
return false;
if(s2Offs >= s2.length)
return false;
if(s1[s1Offs] != s2[s2Offs])
return false;
}
return true;
}
private static int __findFirst(char[] input, int current, int endOffset,
char[] mustString)
{
int count, saveCurrent;
char ch;
if(input.length == 0)
return endOffset;
ch = mustString[0];
// Find the offset of the first character of the must string
while(current < endOffset) {
if(ch == input[current]){
saveCurrent = current;
count = 0;
while(current < endOffset && count < mustString.length) {
if(mustString[count] != input[current])
break;
++count;
++current;
}
current = saveCurrent;
if(count >= mustString.length)
break;
}
++current;
}
return current;
}
private void __pushState(int parenFloor) {
int[] state;
int stateEntries, paren;
stateEntries = 3*(__expSize - parenFloor);
if(stateEntries <= 0)
state = new int[3];
else
state = new int[stateEntries + 3];
state[0] = __expSize;
state[1] = __lastParen;
state[2] = __inputOffset;
for(paren = __expSize; paren > parenFloor; --paren, stateEntries-=3) {
state[stateEntries] = __endMatchOffsets[paren];
state[stateEntries + 1] = __beginMatchOffsets[paren];
state[stateEntries + 2] = paren;
}
__stack.push(state);
}
private void __popState() {
int[] state;
int entry, paren;
state = (int[])__stack.pop();
__expSize = state[0];
__lastParen = state[1];
__inputOffset = state[2];
for(entry = 3; entry < state.length; entry+=3) {
paren = state[entry + 2];
__beginMatchOffsets[paren] = state[entry + 1];
if(paren <= __lastParen)
__endMatchOffsets[paren] = state[entry];
}
for(paren = __lastParen + 1; paren <= __numParentheses; paren++) {
if(paren > __expSize)
__beginMatchOffsets[paren] = OpCode._NULL_OFFSET;
__endMatchOffsets[paren] = OpCode._NULL_OFFSET;
}
}
// Initialize globals needed before calling __tryExpression for first time
private void __initInterpreterGlobals(Perl5Pattern expression, char[] input,
int beginOffset, int endOffset,
int currentOffset)
{
// Remove this hack after more efficient case-folding and unicode
// character classes are implemented
__caseInsensitive = expression._isCaseInsensitive;
__input = input;
__endOffset = endOffset;
__currentRep = new Perl5Repetition();
__currentRep._numInstances = 0;
__currentRep._lastRepetition = null;
__program = expression._program;
__stack.setSize(0);
// currentOffset should always be >= beginOffset and should
// always be equal to zero when beginOffset equals 0, but we
// make a weak attempt to protect against a violation of this
// precondition
if(currentOffset == beginOffset || currentOffset <= 0)
__previousChar = '\n';
else {
__previousChar = input[currentOffset - 1];
if(!__multiline && __previousChar == '\n')
__previousChar = '\0';
}
__numParentheses = expression._numParentheses;
__currentOffset = currentOffset;
__bol = beginOffset;
__eol = endOffset;
// Ok, here we're using endOffset as a temporary variable.
endOffset = __numParentheses + 1;
if(__beginMatchOffsets == null || endOffset > __beginMatchOffsets.length) {
if(endOffset < __INITIAL_NUM_OFFSETS)
endOffset = __INITIAL_NUM_OFFSETS;
__beginMatchOffsets = new int[endOffset];
__endMatchOffsets = new int[endOffset];
}
}
// Set the match result information. Only call this if we successfully
// matched.
private void __setLastMatchResult() {
int offs, maxEndOffs = 0;
//endOffset+=dontTry;
__lastMatchResult = new Perl5MatchResult(__numParentheses + 1);
// This can happen when using Perl5StreamInput
if(__endMatchOffsets[0] > __originalInput.length)
throw new ArrayIndexOutOfBoundsException();
__lastMatchResult._matchBeginOffset = __beginMatchOffsets[0];
while(__numParentheses >= 0) {
offs = __beginMatchOffsets[__numParentheses];
if(offs >= 0)
__lastMatchResult._beginGroupOffset[__numParentheses] =
offs - __lastMatchResult._matchBeginOffset;
else
__lastMatchResult._beginGroupOffset[__numParentheses] =
OpCode._NULL_OFFSET;
offs = __endMatchOffsets[__numParentheses];
if(offs >= 0) {
__lastMatchResult._endGroupOffset[__numParentheses] =
offs - __lastMatchResult._matchBeginOffset;
if(offs > maxEndOffs && offs <= __originalInput.length)
maxEndOffs = offs;
} else
__lastMatchResult._endGroupOffset[__numParentheses] =
OpCode._NULL_OFFSET;
--__numParentheses;
}
__lastMatchResult._match =
new String(__originalInput, __beginMatchOffsets[0],
maxEndOffs - __beginMatchOffsets[0]);
// Free up for garbage collection
__originalInput = null;
}
// Expects to receive a valid regular expression program. No checking
// is done to ensure validity.
// __originalInput must be set before calling this method for
// __lastMatchResult to be set correctly.
// beginOffset marks the beginning of the string
// currentOffset marks where to start the pattern search
private boolean __interpret(Perl5Pattern expression, char[] input,
int beginOffset, int endOffset,
int currentOffset)
{
boolean success;
int minLength = 0, dontTry = 0, offset;
char ch, mustString[];
__initInterpreterGlobals(expression, input, beginOffset, endOffset,
currentOffset);
success = false;
mustString = expression._mustString;
_mainLoop:
while(true) {
if(mustString != null &&
((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 ||
((__multiline ||
(expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0)
&& expression._back >= 0)))
{
__currentOffset =
__findFirst(__input, __currentOffset, endOffset, mustString);
if(__currentOffset >= endOffset) {
if((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0)
expression._mustUtility++;
success = false;
break _mainLoop;
} else if(expression._back >= 0) {
__currentOffset-=expression._back;
if(__currentOffset < currentOffset)
__currentOffset = currentOffset;
minLength = expression._back + mustString.length;
} else if(!expression._isExpensive &&
(expression._options & Perl5Compiler.READ_ONLY_MASK) == 0 &&
(--expression._mustUtility < 0)) {
// Be careful! The preceding logical expression is constructed
// so that mustUtility is only decremented if the expression is
// compiled without READ_ONLY_MASK.
mustString = expression._mustString = null;
__currentOffset = currentOffset;
} else {
__currentOffset = currentOffset;
minLength = mustString.length;
}
}
if((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) {
if(__currentOffset == beginOffset && __tryExpression(beginOffset)) {
success = true;
break _mainLoop;
} else if(__multiline
|| (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0
|| (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0)
{
if(minLength > 0)
dontTry = minLength - 1;
endOffset-=dontTry;
if(__currentOffset > currentOffset)
--__currentOffset;
while(__currentOffset < endOffset) {
if(__input[__currentOffset++] == '\n') {
if(__currentOffset < endOffset &&
__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
}
}
}
break _mainLoop;
}
if(expression._startString != null) {
mustString = expression._startString;
if((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) {
ch = mustString[0];
while(__currentOffset < endOffset) {
if(ch == __input[__currentOffset]) {
if(__tryExpression(__currentOffset)){
success = true;
break _mainLoop;
}
++__currentOffset;
while(__currentOffset < endOffset &&
__input[__currentOffset] == ch)
++__currentOffset;
}
++__currentOffset;
}
} else {
while((__currentOffset =
__findFirst(__input, __currentOffset, endOffset, mustString))
< endOffset){
if(__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
++__currentOffset;
}
}
break _mainLoop;
}
if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
boolean doEvery, tmp;
char op;
doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
if(minLength > 0)
dontTry = minLength - 1;
endOffset -= dontTry;
tmp = true;
switch(op = __program[offset]) {
case OpCode._ANYOF:
offset = OpCode._getOperand(offset);
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(ch < 256 &&
(__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._ANYOFUN:
case OpCode._NANYOFUN:
offset = OpCode._getOperand(offset);
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(__matchUnicodeClass(ch, __program, offset, op)) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._BOUND:
if(minLength > 0) {
++dontTry;
--endOffset;
}
if(__currentOffset != beginOffset) {
ch = __input[__currentOffset - 1];
tmp = OpCode._isWordCharacter(ch);
} else
tmp = OpCode._isWordCharacter(__previousChar);
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(tmp != OpCode._isWordCharacter(ch)){
tmp = !tmp;
if(__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
}
++__currentOffset;
}
if((minLength > 0 || tmp) &&
__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
break;
case OpCode._NBOUND:
if(minLength > 0) {
++dontTry;
--endOffset;
}
if(__currentOffset != beginOffset) {
ch = __input[__currentOffset - 1];
tmp = OpCode._isWordCharacter(ch);
} else
tmp = OpCode._isWordCharacter(__previousChar);
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(tmp != OpCode._isWordCharacter(ch))
tmp = !tmp;
else if(__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
++__currentOffset;
}
if((minLength > 0 || !tmp) &&
__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
break;
case OpCode._ALNUM:
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(OpCode._isWordCharacter(ch)) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._NALNUM:
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(!OpCode._isWordCharacter(ch)) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._SPACE:
while(__currentOffset < endOffset) {
if(Character.isWhitespace(__input[__currentOffset])) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._NSPACE:
while(__currentOffset < endOffset) {
if(!Character.isWhitespace(__input[__currentOffset])) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._DIGIT:
while(__currentOffset < endOffset) {
if(Character.isDigit(__input[__currentOffset])) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
case OpCode._NDIGIT:
while(__currentOffset < endOffset) {
if(!Character.isDigit(__input[__currentOffset])) {
if(tmp && __tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
} else
tmp = doEvery;
} else
tmp = true;
++__currentOffset;
}
break;
} // end switch
} else {
if(minLength > 0)
dontTry = minLength - 1;
endOffset-=dontTry;
do {
if(__tryExpression(__currentOffset)) {
success = true;
break _mainLoop;
}
} while(__currentOffset++ < endOffset);
}
break _mainLoop;
} // end while
__lastSuccess = success;
__lastMatchResult = null;
return success;
}
private boolean __matchUnicodeClass(char code, char __program[],
int offset ,char opcode)
{
boolean isANYOF = ( opcode == OpCode._ANYOFUN );
while( __program[offset] != OpCode._END ){
if( __program[offset] == OpCode._RANGE ){
offset++;
if((code >= __program[offset]) && (code <= __program[offset+1])){
return isANYOF;
} else {
offset+=2;
}
} else if(__program[offset] == OpCode._ONECHAR) {
offset++;
if(__program[offset++] == code) return isANYOF;
} else {
isANYOF = (__program[offset] == OpCode._OPCODE)
? isANYOF : !isANYOF;
offset++;
switch ( __program[offset++] ) {
case OpCode._ALNUM:
if(OpCode._isWordCharacter(code)) return isANYOF;
break;
case OpCode._NALNUM:
if(!OpCode._isWordCharacter(code)) return isANYOF;
break;
case OpCode._SPACE:
if(Character.isWhitespace(code)) return isANYOF;
break;
case OpCode._NSPACE:
if(!Character.isWhitespace(code)) return isANYOF;
break;
case OpCode._DIGIT:
if(Character.isDigit(code)) return isANYOF;
break;
case OpCode._NDIGIT:
if(!Character.isDigit(code)) return isANYOF;
break;
case OpCode._ALNUMC:
if(Character.isLetterOrDigit(code)) return isANYOF;
break;
case OpCode._ALPHA:
if(Character.isLetter(code)) return isANYOF;
break;
case OpCode._BLANK:
if(Character.isSpaceChar(code)) return isANYOF;
break;
case OpCode._CNTRL:
if(Character.isISOControl(code)) return isANYOF;
break;
case OpCode._LOWER:
if(Character.isLowerCase(code)) return isANYOF;
// Remove this hack after more efficient case-folding and unicode
// character classes are implemented
if(__caseInsensitive && Character.isUpperCase(code))
return isANYOF;
break;
case OpCode._UPPER:
if(Character.isUpperCase(code)) return isANYOF;
// Remove this hack after more efficient case-folding and unicode
// character classes are implemented
if(__caseInsensitive && Character.isLowerCase(code))
return isANYOF;
break;
case OpCode._PRINT:
if(Character.isSpaceChar(code)) return isANYOF;
// Fall through to check if the character is alphanumeric,
// or a punctuation mark. Printable characters are either
// alphanumeric, punctuation marks, or spaces.
case OpCode._GRAPH:
if(Character.isLetterOrDigit(code)) return isANYOF;
// Fall through to check if the character is a punctuation mark.
// Graph characters are either alphanumeric or punctuation.
case OpCode._PUNCT:
switch ( Character.getType(code) ) {
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
return isANYOF;
default:
break;
}
break;
case OpCode._XDIGIT:
if( (code >= '0' && code <= '9') ||
(code >= 'a' && code <= 'f') ||
(code >= 'A' && code <= 'F')) return isANYOF;
break;
case OpCode._ASCII:
if(code < 0x80)return isANYOF;
}
}
}
return !isANYOF;
}
private boolean __tryExpression(int offset) {
int count;
__inputOffset = offset;
__lastParen = 0;
__expSize = 0;
if(__numParentheses > 0) {
for(count=0; count <= __numParentheses; count++) {
__beginMatchOffsets[count] = OpCode._NULL_OFFSET;
__endMatchOffsets[count] = OpCode._NULL_OFFSET;
}
}
if(__match(1)){
__beginMatchOffsets[0] = offset;
__endMatchOffsets[0] = __inputOffset;
return true;
}
return false;
}
private int __repeat(int offset, int max) {
int scan, eol, operand, ret;
char ch;
char op;
scan = __inputOffset;
eol = __eol;
if(max != Character.MAX_VALUE && max < eol - scan)
eol = scan + max;
operand = OpCode._getOperand(offset);
switch(op = __program[offset]) {
case OpCode._ANY:
while(scan < eol && __input[scan] != '\n')
++scan;
break;
case OpCode._SANY:
scan = eol;
break;
case OpCode._EXACTLY:
++operand;
while(scan < eol && __program[operand] == __input[scan])
++scan;
break;
case OpCode._ANYOF:
if(scan < eol && (ch = __input[scan]) < 256) {
while((ch < 256 ) && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
if(++scan < eol)
ch = __input[scan];
else
break;
}
}
break;
case OpCode._ANYOFUN:
case OpCode._NANYOFUN:
if(scan < eol) {
ch = __input[scan];
while(__matchUnicodeClass(ch, __program, operand, op)){
if(++scan < eol)
ch = __input[scan];
else
break;
}
}
break;
case OpCode._ALNUM:
while(scan < eol && OpCode._isWordCharacter(__input[scan]))
++scan;
break;
case OpCode._NALNUM:
while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
++scan;
break;
case OpCode._SPACE:
while(scan < eol && Character.isWhitespace(__input[scan]))
++scan;
break;
case OpCode._NSPACE:
while(scan < eol && !Character.isWhitespace(__input[scan]))
++scan;
break;
case OpCode._DIGIT:
while(scan < eol && Character.isDigit(__input[scan]))
++scan;
break;
case OpCode._NDIGIT:
while(scan < eol && !Character.isDigit(__input[scan]))
++scan;
break;
default:
break;
}
ret = scan - __inputOffset;
__inputOffset = scan;
return ret;
}
private boolean __match(int offset) {
char nextChar, op;
int scan, next, input, maxScan, current, line, arg;
boolean inputRemains = true, minMod = false;
Perl5Repetition rep;
input = __inputOffset;
inputRemains = (input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
scan = offset;
maxScan = __program.length;
while(scan < maxScan /*&& scan > 0*/){
next = OpCode._getNext(__program, scan);
switch(op = __program[scan]) {
case OpCode._BOL:
if(input == __bol ? __previousChar == '\n' :
(__multiline && (inputRemains || input < __eol) &&
__input[input - 1] == '\n'))
break;
return false;
case OpCode._MBOL:
if(input == __bol ? __previousChar == '\n' :
((inputRemains || input < __eol) && __input[input - 1] == '\n'))
break;
return false;
case OpCode._SBOL:
if(input == __bol && __previousChar == '\n')
break;
return false;
case OpCode._GBOL:
if(input == __bol)
break;
return true;
case OpCode._EOL :
if((inputRemains || input < __eol) && nextChar != '\n')
return false;
if(!__multiline && __eol - input > 1)
return false;
break;
case OpCode._MEOL:
if((inputRemains || input < __eol) && nextChar != '\n')
return false;
break;
case OpCode._SEOL:
if((inputRemains || input < __eol) && nextChar != '\n')
return false;
if(__eol - input > 1)
return false;
break;
case OpCode._SANY:
if(!inputRemains && input >= __eol)
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._ANY:
if((!inputRemains && input >= __eol) || nextChar == '\n')
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._EXACTLY:
current = OpCode._getOperand(scan);
line = __program[current++];
if(__program[current] != nextChar)
return false;
if(__eol - input < line)
return false;
if(line > 1 && !__compare(__program, current, __input, input, line))
return false;
input+=line;
inputRemains = (input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._ANYOF:
current = OpCode._getOperand(scan);
if(nextChar == __EOS && inputRemains)
nextChar = __input[input];
if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
(1 << (nextChar & 0xf))) != 0)
return false;
if(!inputRemains && input >= __eol)
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._ANYOFUN:
case OpCode._NANYOFUN:
current = OpCode._getOperand(scan);
if(nextChar == __EOS && inputRemains)
nextChar = __input[input];
if(!__matchUnicodeClass(nextChar, __program, current, op))
return false;
if(!inputRemains && input >= __eol)
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._ALNUM:
if(!inputRemains)
return false;
if(!OpCode._isWordCharacter(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._NALNUM:
if(!inputRemains && input >= __eol)
return false;
if(OpCode._isWordCharacter(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._NBOUND:
case OpCode._BOUND:
boolean a, b;
if(input == __bol)
a = OpCode._isWordCharacter(__previousChar);
else
a = OpCode._isWordCharacter(__input[input - 1]);
b = OpCode._isWordCharacter(nextChar);
if((a == b) == (__program[scan] == OpCode._BOUND))
return false;
break;
case OpCode._SPACE:
if(!inputRemains && input >= __eol)
return false;
if(!Character.isWhitespace(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._NSPACE:
if(!inputRemains)
return false;
if(Character.isWhitespace(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._DIGIT:
if(!Character.isDigit(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._NDIGIT:
if(!inputRemains && input >= __eol)
return false;
if(Character.isDigit(nextChar))
return false;
inputRemains = (++input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._REF:
arg = OpCode._getArg1(__program, scan);
current = __beginMatchOffsets[arg];
if(current == OpCode._NULL_OFFSET)
return false;
if(__endMatchOffsets[arg] == OpCode._NULL_OFFSET)
return false;
if(current == __endMatchOffsets[arg])
break;
if(__input[current] != nextChar)
return false;
line = __endMatchOffsets[arg] - current;
if(input + line > __eol)
return false;
if(line > 1 && !__compare(__input, current, __input, input, line))
return false;
input+=line;
inputRemains = (input < __endOffset);
nextChar = (inputRemains ? __input[input] : __EOS);
break;
case OpCode._NOTHING:
break;
case OpCode._BACK:
break;
case OpCode._OPEN:
arg = OpCode._getArg1(__program, scan);
__beginMatchOffsets[arg] = input;
if(arg > __expSize)
__expSize = arg;
break;
case OpCode._CLOSE:
arg = OpCode._getArg1(__program, scan);
__endMatchOffsets[arg] = input;
if(arg > __lastParen)
__lastParen = arg;
break;
case OpCode._CURLYX:
rep = new Perl5Repetition();
rep._lastRepetition = __currentRep;
__currentRep = rep;
rep._parenFloor = __lastParen;
rep._numInstances = -1;
rep._min = OpCode._getArg1(__program, scan);
rep._max = OpCode._getArg2(__program, scan);
rep._scan = OpCode._getNextOperator(scan) + 2;
rep._next = next;
rep._minMod = minMod;
// Must initialize to -1 because if we initialize to 0 and are
// at the beginning of the input the OpCode._WHILEM case will
// not work right.
rep._lastLocation = -1;
__inputOffset = input;
// use minMod as temporary
minMod = __match(OpCode._getPrevOperator(next));
// leave scope call not pertinent?
__currentRep = rep._lastRepetition;
return minMod;
case OpCode._WHILEM:
rep = __currentRep;
arg = rep._numInstances + 1;
__inputOffset = input;
if(input == rep._lastLocation) {
__currentRep = rep._lastRepetition;
line = __currentRep._numInstances;
if(__match(rep._next))
return true;
__currentRep._numInstances = line;
__currentRep = rep;
return false;
}
if(arg < rep._min) {
rep._numInstances = arg;
rep._lastLocation = input;
if(__match(rep._scan))
return true;
rep._numInstances = arg - 1;
return false;
}
if(rep._minMod) {
__currentRep = rep._lastRepetition;
line = __currentRep._numInstances;
if(__match(rep._next))
return true;
__currentRep._numInstances = line;
__currentRep = rep;
if(arg >= rep._max)
return false;
__inputOffset = input;
rep._numInstances = arg;
rep._lastLocation = input;
if(__match(rep._scan))
return true;
rep._numInstances = arg - 1;
return false;
}
if(arg < rep._max) {
__pushState(rep._parenFloor);
rep._numInstances = arg;
rep._lastLocation = input;
if(__match(rep._scan))
return true;
__popState();
__inputOffset = input;
}
__currentRep = rep._lastRepetition;
line = __currentRep._numInstances;
if(__match(rep._next))
return true;
rep._numInstances = line;
__currentRep = rep;
rep._numInstances = arg - 1;
return false;
case OpCode._BRANCH:
if(__program[next] != OpCode._BRANCH)
next = OpCode._getNextOperator(scan);
else {
int lastParen;
lastParen = __lastParen;
do {
__inputOffset = input;
if(__match(OpCode._getNextOperator(scan)))
return true;
for(arg = __lastParen; arg > lastParen; --arg)
//__endMatchOffsets[arg] = 0;
__endMatchOffsets[arg] = OpCode._NULL_OFFSET;
__lastParen = arg;
scan = OpCode._getNext(__program, scan);
} while(scan != OpCode._NULL_OFFSET &&
__program[scan] == OpCode._BRANCH);
return false;
}
break;
case OpCode._MINMOD:
minMod = true;
break;
case OpCode._CURLY:
case OpCode._STAR:
case OpCode._PLUS:
if(op == OpCode._CURLY) {
line = OpCode._getArg1(__program, scan);
arg = OpCode._getArg2(__program, scan);
scan = OpCode._getNextOperator(scan) + 2;
} else if(op == OpCode._STAR) {
line = 0;
arg = Character.MAX_VALUE;
scan = OpCode._getNextOperator(scan);
} else {
line = 1;
arg = Character.MAX_VALUE;
scan = OpCode._getNextOperator(scan);
}
if(__program[next] == OpCode._EXACTLY) {
nextChar = __program[OpCode._getOperand(next) + 1];
current = 0;
} else {
nextChar = __EOS;
current = -1000;
}
__inputOffset = input;
if(minMod) {
minMod = false;
if(line > 0 && __repeat(scan, line) < line)
return false;
while(arg >= line || (arg == Character.MAX_VALUE && line > 0)) {
// there may be a bug here with respect to
// __inputOffset >= __endOffset, but it seems to be right for
// now. the issue is with __inputOffset being reset later.
// is this test really supposed to happen here?
if(current == -1000 || __inputOffset >= __endOffset ||
__input[__inputOffset] == nextChar) {
if(__match(next))
return true;
}
__inputOffset = input + line;
if(__repeat(scan, 1) != 0) {
++line;
__inputOffset = input + line;
} else
return false;
}
} else {
arg = __repeat(scan, arg);
if(line < arg && OpCode._opType[__program[next]] == OpCode._EOL &&
((!__multiline && __program[next] != OpCode._MEOL) ||
__program[next] == OpCode._SEOL))
line = arg;
while(arg >= line) {
// there may be a bug here with respect to
// __inputOffset >= __endOffset, but it seems to be right for
// now. the issue is with __inputOffset being reset later.
// is this test really supposed to happen here?
if(current == -1000 || __inputOffset >= __endOffset ||
__input[__inputOffset] == nextChar) {
if(__match(next))
return true;
}
--arg;
__inputOffset = input + arg;
}
}
return false;
case OpCode._SUCCEED:
case OpCode._END:
__inputOffset = input;
// This enforces the rule that two consecutive matches cannot have
// the same end offset.
if(__inputOffset == __lastMatchInputEndOffset)
return false;
return true;
case OpCode._IFMATCH:
__inputOffset = input;
scan = OpCode._getNextOperator(scan);
if(!__match(scan))
return false;
break;
case OpCode._UNLESSM:
__inputOffset = input;
scan = OpCode._getNextOperator(scan);
if(__match(scan))
return false;
break;
default:
// todo: Need to throw an exception here.
} // end switch
//scan = (next > 0 ? next : 0);
scan = next;
} // end while scan
return false;
}
/**
* Set whether or not subsequent calls to {@link #matches matches()}
* or {@link #contains contains()} should treat the input as
* consisting of multiple lines. The default behavior is for
* input to be treated as consisting of multiple lines. This method
* should only be called if the Perl5Pattern used for a match was
* compiled without either of the Perl5Compiler.MULTILINE_MASK or
* Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the
* behavior of how the ^, $, and . metacharacters are
* interpreted on the fly. The compilation options used when compiling
* a pattern ALWAYS override the behavior specified by setMultiline(). See
* {@link Perl5Compiler} for more details.
*
* @param multiline If set to true treats the input as consisting of
* multiple lines with respect to the ^ and $
* metacharacters. If set to false treats the input as consisting
* of a single line with respect to the ^ and $
* metacharacters.
*/
public void setMultiline(boolean multiline) { __multiline = multiline; }
/**
* @return True if the matcher is treating input as consisting of multiple
* lines with respect to the ^ and $ metacharacters,
* false otherwise.
*/
public boolean isMultiline() { return __multiline; }
char[] _toLower(char[] input) {
int current;
char[] inp;
// todo:
// Certainly not the best way to do case insensitive matching.
// Must definitely change this in some way, but for now we
// do what Perl does and make a copy of the input, converting
// it all to lowercase. This is truly better handled in the
// compilation phase.
inp = new char[input.length];
System.arraycopy(input, 0, inp, 0, input.length);
input = inp;
// todo: Need to inline toLowerCase()
for(current = 0; current < input.length; current++)
if(Character.isUpperCase(input[current]))
input[current] = Character.toLowerCase(input[current]);
return input;
}
/**
* Determines if a prefix of a string (represented as a char[])
* matches a given pattern, starting from a given offset into the string.
* If a prefix of the string matches the pattern, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}.
*
* This method is useful for certain common token identification tasks
* that are made more difficult without this functionality.
*
* @param input The char[] to test for a prefix match.
* @param pattern The Pattern to be matched.
* @param offset The offset at which to start searching for the prefix.
* @return True if input matches pattern, false otherwise.
*/
public boolean matchesPrefix(char[] input, Pattern pattern, int offset) {
Perl5Pattern expression;
expression = (Perl5Pattern)pattern;
__originalInput = input;
if(expression._isCaseInsensitive)
input = _toLower(input);
__initInterpreterGlobals(expression, input, 0, input.length, offset);
__lastSuccess = __tryExpression(offset);
__lastMatchResult = null;
return __lastSuccess;
}
/**
* Determines if a prefix of a string (represented as a char[])
* matches a given pattern.
* If a prefix of the string matches the pattern, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}.
*
* This method is useful for certain common token identification tasks
* that are made more difficult without this functionality.
*
* @param input The char[] to test for a prefix match.
* @param pattern The Pattern to be matched.
* @return True if input matches pattern, false otherwise.
*/
public boolean matchesPrefix(char[] input, Pattern pattern) {
return matchesPrefix(input, pattern, 0);
}
/**
* Determines if a prefix of a string matches a given pattern.
* If a prefix of the string matches the pattern, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}.
*
* This method is useful for certain common token identification tasks
* that are made more difficult without this functionality.
*
* @param input The String to test for a prefix match.
* @param pattern The Pattern to be matched.
* @return True if input matches pattern, false otherwise.
*/
public boolean matchesPrefix(String input, Pattern pattern) {
return matchesPrefix(input.toCharArray(), pattern, 0);
}
/**
* Determines if a prefix of a PatternMatcherInput instance
* matches a given pattern. If there is a match, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}. Unlike the
* {@link #contains(PatternMatcherInput, Pattern)}
* method, the current offset of the PatternMatcherInput argument
* is not updated. However, unlike the
* {@link #matches matches(PatternMatcherInput, Pattern)} method,
* matchesPrefix() will start its search from the current offset
* rather than the begin offset of the PatternMatcherInput.
*
* This method is useful for certain common token identification tasks
* that are made more difficult without this functionality.
*
* @param input The PatternMatcherInput to test for a prefix match.
* @param pattern The Pattern to be matched.
* @return True if input matches pattern, false otherwise.
*/
public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern) {
char[] inp;
Perl5Pattern expression;
expression = (Perl5Pattern)pattern;
__originalInput = input._originalBuffer;
if(expression._isCaseInsensitive) {
if(input._toLowerBuffer == null)
input._toLowerBuffer = _toLower(__originalInput);
inp = input._toLowerBuffer;
} else
inp = __originalInput;
__initInterpreterGlobals(expression, inp, input._beginOffset,
input._endOffset, input._currentOffset);
__lastSuccess = __tryExpression(input._currentOffset);
__lastMatchResult = null;
return __lastSuccess;
}
/**
* Determines if a string (represented as a char[]) exactly
* matches a given pattern. If
* there is an exact match, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}. The pattern must be
* a Perl5Pattern instance, otherwise a ClassCastException will
* be thrown. You are not required to, and indeed should NOT try to
* (for performance reasons), catch a ClassCastException because it
* will never be thrown as long as you use a Perl5Pattern as the pattern
* parameter.
*
* Note: matches() is not the same as sticking a ^ in front of
* your expression and a $ at the end of your expression in Perl5
* and using the =~ operator, even though in many cases it will be
* equivalent. matches() literally looks for an exact match according
* to the rules of Perl5 expression matching. Therefore, if you have
* a pattern foo|foot and are matching the input foot
* it will not produce an exact match. But foot|foo will
* produce an exact match for either foot or foo.
* Remember, Perl5 regular expressions do not match the longest
* possible match. From the perlre manpage:
*
* Alternatives are tried from left to right, so the first
* alternative found for which the entire expression matches,
* is the one that is chosen. This means that alternatives
* are not necessarily greedy. For example: when matching
* foo|foot against "barefoot", only the "foo" part will
* match, as that is the first alternative tried, and it
* successfully matches the target string.
*
*
* @param input The char[] to test for an exact match.
* @param pattern The Perl5Pattern to be matched.
* @return True if input matches pattern, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean matches(char[] input, Pattern pattern) {
Perl5Pattern expression;
expression = (Perl5Pattern)pattern;
__originalInput = input;
if(expression._isCaseInsensitive)
input = _toLower(input);
__initInterpreterGlobals(expression, input, 0, input.length, 0);
__lastSuccess = (__tryExpression(0) &&
__endMatchOffsets[0] == input.length);
__lastMatchResult = null;
return __lastSuccess;
}
/**
* Determines if a string exactly matches a given pattern. If
* there is an exact match, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}. The pattern must be
* a Perl5Pattern instance, otherwise a ClassCastException will
* be thrown. You are not required to, and indeed should NOT try to
* (for performance reasons), catch a ClassCastException because it
* will never be thrown as long as you use a Perl5Pattern as the pattern
* parameter.
*
* Note: matches() is not the same as sticking a ^ in front of
* your expression and a $ at the end of your expression in Perl5
* and using the =~ operator, even though in many cases it will be
* equivalent. matches() literally looks for an exact match according
* to the rules of Perl5 expression matching. Therefore, if you have
* a pattern foo|foot and are matching the input foot
* it will not produce an exact match. But foot|foo will
* produce an exact match for either foot or foo.
* Remember, Perl5 regular expressions do not match the longest
* possible match. From the perlre manpage:
*
* Alternatives are tried from left to right, so the first
* alternative found for which the entire expression matches,
* is the one that is chosen. This means that alternatives
* are not necessarily greedy. For example: when matching
* foo|foot against "barefoot", only the "foo" part will
* match, as that is the first alternative tried, and it
* successfully matches the target string.
*
*
* @param input The String to test for an exact match.
* @param pattern The Perl5Pattern to be matched.
* @return True if input matches pattern, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean matches(String input, Pattern pattern) {
return matches(input.toCharArray(), pattern);
}
/**
* Determines if the contents of a PatternMatcherInput instance
* exactly matches a given pattern. If
* there is an exact match, a MatchResult instance
* representing the match is made accesible via
* {@link #getMatch()}. Unlike the
* {@link #contains(PatternMatcherInput, Pattern)}
* method, the current offset of the PatternMatcherInput argument
* is not updated. You should remember that the region between
* the begin (NOT the current) and end offsets of the PatternMatcherInput
* will be tested for an exact match.
*
* The pattern must be a Perl5Pattern instance, otherwise a
* ClassCastException will be thrown. You are not required to, and
* indeed should NOT try to (for performance reasons), catch a
* ClassCastException because it will never be thrown as long as you use
* a Perl5Pattern as the pattern parameter.
*
* Note: matches() is not the same as sticking a ^ in front of
* your expression and a $ at the end of your expression in Perl5
* and using the =~ operator, even though in many cases it will be
* equivalent. matches() literally looks for an exact match according
* to the rules of Perl5 expression matching. Therefore, if you have
* a pattern foo|foot and are matching the input foot
* it will not produce an exact match. But foot|foo will
* produce an exact match for either foot or foo.
* Remember, Perl5 regular expressions do not match the longest
* possible match. From the perlre manpage:
*
* Alternatives are tried from left to right, so the first
* alternative found for which the entire expression matches,
* is the one that is chosen. This means that alternatives
* are not necessarily greedy. For example: when matching
* foo|foot against "barefoot", only the "foo" part will
* match, as that is the first alternative tried, and it
* successfully matches the target string.
*
*
* @param input The PatternMatcherInput to test for a match.
* @param pattern The Perl5Pattern to be matched.
* @return True if input matches pattern, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean matches(PatternMatcherInput input, Pattern pattern) {
char[] inp;
Perl5Pattern expression;
expression = (Perl5Pattern)pattern;
__originalInput = input._originalBuffer;
if(expression._isCaseInsensitive) {
if(input._toLowerBuffer == null)
input._toLowerBuffer = _toLower(__originalInput);
inp = input._toLowerBuffer;
} else
inp = __originalInput;
__initInterpreterGlobals(expression, inp, input._beginOffset,
input._endOffset, input._beginOffset);
__lastMatchResult = null;
if(__tryExpression(input._beginOffset)) {
if(__endMatchOffsets[0] == input._endOffset ||
input.length() == 0 || input._beginOffset == input._endOffset) {
__lastSuccess = true;
return true;
}
}
__lastSuccess = false;
return false;
}
/**
* Determines if a string contains a pattern. If the pattern is
* matched by some substring of the input, a MatchResult instance
* representing the first such match is made acessible via
* {@link #getMatch()}. If you want to access
* subsequent matches you should either use a PatternMatcherInput object
* or use the offset information in the MatchResult to create a substring
* representing the remaining input. Using the MatchResult offset
* information is the recommended method of obtaining the parts of the
* string preceeding the match and following the match.
*
* The pattern must be a Perl5Pattern instance, otherwise a
* ClassCastException will be thrown. You are not required to, and
* indeed should NOT try to (for performance reasons), catch a
* ClassCastException because it will never be thrown as long as you use
* a Perl5Pattern as the pattern parameter.
*
* @param input The String to test for a match.
* @param pattern The Perl5Pattern to be matched.
* @return True if the input contains a pattern match, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean contains(String input, Pattern pattern) {
return contains(input.toCharArray(), pattern);
}
/**
* Determines if a string (represented as a char[]) contains a pattern.
* If the pattern is
* matched by some substring of the input, a MatchResult instance
* representing the first such match is made acessible via
* {@link #getMatch()}. If you want to access
* subsequent matches you should either use a PatternMatcherInput object
* or use the offset information in the MatchResult to create a substring
* representing the remaining input. Using the MatchResult offset
* information is the recommended method of obtaining the parts of the
* string preceeding the match and following the match.
*
* The pattern must be a Perl5Pattern instance, otherwise a
* ClassCastException will be thrown. You are not required to, and
* indeed should NOT try to (for performance reasons), catch a
* ClassCastException because it will never be thrown as long as you use
* a Perl5Pattern as the pattern parameter.
*
* @param input The char[] to test for a match.
* @param pattern The Perl5Pattern to be matched.
* @return True if the input contains a pattern match, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean contains(char[] input, Pattern pattern) {
Perl5Pattern expression;
expression = (Perl5Pattern)pattern;
__originalInput = input;
if(expression._isCaseInsensitive)
input = _toLower(input);
return __interpret(expression, input, 0, input.length, 0);
}
private static final int __DEFAULT_LAST_MATCH_END_OFFSET = -100;
private int __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET;
/**
* Determines if the contents of a PatternMatcherInput, starting from the
* current offset of the input contains a pattern.
* If a pattern match is found, a MatchResult
* instance representing the first such match is made acessible via
* {@link #getMatch()}. The current offset of the
* PatternMatcherInput is set to the offset corresponding to the end
* of the match, so that a subsequent call to this method will continue
* searching where the last call left off. You should remember that the
* region between the begin and end offsets of the PatternMatcherInput are
* considered the input to be searched, and that the current offset
* of the PatternMatcherInput reflects where a search will start from.
* Matches extending beyond the end offset of the PatternMatcherInput
* will not be matched. In other words, a match must occur entirely
* between the begin and end offsets of the input. See
* {@link PatternMatcherInput} for more details.
*
* As a side effect, if a match is found, the PatternMatcherInput match
* offset information is updated. See the
* {@link PatternMatcherInput#setMatchOffsets(int, int)}
* method for more details.
*
* The pattern must be a Perl5Pattern instance, otherwise a
* ClassCastException will be thrown. You are not required to, and
* indeed should NOT try to (for performance reasons), catch a
* ClassCastException because it will never be thrown as long as you use
* a Perl5Pattern as the pattern parameter.
*
* This method is usually used in a loop as follows:
*
* PatternMatcher matcher;
* PatternCompiler compiler;
* Pattern pattern;
* PatternMatcherInput input;
* MatchResult result;
*
* compiler = new Perl5Compiler();
* matcher = new Perl5Matcher();
*
* try {
* pattern = compiler.compile(somePatternString);
* } catch(MalformedPatternException e) {
* System.err.println("Bad pattern.");
* System.err.println(e.getMessage());
* return;
* }
*
* input = new PatternMatcherInput(someStringInput);
*
* while(matcher.contains(input, pattern)) {
* result = matcher.getMatch();
* // Perform whatever processing on the result you want.
* }
*
*
*
* @param input The PatternMatcherInput to test for a match.
* @param pattern The Pattern to be matched.
* @return True if the input contains a pattern match, false otherwise.
* @exception ClassCastException If a Pattern instance other than a
* Perl5Pattern is passed as the pattern parameter.
*/
public boolean contains(PatternMatcherInput input, Pattern pattern) {
char[] inp;
Perl5Pattern expression;
boolean matchFound;
//if(input.length() > 0) {
// We want to allow a null string to match at the end of the input
// which is why we don't check endOfInput. Not sure if this is a
// safe thing to do or not.
if(input._currentOffset > input._endOffset)
return false;
//}
/* else
if(input._endOfInput())
return false;
*/
expression = (Perl5Pattern)pattern;
__originalInput = input._originalBuffer;
// Todo:
// Really should only reduce to lowercase that part of the
// input that is necessary, instead of the whole thing.
// Adjust MatchResult offsets accordingly. Actually, pass an adjustment
// value to __interpret.
__originalInput = input._originalBuffer;
if(expression._isCaseInsensitive) {
if(input._toLowerBuffer == null)
input._toLowerBuffer = _toLower(__originalInput);
inp = input._toLowerBuffer;
} else
inp = __originalInput;
__lastMatchInputEndOffset = input.getMatchEndOffset();
matchFound =
__interpret(expression, inp, input._beginOffset, input._endOffset,
input._currentOffset);
if(matchFound) {
input.setCurrentOffset(__endMatchOffsets[0]);
input.setMatchOffsets(__beginMatchOffsets[0], __endMatchOffsets[0]);
} else {
input.setCurrentOffset(input._endOffset + 1);
}
// Restore so it doesn't interfere with other unrelated matches.
__lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET;
return matchFound;
}
/**
* Fetches the last match found by a call to a matches() or contains()
* method. If you plan on modifying the original search input, you
* must call this method BEFORE you modify the original search input,
* as a lazy evaluation technique is used to create the MatchResult.
* This reduces the cost of pattern matching when you don't care about
* the actual match and only care if the pattern occurs in the input.
* Otherwise, a MatchResult would be created for every match found,
* whether or not the MatchResult was later used by a call to getMatch().
*
* @return A MatchResult instance containing the pattern match found
* by the last call to any one of the matches() or contains()
* methods. If no match was found by the last call, returns
* null.
*/
public MatchResult getMatch() {
if(!__lastSuccess)
return null;
if(__lastMatchResult == null)
__setLastMatchResult();
return __lastMatchResult;
}
}