All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.oro.text.regex.Perl5Matcher Maven / Gradle / Ivy

/*
 * $Id: Perl5Matcher.java,v 1.27 2003/11/07 20:16:25 dfs Exp $
 *
 * ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact [email protected].
 *
 * 5. Products derived from this software may not be called "Apache" 
 *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
 *    name, without prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 */


package org.apache.oro.text.regex;

import java.util.*;

/**
 * The Perl5Matcher class is used to match regular expressions
 * (conforming to the Perl5 regular expression syntax) generated by
 * Perl5Compiler.
 * 

* Perl5Compiler and Perl5Matcher are designed with the intent that * you use a separate instance of each per thread to avoid the overhead * of both synchronization and concurrent access (e.g., a match that takes * a long time in one thread will block the progress of another thread with * a shorter match). If you want to use a single instance of each * in a concurrent program, you must appropriately protect access to * the instances with critical sections. If you want to share Perl5Pattern * instances between concurrently executing instances of Perl5Matcher, you * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}. * * @version @version@ * @since 1.0 * @see PatternMatcher * @see Perl5Compiler */ public final class Perl5Matcher implements PatternMatcher { private static final char __EOS = Character.MAX_VALUE; private static final int __INITIAL_NUM_OFFSETS = 20; private boolean __multiline = false, __lastSuccess = false; private boolean __caseInsensitive = false; private char __previousChar, __input[], __originalInput[]; private Perl5Repetition __currentRep; private int __numParentheses, __bol, __eol, __currentOffset, __endOffset; private char[] __program; private int __expSize, __inputOffset, __lastParen; private int[] __beginMatchOffsets, __endMatchOffsets; private Stack __stack = new Stack(); private Perl5MatchResult __lastMatchResult = null; private static boolean __compare(char[] s1, int s1Offs, char[] s2, int s2Offs, int n) { int cnt; for(cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) { if(s1Offs >= s1.length) return false; if(s2Offs >= s2.length) return false; if(s1[s1Offs] != s2[s2Offs]) return false; } return true; } private static int __findFirst(char[] input, int current, int endOffset, char[] mustString) { int count, saveCurrent; char ch; if(input.length == 0) return endOffset; ch = mustString[0]; // Find the offset of the first character of the must string while(current < endOffset) { if(ch == input[current]){ saveCurrent = current; count = 0; while(current < endOffset && count < mustString.length) { if(mustString[count] != input[current]) break; ++count; ++current; } current = saveCurrent; if(count >= mustString.length) break; } ++current; } return current; } private void __pushState(int parenFloor) { int[] state; int stateEntries, paren; stateEntries = 3*(__expSize - parenFloor); if(stateEntries <= 0) state = new int[3]; else state = new int[stateEntries + 3]; state[0] = __expSize; state[1] = __lastParen; state[2] = __inputOffset; for(paren = __expSize; paren > parenFloor; --paren, stateEntries-=3) { state[stateEntries] = __endMatchOffsets[paren]; state[stateEntries + 1] = __beginMatchOffsets[paren]; state[stateEntries + 2] = paren; } __stack.push(state); } private void __popState() { int[] state; int entry, paren; state = (int[])__stack.pop(); __expSize = state[0]; __lastParen = state[1]; __inputOffset = state[2]; for(entry = 3; entry < state.length; entry+=3) { paren = state[entry + 2]; __beginMatchOffsets[paren] = state[entry + 1]; if(paren <= __lastParen) __endMatchOffsets[paren] = state[entry]; } for(paren = __lastParen + 1; paren <= __numParentheses; paren++) { if(paren > __expSize) __beginMatchOffsets[paren] = OpCode._NULL_OFFSET; __endMatchOffsets[paren] = OpCode._NULL_OFFSET; } } // Initialize globals needed before calling __tryExpression for first time private void __initInterpreterGlobals(Perl5Pattern expression, char[] input, int beginOffset, int endOffset, int currentOffset) { // Remove this hack after more efficient case-folding and unicode // character classes are implemented __caseInsensitive = expression._isCaseInsensitive; __input = input; __endOffset = endOffset; __currentRep = new Perl5Repetition(); __currentRep._numInstances = 0; __currentRep._lastRepetition = null; __program = expression._program; __stack.setSize(0); // currentOffset should always be >= beginOffset and should // always be equal to zero when beginOffset equals 0, but we // make a weak attempt to protect against a violation of this // precondition if(currentOffset == beginOffset || currentOffset <= 0) __previousChar = '\n'; else { __previousChar = input[currentOffset - 1]; if(!__multiline && __previousChar == '\n') __previousChar = '\0'; } __numParentheses = expression._numParentheses; __currentOffset = currentOffset; __bol = beginOffset; __eol = endOffset; // Ok, here we're using endOffset as a temporary variable. endOffset = __numParentheses + 1; if(__beginMatchOffsets == null || endOffset > __beginMatchOffsets.length) { if(endOffset < __INITIAL_NUM_OFFSETS) endOffset = __INITIAL_NUM_OFFSETS; __beginMatchOffsets = new int[endOffset]; __endMatchOffsets = new int[endOffset]; } } // Set the match result information. Only call this if we successfully // matched. private void __setLastMatchResult() { int offs, maxEndOffs = 0; //endOffset+=dontTry; __lastMatchResult = new Perl5MatchResult(__numParentheses + 1); // This can happen when using Perl5StreamInput if(__endMatchOffsets[0] > __originalInput.length) throw new ArrayIndexOutOfBoundsException(); __lastMatchResult._matchBeginOffset = __beginMatchOffsets[0]; while(__numParentheses >= 0) { offs = __beginMatchOffsets[__numParentheses]; if(offs >= 0) __lastMatchResult._beginGroupOffset[__numParentheses] = offs - __lastMatchResult._matchBeginOffset; else __lastMatchResult._beginGroupOffset[__numParentheses] = OpCode._NULL_OFFSET; offs = __endMatchOffsets[__numParentheses]; if(offs >= 0) { __lastMatchResult._endGroupOffset[__numParentheses] = offs - __lastMatchResult._matchBeginOffset; if(offs > maxEndOffs && offs <= __originalInput.length) maxEndOffs = offs; } else __lastMatchResult._endGroupOffset[__numParentheses] = OpCode._NULL_OFFSET; --__numParentheses; } __lastMatchResult._match = new String(__originalInput, __beginMatchOffsets[0], maxEndOffs - __beginMatchOffsets[0]); // Free up for garbage collection __originalInput = null; } // Expects to receive a valid regular expression program. No checking // is done to ensure validity. // __originalInput must be set before calling this method for // __lastMatchResult to be set correctly. // beginOffset marks the beginning of the string // currentOffset marks where to start the pattern search private boolean __interpret(Perl5Pattern expression, char[] input, int beginOffset, int endOffset, int currentOffset) { boolean success; int minLength = 0, dontTry = 0, offset; char ch, mustString[]; __initInterpreterGlobals(expression, input, beginOffset, endOffset, currentOffset); success = false; mustString = expression._mustString; _mainLoop: while(true) { if(mustString != null && ((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 || ((__multiline || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0) && expression._back >= 0))) { __currentOffset = __findFirst(__input, __currentOffset, endOffset, mustString); if(__currentOffset >= endOffset) { if((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0) expression._mustUtility++; success = false; break _mainLoop; } else if(expression._back >= 0) { __currentOffset-=expression._back; if(__currentOffset < currentOffset) __currentOffset = currentOffset; minLength = expression._back + mustString.length; } else if(!expression._isExpensive && (expression._options & Perl5Compiler.READ_ONLY_MASK) == 0 && (--expression._mustUtility < 0)) { // Be careful! The preceding logical expression is constructed // so that mustUtility is only decremented if the expression is // compiled without READ_ONLY_MASK. mustString = expression._mustString = null; __currentOffset = currentOffset; } else { __currentOffset = currentOffset; minLength = mustString.length; } } if((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) { if(__currentOffset == beginOffset && __tryExpression(beginOffset)) { success = true; break _mainLoop; } else if(__multiline || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0 || (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0) { if(minLength > 0) dontTry = minLength - 1; endOffset-=dontTry; if(__currentOffset > currentOffset) --__currentOffset; while(__currentOffset < endOffset) { if(__input[__currentOffset++] == '\n') { if(__currentOffset < endOffset && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } } } } break _mainLoop; } if(expression._startString != null) { mustString = expression._startString; if((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) { ch = mustString[0]; while(__currentOffset < endOffset) { if(ch == __input[__currentOffset]) { if(__tryExpression(__currentOffset)){ success = true; break _mainLoop; } ++__currentOffset; while(__currentOffset < endOffset && __input[__currentOffset] == ch) ++__currentOffset; } ++__currentOffset; } } else { while((__currentOffset = __findFirst(__input, __currentOffset, endOffset, mustString)) < endOffset){ if(__tryExpression(__currentOffset)) { success = true; break _mainLoop; } ++__currentOffset; } } break _mainLoop; } if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) { boolean doEvery, tmp; char op; doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0); if(minLength > 0) dontTry = minLength - 1; endOffset -= dontTry; tmp = true; switch(op = __program[offset]) { case OpCode._ANYOF: offset = OpCode._getOperand(offset); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(ch < 256 && (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._ANYOFUN: case OpCode._NANYOFUN: offset = OpCode._getOperand(offset); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(__matchUnicodeClass(ch, __program, offset, op)) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._BOUND: if(minLength > 0) { ++dontTry; --endOffset; } if(__currentOffset != beginOffset) { ch = __input[__currentOffset - 1]; tmp = OpCode._isWordCharacter(ch); } else tmp = OpCode._isWordCharacter(__previousChar); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(tmp != OpCode._isWordCharacter(ch)){ tmp = !tmp; if(__tryExpression(__currentOffset)) { success = true; break _mainLoop; } } ++__currentOffset; } if((minLength > 0 || tmp) && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } break; case OpCode._NBOUND: if(minLength > 0) { ++dontTry; --endOffset; } if(__currentOffset != beginOffset) { ch = __input[__currentOffset - 1]; tmp = OpCode._isWordCharacter(ch); } else tmp = OpCode._isWordCharacter(__previousChar); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(tmp != OpCode._isWordCharacter(ch)) tmp = !tmp; else if(__tryExpression(__currentOffset)) { success = true; break _mainLoop; } ++__currentOffset; } if((minLength > 0 || !tmp) && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } break; case OpCode._ALNUM: while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(OpCode._isWordCharacter(ch)) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NALNUM: while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(!OpCode._isWordCharacter(ch)) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._SPACE: while(__currentOffset < endOffset) { if(Character.isWhitespace(__input[__currentOffset])) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NSPACE: while(__currentOffset < endOffset) { if(!Character.isWhitespace(__input[__currentOffset])) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._DIGIT: while(__currentOffset < endOffset) { if(Character.isDigit(__input[__currentOffset])) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NDIGIT: while(__currentOffset < endOffset) { if(!Character.isDigit(__input[__currentOffset])) { if(tmp && __tryExpression(__currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; } // end switch } else { if(minLength > 0) dontTry = minLength - 1; endOffset-=dontTry; do { if(__tryExpression(__currentOffset)) { success = true; break _mainLoop; } } while(__currentOffset++ < endOffset); } break _mainLoop; } // end while __lastSuccess = success; __lastMatchResult = null; return success; } private boolean __matchUnicodeClass(char code, char __program[], int offset ,char opcode) { boolean isANYOF = ( opcode == OpCode._ANYOFUN ); while( __program[offset] != OpCode._END ){ if( __program[offset] == OpCode._RANGE ){ offset++; if((code >= __program[offset]) && (code <= __program[offset+1])){ return isANYOF; } else { offset+=2; } } else if(__program[offset] == OpCode._ONECHAR) { offset++; if(__program[offset++] == code) return isANYOF; } else { isANYOF = (__program[offset] == OpCode._OPCODE) ? isANYOF : !isANYOF; offset++; switch ( __program[offset++] ) { case OpCode._ALNUM: if(OpCode._isWordCharacter(code)) return isANYOF; break; case OpCode._NALNUM: if(!OpCode._isWordCharacter(code)) return isANYOF; break; case OpCode._SPACE: if(Character.isWhitespace(code)) return isANYOF; break; case OpCode._NSPACE: if(!Character.isWhitespace(code)) return isANYOF; break; case OpCode._DIGIT: if(Character.isDigit(code)) return isANYOF; break; case OpCode._NDIGIT: if(!Character.isDigit(code)) return isANYOF; break; case OpCode._ALNUMC: if(Character.isLetterOrDigit(code)) return isANYOF; break; case OpCode._ALPHA: if(Character.isLetter(code)) return isANYOF; break; case OpCode._BLANK: if(Character.isSpaceChar(code)) return isANYOF; break; case OpCode._CNTRL: if(Character.isISOControl(code)) return isANYOF; break; case OpCode._LOWER: if(Character.isLowerCase(code)) return isANYOF; // Remove this hack after more efficient case-folding and unicode // character classes are implemented if(__caseInsensitive && Character.isUpperCase(code)) return isANYOF; break; case OpCode._UPPER: if(Character.isUpperCase(code)) return isANYOF; // Remove this hack after more efficient case-folding and unicode // character classes are implemented if(__caseInsensitive && Character.isLowerCase(code)) return isANYOF; break; case OpCode._PRINT: if(Character.isSpaceChar(code)) return isANYOF; // Fall through to check if the character is alphanumeric, // or a punctuation mark. Printable characters are either // alphanumeric, punctuation marks, or spaces. case OpCode._GRAPH: if(Character.isLetterOrDigit(code)) return isANYOF; // Fall through to check if the character is a punctuation mark. // Graph characters are either alphanumeric or punctuation. case OpCode._PUNCT: switch ( Character.getType(code) ) { case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: return isANYOF; default: break; } break; case OpCode._XDIGIT: if( (code >= '0' && code <= '9') || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F')) return isANYOF; break; case OpCode._ASCII: if(code < 0x80)return isANYOF; } } } return !isANYOF; } private boolean __tryExpression(int offset) { int count; __inputOffset = offset; __lastParen = 0; __expSize = 0; if(__numParentheses > 0) { for(count=0; count <= __numParentheses; count++) { __beginMatchOffsets[count] = OpCode._NULL_OFFSET; __endMatchOffsets[count] = OpCode._NULL_OFFSET; } } if(__match(1)){ __beginMatchOffsets[0] = offset; __endMatchOffsets[0] = __inputOffset; return true; } return false; } private int __repeat(int offset, int max) { int scan, eol, operand, ret; char ch; char op; scan = __inputOffset; eol = __eol; if(max != Character.MAX_VALUE && max < eol - scan) eol = scan + max; operand = OpCode._getOperand(offset); switch(op = __program[offset]) { case OpCode._ANY: while(scan < eol && __input[scan] != '\n') ++scan; break; case OpCode._SANY: scan = eol; break; case OpCode._EXACTLY: ++operand; while(scan < eol && __program[operand] == __input[scan]) ++scan; break; case OpCode._ANYOF: if(scan < eol && (ch = __input[scan]) < 256) { while((ch < 256 ) && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(++scan < eol) ch = __input[scan]; else break; } } break; case OpCode._ANYOFUN: case OpCode._NANYOFUN: if(scan < eol) { ch = __input[scan]; while(__matchUnicodeClass(ch, __program, operand, op)){ if(++scan < eol) ch = __input[scan]; else break; } } break; case OpCode._ALNUM: while(scan < eol && OpCode._isWordCharacter(__input[scan])) ++scan; break; case OpCode._NALNUM: while(scan < eol && !OpCode._isWordCharacter(__input[scan])) ++scan; break; case OpCode._SPACE: while(scan < eol && Character.isWhitespace(__input[scan])) ++scan; break; case OpCode._NSPACE: while(scan < eol && !Character.isWhitespace(__input[scan])) ++scan; break; case OpCode._DIGIT: while(scan < eol && Character.isDigit(__input[scan])) ++scan; break; case OpCode._NDIGIT: while(scan < eol && !Character.isDigit(__input[scan])) ++scan; break; default: break; } ret = scan - __inputOffset; __inputOffset = scan; return ret; } private boolean __match(int offset) { char nextChar, op; int scan, next, input, maxScan, current, line, arg; boolean inputRemains = true, minMod = false; Perl5Repetition rep; input = __inputOffset; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); scan = offset; maxScan = __program.length; while(scan < maxScan /*&& scan > 0*/){ next = OpCode._getNext(__program, scan); switch(op = __program[scan]) { case OpCode._BOL: if(input == __bol ? __previousChar == '\n' : (__multiline && (inputRemains || input < __eol) && __input[input - 1] == '\n')) break; return false; case OpCode._MBOL: if(input == __bol ? __previousChar == '\n' : ((inputRemains || input < __eol) && __input[input - 1] == '\n')) break; return false; case OpCode._SBOL: if(input == __bol && __previousChar == '\n') break; return false; case OpCode._GBOL: if(input == __bol) break; return true; case OpCode._EOL : if((inputRemains || input < __eol) && nextChar != '\n') return false; if(!__multiline && __eol - input > 1) return false; break; case OpCode._MEOL: if((inputRemains || input < __eol) && nextChar != '\n') return false; break; case OpCode._SEOL: if((inputRemains || input < __eol) && nextChar != '\n') return false; if(__eol - input > 1) return false; break; case OpCode._SANY: if(!inputRemains && input >= __eol) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ANY: if((!inputRemains && input >= __eol) || nextChar == '\n') return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._EXACTLY: current = OpCode._getOperand(scan); line = __program[current++]; if(__program[current] != nextChar) return false; if(__eol - input < line) return false; if(line > 1 && !__compare(__program, current, __input, input, line)) return false; input+=line; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ANYOF: current = OpCode._getOperand(scan); if(nextChar == __EOS && inputRemains) nextChar = __input[input]; if(nextChar >= 256 || (__program[current + (nextChar >> 4)] & (1 << (nextChar & 0xf))) != 0) return false; if(!inputRemains && input >= __eol) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ANYOFUN: case OpCode._NANYOFUN: current = OpCode._getOperand(scan); if(nextChar == __EOS && inputRemains) nextChar = __input[input]; if(!__matchUnicodeClass(nextChar, __program, current, op)) return false; if(!inputRemains && input >= __eol) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ALNUM: if(!inputRemains) return false; if(!OpCode._isWordCharacter(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NALNUM: if(!inputRemains && input >= __eol) return false; if(OpCode._isWordCharacter(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NBOUND: case OpCode._BOUND: boolean a, b; if(input == __bol) a = OpCode._isWordCharacter(__previousChar); else a = OpCode._isWordCharacter(__input[input - 1]); b = OpCode._isWordCharacter(nextChar); if((a == b) == (__program[scan] == OpCode._BOUND)) return false; break; case OpCode._SPACE: if(!inputRemains && input >= __eol) return false; if(!Character.isWhitespace(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NSPACE: if(!inputRemains) return false; if(Character.isWhitespace(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._DIGIT: if(!Character.isDigit(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NDIGIT: if(!inputRemains && input >= __eol) return false; if(Character.isDigit(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._REF: arg = OpCode._getArg1(__program, scan); current = __beginMatchOffsets[arg]; if(current == OpCode._NULL_OFFSET) return false; if(__endMatchOffsets[arg] == OpCode._NULL_OFFSET) return false; if(current == __endMatchOffsets[arg]) break; if(__input[current] != nextChar) return false; line = __endMatchOffsets[arg] - current; if(input + line > __eol) return false; if(line > 1 && !__compare(__input, current, __input, input, line)) return false; input+=line; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NOTHING: break; case OpCode._BACK: break; case OpCode._OPEN: arg = OpCode._getArg1(__program, scan); __beginMatchOffsets[arg] = input; if(arg > __expSize) __expSize = arg; break; case OpCode._CLOSE: arg = OpCode._getArg1(__program, scan); __endMatchOffsets[arg] = input; if(arg > __lastParen) __lastParen = arg; break; case OpCode._CURLYX: rep = new Perl5Repetition(); rep._lastRepetition = __currentRep; __currentRep = rep; rep._parenFloor = __lastParen; rep._numInstances = -1; rep._min = OpCode._getArg1(__program, scan); rep._max = OpCode._getArg2(__program, scan); rep._scan = OpCode._getNextOperator(scan) + 2; rep._next = next; rep._minMod = minMod; // Must initialize to -1 because if we initialize to 0 and are // at the beginning of the input the OpCode._WHILEM case will // not work right. rep._lastLocation = -1; __inputOffset = input; // use minMod as temporary minMod = __match(OpCode._getPrevOperator(next)); // leave scope call not pertinent? __currentRep = rep._lastRepetition; return minMod; case OpCode._WHILEM: rep = __currentRep; arg = rep._numInstances + 1; __inputOffset = input; if(input == rep._lastLocation) { __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; __currentRep._numInstances = line; __currentRep = rep; return false; } if(arg < rep._min) { rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; rep._numInstances = arg - 1; return false; } if(rep._minMod) { __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; __currentRep._numInstances = line; __currentRep = rep; if(arg >= rep._max) return false; __inputOffset = input; rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; rep._numInstances = arg - 1; return false; } if(arg < rep._max) { __pushState(rep._parenFloor); rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; __popState(); __inputOffset = input; } __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; rep._numInstances = line; __currentRep = rep; rep._numInstances = arg - 1; return false; case OpCode._BRANCH: if(__program[next] != OpCode._BRANCH) next = OpCode._getNextOperator(scan); else { int lastParen; lastParen = __lastParen; do { __inputOffset = input; if(__match(OpCode._getNextOperator(scan))) return true; for(arg = __lastParen; arg > lastParen; --arg) //__endMatchOffsets[arg] = 0; __endMatchOffsets[arg] = OpCode._NULL_OFFSET; __lastParen = arg; scan = OpCode._getNext(__program, scan); } while(scan != OpCode._NULL_OFFSET && __program[scan] == OpCode._BRANCH); return false; } break; case OpCode._MINMOD: minMod = true; break; case OpCode._CURLY: case OpCode._STAR: case OpCode._PLUS: if(op == OpCode._CURLY) { line = OpCode._getArg1(__program, scan); arg = OpCode._getArg2(__program, scan); scan = OpCode._getNextOperator(scan) + 2; } else if(op == OpCode._STAR) { line = 0; arg = Character.MAX_VALUE; scan = OpCode._getNextOperator(scan); } else { line = 1; arg = Character.MAX_VALUE; scan = OpCode._getNextOperator(scan); } if(__program[next] == OpCode._EXACTLY) { nextChar = __program[OpCode._getOperand(next) + 1]; current = 0; } else { nextChar = __EOS; current = -1000; } __inputOffset = input; if(minMod) { minMod = false; if(line > 0 && __repeat(scan, line) < line) return false; while(arg >= line || (arg == Character.MAX_VALUE && line > 0)) { // there may be a bug here with respect to // __inputOffset >= __endOffset, but it seems to be right for // now. the issue is with __inputOffset being reset later. // is this test really supposed to happen here? if(current == -1000 || __inputOffset >= __endOffset || __input[__inputOffset] == nextChar) { if(__match(next)) return true; } __inputOffset = input + line; if(__repeat(scan, 1) != 0) { ++line; __inputOffset = input + line; } else return false; } } else { arg = __repeat(scan, arg); if(line < arg && OpCode._opType[__program[next]] == OpCode._EOL && ((!__multiline && __program[next] != OpCode._MEOL) || __program[next] == OpCode._SEOL)) line = arg; while(arg >= line) { // there may be a bug here with respect to // __inputOffset >= __endOffset, but it seems to be right for // now. the issue is with __inputOffset being reset later. // is this test really supposed to happen here? if(current == -1000 || __inputOffset >= __endOffset || __input[__inputOffset] == nextChar) { if(__match(next)) return true; } --arg; __inputOffset = input + arg; } } return false; case OpCode._SUCCEED: case OpCode._END: __inputOffset = input; // This enforces the rule that two consecutive matches cannot have // the same end offset. if(__inputOffset == __lastMatchInputEndOffset) return false; return true; case OpCode._IFMATCH: __inputOffset = input; scan = OpCode._getNextOperator(scan); if(!__match(scan)) return false; break; case OpCode._UNLESSM: __inputOffset = input; scan = OpCode._getNextOperator(scan); if(__match(scan)) return false; break; default: // todo: Need to throw an exception here. } // end switch //scan = (next > 0 ? next : 0); scan = next; } // end while scan return false; } /** * Set whether or not subsequent calls to {@link #matches matches()} * or {@link #contains contains()} should treat the input as * consisting of multiple lines. The default behavior is for * input to be treated as consisting of multiple lines. This method * should only be called if the Perl5Pattern used for a match was * compiled without either of the Perl5Compiler.MULTILINE_MASK or * Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the * behavior of how the ^, $, and . metacharacters are * interpreted on the fly. The compilation options used when compiling * a pattern ALWAYS override the behavior specified by setMultiline(). See * {@link Perl5Compiler} for more details. *

* @param multiline If set to true treats the input as consisting of * multiple lines with respect to the ^ and $ * metacharacters. If set to false treats the input as consisting * of a single line with respect to the ^ and $ * metacharacters. */ public void setMultiline(boolean multiline) { __multiline = multiline; } /** * @return True if the matcher is treating input as consisting of multiple * lines with respect to the ^ and $ metacharacters, * false otherwise. */ public boolean isMultiline() { return __multiline; } char[] _toLower(char[] input) { int current; char[] inp; // todo: // Certainly not the best way to do case insensitive matching. // Must definitely change this in some way, but for now we // do what Perl does and make a copy of the input, converting // it all to lowercase. This is truly better handled in the // compilation phase. inp = new char[input.length]; System.arraycopy(input, 0, inp, 0, input.length); input = inp; // todo: Need to inline toLowerCase() for(current = 0; current < input.length; current++) if(Character.isUpperCase(input[current])) input[current] = Character.toLowerCase(input[current]); return input; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern, starting from a given offset into the string. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @param offset The offset at which to start searching for the prefix. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(char[] input, Pattern pattern, int offset) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); __initInterpreterGlobals(expression, input, 0, input.length, offset); __lastSuccess = __tryExpression(offset); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(char[] input, Pattern pattern) { return matchesPrefix(input, pattern, 0); } /** * Determines if a prefix of a string matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The String to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(String input, Pattern pattern) { return matchesPrefix(input.toCharArray(), pattern, 0); } /** * Determines if a prefix of a PatternMatcherInput instance * matches a given pattern. If there is a match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. However, unlike the * {@link #matches matches(PatternMatcherInput, Pattern)} method, * matchesPrefix() will start its search from the current offset * rather than the begin offset of the PatternMatcherInput. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The PatternMatcherInput to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; __initInterpreterGlobals(expression, inp, input._beginOffset, input._endOffset, input._currentOffset); __lastSuccess = __tryExpression(input._currentOffset); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a string (represented as a char[]) exactly * matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * a Perl5Pattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use a Perl5Pattern as the pattern * parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The char[] to test for an exact match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(char[] input, Pattern pattern) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); __initInterpreterGlobals(expression, input, 0, input.length, 0); __lastSuccess = (__tryExpression(0) && __endMatchOffsets[0] == input.length); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a string exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * a Perl5Pattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use a Perl5Pattern as the pattern * parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The String to test for an exact match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(String input, Pattern pattern) { return matches(input.toCharArray(), pattern); } /** * Determines if the contents of a PatternMatcherInput instance * exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. You should remember that the region between * the begin (NOT the current) and end offsets of the PatternMatcherInput * will be tested for an exact match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The PatternMatcherInput to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; __initInterpreterGlobals(expression, inp, input._beginOffset, input._endOffset, input._beginOffset); __lastMatchResult = null; if(__tryExpression(input._beginOffset)) { if(__endMatchOffsets[0] == input._endOffset || input.length() == 0 || input._beginOffset == input._endOffset) { __lastSuccess = true; return true; } } __lastSuccess = false; return false; } /** * Determines if a string contains a pattern. If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* @param input The String to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(String input, Pattern pattern) { return contains(input.toCharArray(), pattern); } /** * Determines if a string (represented as a char[]) contains a pattern. * If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* @param input The char[] to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(char[] input, Pattern pattern) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); return __interpret(expression, input, 0, input.length, 0); } private static final int __DEFAULT_LAST_MATCH_END_OFFSET = -100; private int __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET; /** * Determines if the contents of a PatternMatcherInput, starting from the * current offset of the input contains a pattern. * If a pattern match is found, a MatchResult * instance representing the first such match is made acessible via * {@link #getMatch()}. The current offset of the * PatternMatcherInput is set to the offset corresponding to the end * of the match, so that a subsequent call to this method will continue * searching where the last call left off. You should remember that the * region between the begin and end offsets of the PatternMatcherInput are * considered the input to be searched, and that the current offset * of the PatternMatcherInput reflects where a search will start from. * Matches extending beyond the end offset of the PatternMatcherInput * will not be matched. In other words, a match must occur entirely * between the begin and end offsets of the input. See * {@link PatternMatcherInput} for more details. *

* As a side effect, if a match is found, the PatternMatcherInput match * offset information is updated. See the * {@link PatternMatcherInput#setMatchOffsets(int, int)} * method for more details. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* This method is usually used in a loop as follows: *

   * PatternMatcher matcher;
   * PatternCompiler compiler;
   * Pattern pattern;
   * PatternMatcherInput input;
   * MatchResult result;
   *
   * compiler = new Perl5Compiler();
   * matcher  = new Perl5Matcher();
   *
   * try {
   *   pattern = compiler.compile(somePatternString);
   * } catch(MalformedPatternException e) {
   *   System.err.println("Bad pattern.");
   *   System.err.println(e.getMessage());
   *   return;
   * }
   *
   * input   = new PatternMatcherInput(someStringInput);
   *
   * while(matcher.contains(input, pattern)) {
   *   result = matcher.getMatch();  
   *   // Perform whatever processing on the result you want.
   * }
   *
   * 
*

* @param input The PatternMatcherInput to test for a match. * @param pattern The Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; boolean matchFound; //if(input.length() > 0) { // We want to allow a null string to match at the end of the input // which is why we don't check endOfInput. Not sure if this is a // safe thing to do or not. if(input._currentOffset > input._endOffset) return false; //} /* else if(input._endOfInput()) return false; */ expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; // Todo: // Really should only reduce to lowercase that part of the // input that is necessary, instead of the whole thing. // Adjust MatchResult offsets accordingly. Actually, pass an adjustment // value to __interpret. __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; __lastMatchInputEndOffset = input.getMatchEndOffset(); matchFound = __interpret(expression, inp, input._beginOffset, input._endOffset, input._currentOffset); if(matchFound) { input.setCurrentOffset(__endMatchOffsets[0]); input.setMatchOffsets(__beginMatchOffsets[0], __endMatchOffsets[0]); } else { input.setCurrentOffset(input._endOffset + 1); } // Restore so it doesn't interfere with other unrelated matches. __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET; return matchFound; } /** * Fetches the last match found by a call to a matches() or contains() * method. If you plan on modifying the original search input, you * must call this method BEFORE you modify the original search input, * as a lazy evaluation technique is used to create the MatchResult. * This reduces the cost of pattern matching when you don't care about * the actual match and only care if the pattern occurs in the input. * Otherwise, a MatchResult would be created for every match found, * whether or not the MatchResult was later used by a call to getMatch(). *

* @return A MatchResult instance containing the pattern match found * by the last call to any one of the matches() or contains() * methods. If no match was found by the last call, returns * null. */ public MatchResult getMatch() { if(!__lastSuccess) return null; if(__lastMatchResult == null) __setLastMatchResult(); return __lastMatchResult; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy