com.groupbyinc.common.jregex.Matcher Maven / Gradle / Ivy
/**
* Copyright (c) 2001, Sergey A. Samokhodkin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form
* must reproduce the above copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided with the distribution.
* - Neither the name of jregex nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @version 1.2_01
*/
package jregex;
import java.util.*;
import java.io.*;
/**
* Matcher instance is an automaton that actually performs matching. It provides the following methods:
* searching for a matching substrings : matcher.find() or matcher.findAll();
* testing whether a text matches a whole pattern : matcher.matches();
* testing whether the text matches the beginning of a pattern : matcher.matchesPrefix();
* searching with custom options : matcher.find(int options)
*
* Obtaining results
* After the search succeded, i.e. if one of above methods returned true
* one may obtain an information on the match:
*
may check whether some group is captured : matcher.isCaptured(int);
* may obtain start and end positions of the match and its length : matcher.start(int),matcher.end(int),matcher.length(int);
* may obtain match contents as String : matcher.group(int).
* The same way can be obtained the match prefix and suffix information.
* The appropriate methods are grouped in MatchResult interface, which the Matcher class implements.
* Matcher objects are not thread-safe, so only one thread may use a matcher instance at a time.
* Note, that Pattern objects are thread-safe(the same instanse may be shared between
* multiple threads), and the typical tactics in multithreaded applications is to have one Pattern instance per expression(a singleton),
* and one Matcher object per thread.
*/
public class Matcher implements MatchResult{
/* Matching options*/
/**
* The same effect as "^" without REFlags.MULTILINE.
* @see Matcher#find(int)
*/
public static final int ANCHOR_START=1;
/**
* The same effect as "\\G".
* @see Matcher#find(int)
*/
public static final int ANCHOR_LASTMATCH=2;
/**
* The same effect as "$" without REFlags.MULTILINE.
* @see Matcher#find(int)
*/
public static final int ANCHOR_END=4;
/**
* Experimental option; if a text ends up before the end of a pattern,report a match.
* @see Matcher#find(int)
*/
public static final int ACCEPT_INCOMPLETE=8;
//see search(ANCHOR_START|...)
private static Term startAnchor=new Term(Term.START);
//see search(ANCHOR_LASTMATCH|...)
private static Term lastMatchAnchor=new Term(Term.LAST_MATCH_END);
private Pattern re;
private int[] counters;
private MemReg[] memregs;
private LAEntry[] lookaheads;
private int counterCount;
private int memregCount;
private int lookaheadCount;
private char[] data;
private int offset,end,wOffset,wEnd;
private boolean shared;
private SearchEntry top; //stack entry
private SearchEntry first; //object pool entry
private SearchEntry defaultEntry; //called when moving the window
private boolean called;
private int minQueueLength;
private String cache;
//cache may be longer than the actual data
//and contrariwise; so cacheOffset may have both signs.
//cacheOffset is actually -(data offset).
private int cacheOffset,cacheLength;
private MemReg prefixBounds,suffixBounds,targetBounds;
Matcher(Pattern regex){
this.re=regex;
//int memregCount=(memregs=new MemReg[regex.memregs]).length;
//for(int i=0;i0){
MemReg[] memregs=new MemReg[memregCount];
for(int i=0;i0) counters=new int[counterCount];
if((lookaheadCount=regex.lookaheads)>0){
LAEntry[] lookaheads=new LAEntry[lookaheadCount];
for(int i=0;i
* Matcher m=new Pattern("\\w+").matcher(myString);
* if(m.find())m.setTarget(m,m.SUFFIX); //forget all that is not a suffix
*
* Resets current search position to zero.
* @param m - a matcher that is a source of data
* @param groupId - which group to take data from
* @see Matcher#setTarget(java.lang.String)
* @see Matcher#setTarget(java.lang.String,int,int)
* @see Matcher#setTarget(char[],int,int)
* @see Matcher#setTarget(java.io.Reader,int)
*/
public final void setTarget(Matcher m, int groupId){
MemReg mr=m.bounds(groupId);
//System.out.println("setTarget("+m+","+groupId+")");
//System.out.println(" in="+mr.in);
//System.out.println(" out="+mr.out);
if(mr==null) throw new IllegalArgumentException("group #"+groupId+" is not assigned");
data=m.data;
offset=mr.in;
end=mr.out;
cache=m.cache;
cacheLength=m.cacheLength;
cacheOffset=m.cacheOffset;
if(m!=this){
shared=true;
m.shared=true;
}
init();
}
/**
* Supplies a text to search in/match with.
* Resets current search position to zero.
* @param text - a data
* @see Matcher#setTarget(jregex.Matcher,int)
* @see Matcher#setTarget(java.lang.String,int,int)
* @see Matcher#setTarget(char[],int,int)
* @see Matcher#setTarget(java.io.Reader,int)
*/
public void setTarget(String text){
setTarget(text,0,text.length());
}
/**
* Supplies a text to search in/match with, as a part of String.
* Resets current search position to zero.
* @param text - a data source
* @param start - where the target starts
* @param len - how long is the target
* @see Matcher#setTarget(jregex.Matcher,int)
* @see Matcher#setTarget(java.lang.String)
* @see Matcher#setTarget(char[],int,int)
* @see Matcher#setTarget(java.io.Reader,int)
*/
public void setTarget(String text,int start,int len){
char[] mychars=data;
if(mychars==null || shared || mychars.length* myMatcher.setTarget(myCharArray,x,y,false); //we declare that array contents is NEITHER shared NOR will be used later, so may modifications on it are permitted ** then we should expect the array contents to be changed on subsequent setTarget(..) operations. * Such method may yield some increase in perfomanse in the case of multiple setTarget() calls. * Resets current search position to zero. * @param text - a data source * @param start - where the target starts * @param len - how long is the target * @param shared - if
true: data are shared or used later, don't modify it; if false: possible modifications of the text on subsequent setTarget()
calls are perceived and allowed.
* @see Matcher#setTarget(jregex.Matcher,int)
* @see Matcher#setTarget(java.lang.String)
* @see Matcher#setTarget(java.lang.String,int,int)
* @see Matcher#setTarget(char[],int,int)
* @see Matcher#setTarget(java.io.Reader,int)
*/
public final void setTarget(char[] text,int start,int len,boolean shared){
cache=null;
data=text;
offset=start;
end=start+len;
this.shared=shared;
init();
}
/**
* Supplies a text to search in/match with through a stream.
* Resets current search position to zero.
* @param in - a data stream;
* @param len - how much characters should be read; if len is -1, read the entire stream.
* @see Matcher#setTarget(jregex.Matcher,int)
* @see Matcher#setTarget(java.lang.String)
* @see Matcher#setTarget(java.lang.String,int,int)
* @see Matcher#setTarget(char[],int,int)
*/
public void setTarget(Reader in,int len)throws IOException{
if(len<0){
setAll(in);
return;
}
char[] mychars=data;
boolean shared=this.shared;
if(mychars==null || shared || mychars.length=0){
len-=c;
count+=c;
if(len==0) break;
}
setTarget(mychars,0,count,shared);
}
private void setAll(Reader in)throws IOException{
char[] mychars=data;
int free;
boolean shared=this.shared;
if(mychars==null || shared){
mychars=new char[free=1024];
shared=false;
}
else free=mychars.length;
int count=0;
int c;
while((c=in.read(mychars,count,free))>=0){
free-=c;
count+=c;
if(free==0){
int newsize=count*3;
char[] newchars=new char[newsize];
System.arraycopy(mychars,0,newchars,0,count);
mychars=newchars;
free=newsize-count;
shared=false;
}
}
setTarget(mychars,0,count,shared);
}
private final String getString(int start,int end){
String src=cache;
if(src!=null){
int co=cacheOffset;
return src.substring(start-co,end-co);
}
int tOffset,tEnd,tLen=(tEnd=this.end)-(tOffset=this.offset);
char[] data=this.data;
if((end-start)>=(tLen/3)){
//it makes sence to make a cache
cache=src=new String(data,tOffset,tLen);
cacheOffset=tOffset;
cacheLength=tLen;
return src.substring(start-tOffset,end-tOffset);
}
return new String(data,start,end-start);
}
/* Matching */
/**
* Tells whether the entire target matches the beginning of the pattern.
* The whole pattern is also regarded as its beginning.
* This feature allows to find a mismatch by examining only a beginning part of
* the target (as if the beginning of the target doesn't match the beginning of the pattern, then the entire target
* also couldn't match).
* For example the following assertions yield true:
* Pattern p=new Pattern("abcd");
* p.matcher("").matchesPrefix();
* p.matcher("a").matchesPrefix();
* p.matcher("ab").matchesPrefix();
* p.matcher("abc").matchesPrefix();
* p.matcher("abcd").matchesPrefix();
*
* and the following yield false:
* p.matcher("b").isPrefix();
* p.matcher("abcdef").isPrefix();
* p.matcher("x").isPrefix();
*
* @return true if the entire target matches the beginning of the pattern
*/
public final boolean matchesPrefix(){
setPosition(0);
return search(ANCHOR_START|ACCEPT_INCOMPLETE|ANCHOR_END);
}
/**
* Just an old name for isPrefix().
* Retained for backwards compatibility.
* @deprecated Replaced by isPrefix()
*/
public final boolean isStart(){
return matchesPrefix();
}
/**
* Tells whether a current target matches the whole pattern.
* For example the following yields the true:
* Pattern p=new Pattern("\\w+");
* p.matcher("a").matches();
* p.matcher("ab").matches();
* p.matcher("abc").matches();
*
* and the following yields the false:
* p.matcher("abc def").matches();
* p.matcher("bcd ").matches();
* p.matcher(" bcd").matches();
* p.matcher("#xyz#").matches();
*
* @return whether a current target matches the whole pattern.
*/
public final boolean matches(){
if(called) setPosition(0);
return search(ANCHOR_START|ANCHOR_END);
}
/**
* Just a combination of setTarget(String) and matches().
* @param s the target string;
* @return whether the specified string matches the whole pattern.
*/
public final boolean matches(String s){
setTarget(s);
return search(ANCHOR_START|ANCHOR_END);
}
/**
* Allows to set a position the subsequent find()/find(int) will start from.
* @param pos the position to start from;
* @see Matcher#find()
* @see Matcher#find(int)
*/
public void setPosition(int pos){
wOffset=offset+pos;
wEnd=-1;
called=false;
flush();
}
/**
* Searches through a target for a matching substring, starting from just after the end of last match.
* If there wasn't any search performed, starts from zero.
* @return true
if a match found.
*/
public final boolean find(){
if(called) skip();
return search(0);
}
/**
* Searches through a target for a matching substring, starting from just after the end of last match.
* If there wasn't any search performed, starts from zero.
* @param anchors a zero or a combination(bitwise OR) of ANCHOR_START,ANCHOR_END,ANCHOR_LASTMATCH,ACCEPT_INCOMPLETE
* @return true
if a match found.
*/
public final boolean find(int anchors){
if(called) skip();
return search(anchors);
}
/**
* The same as findAll(int), but with default behaviour;
*/
public MatchIterator findAll(){
return findAll(0);
}
/**
* Returns an iterator over the matches found by subsequently calling find(options), the search starts from the zero position.
*/
public MatchIterator findAll(final int options){
//setPosition(0);
return new MatchIterator(){
private boolean checked=false;
private boolean hasMore=false;
public boolean hasMore(){
if(!checked) check();
return hasMore;
}
public MatchResult nextMatch(){
if(!checked) check();
if(!hasMore) throw new NoSuchElementException();
checked=false;
return Matcher.this;
}
private final void check(){
hasMore=find(options);
checked=true;
}
public int count(){
if(!checked) check();
if(!hasMore) return 0;
int c=1;
while(find(options))c++;
checked=false;
return c;
}
};
}
/**
* Continues to search from where the last search left off.
* The same as proceed(0).
* @see Matcher#proceed(int)
*/
public final boolean proceed(){
return proceed(0);
}
/**
* Continues to search from where the last search left off using specified options:
* Matcher m=new Pattern("\\w+").matcher("abc");
* while(m.proceed(0)){
* System.out.println(m.group(0));
* }
*
* Output:
* abc
* ab
* a
* bc
* b
* c
*
* For example, let's find all odd nubmers occuring in a text:
* Matcher m=new Pattern("\\d+").matcher("123");
* while(m.proceed(0)){
* String match=m.group(0);
* if(isOdd(Integer.parseInt(match))) System.out.println(match);
* }
*
* static boolean isOdd(int i){
* return (i&1)>0;
* }
*
* This outputs:
* 123
* 1
* 23
* 3
*
* Note that using find()
method we would find '123' only.
* @param options search options, some of ANCHOR_START|ANCHOR_END|ANCHOR_LASTMATCH|ACCEPT_INCOMPLETE; zero value(default) stands for usual search for substring.
*/
public final boolean proceed(int options){
//System.out.println("next() : top="+top);
if(called){
if(top==null){
wOffset++;
}
}
return search(0);
}
/**
* Sets the current search position just after the end of last match.
*/
public final void skip(){
int we=wEnd;
if(wOffset==we){ //requires special handling
//if no variants at 'wOutside',advance pointer and clear
if(top==null){
wOffset++;
flush();
}
//otherwise, if there exist a variant,
//don't clear(), i.e. allow it to match
return;
}
else{
if(we<0) wOffset=0;
else wOffset=we;
}
//rflush(); //rflush() works faster on simple regexes (with a small group/branch number)
flush();
}
private final void init(){
//wOffset=-1;
//System.out.println("init(): offset="+offset+", end="+end);
wOffset=offset;
wEnd=-1;
called=false;
flush();
}
/**
* Resets the internal state.
*/
private final void flush(){
top=null;
defaultEntry.reset(0);
/*
int c=0;
SearchEntry se=first;
while(se!=null){
c++;
se=se.on;
}
System.out.println("queue: allocated="+c+", truncating to "+minQueueLength);
new Exception().printStackTrace();
*/
first.reset(minQueueLength);
//first.reset(0);
for(int i=memregs.length-1;i>0;i--){
MemReg mr=memregs[i];
mr.in=mr.out=-1;
}
for(int i=memregs.length-1;i>0;i--){
MemReg mr=memregs[i];
mr.in=mr.out=-1;
}
called=false;
}
//reverse flush
//may work significantly faster,
//need testing
private final void rflush(){
SearchEntry entry=top;
top=null;
MemReg[] memregs=this.memregs;
int[] counters=this.counters;
while(entry!=null){
SearchEntry next=entry.sub;
SearchEntry.popState(entry,memregs,counters);
entry=next;
}
SearchEntry.popState(defaultEntry,memregs,counters);
}
/**
*/
public String toString(){
return getString(wOffset,wEnd);
}
public Pattern pattern(){
return re;
}
public String target(){
return getString(offset,end);
}
/**
*/
public char[] targetChars(){
shared=true;
return data;
}
/**
*/
public int targetStart(){
return offset;
}
/**
*/
public int targetEnd(){
return end;
}
public char charAt(int i){
int in=this.wOffset;
int out=this.wEnd;
if(in<0 || out(mr.out-in)) throw new StringIndexOutOfBoundsException(""+i);
return data[in+i];
}
public final int length(){
return wEnd-wOffset;
}
/**
*/
public final int start(){
return wOffset-offset;
}
/**
*/
public final int end(){
return wEnd-offset;
}
/**
*/
public String prefix(){
return getString(offset,wOffset);
}
/**
*/
public String suffix(){
return getString(wEnd,end);
}
/**
*/
public int groupCount(){
return memregs.length;
}
/**
*/
public String group(int n){
MemReg mr=bounds(n);
if(mr==null) return null;
return getString(mr.in,mr.out);
}
/**
*/
public String group(String name){
Integer id=re.groupId(name);
if(id==null) throw new IllegalArgumentException("<"+name+"> isn't defined");
return group(id.intValue());
}
/**
*/
public boolean getGroup(int n,TextBuffer tb){
MemReg mr=bounds(n);
if(mr==null) return false;
int in;
tb.append(data,in=mr.in,mr.out-in);
return true;
}
/**
*/
public boolean getGroup(String name,TextBuffer tb){
Integer id=re.groupId(name);
if(id==null) throw new IllegalArgumentException("unknown group: \""+name+"\"");
return getGroup(id.intValue(),tb);
}
/**
*/
public boolean getGroup(int n,StringBuffer sb){
MemReg mr=bounds(n);
if(mr==null) return false;
int in;
sb.append(data,in=mr.in,mr.out-in);
return true;
}
/**
*/
public boolean getGroup(String name,StringBuffer sb){
Integer id=re.groupId(name);
if(id==null) throw new IllegalArgumentException("unknown group: \""+name+"\"");
return getGroup(id.intValue(),sb);
}
/**
*/
public String[] groups(){
MemReg[] memregs=this.memregs;
String[] groups=new String[memregs.length];
int in,out;
MemReg mr;
for(int i=0;i=0){
mr=memregs[id];
}
else switch(id){
case PREFIX:
mr=prefixBounds;
if(mr==null) prefixBounds=mr=new MemReg(PREFIX);
mr.in=offset;
mr.out=wOffset;
break;
case SUFFIX:
mr=suffixBounds;
if(mr==null) suffixBounds=mr=new MemReg(SUFFIX);
mr.in=wEnd;
mr.out=end;
break;
case TARGET:
mr=targetBounds;
if(mr==null) targetBounds=mr=new MemReg(TARGET);
mr.in=offset;
mr.out=end;
break;
default:
throw new IllegalArgumentException("illegal group id: "+id+"; must either nonnegative int, or MatchResult.PREFIX, or MatchResult.SUFFIX");
}
//System.out.println(" mr=["+mr.in+","+mr.out+"]");
int in;
if((in=mr.in)<0 || mr.out=0 && wEnd>=wOffset;
}
/**
*/
public final boolean isCaptured(int id){
return bounds(id)!=null;
}
/**
*/
public final boolean isCaptured(String groupName){
Integer id=re.groupId(groupName);
if(id==null) throw new IllegalArgumentException("unknown group: \""+groupName+"\"");
return isCaptured(id.intValue());
}
/**
*/
public final int length(int id){
MemReg mr=bounds(id);
return mr.out-mr.in;
}
/**
*/
public final int start(int id){
return bounds(id).in-offset;
}
/**
*/
public final int end(int id){
return bounds(id).out-offset;
}
private final boolean search(int anchors){
called=true;
final int end=this.end;
int offset=this.offset;
char[] data=this.data;
int wOffset=this.wOffset;
int wEnd=this.wEnd;
MemReg[] memregs=this.memregs;
int[] counters=this.counters;
LAEntry[] lookaheads=this.lookaheads;
//int memregCount=memregs.length;
//int cntCount=counters.length;
int memregCount=this.memregCount;
int cntCount=this.counterCount;
SearchEntry defaultEntry=this.defaultEntry;
SearchEntry first=this.first;
SearchEntry top=this.top;
SearchEntry actual=null;
int cnt,regLen;
int i;
final boolean matchEnd=(anchors&ANCHOR_END)>0;
final boolean allowIncomplete=(anchors&ACCEPT_INCOMPLETE)>0;
Pattern re=this.re;
Term root=re.root;
Term term;
if(top==null){
if((anchors&ANCHOR_START)>0){
term=re.root0; //raw root
root=startAnchor;
}
else if((anchors&ANCHOR_LASTMATCH)>0){
term=re.root0; //raw root
root=lastMatchAnchor;
}
else{
term=root; //optimized root
}
i=wOffset;
actual=first;
SearchEntry.popState(defaultEntry,memregs,counters);
}
else{
top=(actual=top).sub;
term=actual.term;
i=actual.index;
SearchEntry.popState(actual,memregs,counters);
}
cnt=actual.cnt;
regLen=actual.regLen;
main:
while(wOffset<=end){
matchHere:
for(;;){
/*
System.out.print("char: "+i+", term: ");
System.out.print(term.toString());
System.out.print(" // mrs:{");
for(int dbi=0;dbiend) break;
}
term=term.next;
continue matchHere;
}
case Term.VOID:
term=term.next;
continue matchHere;
case Term.CHAR:
//can only be 1-char-wide
// \/
if(i>=end || data[i]!=term.c) break;
//System.out.println("CHAR: "+data[i]+", i="+i);
i++;
term=term.next;
continue matchHere;
case Term.ANY_CHAR:
//can only be 1-char-wide
// \/
if(i>=end) break;
i++;
term=term.next;
continue matchHere;
case Term.ANY_CHAR_NE:
//can only be 1-char-wide
// \/
if(i>=end || (c=data[i])=='\r' || c=='\n') break;
i++;
term=term.next;
continue matchHere;
case Term.END:
if(i>=end){ //meets
term=term.next;
continue matchHere;
}
break;
case Term.END_EOL: //perl's $
if(i>=end){ //meets
term=term.next;
continue matchHere;
}
else{
boolean matches=
i>=end |
((i+1)==end && data[i]=='\n') |
((i+2)==end && data[i]=='\r' && data[i+1]=='\n');
if(matches){
term=term.next;
continue matchHere;
}
else break;
}
case Term.LINE_END:
if(i>=end){ //meets
term=term.next;
continue matchHere;
}
else{
/*
if(((c=data[i])=='\r' || c=='\n') &&
(c=data[i-1])!='\r' && c!='\n'){
term=term.next;
continue matchHere;
}
*/
//5 aug 2001
if((c=data[i])=='\r' || c=='\n'){
term=term.next;
continue matchHere;
}
}
break;
case Term.START: //Perl's "^"
if(i==offset){ //meets
term=term.next;
continue matchHere;
}
//break;
//changed on 27-04-2002
//due to a side effect: if ALLOW_INCOMPLETE is enabled,
//the anchorStart moves up to the end and succeeds
//(see comments at the last lines of matchHere, ~line 1830)
//Solution: if there are some entries on the stack ("^a|b$"),
//try them; otherwise it's a final 'no'
//if(top!=null) break;
//else break main;
//changed on 25-05-2002
//rationale: if the term is startAnchor,
//it's the root term by definition,
//so if it doesn't match, the entire pattern
//couldn't match too;
//otherwise we could have the following problem:
//"c|^a" against "abc" finds only "a"
if(top!=null) break;
if(term!=startAnchor) break;
else break main;
case Term.LAST_MATCH_END:
if(i==wEnd){ //meets
term=term.next;
continue matchHere;
}
break main; //return false
case Term.LINE_START:
if(i==offset){ //meets
term=term.next;
continue matchHere;
}
else if(i=end) break;
c=data[i];
if(!(c<=255 && term.bitset[c])^term.inverse) break;
i++;
term=term.next;
continue matchHere;
}
case Term.BITSET2:{
//can only be 1-char-wide
// \/
if(i>=end) break;
c=data[i];
boolean[] arr=term.bitset2[c>>8];
if(arr==null || !arr[c&255]^term.inverse) break;
i++;
term=term.next;
continue matchHere;
}
case Term.BOUNDARY:{
boolean ch1Meets=false,ch2Meets=false;
boolean[] bitset=term.bitset;
test1:{
int j=i-1;
//if(j=end) break test1;
if(j=end) break test2;
if(i>=end) break test2;
c= data[i];
ch2Meets= (c<256 && bitset[c]);
}
if(ch1Meets^ch2Meets^term.inverse){ //meets
term=term.next;
continue matchHere;
}
else break;
}
case Term.UBOUNDARY:{
boolean ch1Meets=false,ch2Meets=false;
boolean[][] bitset2=term.bitset2;
test1:{
int j=i-1;
//if(j=end) break test1;
if(j>8];
ch1Meets= bits!=null && bits[c&0xff];
}
test2:{
//if(i=end) break test2;
if(i>=end) break test2;
c= data[i];
boolean[] bits=bitset2[c>>8];
ch2Meets= bits!=null && bits[c&0xff];
}
if(ch1Meets^ch2Meets^term.inverse){ //is boundary ^ inv
term=term.next;
continue matchHere;
}
else break;
}
case Term.DIRECTION:{
boolean ch1Meets=false,ch2Meets=false;
boolean[] bitset=term.bitset;
boolean inv=term.inverse;
//System.out.println("i="+i+", inv="+inv+", bitset="+CharacterClass.stringValue0(bitset));
int j=i-1;
//if(j>=offset && j=offset){
c= data[j];
ch1Meets= c<256 && bitset[c];
//System.out.println(" ch1Meets="+ch1Meets);
}
if(ch1Meets^inv) break;
//if(i>=offset && i=offset && j=offset){
c= data[j];
boolean[] bits=bitset2[c>>8];
ch1Meets= bits!=null && bits[c&0xff];
}
if(ch1Meets^inv) break;
//if(i>=offset && i>8];
ch2Meets= bits!=null && bits[c&0xff];
}
if(!ch2Meets^inv) break;
term=term.next;
continue matchHere;
}
case Term.REG:{
MemReg mr=memregs[term.memreg];
int sampleOffset=mr.in;
int sampleOutside=mr.out;
int rLen;
if(sampleOffset<0 || (rLen=sampleOutside-sampleOffset)<0){
break;
}
else if(rLen==0){
term=term.next;
continue matchHere;
}
// don't prevent us from reaching the 'end'
if((i+rLen)>end) break;
if(compareRegions(data,sampleOffset,i,rLen,end)){
i+=rLen;
term=term.next;
continue matchHere;
}
break;
}
case Term.REG_I:{
MemReg mr=memregs[term.memreg];
int sampleOffset=mr.in;
int sampleOutside=mr.out;
int rLen;
if(sampleOffset<0 || (rLen=sampleOutside-sampleOffset)<0){
break;
}
else if(rLen==0){
term=term.next;
continue matchHere;
}
// don't prevent us from reaching the 'end'
if((i+rLen)>end) break;
if(compareRegionsI(data,sampleOffset,i,rLen,end)){
i+=rLen;
term=term.next;
continue matchHere;
}
break;
}
case Term.REPEAT_0_INF:{
//System.out.println("REPEAT, i="+i+", term.minCount="+term.minCount+", term.maxCount="+term.maxCount);
//i+=(cnt=repeat(data,i,end,term.target));
if((cnt=repeat(data,i,end,term.target))<=0){
term=term.next;
continue;
}
i+=cnt;
//branch out the backtracker (that is term.failNext, see Term.make*())
actual.cnt=cnt;
actual.term=term.failNext;
actual.index=i;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
case Term.REPEAT_MIN_INF:{
//System.out.println("REPEAT, i="+i+", term.minCount="+term.minCount+", term.maxCount="+term.maxCount);
cnt=repeat(data,i,end,term.target);
if(cnt0 && compareRegions(data,i,sampleOffset,bitset,end)){
cnt++;
i+=bitset;
countBack--;
}
if(cnt0){
cnt--;
i--;
actual.cnt=cnt;
actual.index=i;
actual.term=term;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else break;
case Term.BACKTRACK_MIN:
//System.out.println("<<");
cnt=actual.cnt;
if(cnt>term.minCount){
cnt--;
i--;
actual.cnt=cnt;
actual.index=i;
actual.term=term;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else break;
case Term.BACKTRACK_FIND_MIN:{
//System.out.print("<<<[cnt=");
cnt=actual.cnt;
//System.out.print(cnt+", minCnt=");
//System.out.print(term.minCount+", target=");
//System.out.print(term.target+"]");
int minCnt;
if(cnt>(minCnt=term.minCount)){
int start=i+term.distance;
if(start>end){
int exceed=start-end;
cnt-=exceed;
if(cnt<=minCnt) break;
i-=exceed;
start=end;
}
int back=findBack(data,i+term.distance,cnt-minCnt,term.target);
//System.out.print("[back="+back+"]");
if(back<0) break;
//cnt-=back;
//i-=back;
if((cnt-=back)<=minCnt){
i-=back;
if(term.eat)i++;
term=term.next;
continue;
}
i-=back;
actual.cnt=cnt;
actual.index=i;
if(term.eat)i++;
actual.term=term;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else break;
}
case Term.BACKTRACK_FINDREG_MIN:{
//System.out.print("<<<[cnt=");
cnt=actual.cnt;
//System.out.print(cnt+", minCnt=");
//System.out.print(term.minCount+", target=");
//System.out.print(term.target);
//System.out.print("reg=<"+memregs[term.target.memreg].in+","+memregs[term.target.memreg].out+">]");
int minCnt;
if(cnt>(minCnt=term.minCount)){
int start=i+term.distance;
if(start>end){
int exceed=start-end;
cnt-=exceed;
if(cnt<=minCnt) break;
i-=exceed;
start=end;
}
MemReg mr=memregs[term.target.memreg];
int sampleOff=mr.in;
int sampleLen=mr.out-sampleOff;
//if(sampleOff<0 || sampleLen<0) throw new Error("backreference used before definition: \\"+term.memreg);
//int back=findBackReg(data,i+term.distance,sampleOff,sampleLen,cnt-minCnt,term.target,end);
//if(back<0) break;
/*@since 1.2*/
int back;
if(sampleOff<0 || sampleLen<0){
//the group is not def., as in the case of '(\w+)\1'
//treat as usual BACKTRACK_MIN
cnt--;
i--;
actual.cnt=cnt;
actual.index=i;
actual.term=term;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else if(sampleLen==0){
back=-1;
}
else{
back=findBackReg(data,i+term.distance,sampleOff,sampleLen,cnt-minCnt,term.target,end);
//System.out.print("[back="+back+"]");
if(back<0) break;
}
cnt-=back;
i-=back;
actual.cnt=cnt;
actual.index=i;
if(term.eat)i+=sampleLen;
actual.term=term;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else break;
}
case Term.BACKTRACK_REG_MIN:
//System.out.println("<<");
cnt=actual.cnt;
if(cnt>term.minCount){
regLen=actual.regLen;
cnt--;
i-=regLen;
actual.cnt=cnt;
actual.index=i;
actual.term=term;
//actual.regLen=regLen;
actual=(top=actual).on;
if(actual==null){
actual=new SearchEntry();
top.on=actual;
actual.sub=top;
}
term=term.next;
continue;
}
else break;
case Term.GROUP_IN:{
memreg=term.memreg;
//memreg=0 is a regex itself; we don't need to handle it
//because regex bounds already are in wOffset and wEnd
if(memreg>0){
//MemReg mr=memregs[memreg];
//saveMemregState((top!=null)? top: defaultEntry,memreg,mr);
//mr.in=i;
memregs[memreg].tmp=i; //assume
}
term=term.next;
continue;
}
case Term.GROUP_OUT:
memreg=term.memreg;
//see above
if(memreg>0){
//if(term.saveState)saveMemregState((top!=null)? top: defaultEntry,memreg,memregs);
MemReg mr=memregs[memreg];
SearchEntry.saveMemregState((top!=null)? top: defaultEntry,memreg,mr);
mr.in=mr.tmp; //commit
mr.out=i;
}
term=term.next;
continue;
case Term.PLOOKBEHIND_IN:{
int tmp=i-term.distance;
if(tmp0;c--,p1--,p2--){
if(arr[p1]!=arr[p2]){
//System.out.println(" : no");
return false;
}
}
//System.out.println(" : yes");
return true;
}
private static final boolean compareRegionsI(char[] arr, int off1, int off2, int len,int out){
int p1=off1+len-1;
int p2=off2+len-1;
if(p1>=out || p2>=out){
return false;
}
char c1,c2;
for(int c=len;c>0;c--,p1--,p2--){
if((c1=arr[p1])!=Character.toLowerCase(c2=arr[p2]) &&
c1!=Character.toUpperCase(c2) &&
c1!=Character.toTitleCase(c2)) return false;
}
return true;
}
//repeat while matches
private static final int repeat(char[] data,int off,int out,Term term){
//System.out.print("off="+off+", out="+out+", term="+term);
switch(term.type){
case Term.CHAR:{
char c=term.c;
int i=off;
while(i>8];
if(arr!=null && arr[c&0xff]) break;
else i++;
}
else while(i>8];
if(arr!=null && arr[c&0xff]) i++;
else break;
}
return i-off;
}
}
throw new Error("this kind of term can't be quantified:"+term.type);
}
//repeat while doesn't match
private static final int find(char[] data,int off,int out,Term term){
//System.out.print("off="+off+", out="+out+", term="+term);
if(off>=out) return -1;
switch(term.type){
case Term.CHAR:{
char c=term.c;
int i=off;
while(i>8];
if(arr!=null && arr[c&0xff]) break;
else i++;
}
else while(i>8];
if(arr!=null && arr[c&0xff]) i++;
else break;
}
return i-off;
}
}
throw new IllegalArgumentException("can't seek this kind of term:"+term.type);
}
private static final int findReg(char[] data,int off,int regOff,int regLen,Term term,int out){
//System.out.print("off="+off+", out="+out+", term="+term);
if(off>=out) return -1;
int i=off;
if(term.type==Term.REG){
while(i255 || !arr[c]) break;
if(i<=iMin) return -1;
}
return off-i;
}
case Term.BITSET2:{
boolean[][] bitset2=term.bitset2;
int i=off;
char c;
int iMin=off-maxCount;
if(!term.inverse) for(;;){
boolean[] arr=bitset2[(c=data[--i])>>8];
if(arr!=null && arr[c&0xff]) break;
if(i<=iMin) return -1;
}
else for(;;){
boolean[] arr=bitset2[(c=data[--i])>>8];
if(arr==null || arr[c&0xff]) break;
if(i<=iMin) return -1;
}
return off-i;
}
}
throw new IllegalArgumentException("can't find this kind of term:"+term.type);
}
private static final int findBackReg(char[] data,int off,int regOff,int regLen,int maxCount,Term term,int out){
//assume that the cases when regLen==0 or maxCount==0 are handled by caller
int i=off;
int iMin=off-maxCount;
if(term.type==Term.REG){
/*@since 1.2*/
char first=data[regOff];
regOff++;
regLen--;
for(;;){
i--;
if(data[i]==first && compareRegions(data,i+1,regOff,regLen,out)) break;
if(i<=iMin) return -1;
}
}
else if(term.type==Term.REG_I){
/*@since 1.2*/
char c=data[regOff];
char firstLower=Character.toLowerCase(c);
char firstUpper=Character.toUpperCase(c);
char firstTitle=Character.toTitleCase(c);
regOff++;
regLen--;
for(;;){
i--;
if(((c=data[i])==firstLower || c==firstUpper || c==firstTitle) && compareRegionsI(data,i+1,regOff,regLen,out)) break;
if(i<=iMin) return -1;
}
return off-i;
}
else throw new IllegalArgumentException("wrong findBackReg() target type :"+term.type);
return off-i;
}
public String toString_d(){
StringBuffer s=new StringBuffer();
s.append("counters: ");
s.append(counters==null? 0: counters.length);
s.append("\r\nmemregs: ");
s.append(memregs.length);
for(int i=0;i0) on.reset(restQueue-1);
else{
this.on=null;
on.sub=null;
}
}
//sub=on=null;
}
}
class MemReg{
int index;
int in=-1,out=-1;
int tmp=-1; //for assuming at GROUP_IN
MemReg(int index){
this.index=index;
}
void reset(){
in=out=-1;
}
}
class LAEntry{
int index;
SearchEntry top,actual;
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy