All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.groupbyinc.common.jregex.Matcher Maven / Gradle / Ivy

There is a newer version: 198
Show newest version
/**
 * Copyright (c) 2001, Sergey A. Samokhodkin
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification, 
 * are permitted provided that the following conditions are met:
 * 
 * - Redistributions of source code must retain the above copyright notice, 
 * this list of conditions and the following disclaimer. 
 * - Redistributions in binary form 
 * must reproduce the above copyright notice, this list of conditions and the following 
 * disclaimer in the documentation and/or other materials provided with the distribution.
 * - Neither the name of jregex nor the names of its contributors may be used 
 * to endorse or promote products derived from this software without specific prior 
 * written permission. 
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * @version 1.2_01
 */

package jregex;

import java.util.*;
import java.io.*;

/**
 * Matcher instance is an automaton that actually performs matching. It provides the following methods:
 * 
  • searching for a matching substrings : matcher.find() or matcher.findAll(); *
  • testing whether a text matches a whole pattern : matcher.matches(); *
  • testing whether the text matches the beginning of a pattern : matcher.matchesPrefix(); *
  • searching with custom options : matcher.find(int options) *

    * Obtaining results
    * After the search succeded, i.e. if one of above methods returned true * one may obtain an information on the match: *

  • may check whether some group is captured : matcher.isCaptured(int); *
  • may obtain start and end positions of the match and its length : matcher.start(int),matcher.end(int),matcher.length(int); *
  • may obtain match contents as String : matcher.group(int).
    * The same way can be obtained the match prefix and suffix information. * The appropriate methods are grouped in MatchResult interface, which the Matcher class implements.
    * Matcher objects are not thread-safe, so only one thread may use a matcher instance at a time. * Note, that Pattern objects are thread-safe(the same instanse may be shared between * multiple threads), and the typical tactics in multithreaded applications is to have one Pattern instance per expression(a singleton), * and one Matcher object per thread. */ public class Matcher implements MatchResult{ /* Matching options*/ /** * The same effect as "^" without REFlags.MULTILINE. * @see Matcher#find(int) */ public static final int ANCHOR_START=1; /** * The same effect as "\\G". * @see Matcher#find(int) */ public static final int ANCHOR_LASTMATCH=2; /** * The same effect as "$" without REFlags.MULTILINE. * @see Matcher#find(int) */ public static final int ANCHOR_END=4; /** * Experimental option; if a text ends up before the end of a pattern,report a match. * @see Matcher#find(int) */ public static final int ACCEPT_INCOMPLETE=8; //see search(ANCHOR_START|...) private static Term startAnchor=new Term(Term.START); //see search(ANCHOR_LASTMATCH|...) private static Term lastMatchAnchor=new Term(Term.LAST_MATCH_END); private Pattern re; private int[] counters; private MemReg[] memregs; private LAEntry[] lookaheads; private int counterCount; private int memregCount; private int lookaheadCount; private char[] data; private int offset,end,wOffset,wEnd; private boolean shared; private SearchEntry top; //stack entry private SearchEntry first; //object pool entry private SearchEntry defaultEntry; //called when moving the window private boolean called; private int minQueueLength; private String cache; //cache may be longer than the actual data //and contrariwise; so cacheOffset may have both signs. //cacheOffset is actually -(data offset). private int cacheOffset,cacheLength; private MemReg prefixBounds,suffixBounds,targetBounds; Matcher(Pattern regex){ this.re=regex; //int memregCount=(memregs=new MemReg[regex.memregs]).length; //for(int i=0;i0){ MemReg[] memregs=new MemReg[memregCount]; for(int i=0;i0) counters=new int[counterCount]; if((lookaheadCount=regex.lookaheads)>0){ LAEntry[] lookaheads=new LAEntry[lookaheadCount]; for(int i=0;i * Matcher m=new Pattern("\\w+").matcher(myString); * if(m.find())m.setTarget(m,m.SUFFIX); //forget all that is not a suffix *
  • * Resets current search position to zero. * @param m - a matcher that is a source of data * @param groupId - which group to take data from * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) * @see Matcher#setTarget(java.io.Reader,int) */ public final void setTarget(Matcher m, int groupId){ MemReg mr=m.bounds(groupId); //System.out.println("setTarget("+m+","+groupId+")"); //System.out.println(" in="+mr.in); //System.out.println(" out="+mr.out); if(mr==null) throw new IllegalArgumentException("group #"+groupId+" is not assigned"); data=m.data; offset=mr.in; end=mr.out; cache=m.cache; cacheLength=m.cacheLength; cacheOffset=m.cacheOffset; if(m!=this){ shared=true; m.shared=true; } init(); } /** * Supplies a text to search in/match with. * Resets current search position to zero. * @param text - a data * @see Matcher#setTarget(jregex.Matcher,int) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) * @see Matcher#setTarget(java.io.Reader,int) */ public void setTarget(String text){ setTarget(text,0,text.length()); } /** * Supplies a text to search in/match with, as a part of String. * Resets current search position to zero. * @param text - a data source * @param start - where the target starts * @param len - how long is the target * @see Matcher#setTarget(jregex.Matcher,int) * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(char[],int,int) * @see Matcher#setTarget(java.io.Reader,int) */ public void setTarget(String text,int start,int len){ char[] mychars=data; if(mychars==null || shared || mychars.lengthshared=false:
       *   myMatcher.setTarget(myCharArray,x,y,false); //we declare that array contents is NEITHER shared NOR will be used later, so may modifications on it are permitted
       * 
    * then we should expect the array contents to be changed on subsequent setTarget(..) operations. * Such method may yield some increase in perfomanse in the case of multiple setTarget() calls. * Resets current search position to zero. * @param text - a data source * @param start - where the target starts * @param len - how long is the target * @param shared - if true: data are shared or used later, don't modify it; if false: possible modifications of the text on subsequent setTarget() calls are perceived and allowed. * @see Matcher#setTarget(jregex.Matcher,int) * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) * @see Matcher#setTarget(java.io.Reader,int) */ public final void setTarget(char[] text,int start,int len,boolean shared){ cache=null; data=text; offset=start; end=start+len; this.shared=shared; init(); } /** * Supplies a text to search in/match with through a stream. * Resets current search position to zero. * @param in - a data stream; * @param len - how much characters should be read; if len is -1, read the entire stream. * @see Matcher#setTarget(jregex.Matcher,int) * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) */ public void setTarget(Reader in,int len)throws IOException{ if(len<0){ setAll(in); return; } char[] mychars=data; boolean shared=this.shared; if(mychars==null || shared || mychars.length=0){ len-=c; count+=c; if(len==0) break; } setTarget(mychars,0,count,shared); } private void setAll(Reader in)throws IOException{ char[] mychars=data; int free; boolean shared=this.shared; if(mychars==null || shared){ mychars=new char[free=1024]; shared=false; } else free=mychars.length; int count=0; int c; while((c=in.read(mychars,count,free))>=0){ free-=c; count+=c; if(free==0){ int newsize=count*3; char[] newchars=new char[newsize]; System.arraycopy(mychars,0,newchars,0,count); mychars=newchars; free=newsize-count; shared=false; } } setTarget(mychars,0,count,shared); } private final String getString(int start,int end){ String src=cache; if(src!=null){ int co=cacheOffset; return src.substring(start-co,end-co); } int tOffset,tEnd,tLen=(tEnd=this.end)-(tOffset=this.offset); char[] data=this.data; if((end-start)>=(tLen/3)){ //it makes sence to make a cache cache=src=new String(data,tOffset,tLen); cacheOffset=tOffset; cacheLength=tLen; return src.substring(start-tOffset,end-tOffset); } return new String(data,start,end-start); } /* Matching */ /** * Tells whether the entire target matches the beginning of the pattern. * The whole pattern is also regarded as its beginning.
    * This feature allows to find a mismatch by examining only a beginning part of * the target (as if the beginning of the target doesn't match the beginning of the pattern, then the entire target * also couldn't match).
    * For example the following assertions yield true:
       *   Pattern p=new Pattern("abcd"); 
       *   p.matcher("").matchesPrefix();
       *   p.matcher("a").matchesPrefix();
       *   p.matcher("ab").matchesPrefix();
       *   p.matcher("abc").matchesPrefix();
       *   p.matcher("abcd").matchesPrefix();
       * 
    * and the following yield false:
       *   p.matcher("b").isPrefix();
       *   p.matcher("abcdef").isPrefix();
       *   p.matcher("x").isPrefix();
       * 
    * @return true if the entire target matches the beginning of the pattern */ public final boolean matchesPrefix(){ setPosition(0); return search(ANCHOR_START|ACCEPT_INCOMPLETE|ANCHOR_END); } /** * Just an old name for isPrefix().
    * Retained for backwards compatibility. * @deprecated Replaced by isPrefix() */ public final boolean isStart(){ return matchesPrefix(); } /** * Tells whether a current target matches the whole pattern. * For example the following yields the true:
       *   Pattern p=new Pattern("\\w+"); 
       *   p.matcher("a").matches();
       *   p.matcher("ab").matches();
       *   p.matcher("abc").matches();
       * 
    * and the following yields the false:
       *   p.matcher("abc def").matches();
       *   p.matcher("bcd ").matches();
       *   p.matcher(" bcd").matches();
       *   p.matcher("#xyz#").matches();
       * 
    * @return whether a current target matches the whole pattern. */ public final boolean matches(){ if(called) setPosition(0); return search(ANCHOR_START|ANCHOR_END); } /** * Just a combination of setTarget(String) and matches(). * @param s the target string; * @return whether the specified string matches the whole pattern. */ public final boolean matches(String s){ setTarget(s); return search(ANCHOR_START|ANCHOR_END); } /** * Allows to set a position the subsequent find()/find(int) will start from. * @param pos the position to start from; * @see Matcher#find() * @see Matcher#find(int) */ public void setPosition(int pos){ wOffset=offset+pos; wEnd=-1; called=false; flush(); } /** * Searches through a target for a matching substring, starting from just after the end of last match. * If there wasn't any search performed, starts from zero. * @return true if a match found. */ public final boolean find(){ if(called) skip(); return search(0); } /** * Searches through a target for a matching substring, starting from just after the end of last match. * If there wasn't any search performed, starts from zero. * @param anchors a zero or a combination(bitwise OR) of ANCHOR_START,ANCHOR_END,ANCHOR_LASTMATCH,ACCEPT_INCOMPLETE * @return true if a match found. */ public final boolean find(int anchors){ if(called) skip(); return search(anchors); } /** * The same as findAll(int), but with default behaviour; */ public MatchIterator findAll(){ return findAll(0); } /** * Returns an iterator over the matches found by subsequently calling find(options), the search starts from the zero position. */ public MatchIterator findAll(final int options){ //setPosition(0); return new MatchIterator(){ private boolean checked=false; private boolean hasMore=false; public boolean hasMore(){ if(!checked) check(); return hasMore; } public MatchResult nextMatch(){ if(!checked) check(); if(!hasMore) throw new NoSuchElementException(); checked=false; return Matcher.this; } private final void check(){ hasMore=find(options); checked=true; } public int count(){ if(!checked) check(); if(!hasMore) return 0; int c=1; while(find(options))c++; checked=false; return c; } }; } /** * Continues to search from where the last search left off. * The same as proceed(0). * @see Matcher#proceed(int) */ public final boolean proceed(){ return proceed(0); } /** * Continues to search from where the last search left off using specified options:
       * Matcher m=new Pattern("\\w+").matcher("abc");
       * while(m.proceed(0)){
       *    System.out.println(m.group(0));
       * }
       * 
    * Output:
       * abc
       * ab
       * a
       * bc
       * b
       * c
       * 
    * For example, let's find all odd nubmers occuring in a text:
       *    Matcher m=new Pattern("\\d+").matcher("123");
       *    while(m.proceed(0)){
       *       String match=m.group(0);
       *       if(isOdd(Integer.parseInt(match))) System.out.println(match);
       *    }
       *    
       *    static boolean isOdd(int i){
       *       return (i&1)>0;
       *    }
       * 
    * This outputs:
       * 123
       * 1
       * 23
       * 3
       * 
    * Note that using find() method we would find '123' only. * @param options search options, some of ANCHOR_START|ANCHOR_END|ANCHOR_LASTMATCH|ACCEPT_INCOMPLETE; zero value(default) stands for usual search for substring. */ public final boolean proceed(int options){ //System.out.println("next() : top="+top); if(called){ if(top==null){ wOffset++; } } return search(0); } /** * Sets the current search position just after the end of last match. */ public final void skip(){ int we=wEnd; if(wOffset==we){ //requires special handling //if no variants at 'wOutside',advance pointer and clear if(top==null){ wOffset++; flush(); } //otherwise, if there exist a variant, //don't clear(), i.e. allow it to match return; } else{ if(we<0) wOffset=0; else wOffset=we; } //rflush(); //rflush() works faster on simple regexes (with a small group/branch number) flush(); } private final void init(){ //wOffset=-1; //System.out.println("init(): offset="+offset+", end="+end); wOffset=offset; wEnd=-1; called=false; flush(); } /** * Resets the internal state. */ private final void flush(){ top=null; defaultEntry.reset(0); /* int c=0; SearchEntry se=first; while(se!=null){ c++; se=se.on; } System.out.println("queue: allocated="+c+", truncating to "+minQueueLength); new Exception().printStackTrace(); */ first.reset(minQueueLength); //first.reset(0); for(int i=memregs.length-1;i>0;i--){ MemReg mr=memregs[i]; mr.in=mr.out=-1; } for(int i=memregs.length-1;i>0;i--){ MemReg mr=memregs[i]; mr.in=mr.out=-1; } called=false; } //reverse flush //may work significantly faster, //need testing private final void rflush(){ SearchEntry entry=top; top=null; MemReg[] memregs=this.memregs; int[] counters=this.counters; while(entry!=null){ SearchEntry next=entry.sub; SearchEntry.popState(entry,memregs,counters); entry=next; } SearchEntry.popState(defaultEntry,memregs,counters); } /** */ public String toString(){ return getString(wOffset,wEnd); } public Pattern pattern(){ return re; } public String target(){ return getString(offset,end); } /** */ public char[] targetChars(){ shared=true; return data; } /** */ public int targetStart(){ return offset; } /** */ public int targetEnd(){ return end; } public char charAt(int i){ int in=this.wOffset; int out=this.wEnd; if(in<0 || out(mr.out-in)) throw new StringIndexOutOfBoundsException(""+i); return data[in+i]; } public final int length(){ return wEnd-wOffset; } /** */ public final int start(){ return wOffset-offset; } /** */ public final int end(){ return wEnd-offset; } /** */ public String prefix(){ return getString(offset,wOffset); } /** */ public String suffix(){ return getString(wEnd,end); } /** */ public int groupCount(){ return memregs.length; } /** */ public String group(int n){ MemReg mr=bounds(n); if(mr==null) return null; return getString(mr.in,mr.out); } /** */ public String group(String name){ Integer id=re.groupId(name); if(id==null) throw new IllegalArgumentException("<"+name+"> isn't defined"); return group(id.intValue()); } /** */ public boolean getGroup(int n,TextBuffer tb){ MemReg mr=bounds(n); if(mr==null) return false; int in; tb.append(data,in=mr.in,mr.out-in); return true; } /** */ public boolean getGroup(String name,TextBuffer tb){ Integer id=re.groupId(name); if(id==null) throw new IllegalArgumentException("unknown group: \""+name+"\""); return getGroup(id.intValue(),tb); } /** */ public boolean getGroup(int n,StringBuffer sb){ MemReg mr=bounds(n); if(mr==null) return false; int in; sb.append(data,in=mr.in,mr.out-in); return true; } /** */ public boolean getGroup(String name,StringBuffer sb){ Integer id=re.groupId(name); if(id==null) throw new IllegalArgumentException("unknown group: \""+name+"\""); return getGroup(id.intValue(),sb); } /** */ public String[] groups(){ MemReg[] memregs=this.memregs; String[] groups=new String[memregs.length]; int in,out; MemReg mr; for(int i=0;i=0){ mr=memregs[id]; } else switch(id){ case PREFIX: mr=prefixBounds; if(mr==null) prefixBounds=mr=new MemReg(PREFIX); mr.in=offset; mr.out=wOffset; break; case SUFFIX: mr=suffixBounds; if(mr==null) suffixBounds=mr=new MemReg(SUFFIX); mr.in=wEnd; mr.out=end; break; case TARGET: mr=targetBounds; if(mr==null) targetBounds=mr=new MemReg(TARGET); mr.in=offset; mr.out=end; break; default: throw new IllegalArgumentException("illegal group id: "+id+"; must either nonnegative int, or MatchResult.PREFIX, or MatchResult.SUFFIX"); } //System.out.println(" mr=["+mr.in+","+mr.out+"]"); int in; if((in=mr.in)<0 || mr.out=0 && wEnd>=wOffset; } /** */ public final boolean isCaptured(int id){ return bounds(id)!=null; } /** */ public final boolean isCaptured(String groupName){ Integer id=re.groupId(groupName); if(id==null) throw new IllegalArgumentException("unknown group: \""+groupName+"\""); return isCaptured(id.intValue()); } /** */ public final int length(int id){ MemReg mr=bounds(id); return mr.out-mr.in; } /** */ public final int start(int id){ return bounds(id).in-offset; } /** */ public final int end(int id){ return bounds(id).out-offset; } private final boolean search(int anchors){ called=true; final int end=this.end; int offset=this.offset; char[] data=this.data; int wOffset=this.wOffset; int wEnd=this.wEnd; MemReg[] memregs=this.memregs; int[] counters=this.counters; LAEntry[] lookaheads=this.lookaheads; //int memregCount=memregs.length; //int cntCount=counters.length; int memregCount=this.memregCount; int cntCount=this.counterCount; SearchEntry defaultEntry=this.defaultEntry; SearchEntry first=this.first; SearchEntry top=this.top; SearchEntry actual=null; int cnt,regLen; int i; final boolean matchEnd=(anchors&ANCHOR_END)>0; final boolean allowIncomplete=(anchors&ACCEPT_INCOMPLETE)>0; Pattern re=this.re; Term root=re.root; Term term; if(top==null){ if((anchors&ANCHOR_START)>0){ term=re.root0; //raw root root=startAnchor; } else if((anchors&ANCHOR_LASTMATCH)>0){ term=re.root0; //raw root root=lastMatchAnchor; } else{ term=root; //optimized root } i=wOffset; actual=first; SearchEntry.popState(defaultEntry,memregs,counters); } else{ top=(actual=top).sub; term=actual.term; i=actual.index; SearchEntry.popState(actual,memregs,counters); } cnt=actual.cnt; regLen=actual.regLen; main: while(wOffset<=end){ matchHere: for(;;){ /* System.out.print("char: "+i+", term: "); System.out.print(term.toString()); System.out.print(" // mrs:{"); for(int dbi=0;dbiend) break; } term=term.next; continue matchHere; } case Term.VOID: term=term.next; continue matchHere; case Term.CHAR: //can only be 1-char-wide // \/ if(i>=end || data[i]!=term.c) break; //System.out.println("CHAR: "+data[i]+", i="+i); i++; term=term.next; continue matchHere; case Term.ANY_CHAR: //can only be 1-char-wide // \/ if(i>=end) break; i++; term=term.next; continue matchHere; case Term.ANY_CHAR_NE: //can only be 1-char-wide // \/ if(i>=end || (c=data[i])=='\r' || c=='\n') break; i++; term=term.next; continue matchHere; case Term.END: if(i>=end){ //meets term=term.next; continue matchHere; } break; case Term.END_EOL: //perl's $ if(i>=end){ //meets term=term.next; continue matchHere; } else{ boolean matches= i>=end | ((i+1)==end && data[i]=='\n') | ((i+2)==end && data[i]=='\r' && data[i+1]=='\n'); if(matches){ term=term.next; continue matchHere; } else break; } case Term.LINE_END: if(i>=end){ //meets term=term.next; continue matchHere; } else{ /* if(((c=data[i])=='\r' || c=='\n') && (c=data[i-1])!='\r' && c!='\n'){ term=term.next; continue matchHere; } */ //5 aug 2001 if((c=data[i])=='\r' || c=='\n'){ term=term.next; continue matchHere; } } break; case Term.START: //Perl's "^" if(i==offset){ //meets term=term.next; continue matchHere; } //break; //changed on 27-04-2002 //due to a side effect: if ALLOW_INCOMPLETE is enabled, //the anchorStart moves up to the end and succeeds //(see comments at the last lines of matchHere, ~line 1830) //Solution: if there are some entries on the stack ("^a|b$"), //try them; otherwise it's a final 'no' //if(top!=null) break; //else break main; //changed on 25-05-2002 //rationale: if the term is startAnchor, //it's the root term by definition, //so if it doesn't match, the entire pattern //couldn't match too; //otherwise we could have the following problem: //"c|^a" against "abc" finds only "a" if(top!=null) break; if(term!=startAnchor) break; else break main; case Term.LAST_MATCH_END: if(i==wEnd){ //meets term=term.next; continue matchHere; } break main; //return false case Term.LINE_START: if(i==offset){ //meets term=term.next; continue matchHere; } else if(i=end) break; c=data[i]; if(!(c<=255 && term.bitset[c])^term.inverse) break; i++; term=term.next; continue matchHere; } case Term.BITSET2:{ //can only be 1-char-wide // \/ if(i>=end) break; c=data[i]; boolean[] arr=term.bitset2[c>>8]; if(arr==null || !arr[c&255]^term.inverse) break; i++; term=term.next; continue matchHere; } case Term.BOUNDARY:{ boolean ch1Meets=false,ch2Meets=false; boolean[] bitset=term.bitset; test1:{ int j=i-1; //if(j=end) break test1; if(j=end) break test2; if(i>=end) break test2; c= data[i]; ch2Meets= (c<256 && bitset[c]); } if(ch1Meets^ch2Meets^term.inverse){ //meets term=term.next; continue matchHere; } else break; } case Term.UBOUNDARY:{ boolean ch1Meets=false,ch2Meets=false; boolean[][] bitset2=term.bitset2; test1:{ int j=i-1; //if(j=end) break test1; if(j>8]; ch1Meets= bits!=null && bits[c&0xff]; } test2:{ //if(i=end) break test2; if(i>=end) break test2; c= data[i]; boolean[] bits=bitset2[c>>8]; ch2Meets= bits!=null && bits[c&0xff]; } if(ch1Meets^ch2Meets^term.inverse){ //is boundary ^ inv term=term.next; continue matchHere; } else break; } case Term.DIRECTION:{ boolean ch1Meets=false,ch2Meets=false; boolean[] bitset=term.bitset; boolean inv=term.inverse; //System.out.println("i="+i+", inv="+inv+", bitset="+CharacterClass.stringValue0(bitset)); int j=i-1; //if(j>=offset && j=offset){ c= data[j]; ch1Meets= c<256 && bitset[c]; //System.out.println(" ch1Meets="+ch1Meets); } if(ch1Meets^inv) break; //if(i>=offset && i=offset && j=offset){ c= data[j]; boolean[] bits=bitset2[c>>8]; ch1Meets= bits!=null && bits[c&0xff]; } if(ch1Meets^inv) break; //if(i>=offset && i>8]; ch2Meets= bits!=null && bits[c&0xff]; } if(!ch2Meets^inv) break; term=term.next; continue matchHere; } case Term.REG:{ MemReg mr=memregs[term.memreg]; int sampleOffset=mr.in; int sampleOutside=mr.out; int rLen; if(sampleOffset<0 || (rLen=sampleOutside-sampleOffset)<0){ break; } else if(rLen==0){ term=term.next; continue matchHere; } // don't prevent us from reaching the 'end' if((i+rLen)>end) break; if(compareRegions(data,sampleOffset,i,rLen,end)){ i+=rLen; term=term.next; continue matchHere; } break; } case Term.REG_I:{ MemReg mr=memregs[term.memreg]; int sampleOffset=mr.in; int sampleOutside=mr.out; int rLen; if(sampleOffset<0 || (rLen=sampleOutside-sampleOffset)<0){ break; } else if(rLen==0){ term=term.next; continue matchHere; } // don't prevent us from reaching the 'end' if((i+rLen)>end) break; if(compareRegionsI(data,sampleOffset,i,rLen,end)){ i+=rLen; term=term.next; continue matchHere; } break; } case Term.REPEAT_0_INF:{ //System.out.println("REPEAT, i="+i+", term.minCount="+term.minCount+", term.maxCount="+term.maxCount); //i+=(cnt=repeat(data,i,end,term.target)); if((cnt=repeat(data,i,end,term.target))<=0){ term=term.next; continue; } i+=cnt; //branch out the backtracker (that is term.failNext, see Term.make*()) actual.cnt=cnt; actual.term=term.failNext; actual.index=i; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } case Term.REPEAT_MIN_INF:{ //System.out.println("REPEAT, i="+i+", term.minCount="+term.minCount+", term.maxCount="+term.maxCount); cnt=repeat(data,i,end,term.target); if(cnt0 && compareRegions(data,i,sampleOffset,bitset,end)){ cnt++; i+=bitset; countBack--; } if(cnt0){ cnt--; i--; actual.cnt=cnt; actual.index=i; actual.term=term; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else break; case Term.BACKTRACK_MIN: //System.out.println("<<"); cnt=actual.cnt; if(cnt>term.minCount){ cnt--; i--; actual.cnt=cnt; actual.index=i; actual.term=term; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else break; case Term.BACKTRACK_FIND_MIN:{ //System.out.print("<<<[cnt="); cnt=actual.cnt; //System.out.print(cnt+", minCnt="); //System.out.print(term.minCount+", target="); //System.out.print(term.target+"]"); int minCnt; if(cnt>(minCnt=term.minCount)){ int start=i+term.distance; if(start>end){ int exceed=start-end; cnt-=exceed; if(cnt<=minCnt) break; i-=exceed; start=end; } int back=findBack(data,i+term.distance,cnt-minCnt,term.target); //System.out.print("[back="+back+"]"); if(back<0) break; //cnt-=back; //i-=back; if((cnt-=back)<=minCnt){ i-=back; if(term.eat)i++; term=term.next; continue; } i-=back; actual.cnt=cnt; actual.index=i; if(term.eat)i++; actual.term=term; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else break; } case Term.BACKTRACK_FINDREG_MIN:{ //System.out.print("<<<[cnt="); cnt=actual.cnt; //System.out.print(cnt+", minCnt="); //System.out.print(term.minCount+", target="); //System.out.print(term.target); //System.out.print("reg=<"+memregs[term.target.memreg].in+","+memregs[term.target.memreg].out+">]"); int minCnt; if(cnt>(minCnt=term.minCount)){ int start=i+term.distance; if(start>end){ int exceed=start-end; cnt-=exceed; if(cnt<=minCnt) break; i-=exceed; start=end; } MemReg mr=memregs[term.target.memreg]; int sampleOff=mr.in; int sampleLen=mr.out-sampleOff; //if(sampleOff<0 || sampleLen<0) throw new Error("backreference used before definition: \\"+term.memreg); //int back=findBackReg(data,i+term.distance,sampleOff,sampleLen,cnt-minCnt,term.target,end); //if(back<0) break; /*@since 1.2*/ int back; if(sampleOff<0 || sampleLen<0){ //the group is not def., as in the case of '(\w+)\1' //treat as usual BACKTRACK_MIN cnt--; i--; actual.cnt=cnt; actual.index=i; actual.term=term; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else if(sampleLen==0){ back=-1; } else{ back=findBackReg(data,i+term.distance,sampleOff,sampleLen,cnt-minCnt,term.target,end); //System.out.print("[back="+back+"]"); if(back<0) break; } cnt-=back; i-=back; actual.cnt=cnt; actual.index=i; if(term.eat)i+=sampleLen; actual.term=term; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else break; } case Term.BACKTRACK_REG_MIN: //System.out.println("<<"); cnt=actual.cnt; if(cnt>term.minCount){ regLen=actual.regLen; cnt--; i-=regLen; actual.cnt=cnt; actual.index=i; actual.term=term; //actual.regLen=regLen; actual=(top=actual).on; if(actual==null){ actual=new SearchEntry(); top.on=actual; actual.sub=top; } term=term.next; continue; } else break; case Term.GROUP_IN:{ memreg=term.memreg; //memreg=0 is a regex itself; we don't need to handle it //because regex bounds already are in wOffset and wEnd if(memreg>0){ //MemReg mr=memregs[memreg]; //saveMemregState((top!=null)? top: defaultEntry,memreg,mr); //mr.in=i; memregs[memreg].tmp=i; //assume } term=term.next; continue; } case Term.GROUP_OUT: memreg=term.memreg; //see above if(memreg>0){ //if(term.saveState)saveMemregState((top!=null)? top: defaultEntry,memreg,memregs); MemReg mr=memregs[memreg]; SearchEntry.saveMemregState((top!=null)? top: defaultEntry,memreg,mr); mr.in=mr.tmp; //commit mr.out=i; } term=term.next; continue; case Term.PLOOKBEHIND_IN:{ int tmp=i-term.distance; if(tmp0;c--,p1--,p2--){ if(arr[p1]!=arr[p2]){ //System.out.println(" : no"); return false; } } //System.out.println(" : yes"); return true; } private static final boolean compareRegionsI(char[] arr, int off1, int off2, int len,int out){ int p1=off1+len-1; int p2=off2+len-1; if(p1>=out || p2>=out){ return false; } char c1,c2; for(int c=len;c>0;c--,p1--,p2--){ if((c1=arr[p1])!=Character.toLowerCase(c2=arr[p2]) && c1!=Character.toUpperCase(c2) && c1!=Character.toTitleCase(c2)) return false; } return true; } //repeat while matches private static final int repeat(char[] data,int off,int out,Term term){ //System.out.print("off="+off+", out="+out+", term="+term); switch(term.type){ case Term.CHAR:{ char c=term.c; int i=off; while(i>8]; if(arr!=null && arr[c&0xff]) break; else i++; } else while(i>8]; if(arr!=null && arr[c&0xff]) i++; else break; } return i-off; } } throw new Error("this kind of term can't be quantified:"+term.type); } //repeat while doesn't match private static final int find(char[] data,int off,int out,Term term){ //System.out.print("off="+off+", out="+out+", term="+term); if(off>=out) return -1; switch(term.type){ case Term.CHAR:{ char c=term.c; int i=off; while(i>8]; if(arr!=null && arr[c&0xff]) break; else i++; } else while(i>8]; if(arr!=null && arr[c&0xff]) i++; else break; } return i-off; } } throw new IllegalArgumentException("can't seek this kind of term:"+term.type); } private static final int findReg(char[] data,int off,int regOff,int regLen,Term term,int out){ //System.out.print("off="+off+", out="+out+", term="+term); if(off>=out) return -1; int i=off; if(term.type==Term.REG){ while(i255 || !arr[c]) break; if(i<=iMin) return -1; } return off-i; } case Term.BITSET2:{ boolean[][] bitset2=term.bitset2; int i=off; char c; int iMin=off-maxCount; if(!term.inverse) for(;;){ boolean[] arr=bitset2[(c=data[--i])>>8]; if(arr!=null && arr[c&0xff]) break; if(i<=iMin) return -1; } else for(;;){ boolean[] arr=bitset2[(c=data[--i])>>8]; if(arr==null || arr[c&0xff]) break; if(i<=iMin) return -1; } return off-i; } } throw new IllegalArgumentException("can't find this kind of term:"+term.type); } private static final int findBackReg(char[] data,int off,int regOff,int regLen,int maxCount,Term term,int out){ //assume that the cases when regLen==0 or maxCount==0 are handled by caller int i=off; int iMin=off-maxCount; if(term.type==Term.REG){ /*@since 1.2*/ char first=data[regOff]; regOff++; regLen--; for(;;){ i--; if(data[i]==first && compareRegions(data,i+1,regOff,regLen,out)) break; if(i<=iMin) return -1; } } else if(term.type==Term.REG_I){ /*@since 1.2*/ char c=data[regOff]; char firstLower=Character.toLowerCase(c); char firstUpper=Character.toUpperCase(c); char firstTitle=Character.toTitleCase(c); regOff++; regLen--; for(;;){ i--; if(((c=data[i])==firstLower || c==firstUpper || c==firstTitle) && compareRegionsI(data,i+1,regOff,regLen,out)) break; if(i<=iMin) return -1; } return off-i; } else throw new IllegalArgumentException("wrong findBackReg() target type :"+term.type); return off-i; } public String toString_d(){ StringBuffer s=new StringBuffer(); s.append("counters: "); s.append(counters==null? 0: counters.length); s.append("\r\nmemregs: "); s.append(memregs.length); for(int i=0;i0) on.reset(restQueue-1); else{ this.on=null; on.sub=null; } } //sub=on=null; } } class MemReg{ int index; int in=-1,out=-1; int tmp=-1; //for assuming at GROUP_IN MemReg(int index){ this.index=index; } void reset(){ in=out=-1; } } class LAEntry{ int index; SearchEntry top,actual; }




    © 2015 - 2024 Weber Informatics LLC | Privacy Policy