com.groupbyinc.common.jregex.Term Maven / Gradle / Ivy
/**
* Copyright (c) 2001, Sergey A. Samokhodkin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form
* must reproduce the above copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided with the distribution.
* - Neither the name of jregex nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @version 1.2_01
*/
package jregex;
import java.util.*;
class Term implements REFlags{
//runtime Term types
static final int CHAR = 0;
static final int BITSET = 1;
static final int BITSET2 = 2;
static final int ANY_CHAR = 4;
static final int ANY_CHAR_NE = 5;
static final int REG = 6;
static final int REG_I = 7;
static final int FIND = 8;
static final int FINDREG = 9;
static final int SUCCESS = 10;
/*optimization-transparent types*/
static final int BOUNDARY = 11;
static final int DIRECTION = 12;
static final int UBOUNDARY = 13;
static final int UDIRECTION = 14;
static final int GROUP_IN = 15;
static final int GROUP_OUT = 16;
static final int VOID = 17;
static final int START = 18;
static final int END = 19;
static final int END_EOL = 20;
static final int LINE_START = 21;
static final int LINE_END = 22;
static final int LAST_MATCH_END = 23;
static final int CNT_SET_0 = 24;
static final int CNT_INC = 25;
static final int CNT_GT_EQ = 26;
static final int READ_CNT_LT = 27;
static final int CRSTORE_CRINC = 28; //store on 'actual' search entry
static final int CR_SET_0 = 29;
static final int CR_LT = 30;
static final int CR_GT_EQ = 31;
/*optimization-nontransparent types*/
static final int BRANCH = 32;
static final int BRANCH_STORE_CNT = 33;
static final int BRANCH_STORE_CNT_AUX1 = 34;
static final int PLOOKAHEAD_IN = 35;
static final int PLOOKAHEAD_OUT = 36;
static final int NLOOKAHEAD_IN = 37;
static final int NLOOKAHEAD_OUT = 38;
static final int PLOOKBEHIND_IN = 39;
static final int PLOOKBEHIND_OUT = 40;
static final int NLOOKBEHIND_IN = 41;
static final int NLOOKBEHIND_OUT = 42;
static final int INDEPENDENT_IN = 43; //functionally the same as NLOOKAHEAD_IN
static final int INDEPENDENT_OUT = 44;
static final int REPEAT_0_INF = 45;
static final int REPEAT_MIN_INF = 46;
static final int REPEAT_MIN_MAX = 47;
static final int REPEAT_REG_MIN_INF = 48;
static final int REPEAT_REG_MIN_MAX = 49;
static final int BACKTRACK_0 = 50;
static final int BACKTRACK_MIN = 51;
static final int BACKTRACK_FIND_MIN = 52;
static final int BACKTRACK_FINDREG_MIN = 53;
static final int BACKTRACK_REG_MIN = 54;
static final int MEMREG_CONDITION = 55;
static final int LOOKAHEAD_CONDITION_IN = 56;
static final int LOOKAHEAD_CONDITION_OUT = 57;
static final int LOOKBEHIND_CONDITION_IN = 58;
static final int LOOKBEHIND_CONDITION_OUT = 59;
//optimization
static final int FIRST_TRANSPARENT = BOUNDARY;
static final int LAST_TRANSPARENT = CR_GT_EQ;
// compiletime: length of vars[] (see makeTree())
static final int VARS_LENGTH=4;
// compiletime variable indicies:
private static final int MEMREG_COUNT=0; //refers current memreg index
private static final int CNTREG_COUNT=1; //refers current counters number
private static final int DEPTH=2; //refers current depth: (((depth=3)))
private static final int LOOKAHEAD_COUNT=3; //refers current memreg index
private static final int LIMITS_LENGTH=3;
private static final int LIMITS_PARSE_RESULT_INDEX=2;
private static final int LIMITS_OK=1;
private static final int LIMITS_FAILURE=2;
//static CustomParser[] customParsers=new CustomParser[256];
// **** CONTROL FLOW ****
// next-to-execute and next-if-failed commands;
Term next,failNext;
// **** TYPES ****
int type=VOID;
boolean inverse;
// used with type=CHAR
char c;
// used with type=FIND
int distance;
boolean eat;
// used with type=BITSET(2);
boolean[] bitset;
boolean[][] bitset2;
boolean[] categoryBitset; //types(unicode categories)
// used with type=BALANCE;
char[] brackets;
// used for optimization with type=BITSET,BITSET2
int weight;
// **** MEMORISATION ****
// memory slot, used with type=REG,GROUP_IN,GROUP_OUT
int memreg=-1;
// **** COUNTERS ****
// max|min number of iterations
// used with CNT_GT_EQ ,REPEAT_* etc.;
int minCount,maxCount;
// used with REPEAT_*,REPEAT_REG_*;
Term target;
// a counter slot to increment & compare with maxCount (CNT_INC etc.);
int cntreg=0;
// lookahead group id;
int lookaheadId;
// **** COMPILE HELPERS ****
protected Term prev,in,out,out1,first,current;
//new!!
protected Term branchOut;
//protected boolean newBranch=false,closed=false;
//protected boolean newBranch=false;
//for debugging
static int instances;
int instanceNum;
Term(){
//for debugging
instanceNum=instances;
instances++;
in=out=this;
}
Term(int type){
this();
this.type=type;
}
static void makeTree(String s, int flags,Pattern re) throws PatternSyntaxException{
char[] data=s.toCharArray();
makeTree(data,0,data.length,flags,re);
}
static void makeTree(char[] data,int offset,int end,
int flags,Pattern re) throws PatternSyntaxException{
// memreg,counter,depth,lookahead
int[] vars={1,0,0,0}; //don't use counters[0]
//collect iterators for subsequent optimization
Vector iterators=new Vector();
Hashtable groupNames=new Hashtable();
Pretokenizer t=new Pretokenizer(data,offset,end);
Term term=makeTree(t,data,vars,flags,new Group(),iterators,groupNames);
// term=(0-...-0)
// convert closing outer bracket into success term
term.out.type=SUCCESS;
// term=(0-...-!!!
//throw out opening bracket
Term first=term.next;
// term=...-!!!
// Optimisation:
Term optimized=first;
Optimizer opt=Optimizer.find(first);
if(opt!=null) optimized=opt.makeFirst(first);
Enumeration en=iterators.elements();
while(en.hasMoreElements()){
Iterator i=(Iterator)en.nextElement();
i.optimize();
}
// ===
re.root=optimized;
re.root0=first;
re.memregs=vars[MEMREG_COUNT];
re.counters=vars[CNTREG_COUNT];
re.lookaheads=vars[LOOKAHEAD_COUNT];
re.namedGroupMap=groupNames;
}
private static Term makeTree(Pretokenizer t,char[] data,int[] vars,
int flags,Term term,Vector iterators,Hashtable groupNames) throws PatternSyntaxException{
//System.out.println("Term.makeTree(): flags="+flags);
if(vars.length!=VARS_LENGTH) throw new IllegalArgumentException("vars.length should be "+VARS_LENGTH+", not "+vars.length);
//Term term=new Term(isMemReg? vars[MEMREG_COUNT]: -1);
// use memreg 0 as unsignificant
//Term term=new Group(isMemReg? vars[MEMREG_COUNT]: 0);
while(true){
t.next();
term.append(t.tOffset,t.tOutside,data,vars,flags,iterators,groupNames);
switch(t.ttype){
case Pretokenizer.FLAGS:
flags=t.flags(flags);
continue;
case Pretokenizer.CLASS_GROUP:
t.next();
Term clg=new Term();
CharacterClass.parseGroup(data,t.tOffset,t.tOutside,clg,
(flags&IGNORE_CASE)>0, (flags&IGNORE_SPACES)>0,
(flags&UNICODE)>0, (flags&XML_SCHEMA)>0);
term.append(clg);
continue;
case Pretokenizer.PLAIN_GROUP:
vars[DEPTH]++;
//System.out.println("PLAIN_GROUP, t.tOffset="+t.tOffset+", t.tOutside="+t.tOutside+", t.flags("+flags+")="+t.flags(flags));
term.append(makeTree(t,data,vars,t.flags(flags),new Group(),iterators,groupNames));
break;
case Pretokenizer.NAMED_GROUP:
String gname=t.groupName;
int id;
if(Character.isDigit(gname.charAt(0))){
try{
id=Integer.parseInt(gname);
}
catch(NumberFormatException e){
throw new PatternSyntaxException("group name starts with digit but is not a number");
}
if(groupNames.contains(new Integer(id))){
if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration: "+gname+"; use ({=id}...) for multiple group assignments");
}
if(vars[MEMREG_COUNT]<=id)vars[MEMREG_COUNT]=id+1;
}
else{
Integer no=(Integer)groupNames.get(gname);
if(no==null){
id=vars[MEMREG_COUNT]++;
groupNames.put(t.groupName,new Integer(id));
}
else{
if(t.groupDeclared) throw new PatternSyntaxException("group redeclaration "+gname+"; use ({=name}...) for group reassignments");
id=no.intValue();
}
}
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Group(id),iterators,groupNames));
break;
case '(':
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Group(vars[MEMREG_COUNT]++),iterators,groupNames));
break;
case Pretokenizer.POS_LOOKAHEAD:
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
break;
case Pretokenizer.NEG_LOOKAHEAD:
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Lookahead(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
break;
case Pretokenizer.POS_LOOKBEHIND:
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,true),iterators,groupNames));
break;
case Pretokenizer.NEG_LOOKBEHIND:
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new Lookbehind(vars[LOOKAHEAD_COUNT]++,false),iterators,groupNames));
break;
case Pretokenizer.INDEPENDENT_REGEX:
vars[DEPTH]++;
term.append(makeTree(t,data,vars,flags,new IndependentGroup(vars[LOOKAHEAD_COUNT]++),iterators,groupNames));
break;
case Pretokenizer.CONDITIONAL_GROUP:
vars[DEPTH]++;
t.next();
Term fork=null;
boolean positive=true;
switch(t.ttype){
case Pretokenizer.NEG_LOOKAHEAD:
positive=false;
case Pretokenizer.POS_LOOKAHEAD:
vars[DEPTH]++;
Lookahead la=new Lookahead(vars[LOOKAHEAD_COUNT]++,positive);
makeTree(t,data,vars,flags,la,iterators,groupNames);
fork=new ConditionalExpr(la);
break;
case Pretokenizer.NEG_LOOKBEHIND:
positive=false;
case Pretokenizer.POS_LOOKBEHIND:
vars[DEPTH]++;
Lookbehind lb=new Lookbehind(vars[LOOKAHEAD_COUNT]++,positive);
makeTree(t,data,vars,flags,lb,iterators,groupNames);
fork=new ConditionalExpr(lb);
break;
case '(':
t.next();
if(t.ttype!=')') throw new PatternSyntaxException("malformed condition");
int memregNo;
if(Character.isDigit(data[t.tOffset])) memregNo=makeNumber(t.tOffset,t.tOutside,data);
else{
String gn=new String(data,t.tOffset,t.tOutside-t.tOffset);
Integer gno=(Integer)groupNames.get(gn);
if(gno==null) throw new PatternSyntaxException("unknown group name in conditional expr.: "+gn);
memregNo=gno.intValue();
}
fork=new ConditionalExpr(memregNo);
break;
default:
throw new PatternSyntaxException("malformed conditional expression: "+t.ttype+" '"+(char)t.ttype+"'");
}
term.append(makeTree(t,data,vars,flags,fork,iterators,groupNames));
break;
case '|':
term.newBranch();
break;
case Pretokenizer.END:
if(vars[DEPTH]>0) throw new PatternSyntaxException("unbalanced parenthesis");
term.close();
return term;
case ')':
if(vars[DEPTH]<=0) throw new PatternSyntaxException("unbalanced parenthesis");
term.close();
vars[DEPTH]--;
return term;
case Pretokenizer.COMMENT:
while(t.ttype!=')') t.next();
continue;
default:
throw new PatternSyntaxException("unknown token type: "+t.ttype);
}
}
}
static int makeNumber(int off, int out, char[] data){
int n=0;
for(int i=off;i9) return -1;
n*=10;
n+=d;
}
return n;
}
protected void append(int offset,int end,char[] data,
int[] vars,int flags,Vector iterators,Hashtable gmap) throws PatternSyntaxException{
//System.out.println("append("+new String(data,offset,end-offset)+")");
//System.out.println("current="+this.current);
int[] limits=new int[3];
int i=offset;
Term tmp,current=this.current;
while(i0);
i=parseGroupId(data,p,end,br,gmap);
current=append(br);
continue;
}
else{
Term t=new Term();
i=CharacterClass.parseName(data,i,end,t,false,(flags&IGNORE_SPACES)>0);
current=append(t);
continue;
}
}
case ' ':
case '\t':
case '\r':
case '\n':
if((flags&IGNORE_SPACES)>0){
i++;
continue;
}
//else go on as default
//symbolic items
default:
tmp=new Term();
i=parseTerm(data,i,end,tmp,flags);
if(tmp.type==END && i");
}
//"\A"
//if(tmp.type==START && i>(offset+1)){
// throw new PatternSyntaxException("'^' is not a first term in the group: <"+new String(data,offset,end-offset)+">");
//}
current=append(tmp);
break;
}
//System.out.println("next term: "+next);
//System.out.println(" next.out="+next.out);
//System.out.println(" next.out1="+next.out1);
//System.out.println(" next.branchOut="+next.branchOut);
}
//System.out.println(in.toStringAll());
//System.out.println("current="+current);
//System.out.println();
}
private static int parseGroupId(char[] data, int i, int end, Term term, Hashtable gmap) throws PatternSyntaxException{
int id;
int nstart=i;
if(Character.isDigit(data[i])){
while(Character.isDigit(data[i])){
i++;
if(i==end) throw new PatternSyntaxException("group_id expected");
}
id=makeNumber(nstart,i,data);
}
else{
while(Character.isJavaIdentifierPart(data[i])){
i++;
if(i==end) throw new PatternSyntaxException("group_id expected");
}
String s=new String(data,nstart,i-nstart);
Integer no=(Integer)gmap.get(s);
if(no==null)throw new PatternSyntaxException("backreference to unknown group: "+s);
id=no.intValue();
}
while(Character.isWhitespace(data[i])){
i++;
if(i==end) throw new PatternSyntaxException("'}' expected");
}
int c=data[i++];
if(c!='}') throw new PatternSyntaxException("'}' expected");
term.memreg=id;
return i;
}
protected Term append(Term term) throws PatternSyntaxException{
//System.out.println("append("+term.toStringAll()+"), this="+toStringAll());
//Term prev=this.prev;
Term current=this.current;
if(current==null){
//System.out.println("2");
//System.out.println(" term="+term);
//System.out.println(" term.in="+term.in);
in.next=term;
term.prev=in;
this.current=term;
//System.out.println(" result: "+in.toStringAll()+"\r\n");
return term;
}
//System.out.println("3");
link(current,term);
//this.prev=current;
this.current=term;
//System.out.println(in.toStringAll());
//System.out.println("current="+this.current);
//System.out.println();
return term;
}
protected Term replaceCurrent(Term term) throws PatternSyntaxException{
//System.out.println("replaceCurrent("+term+"), current="+current+", current.prev="+current.prev);
//Term prev=this.prev;
Term prev=current.prev;
if(prev!=null){
Term in=this.in;
if(prev==in){
//in.next=term;
//term.prev=in;
in.next=term.in;
term.in.prev=in;
}
else link(prev,term);
}
this.current=term;
//System.out.println(" new current="+this.current);
return term;
}
protected void newBranch() throws PatternSyntaxException{
//System.out.println("newBranch()");
close();
startNewBranch();
//System.out.println(in.toStringAll());
//System.out.println("current="+current);
//System.out.println();
}
protected void close() throws PatternSyntaxException{
//System.out.println("close(), current="+current+", this="+toStringAll());
//System.out.println();
//System.out.println("close()");
//System.out.println("current="+this.current);
//System.out.println("prev="+this.prev);
//System.out.println();
/*
Term prev=this.prev;
if(prev!=null){
Term current=this.current;
if(current!=null){
link(prev,current);
prev=current;
this.current=null;
}
link(prev,out);
this.prev=null;
}
*/
Term current=this.current;
if(current!=null) linkd(current,out);
else in.next=out;
//System.out.println(in.toStringAll());
//System.out.println("current="+this.current);
//System.out.println("prev="+this.prev);
//System.out.println();
}
private final static void link(Term term,Term next){
linkd(term,next.in);
next.prev=term;
}
private final static void linkd(Term term,Term next){
//System.out.println("linkDirectly(\""+term+"\" -> \""+next+"\")");
Term prev_out=term.out;
if(prev_out!=null){
//System.out.println(" prev_out="+prev_out);
prev_out.next=next;
}
Term prev_out1=term.out1;
if(prev_out1!=null){
//System.out.println(" prev_out1="+prev_out1);
prev_out1.next=next;
}
Term prev_branch=term.branchOut;
if(prev_branch!=null){
//System.out.println(" prev_branch="+prev_branch);
prev_branch.failNext=next;
}
}
protected void startNewBranch() throws PatternSyntaxException{
//System.out.println("newBranch()");
//System.out.println("before startNewBranch(), this="+toStringAll());
//System.out.println();
Term tmp=in.next;
Term b=new Branch();
in.next=b;
b.next=tmp;
b.in=null;
b.out=null;
b.out1=null;
b.branchOut=b;
current=b;
//System.out.println("startNewBranch(), this="+toStringAll());
//System.out.println();
}
private final static Term makeGreedyStar(int[] vars,Term term,Vector iterators) throws PatternSyntaxException{
//vars[STACK_SIZE]++;
switch(term.type){
case GROUP_IN:{
Term b=new Branch();
b.next=term.in;
term.out.next=b;
b.in=b;
b.out=null;
b.out1=null;
b.branchOut=b;
return b;
}
default:{
Iterator i=new Iterator(term,0,-1,iterators);
return i;
}
}
}
private final static Term makeLazyStar(int[] vars,Term term){
//vars[STACK_SIZE]++;
switch(term.type){
case GROUP_IN:{
Term b=new Branch();
b.failNext=term.in;
term.out.next=b;
b.in=b;
b.out=b;
b.out1=null;
b.branchOut=null;
return b;
}
default:{
Term b=new Branch();
b.failNext=term;
term.next=b;
b.in=b;
b.out=b;
b.out1=null;
b.branchOut=null;
return b;
}
}
}
private final static Term makeGreedyPlus(int[] vars,Term term,Vector iterators) throws PatternSyntaxException{
//vars[STACK_SIZE]++;
switch(term.type){
case INDEPENDENT_IN://?
case GROUP_IN:{
//System.out.println("makeGreedyPlus():");
//System.out.println(" in="+term.in);
//System.out.println(" out="+term.out);
Term b=new Branch();
b.next=term.in;
term.out.next=b;
b.in=term.in;
b.out=null;
b.out1=null;
b.branchOut=b;
//System.out.println(" returning "+b.in);
return b;
}
default:{
return new Iterator(term,1,-1,iterators);
}
}
}
private final static Term makeLazyPlus(int[] vars,Term term){
//vars[STACK_SIZE]++;
switch(term.type){
case GROUP_IN:{
Term b=new Branch();
term.out.next=b;
b.failNext=term.in;
b.in=term.in;
b.out=b;
b.out1=null;
b.branchOut=null;
return b;
}
case REG:
default:{
Term b=new Branch();
term.next=b;
b.failNext=term;
b.in=term;
b.out=b;
b.out1=null;
b.branchOut=null;
return b;
}
}
}
private final static Term makeGreedyQMark(int[] vars,Term term){
//vars[STACK_SIZE]++;
switch(term.type){
case GROUP_IN:{
Term b=new Branch();
b.next=term.in;
b.in=b;
b.out=term.out;
b.out1=null;
b.branchOut=b;
return b;
}
case REG:
default:{
Term b=new Branch();
b.next=term;
b.in=b;
b.out=term;
b.out1=null;
b.branchOut=b;
return b;
}
}
}
private final static Term makeLazyQMark(int[] vars,Term term){
//vars[STACK_SIZE]++;
switch(term.type){
case GROUP_IN:{
Term b=new Branch();
b.failNext=term.in;
b.in=b;
b.out=b;
b.out1=term.out;
b.branchOut=null;
return b;
}
case REG:
default:{
Term b=new Branch();
b.failNext=term;
b.in=b;
b.out=b;
b.out1=term;
b.branchOut=null;
return b;
}
}
}
private final static Term makeGreedyLimits(int[] vars,Term term,int[] limits,Vector iterators) throws PatternSyntaxException{
//vars[STACK_SIZE]++;
int m=limits[0];
int n=limits[1];
switch(term.type){
case GROUP_IN:{
int cntreg=vars[CNTREG_COUNT]++;
Term reset=new Term(CR_SET_0);
reset.cntreg=cntreg;
Term b=new Term(BRANCH);
Term inc=new Term(CRSTORE_CRINC);
inc.cntreg=cntreg;
reset.next=b;
if(n>=0){
Term lt=new Term(CR_LT);
lt.cntreg=cntreg;
lt.maxCount=n;
b.next=lt;
lt.next=term.in;
}
else{
b.next=term.in;
}
term.out.next=inc;
inc.next=b;
if(m>=0){
Term gt=new Term(CR_GT_EQ);
gt.cntreg=cntreg;
gt.maxCount=m;
b.failNext=gt;
reset.in=reset;
reset.out=gt;
reset.out1=null;
reset.branchOut=null;
}
else{
reset.in=reset;
reset.out=null;
reset.out1=null;
reset.branchOut=b;
}
return reset;
}
default:{
return new Iterator(term,limits[0],limits[1],iterators);
}
}
}
private final static Term makeLazyLimits(int[] vars,Term term,int[] limits){
//vars[STACK_SIZE]++;
int m=limits[0];
int n=limits[1];
switch(term.type){
case GROUP_IN:{
int cntreg=vars[CNTREG_COUNT]++;
Term reset=new Term(CR_SET_0);
reset.cntreg=cntreg;
Term b=new Term(BRANCH);
Term inc=new Term(CRSTORE_CRINC);
inc.cntreg=cntreg;
reset.next=b;
if(n>=0){
Term lt=new Term(CR_LT);
lt.cntreg=cntreg;
lt.maxCount=n;
b.failNext=lt;
lt.next=term.in;
}
else{
b.failNext=term.in;
}
term.out.next=inc;
inc.next=b;
if(m>=0){
Term gt=new Term(CR_GT_EQ);
gt.cntreg=cntreg;
gt.maxCount=m;
b.next=gt;
reset.in=reset;
reset.out=gt;
reset.out1=null;
reset.branchOut=null;
return reset;
}
else{
reset.in=reset;
reset.out=b;
reset.out1=null;
reset.branchOut=null;
return reset;
}
}
case REG:
default:{
Term reset=new Term(CNT_SET_0);
Term b=new Branch(BRANCH_STORE_CNT);
Term inc=new Term(CNT_INC);
reset.next=b;
if(n>=0){
Term lt=new Term(READ_CNT_LT);
lt.maxCount=n;
b.failNext=lt;
lt.next=term;
term.next=inc;
inc.next=b;
}
else{
b.next=term;
term.next=inc;
inc.next=term;
}
if(m>=0){
Term gt=new Term(CNT_GT_EQ);
gt.maxCount=m;
b.next=gt;
reset.in=reset;
reset.out=gt;
reset.out1=null;
reset.branchOut=null;
return reset;
}
else{
reset.in=reset;
reset.out=b;
reset.out1=null;
reset.branchOut=null;
return reset;
}
}
}
}
private final int parseTerm(char[] data, int i, int out, Term term,
int flags) throws PatternSyntaxException{
char c=data[i++];
boolean inv=false;
switch(c){
case '[':
return CharacterClass.parseClass(data,i,out,term,(flags&IGNORE_CASE)>0,(flags&IGNORE_SPACES)>0,(flags&UNICODE)>0,(flags&XML_SCHEMA)>0);
case '.':
term.type=(flags&DOTALL)>0? ANY_CHAR: ANY_CHAR_NE;
break;
case '$':
//term.type=mods[MULTILINE_IND]? LINE_END: END; //??
term.type=(flags&MULTILINE)>0? LINE_END: END_EOL;
break;
case '^':
term.type=(flags&MULTILINE)>0? LINE_START: START;
break;
case '\\':
if(i>=out) throw new PatternSyntaxException("Escape without a character");
c=data[i++];
esc: switch(c){
case 'f':
c='\f'; // form feed
break;
case 'n':
c='\n'; // new line
break;
case 'r':
c='\r'; // carriage return
break;
case 't':
c='\t'; // tab
break;
case 'u':
c=(char)((CharacterClass.toHexDigit(data[i++])<<12)+
(CharacterClass.toHexDigit(data[i++])<<8)+
(CharacterClass.toHexDigit(data[i++])<<4)+
CharacterClass.toHexDigit(data[i++]));
break;
case 'v':
c=(char)((CharacterClass.toHexDigit(data[i++])<<24)+
(CharacterClass.toHexDigit(data[i++])<<16)+
(CharacterClass.toHexDigit(data[i++])<<12)+
(CharacterClass.toHexDigit(data[i++])<<8)+
(CharacterClass.toHexDigit(data[i++])<<4)+
CharacterClass.toHexDigit(data[i++]));
break;
case 'x':{ // hex 2-digit number -> char
int hex=0;
char d;
if((d=data[i++])=='{'){
while((d=data[i++])!='}'){
hex=(hex<<4)+CharacterClass.toHexDigit(d);
if(hex>0xffff) throw new PatternSyntaxException("\\x{}");
}
}
else{
hex=(CharacterClass.toHexDigit(d)<<4)+
CharacterClass.toHexDigit(data[i++]);
}
c=(char)hex;
break;
}
case '0':
case 'o': // oct 2- or 3-digit number -> char
int oct=0;
for(;;){
char d=data[i++];
if(d>='0' && d<='7'){
oct*=8;
oct+=d-'0';
if(oct>0xffff) break;
}
else break;
}
c=(char)oct;
break;
case 'm': // decimal number -> char
int dec=0;
for(;;){
char d=data[i++];
if(d>='0' && d<='9'){
dec*=10;
dec+=d-'0';
if(dec>0xffff) break;
}
else break;
}
c=(char)dec;
break;
case 'c': // ctrl-char
c=(char)(data[i++]&0x1f);
break;
case 'D': // non-digit
inv=true;
// go on
case 'd': // digit
CharacterClass.makeDigit(term,inv,(flags&UNICODE)>0);
return i;
case 'S': // non-space
inv=true;
// go on
case 's': // space
CharacterClass.makeSpace(term,inv,(flags&UNICODE)>0);
return i;
case 'W': // non-letter
inv=true;
// go on
case 'w': // letter
CharacterClass.makeWordChar(term,inv,(flags&UNICODE)>0);
return i;
case 'B': // non-(word boundary)
inv=true;
// go on
case 'b': // word boundary
CharacterClass.makeWordBoundary(term,inv,(flags&UNICODE)>0);
return i;
case '<': // non-(word boundary)
CharacterClass.makeWordStart(term,(flags&UNICODE)>0);
return i;
case '>': // word boundary
CharacterClass.makeWordEnd(term,(flags&UNICODE)>0);
return i;
case 'A': // text beginning
term.type=START;
return i;
case 'Z': // text end
term.type=END_EOL;
return i;
case 'z': // text end
term.type=END;
return i;
case 'G': // end of last match
term.type=LAST_MATCH_END;
return i;
case 'P': // \\P{..}
inv=true;
case 'p': // \\p{..}
i=CharacterClass.parseName(data,i,out,term,inv,(flags&IGNORE_SPACES)>0);
return i;
default:
if(c>='1' && c<='9'){
int n=c-'0';
while((i='0' && c<='9'){
n=(n*10)+c-'0';
i++;
}
term.type=(flags&IGNORE_CASE)>0? REG_I: REG;
term.memreg=n;
return i;
}
/*
if(c<256){
CustomParser termp=customParsers[c];
if(termp!=null){
i=termp.parse(i,data,term);
return i;
}
}
*/
}
term.type=CHAR;
term.c=c;
break;
default:
if((flags&IGNORE_CASE)==0){
term.type=CHAR;
term.c=c;
}
else{
CharacterClass.makeICase(term,c);
}
break;
}
return i;
}
// one of {n},{n,},{,n},{n1,n2}
protected static final int parseLimits(int i,int end,char[] data,int[] limits) throws PatternSyntaxException{
if(limits.length!=LIMITS_LENGTH) throw new IllegalArgumentException("maxTimess.length="+limits.length+", should be 2");
limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_OK;
int ind=0;
int v=0;
char c;
while(i0) throw new PatternSyntaxException("illegal construction: {.. , , ..}");
limits[ind++]=v;
v=-1;
continue;
case '}':
limits[ind]=v;
if(ind==0) limits[1]=v;
return i;
default:
if(c>'9' || c<'0'){
//throw new PatternSyntaxException("illegal symbol in iterator: '{"+c+"}'");
limits[LIMITS_PARSE_RESULT_INDEX]=LIMITS_FAILURE;
return i;
}
if(v<0) v=0;
v= v*10 + (c-'0');
}
}
throw new PatternSyntaxException("malformed quantifier");
}
public String toString(){
StringBuffer b=new StringBuffer(100);
b.append(instanceNum);
b.append(": ");
if(inverse) b.append('^');
switch(type){
case VOID:
b.append("[]");
b.append(" , ");
break;
case CHAR:
b.append(CharacterClass.stringValue(c));
b.append(" , ");
break;
case ANY_CHAR:
b.append("dotall, ");
break;
case ANY_CHAR_NE:
b.append("dot-eols, ");
break;
case BITSET:
b.append('[');
b.append(CharacterClass.stringValue0(bitset));
b.append(']');
b.append(" , weight=");
b.append(weight);
b.append(" , ");
break;
case BITSET2:
b.append('[');
b.append(CharacterClass.stringValue2(bitset2));
b.append(']');
b.append(" , weight=");
b.append(weight);
b.append(" , ");
break;
case START:
b.append("abs.start");
break;
case END:
b.append("abs.end");
break;
case END_EOL:
b.append("abs.end-eol");
break;
case LINE_START:
b.append("line start");
break;
case LINE_END:
b.append("line end");
break;
case LAST_MATCH_END:
if(inverse)b.append("non-");
b.append("BOUNDARY");
break;
case BOUNDARY:
if(inverse)b.append("non-");
b.append("BOUNDARY");
break;
case UBOUNDARY:
if(inverse)b.append("non-");
b.append("UBOUNDARY");
break;
case DIRECTION:
b.append("DIRECTION");
break;
case UDIRECTION:
b.append("UDIRECTION");
break;
case FIND:
b.append(">>>{");
b.append(target);
b.append("}, <<");
b.append(distance);
if(eat){
b.append(",eat");
}
b.append(", ");
break;
case REPEAT_0_INF:
b.append("rpt{");
b.append(target);
b.append(",0,inf}");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case REPEAT_MIN_INF:
b.append("rpt{");
b.append(target);
b.append(",");
b.append(minCount);
b.append(",inf}");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case REPEAT_MIN_MAX:
b.append("rpt{");
b.append(target);
b.append(",");
b.append(minCount);
b.append(",");
b.append(maxCount);
b.append("}");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case REPEAT_REG_MIN_INF:
b.append("rpt{$");
b.append(memreg);
b.append(',');
b.append(minCount);
b.append(",inf}");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case REPEAT_REG_MIN_MAX:
b.append("rpt{$");
b.append(memreg);
b.append(',');
b.append(minCount);
b.append(',');
b.append(maxCount);
b.append("}");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case BACKTRACK_0:
b.append("back(0)");
break;
case BACKTRACK_MIN:
b.append("back(");
b.append(minCount);
b.append(")");
break;
case BACKTRACK_REG_MIN:
b.append("back");
b.append("_$");
b.append(memreg);
b.append("(");
b.append(minCount);
b.append(")");
break;
case GROUP_IN:
b.append('(');
if(memreg>0)b.append(memreg);
b.append('-');
b.append(" , ");
break;
case GROUP_OUT:
b.append('-');
if(memreg>0)b.append(memreg);
b.append(')');
b.append(" , ");
break;
case PLOOKAHEAD_IN:
b.append('(');
b.append("=");
b.append(lookaheadId);
b.append(" , ");
break;
case PLOOKAHEAD_OUT:
b.append('=');
b.append(lookaheadId);
b.append(')');
b.append(" , ");
break;
case NLOOKAHEAD_IN:
b.append("(!");
b.append(lookaheadId);
b.append(" , ");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case NLOOKAHEAD_OUT:
b.append('!');
b.append(lookaheadId);
b.append(')');
b.append(" , ");
break;
case PLOOKBEHIND_IN:
b.append('(');
b.append("<=");
b.append(lookaheadId);
b.append(" , dist=");
b.append(distance);
b.append(" , ");
break;
case PLOOKBEHIND_OUT:
b.append("<=");
b.append(lookaheadId);
b.append(')');
b.append(" , ");
break;
case NLOOKBEHIND_IN:
b.append("(");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case NLOOKBEHIND_OUT:
b.append("");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case LOOKAHEAD_CONDITION_IN:
b.append("(cond");
b.append(lookaheadId);
b.append(((Lookahead)this).isPositive? '=': '!');
b.append(" , ");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case LOOKAHEAD_CONDITION_OUT:
b.append("cond");
b.append(lookaheadId);
b.append(")");
if(failNext!=null){
b.append(", =>");
b.append(failNext.instanceNum);
b.append(", ");
}
break;
case REG:
b.append("$");
b.append(memreg);
b.append(", ");
break;
case SUCCESS:
b.append("END");
break;
case BRANCH_STORE_CNT_AUX1:
b.append("(aux1)");
case BRANCH_STORE_CNT:
b.append("(cnt)");
case BRANCH:
b.append("=>");
if(failNext!=null) b.append(failNext.instanceNum);
else b.append("null");
b.append(" , ");
break;
default:
b.append('[');
switch(type){
case CNT_SET_0:
b.append("cnt=0");
break;
case CNT_INC:
b.append("cnt++");
break;
case CNT_GT_EQ:
b.append("cnt>="+maxCount);
break;
case READ_CNT_LT:
b.append("->cnt<"+maxCount);
break;
case CRSTORE_CRINC:
b.append("M("+memreg+")->,Cr("+cntreg+")->,Cr("+cntreg+")++");
break;
case CR_SET_0:
b.append("Cr("+cntreg+")=0");
break;
case CR_LT:
b.append("Cr("+cntreg+")<"+maxCount);
break;
case CR_GT_EQ:
b.append("Cr("+cntreg+")>="+maxCount);
break;
default:
b.append("unknown type: "+type);
}
b.append("] , ");
}
if(next!=null){
b.append("->");
b.append(next.instanceNum);
b.append(", ");
}
//b.append("\r\n");
return b.toString();
}
public String toStringAll(){
return toStringAll(new Vector());
}
public String toStringAll(Vector v){
v.addElement(new Integer(instanceNum));
String s=toString();
if(next!=null){
if(!v.contains(new Integer(next.instanceNum))){
s+="\r\n";
s+=next.toStringAll(v);
}
}
if(failNext!=null){
if(!v.contains(new Integer(failNext.instanceNum))){
s+="\r\n";
s+=failNext.toStringAll(v);
}
}
return s;
}
}
class Pretokenizer{
private static final int START=1;
static final int END=2;
static final int PLAIN_GROUP=3;
static final int POS_LOOKAHEAD=4;
static final int NEG_LOOKAHEAD=5;
static final int POS_LOOKBEHIND=6;
static final int NEG_LOOKBEHIND=7;
static final int INDEPENDENT_REGEX=8;
static final int COMMENT=9;
static final int CONDITIONAL_GROUP=10;
static final int FLAGS=11;
static final int CLASS_GROUP=12;
static final int NAMED_GROUP=13;
int tOffset,tOutside,skip;
int offset,end;
int c;
int ttype=START;
char[] data;
//results
private int flags;
private boolean flagsChanged;
char[] brackets;
String groupName;
boolean groupDeclared;
Pretokenizer(char[] data,int offset,int end){
if(offset<0 || end>data.length) throw new IndexOutOfBoundsException("offset="+offset+", end="+end+", length="+data.length);
this.offset=offset;
this.end=end;
this.tOffset=offset;
this.tOutside=offset;
this.data=data;
}
int flags(int def){
return flagsChanged? flags: def;
}
void next() throws PatternSyntaxException{
int tOffset=this.tOutside;
int skip=this.skip;
tOffset+=skip;
flagsChanged=false;
int end=this.end;
char[] data=this.data;
boolean esc=false;
for(int i=tOffset;i':
ttype=INDEPENDENT_REGEX;
skip=3; // "(?>"
break;
case '#':
ttype=COMMENT;
skip=3; // ="(?#".length, the makeTree() skips the rest by itself
break;
case '(':
ttype=CONDITIONAL_GROUP;
skip=2; //"(?"+"(..." - skip "(?" (2 chars) and parse condition as a group
break;
case '[':
ttype=CLASS_GROUP;
skip=2; // "(?"+"[..]+...-...&...)" - skip 2 chars and parse a class group
break;
default:
int mOff,mLen;
mLoop:
for(int p=i+2;p0){
flags=Pattern.parseFlags(data,mOff,mLen);
flagsChanged=true;
}
ttype=PLAIN_GROUP;
skip=mLen+3; // "(?imsx:" mLen=4; skip= "(?".len + ":".len + mLen = 2+1+4=7
break mLoop;
case ')':
flags=Pattern.parseFlags(data,mOff=(i+2),mLen=(p-mOff));
flagsChanged=true;
ttype=FLAGS;
skip=mLen+3; // "(?imsx)" mLen=4, skip="(?".len+")".len+mLen=2+1+4=7
break mLoop;
default:
throw new PatternSyntaxException("wrong char after \"(?\": "+c2);
}
}
break;
}
}
else if(((i+2)=0){
if(distance!=pd) throw new PatternSyntaxException("non-equal branch lengths within a lookbehind assertion");
}
super.close();
}
}
class Iterator extends Term{
Iterator(Term term,int min,int max,Vector collection) throws PatternSyntaxException{
collection.addElement(this);
switch(term.type){
case CHAR:
case ANY_CHAR:
case ANY_CHAR_NE:
case BITSET:
case BITSET2:{
target=term;
Term back=new Term();
if(min<=0 && max<0){
type=REPEAT_0_INF;
back.type=BACKTRACK_0;
}
else if(min>0 && max<0){
type=REPEAT_MIN_INF;
back.type=BACKTRACK_MIN;
minCount=back.minCount=min;
}
else{
type=REPEAT_MIN_MAX;
back.type=BACKTRACK_MIN;
minCount=back.minCount=min;
maxCount=max;
}
failNext=back;
in=this;
out=this;
out1=back;
branchOut=null;
return;
}
case REG:{
target=term;
memreg=term.memreg;
Term back=new Term();
if(max<0){
type=REPEAT_REG_MIN_INF;
back.type=BACKTRACK_REG_MIN;
minCount=back.minCount=min;
}
else{
type=REPEAT_REG_MIN_MAX;
back.type=BACKTRACK_REG_MIN;
minCount=back.minCount=min;
maxCount=max;
}
failNext=back;
in=this;
out=this;
out1=back;
branchOut=null;
return;
}
default:
throw new PatternSyntaxException("can't iterate this type: "+term.type);
}
}
void optimize(){
//System.out.println("optimizing myself: "+this);
//BACKTRACK_MIN_REG_FIND
Term back=failNext;
Optimizer opt=Optimizer.find(back.next);
if(opt==null) return;
failNext=opt.makeBacktrack(back);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy