All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.groupbyinc.common.jregex.CharacterClass Maven / Gradle / Ivy

There is a newer version: 198
Show newest version
/**
 * Copyright (c) 2001, Sergey A. Samokhodkin
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification, 
 * are permitted provided that the following conditions are met:
 * 
 * - Redistributions of source code must retain the above copyright notice, 
 * this list of conditions and the following disclaimer. 
 * - Redistributions in binary form 
 * must reproduce the above copyright notice, this list of conditions and the following 
 * disclaimer in the documentation and/or other materials provided with the distribution.
 * - Neither the name of jregex nor the names of its contributors may be used 
 * to endorse or promote products derived from this software without specific prior 
 * written permission. 
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * @version 1.2_01
 */

package jregex;

import java.util.*;

class CharacterClass extends Term implements UnicodeConstants{
   static final Bitset DIGIT=new Bitset();
   static final Bitset WORDCHAR=new Bitset();
   static final Bitset SPACE=new Bitset();
   
   static final Bitset UDIGIT=new Bitset();
   static final Bitset UWORDCHAR=new Bitset();
   static final Bitset USPACE=new Bitset();
   
   static final Bitset NONDIGIT=new Bitset();
   static final Bitset NONWORDCHAR=new Bitset();
   static final Bitset NONSPACE=new Bitset();
   
   static final Bitset UNONDIGIT=new Bitset();
   static final Bitset UNONWORDCHAR=new Bitset();
   static final Bitset UNONSPACE=new Bitset();
   
   private static boolean namesInitialized=false;
   
   static final Hashtable namedClasses=new Hashtable();
   static final Vector unicodeBlocks=new Vector();
   static final Vector posixClasses=new Vector();
   static final Vector unicodeCategories=new Vector();
   
   //modes; used in parseGroup(()
   private final static int ADD=1;
   private final static int SUBTRACT=2;
   private final static int INTERSECT=3;
   
   private static final String blockData=
     "0000..007F:InBasicLatin;0080..00FF:InLatin-1Supplement;0100..017F:InLatinExtended-A;"
    +"0180..024F:InLatinExtended-B;0250..02AF:InIPAExtensions;02B0..02FF:InSpacingModifierLetters;"
    +"0300..036F:InCombiningDiacriticalMarks;0370..03FF:InGreek;0400..04FF:InCyrillic;0530..058F:InArmenian;"
    +"0590..05FF:InHebrew;0600..06FF:InArabic;0700..074F:InSyriac;0780..07BF:InThaana;0900..097F:InDevanagari;"
    +"0980..09FF:InBengali;0A00..0A7F:InGurmukhi;0A80..0AFF:InGujarati;0B00..0B7F:InOriya;0B80..0BFF:InTamil;"
    +"0C00..0C7F:InTelugu;0C80..0CFF:InKannada;0D00..0D7F:InMalayalam;0D80..0DFF:InSinhala;0E00..0E7F:InThai;"
    +"0E80..0EFF:InLao;0F00..0FFF:InTibetan;1000..109F:InMyanmar;10A0..10FF:InGeorgian;1100..11FF:InHangulJamo;"
    +"1200..137F:InEthiopic;13A0..13FF:InCherokee;1400..167F:InUnifiedCanadianAboriginalSyllabics;"
    +"1680..169F:InOgham;16A0..16FF:InRunic;1780..17FF:InKhmer;1800..18AF:InMongolian;"
    +"1E00..1EFF:InLatinExtendedAdditional;1F00..1FFF:InGreekExtended;2000..206F:InGeneralPunctuation;"
    +"2070..209F:InSuperscriptsAndSubscripts;20A0..20CF:InCurrencySymbols;"
    +"20D0..20FF:InCombiningMarksForSymbols;2100..214F:InLetterLikeSymbols;2150..218F:InNumberForms;"
    +"2190..21FF:InArrows;2200..22FF:InMathematicalOperators;2300..23FF:InMiscellaneousTechnical;"
    +"2400..243F:InControlPictures;2440..245F:InOpticalCharacterRecognition;"
    +"2460..24FF:InEnclosedAlphanumerics;2500..257F:InBoxDrawing;2580..259F:InBlockElements;"
    +"25A0..25FF:InGeometricShapes;2600..26FF:InMiscellaneousSymbols;2700..27BF:InDingbats;"
    +"2800..28FF:InBraillePatterns;2E80..2EFF:InCJKRadicalsSupplement;2F00..2FDF:InKangxiRadicals;"
    +"2FF0..2FFF:InIdeographicDescriptionCharacters;3000..303F:InCJKSymbolsAndPunctuation;"
    +"3040..309F:InHiragana;30A0..30FF:InKatakana;3100..312F:InBopomofo;3130..318F:InHangulCompatibilityJamo;"
    +"3190..319F:InKanbun;31A0..31BF:InBopomofoExtended;3200..32FF:InEnclosedCJKLettersAndMonths;"
    +"3300..33FF:InCJKCompatibility;3400..4DB5:InCJKUnifiedIdeographsExtensionA;"
    +"4E00..9FFF:InCJKUnifiedIdeographs;A000..A48F:InYiSyllables;A490..A4CF:InYiRadicals;"
    +"AC00..D7A3:InHangulSyllables;D800..DB7F:InHighSurrogates;DB80..DBFF:InHighPrivateUseSurrogates;"
    +"DC00..DFFF:InLowSurrogates;E000..F8FF:InPrivateUse;F900..FAFF:InCJKCompatibilityIdeographs;"
    +"FB00..FB4F:InAlphabeticPresentationForms;FB50..FDFF:InArabicPresentationForms-A;"
    +"FE20..FE2F:InCombiningHalfMarks;FE30..FE4F:InCJKCompatibilityForms;FE50..FE6F:InSmallFormVariants;"
    +"FE70..FEFE:InArabicPresentationForms-B;FEFF..FEFF:InSpecials;FF00..FFEF:InHalfWidthAndFullWidthForms;"
    +"FFF0..FFFD:InSpecials";
   
   static{
      //*
      DIGIT.setDigit(false);
      WORDCHAR.setWordChar(false);
      SPACE.setSpace(false);
      
      UDIGIT.setDigit(true);
      UWORDCHAR.setWordChar(true);
      USPACE.setSpace(true);
      
      NONDIGIT.setDigit(false); NONDIGIT.setPositive(false); 
      NONWORDCHAR.setWordChar(false); NONWORDCHAR.setPositive(false); 
      NONSPACE.setSpace(false); NONSPACE.setPositive(false); 
      
      UNONDIGIT.setDigit(true); UNONDIGIT.setPositive(false);
      UNONWORDCHAR.setWordChar(true); UNONWORDCHAR.setPositive(false);
      UNONSPACE.setSpace(true); UNONSPACE.setPositive(false);
      
      initPosixClasses();
   }
   
   private static void registerClass(String name,Bitset cls,Vector realm){
      namedClasses.put(name,cls);
      if(!realm.contains(name))realm.addElement(name);
   }
   
   private static void initPosixClasses(){
      Bitset lower=new Bitset();
         lower.setRange('a','z');
         registerClass("Lower",lower,posixClasses);
      Bitset upper=new Bitset();
         upper.setRange('A','Z');
         registerClass("Upper",upper,posixClasses);
      Bitset ascii=new Bitset();
         ascii.setRange((char)0,(char)0x7f);
         registerClass("ASCII",ascii,posixClasses);
      Bitset alpha=new Bitset();
         alpha.add(lower);
         alpha.add(upper);
         registerClass("Alpha",alpha,posixClasses);
      Bitset digit=new Bitset();
         digit.setRange('0','9');
         registerClass("Digit",digit,posixClasses);
      Bitset alnum=new Bitset();
         alnum.add(alpha);
         alnum.add(digit);
         registerClass("Alnum",alnum,posixClasses);
      Bitset punct=new Bitset();
         punct.setChars("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~");
         registerClass("Punct",punct,posixClasses);
      Bitset graph=new Bitset();
         graph.add(alnum);
         graph.add(punct);
         registerClass("Graph",graph,posixClasses);
         registerClass("Print",graph,posixClasses);
      Bitset blank=new Bitset();
         blank.setChars(" \t");
         registerClass("Blank",blank,posixClasses);
      Bitset cntrl=new Bitset();
         cntrl.setRange((char)0,(char)0x1f);
         cntrl.setChar((char)0x7f);
         registerClass("Cntrl",cntrl,posixClasses);
      Bitset xdigit=new Bitset();
         xdigit.setRange('0','9');
         xdigit.setRange('a','f');
         xdigit.setRange('A','F');
         registerClass("XDigit",xdigit,posixClasses);
      Bitset space=new Bitset();
         space.setChars(" \t\n\r\f\u000b");
         registerClass("Space",space,posixClasses);
   }
   
   private static void initNames(){
      initNamedCategory("C",new int[]{Cn,Cc,Cf,Co,Cs}); 
      initNamedCategory("Cn",Cn); 
      initNamedCategory("Cc",Cc);
      initNamedCategory("Cf",Cf);
      initNamedCategory("Co",Co);
      initNamedCategory("Cs",Cs);
      
      initNamedCategory("L",new int[]{Lu,Ll,Lt,Lm,Lo}); 
      initNamedCategory("Lu",Lu);
      initNamedCategory("Ll",Ll);
      initNamedCategory("Lt",Lt);
      initNamedCategory("Lm",Lm);
      initNamedCategory("Lo",Lo);
      
      initNamedCategory("M",new int[]{Mn,Me,Mc}); 
      initNamedCategory("Mn",Mn);
      initNamedCategory("Me",Me);
      initNamedCategory("Mc",Mc);
      
      initNamedCategory("N",new int[]{Nd,Nl,No}); 
      initNamedCategory("Nd",Nd);
      initNamedCategory("Nl",Nl);
      initNamedCategory("No",No);
      
      initNamedCategory("Z",new int[]{Zs,Zl,Zp}); 
      initNamedCategory("Zs",Zs);
      initNamedCategory("Zl",Zl);
      initNamedCategory("Zp",Zp);
      
      initNamedCategory("P",new int[]{Pd,Ps,Pi,Pe,Pf,Pc,Po}); 
      initNamedCategory("Pd",Pd);
      initNamedCategory("Ps",Ps);
      initNamedCategory("Pi",Pi);
      initNamedCategory("Pe",Pe);
      initNamedCategory("Pf",Pf);
      initNamedCategory("Pc",Pc);
      initNamedCategory("Po",Po);
      
      initNamedCategory("S",new int[]{Sm,Sc,Sk,So}); 
      initNamedCategory("Sm",Sm);
      initNamedCategory("Sc",Sc);
      initNamedCategory("Sk",Sk);
      initNamedCategory("So",So);
      
      Bitset bs=new Bitset();
      bs.setCategory(Cn);
      registerClass("UNASSIGNED",bs,unicodeCategories);
      bs=new Bitset();
      bs.setCategory(Cn);
      bs.setPositive(false);
      registerClass("ASSIGNED",bs,unicodeCategories);
      
      StringTokenizer st=new StringTokenizer(blockData,".,:;");
      while(st.hasMoreTokens()){
         try{
            int first=Integer.parseInt(st.nextToken(),16);
            int last=Integer.parseInt(st.nextToken(),16);
            String name=st.nextToken();
            initNamedBlock(name,first,last);
         }
         catch(Exception e){
            e.printStackTrace();
         }
      }
      
      initNamedBlock("ALL",0,0xffff);
      
      namesInitialized=true;
      //*/
   }
   
   private static void initNamedBlock(String name,int first,int last){
      if(firstCharacter.MAX_VALUE) throw new IllegalArgumentException("wrong start code ("+first+") in block "+name);
      if(lastCharacter.MAX_VALUE) throw new IllegalArgumentException("wrong end code ("+last+") in block "+name);
      if(last=0){
                  char c1=(char)prev;
                  if(icase){
                     bs.setChar(Character.toLowerCase(c1));
                     bs.setChar(Character.toUpperCase(c1));
                     bs.setChar(Character.toTitleCase(c1));
                  }
                  else bs.setChar(c1);
               }
               return i;
               
            case '-':
               if(isFirst) break;
               //if(isFirst) throw new PatternSyntaxException("[-...] is illegal");
               if(inRange) break;
               //if(inRange) throw new PatternSyntaxException("[...--...] is illegal");
               inRange=true;
               continue;
               
            case '[':
               if(inRange && xml){ //[..-[..]]
                  if(prev>=0) bs.setChar((char)prev);
                  if(bs1==null) bs1=new Bitset();
                  else bs1.reset();
                  i=parseClass(data,i,out,bs1,icase,skipspaces,unicode,xml);
//System.out.println("     i="+i);
                  bs.subtract(bs1);
                  inRange=false;
                  prev=-1;
                  continue;
               }
               else break handle_special;
               
            case '^':
               //if(!isFirst) throw new PatternSyntaxException("'^' isn't a first char in a class def");
               //bs.setPositive(false);
               //setFirst=true;
               //continue;
               if(isFirst){
                  bs.setPositive(false);
                  setFirst=true;
                  continue;
               }
               //treat as normal char
               break;
               
            case ' ':
            case '\r':
            case '\n':
            case '\t':
            case '\f':
               if(skipspaces) continue;
               else break handle_special;
            case '\\':
               Bitset negatigeClass=null;
               boolean inv=false;
               handle_escape: switch(c=data[i++]){
                  case 'r':
                     c='\r';
                     break handle_special;
                     
                  case 'n':
                     c='\n';
                     break handle_special;
                     
                  case 't':
                     c='\t';
                     break handle_special;
                     
                  case 'f':
                     c='\f';
                     break handle_special;
                     
                  case 'u':
                     if(i>=out-4) throw  new PatternSyntaxException("incomplete escape sequence \\uXXXX");
                     c=(char)((toHexDigit(c)<<12)
                             +(toHexDigit(data[i++])<<8)
                             +(toHexDigit(data[i++])<<4)
                             +toHexDigit(data[i++]));
                     break handle_special;
                     
                  case 'v':
                     c=(char)((toHexDigit(c)<<24)+
                              (toHexDigit(data[i++])<<16)+
                              (toHexDigit(data[i++])<<12)+
                              (toHexDigit(data[i++])<<8)+
                              (toHexDigit(data[i++])<<4)+
                               toHexDigit(data[i++]));
                     break handle_special;
                     
                  case 'b':
                     c=8; // backspace
                     break handle_special;

                  case 'x':{   // hex 2-digit number
                     int hex=0;
                     char d;
                     if((d=data[i++])=='{'){
                        while((d=data[i++])!='}'){
                           hex=(hex<<4)+toHexDigit(d);
                        }
                        if(hex>0xffff) throw new PatternSyntaxException("\\x{}");
                     }
                     else{
                        hex=(toHexDigit(d)<<4)+toHexDigit(data[i++]);
                     }
                     c=(char)hex;
                     break handle_special;
                  }
                  case 'o':   // oct 2- or 3-digit number
                     int oct=0;
                     for(;;){
                        char d=data[i++];
                        if(d>='0' && d<='7'){
                           oct*=8;
                           oct+=d-'0';
                           if(oct>0xffff) break;
                        }
                        else break;
                     }
                     c=(char)oct;
                     break handle_special;
                     
                  case 'm':   // decimal number -> char
                     int dec=0;
                     for(;;){
                        char d=data[i++];
                        if(d>='0' && d<='9'){
                           dec*=10;
                           dec+=d-'0';
                           if(dec>0xffff) break;
                        }
                        else break;
                     }
                     c=(char)dec;
                     break handle_special;
                     
                  case 'c':   // ctrl-char
                     c=(char)(data[i++]&0x1f);
                     break handle_special;
                  
                  //classes;
                  //
                  case 'D':   // non-digit
                     negatigeClass=unicode? UNONDIGIT: NONDIGIT;
                     break handle_escape;
                     
                  case 'S':   // space
                     negatigeClass=unicode? UNONSPACE: NONSPACE;
                     break handle_escape;
                     
                  case 'W':   // space
                     negatigeClass=unicode? UNONWORDCHAR: NONWORDCHAR;
                     break handle_escape;
                     
                  case 'd':   // digit
                     if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\d...]");
                     bs.setDigit(unicode);
                     continue;
                     
                  case 's':   // digit
                     if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\s...]");
                     bs.setSpace(unicode);
                     continue;
                     
                  case 'w':   // digit
                     if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\w...]");
                     bs.setWordChar(unicode);
                     continue;
                     
                  case 'P':   // \\P{..}
                     inv=true;
                  case 'p':   // \\p{..}
                     if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\w...]");
                     if(sb==null) sb=new StringBuffer();
                     else sb.setLength(0);
                     i=parseName(data,i,out,sb,skipspaces);
                     Bitset nc=getNamedClass(sb.toString());
                     if(nc==null) throw new PatternSyntaxException("unknown named class: {"+sb+"}");
                     bs.add(nc,inv);
                     continue;
                     
                  default:
                     //other escaped treat as normal
                     break handle_special;
               }
               //negatigeClass;
               //\S,\D,\W
               if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\"+c+"...]");
               bs.add(negatigeClass);
               continue;
               
            case '{':   //
               if(inRange) throw new PatternSyntaxException("illegal range: [..."+prev+"-\\w...]");
               if(sb==null) sb=new StringBuffer();
               else sb.setLength(0);
               i=parseName(data,i-1,out,sb,skipspaces);
               Bitset nc=getNamedClass(sb.toString());
               if(nc==null) throw new PatternSyntaxException("unknown named class: {"+sb+"}");
               bs.add(nc,false);
               continue;
                     
            default:
         }
         //c is a normal char
//System.out.println("      normal c="+c+", inRange="+inRange+", prev="+(char)prev);
         if(prev<0){
            prev=c;
            inRange=false;
            continue;
         }
         if(!inRange){
            char c1=(char)prev;
            if(icase){
               bs.setChar(Character.toLowerCase(c1));
               bs.setChar(Character.toUpperCase(c1));
               bs.setChar(Character.toTitleCase(c1));
            }
            else bs.setChar(c1);
            prev=c;
         }
         else{
            if(prev>c) throw new PatternSyntaxException("illegal range: "+prev+">"+c);
            char c0=(char)prev;
            inRange=false;
            prev=-1;
            if(icase){
               bs.setRange(Character.toLowerCase(c0),Character.toLowerCase(c));
               bs.setRange(Character.toUpperCase(c0),Character.toUpperCase(c));
               bs.setRange(Character.toTitleCase(c0),Character.toTitleCase(c));
            }
            else bs.setRange(c0,c);
         }
      }
      throw new PatternSyntaxException("unbalanced brackets in a class def");
   }
   
      
   final static int parseName(char[] data,int i,int out,StringBuffer sb,
                              boolean skipspaces) throws PatternSyntaxException{
      char c;
      int start=-1;
      while(i=0xff) break loop;
         }      
         int first=c;
         while(arr[c]){
//System.out.println(c+": "+arr[c]);
            c++;
            if(c>0xff) break;
         }     
         int last=c-1;
         if(last==first) b.append(stringValue(last));
         else{
            b.append(stringValue(first));
            b.append('-');
            b.append(stringValue(last));
         }
         if(c>0xff) break;
      }
      return b.toString();
   }
   
   /* Mmm.. what is it? 
   static String stringValueC(boolean[] categories){
      StringBuffer sb=new StringBuffer();
      for(int i=0;i>8];
            if(marks!=null && marks[c&255]) break;
            c++;
            if(c>0xffff) break loop;
         }
         int first=c;
         for(;c<=0xffff;){
            boolean[] marks=arr[c>>8];
            if(marks==null || !marks[c&255]) break;
            c++;
         }
         int last=c-1;
         if(last==first) b.append(stringValue(last));
         else{
            b.append(stringValue(first));
            b.append('-');
            b.append(stringValue(last));
         }
         if(c>0xffff) break;
      }
      return b.toString();
   }
   
   static String stringValue(int c){
      StringBuffer b=new StringBuffer(5);
      if(c<32){
         switch(c){
            case '\r':
               b.append("\\r");
               break;
            case '\n':
               b.append("\\n");
               break;
            case '\t':
               b.append("\\t");
               break;
            case '\f':
               b.append("\\f");
               break;
            default:
               b.append('(');
               b.append((int)c);
               b.append(')');
         }
      }
      else if(c<256){
         b.append((char)c);
      }
      else{
         b.append('\\');
         b.append('x');
         b.append(Integer.toHexString(c));
      }
      return b.toString();
   }
   
   static int toHexDigit(char d) throws PatternSyntaxException{
      int val=0;
      if(d>='0' && d<='9') val=d-'0';
      else if(d>='a' && d<='f') val=10+d-'a';
      else if(d>='A' && d<='F') val=10+d-'A';
      else throw new PatternSyntaxException("hexadecimal digit expected: "+d);
      return val;
   }
   
   public static void main(String[] args){
      if(!namesInitialized)initNames();
      if(args.length==0){
         System.out.println("Class usage: \\p{Class},\\P{Class}");
         printRealm(posixClasses,"Posix classes");
         printRealm(unicodeCategories,"Unicode categories");
         printRealm(unicodeBlocks,"Unicode blocks");
      }
      else{
         for(int i=0;i>8)&0xff;
         if(data[cat][b]==0){
            data[cat][b]=1;
            data[cat][BLOCK_SIZE+1]++;
         }
      }
      for(int i=0;i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy