All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.groupbyinc.common.jregex.Pattern Maven / Gradle / Ivy

There is a newer version: 198
Show newest version
/**
 * Copyright (c) 2001, Sergey A. Samokhodkin
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification, 
 * are permitted provided that the following conditions are met:
 * 
 * - Redistributions of source code must retain the above copyright notice, 
 * this list of conditions and the following disclaimer. 
 * - Redistributions in binary form 
 * must reproduce the above copyright notice, this list of conditions and the following 
 * disclaimer in the documentation and/or other materials provided with the distribution.
 * - Neither the name of jregex nor the names of its contributors may be used 
 * to endorse or promote products derived from this software without specific prior 
 * written permission. 
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * @version 1.2_01
 */

package jregex;

import java.io.*;
import java.util.*;

/**
 * A handle for a precompiled regular expression.
* To match a regular expression myExpr against a text myString one should first create a Pattern object:
 * Pattern p=new Pattern(myExpr);
 * 
* then obtain a Matcher object:
 * Matcher matcher=p.matcher(myText);
 * 
* The latter is an automaton that actually performs a search. It provides the following methods: *
  • search for matching substrings : matcher.find() or matcher.findAll(); *
  • test whether the text matches the whole pattern : matcher.matches(); *
  • test whether the text matches the beginning of the pattern : matcher.matchesPrefix(); *
  • search with custom options : matcher.find(int options) *

    * Flags
    * Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime. * These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of: *

  • REFlags.IGNORE_CASE - enables case insensitivity *
  • REFlags.MULTILINE - forces "^" and "$" to match both at the start and the end of line; *
  • REFlags.DOTALL - forces "." to match eols('\r' and '\n' in ASCII); *
  • REFlags.IGNORE_SPACES - literal spaces in expression are ignored for better readability; *
  • REFlags.UNICODE - the predefined classes('\w','\d',etc) are referenced to Unicode; *
  • REFlags.XML_SCHEMA - permits XML Schema regular expressions syntax extentions. *

    * Multithreading
    * Pattern instances are thread-safe, i.e. the same Pattern object may be used * by any number of threads simultaniously. On the other hand, the Matcher objects * are NOT thread safe, so, given a Pattern instance, each thread must obtain * and use its own Matcher. * * @see REFlags * @see Matcher * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) * @see Matcher#setTarget(java.io.Reader,int) * @see MatchResult * @see MatchResult#group(int) * @see MatchResult#start(int) * @see MatchResult#end(int) * @see MatchResult#length(int) * @see MatchResult#charAt(int,int) * @see MatchResult#prefix() * @see MatchResult#suffix() */ public class Pattern implements Serializable,REFlags{ String stringRepr; // tree entry Term root,root0; // required number of memory slots int memregs; // required number of iteration counters int counters; // number of lookahead groups int lookaheads; Hashtable namedGroupMap; protected Pattern() throws PatternSyntaxException{} /** * Compiles an expression with default flags. * @param regex the Perl5-compatible regular expression string. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * @see Pattern#Pattern(java.lang.String,java.lang.String) * @see Pattern#Pattern(java.lang.String,int) */ public Pattern(String regex) throws PatternSyntaxException{ this(regex,DEFAULT); } /** * Compiles a regular expression using Perl5-style flags. * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen. * The meaning of letters: *

      *
    • i - case insensitivity, corresponds to REFLlags.IGNORE_CASE; *
    • m - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFLlags.MULTILINE flag; *
    • s - single line treatment('.' matches \r's and \n's),corresponds to REFLlags.DOTALL; *
    • x - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFLlags.IGNORE_SPACES. *
    • u - predefined classes are regarded as belonging to Unicode, corresponds to REFLlags.UNICODE; this may yield some performance penalty. *
    • X - compatibility with XML Schema, corresponds to REFLlags.XML_SCHEMA. *
    * @param regex the Perl5-compatible regular expression string. * @param flags the Perl5-compatible flags. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * see REFlags */ public Pattern(String regex,String flags) throws PatternSyntaxException{ stringRepr=regex; compile(regex,parseFlags(flags)); } /** * Compiles a regular expression using REFlags. * The flags parameter is a bitwise OR of the folloing values: *
      *
    • REFLlags.IGNORE_CASE - case insensitivity, corresponds to 'i' letter; *
    • REFLlags.MULTILINE - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to 'm'; *
    • REFLlags.DOTALL - single line treatment('.' matches \r's and \n's),corresponds to 's'; *
    • REFLlags.IGNORE_SPACES - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to 'x'. *
    • REFLlags.UNICODE - predefined classes are regarded as belonging to Unicode, corresponds to 'u'; this may yield some performance penalty. *
    • REFLlags.XML_SCHEMA - compatibility with XML Schema, corresponds to 'X'. *
    * @param regex the Perl5-compatible regular expression string. * @param flags the Perl5-compatible flags. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * see REFlags */ public Pattern(String regex, int flags) throws PatternSyntaxException{ compile(regex,flags); } /* //java.util.regex.* compatibility public static Pattern compile(String regex,int flags) throws PatternSyntaxException{ Pattern p=new Pattern(); p.compile(regex,flags); return flags; } */ protected void compile(String regex,int flags) throws PatternSyntaxException{ stringRepr=regex; Term.makeTree(regex,flags,this); } /** * How many capturing groups this expression includes? */ public int groupCount(){ return memregs; } /** * Get numeric id for a group name. * @return null if no such name found. * @see MatchResult#group(java.lang.String) * @see MatchResult#isCaptured(java.lang.String) */ public Integer groupId(String name){ return ((Integer)namedGroupMap.get(name)); } /** * A shorthand for Pattern.matcher(String).matches().
    * @param s the target * @return true if the entire target matches the pattern * @see Matcher#matches() * @see Matcher#matches(String) */ public boolean matches(String s){ return matcher(s).matches(); } /** * A shorthand for Pattern.matcher(String).matchesPrefix().
    * @param s the target * @return true if the entire target matches the beginning of the pattern * @see Matcher#matchesPrefix() */ public boolean startsWith(String s){ return matcher(s).matchesPrefix(); } /** * Returns a targetless matcher. * Don't forget to supply a target. */ public Matcher matcher(){ return new Matcher(this); } /** * Returns a matcher for a specified string. */ public Matcher matcher(String s){ Matcher m=new Matcher(this); m.setTarget(s); return m; } /** * Returns a matcher for a specified region. */ public Matcher matcher(char[] data,int start,int end){ Matcher m=new Matcher(this); m.setTarget(data,start,end); return m; } /** * Returns a matcher for a match result (in a performance-friendly way). * groupId parameter specifies which group is a target. * @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET. */ public Matcher matcher(MatchResult res,int groupId){ Matcher m=new Matcher(this); if(res instanceof Matcher){ m.setTarget((Matcher)res,groupId); } else{ m.setTarget(res.targetChars(),res.start(groupId)+res.targetStart(),res.length(groupId)); } return m; } /** * Just as above, yet with symbolic group name. * @exception NullPointerException if there is no group with such name */ public Matcher matcher(MatchResult res,String groupName){ Integer id=res.pattern().groupId(groupName); if(id==null) throw new IllegalArgumentException("group not found:"+groupName); int group=id.intValue(); return matcher(res,group); } /** * Returns a matcher taking a text stream as target. * Note that this is not a true POSIX-style stream matching, i.e. the whole length of the text is preliminary read and stored in a char array. * @param text a text stream * @param len the length to read from a stream; if len is -1, the whole stream is read in. * @exception IOException indicates an IO problem * @exception OutOfMemoryException if a stream is too lengthy */ public Matcher matcher(Reader text,int length)throws IOException{ Matcher m=new Matcher(this); m.setTarget(text,length); return m; } /** * Returns a replacer of a pattern by specified perl-like expression. * Such replacer will substitute all occurences of a pattern by an evaluated expression * ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc). * Example:
       * String text="The quick brown fox jumped over the lazy dog";
       * Pattern word=new Pattern("\\w+");
       * System.out.println(word.replacer("[$&]").replace(text));
       * //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]"
       * Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)");
       * System.out.println(swap.replacer("$3$2$1").replace(text));
       * //prints "The quick brown dog jumped over the lazy fox"
       * Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)");
       * System.out.println(scramble.replacer("$3$2$1").replace(text));
       * //prints "quick The fox brown over jumped lazy the dog"
       * 
    * @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo. * @see Replacer */ public Replacer replacer(String expr){ return new Replacer(this,expr); } /** * Returns a replacer will substitute all occurences of a pattern * through applying a user-defined substitution model. * @param model a Substitution object which is in charge for match substitution * @see Replacer */ public Replacer replacer(Substitution model){ return new Replacer(this,model); } /** * Tokenizes a text by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,String); * @see RETokenizer * @see RETokenizer#RETokenizer(jregex.Pattern,java.lang.String) * */ public RETokenizer tokenizer(String text){ return new RETokenizer(this,text); } /** * Tokenizes a specified region by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,char[],int,int); * @see RETokenizer * @see RETokenizer#RETokenizer(jregex.Pattern,char[],int,int) */ public RETokenizer tokenizer(char[] data,int off,int len){ return new RETokenizer(this,data,off,len); } /** * Tokenizes a specified region by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,Reader,int); * @see RETokenizer * @see RETokenizer#RETokenizer(jregex.Pattern,java.io.Reader,int) */ public RETokenizer tokenizer(Reader in,int length) throws IOException{ return new RETokenizer(this,in,length); } public String toString(){ return stringRepr; } /** * Returns a less or more readable representation of a bytecode for the pattern. */ public String toString_d(){ return root.toStringAll(); } static int parseFlags(String flags)throws PatternSyntaxException{ boolean enable=true; int len=flags.length(); int result=DEFAULT; for(int i=0;i




  • © 2015 - 2025 Weber Informatics LLC | Privacy Policy