com.groupbyinc.common.jregex.Pattern Maven / Gradle / Ivy
/**
* Copyright (c) 2001, Sergey A. Samokhodkin
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form
* must reproduce the above copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided with the distribution.
* - Neither the name of jregex nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @version 1.2_01
*/
package jregex;
import java.io.*;
import java.util.*;
/**
* A handle for a precompiled regular expression.
* To match a regular expression myExpr
against a text myString
one should first create a Pattern object:
* Pattern p=new Pattern(myExpr);
*
* then obtain a Matcher object:
* Matcher matcher=p.matcher(myText);
*
* The latter is an automaton that actually performs a search. It provides the following methods:
* search for matching substrings : matcher.find() or matcher.findAll();
* test whether the text matches the whole pattern : matcher.matches();
* test whether the text matches the beginning of the pattern : matcher.matchesPrefix();
* search with custom options : matcher.find(int options)
*
* Flags
* Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime.
* These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of:
*
REFlags.IGNORE_CASE - enables case insensitivity
* REFlags.MULTILINE - forces "^" and "$" to match both at the start and the end of line;
* REFlags.DOTALL - forces "." to match eols('\r' and '\n' in ASCII);
* REFlags.IGNORE_SPACES - literal spaces in expression are ignored for better readability;
* REFlags.UNICODE - the predefined classes('\w','\d',etc) are referenced to Unicode;
* REFlags.XML_SCHEMA - permits XML Schema regular expressions syntax extentions.
*
* Multithreading
* Pattern instances are thread-safe, i.e. the same Pattern object may be used
* by any number of threads simultaniously. On the other hand, the Matcher objects
* are NOT thread safe, so, given a Pattern instance, each thread must obtain
* and use its own Matcher.
*
* @see REFlags
* @see Matcher
* @see Matcher#setTarget(java.lang.String)
* @see Matcher#setTarget(java.lang.String,int,int)
* @see Matcher#setTarget(char[],int,int)
* @see Matcher#setTarget(java.io.Reader,int)
* @see MatchResult
* @see MatchResult#group(int)
* @see MatchResult#start(int)
* @see MatchResult#end(int)
* @see MatchResult#length(int)
* @see MatchResult#charAt(int,int)
* @see MatchResult#prefix()
* @see MatchResult#suffix()
*/
public class Pattern implements Serializable,REFlags{
String stringRepr;
// tree entry
Term root,root0;
// required number of memory slots
int memregs;
// required number of iteration counters
int counters;
// number of lookahead groups
int lookaheads;
Hashtable namedGroupMap;
protected Pattern() throws PatternSyntaxException{}
/**
* Compiles an expression with default flags.
* @param regex
the Perl5-compatible regular expression string.
* @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* @see Pattern#Pattern(java.lang.String,java.lang.String)
* @see Pattern#Pattern(java.lang.String,int)
*/
public Pattern(String regex) throws PatternSyntaxException{
this(regex,DEFAULT);
}
/**
* Compiles a regular expression using Perl5-style flags.
* The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen.
* The meaning of letters:
*
* - i - case insensitivity, corresponds to REFLlags.IGNORE_CASE;
*
- m - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFLlags.MULTILINE flag;
*
- s - single line treatment('.' matches \r's and \n's),corresponds to REFLlags.DOTALL;
*
- x - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFLlags.IGNORE_SPACES.
*
- u - predefined classes are regarded as belonging to Unicode, corresponds to REFLlags.UNICODE; this may yield some performance penalty.
*
- X - compatibility with XML Schema, corresponds to REFLlags.XML_SCHEMA.
*
* @param regex
the Perl5-compatible regular expression string.
* @param flags
the Perl5-compatible flags.
* @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* see REFlags
*/
public Pattern(String regex,String flags) throws PatternSyntaxException{
stringRepr=regex;
compile(regex,parseFlags(flags));
}
/**
* Compiles a regular expression using REFlags.
* The flags
parameter is a bitwise OR of the folloing values:
*
* - REFLlags.IGNORE_CASE - case insensitivity, corresponds to 'i' letter;
*
- REFLlags.MULTILINE - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to 'm';
*
- REFLlags.DOTALL - single line treatment('.' matches \r's and \n's),corresponds to 's';
*
- REFLlags.IGNORE_SPACES - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to 'x'.
*
- REFLlags.UNICODE - predefined classes are regarded as belonging to Unicode, corresponds to 'u'; this may yield some performance penalty.
*
- REFLlags.XML_SCHEMA - compatibility with XML Schema, corresponds to 'X'.
*
* @param regex
the Perl5-compatible regular expression string.
* @param flags
the Perl5-compatible flags.
* @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
* see REFlags
*/
public Pattern(String regex, int flags) throws PatternSyntaxException{
compile(regex,flags);
}
/*
//java.util.regex.* compatibility
public static Pattern compile(String regex,int flags) throws PatternSyntaxException{
Pattern p=new Pattern();
p.compile(regex,flags);
return flags;
}
*/
protected void compile(String regex,int flags) throws PatternSyntaxException{
stringRepr=regex;
Term.makeTree(regex,flags,this);
}
/**
* How many capturing groups this expression includes?
*/
public int groupCount(){
return memregs;
}
/**
* Get numeric id for a group name.
* @return null
if no such name found.
* @see MatchResult#group(java.lang.String)
* @see MatchResult#isCaptured(java.lang.String)
*/
public Integer groupId(String name){
return ((Integer)namedGroupMap.get(name));
}
/**
* A shorthand for Pattern.matcher(String).matches().
* @param s the target
* @return true if the entire target matches the pattern
* @see Matcher#matches()
* @see Matcher#matches(String)
*/
public boolean matches(String s){
return matcher(s).matches();
}
/**
* A shorthand for Pattern.matcher(String).matchesPrefix().
* @param s the target
* @return true if the entire target matches the beginning of the pattern
* @see Matcher#matchesPrefix()
*/
public boolean startsWith(String s){
return matcher(s).matchesPrefix();
}
/**
* Returns a targetless matcher.
* Don't forget to supply a target.
*/
public Matcher matcher(){
return new Matcher(this);
}
/**
* Returns a matcher for a specified string.
*/
public Matcher matcher(String s){
Matcher m=new Matcher(this);
m.setTarget(s);
return m;
}
/**
* Returns a matcher for a specified region.
*/
public Matcher matcher(char[] data,int start,int end){
Matcher m=new Matcher(this);
m.setTarget(data,start,end);
return m;
}
/**
* Returns a matcher for a match result (in a performance-friendly way).
* groupId
parameter specifies which group is a target.
* @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET.
*/
public Matcher matcher(MatchResult res,int groupId){
Matcher m=new Matcher(this);
if(res instanceof Matcher){
m.setTarget((Matcher)res,groupId);
}
else{
m.setTarget(res.targetChars(),res.start(groupId)+res.targetStart(),res.length(groupId));
}
return m;
}
/**
* Just as above, yet with symbolic group name.
* @exception NullPointerException if there is no group with such name
*/
public Matcher matcher(MatchResult res,String groupName){
Integer id=res.pattern().groupId(groupName);
if(id==null) throw new IllegalArgumentException("group not found:"+groupName);
int group=id.intValue();
return matcher(res,group);
}
/**
* Returns a matcher taking a text stream as target.
* Note that this is not a true POSIX-style stream matching, i.e. the whole length of the text is preliminary read and stored in a char array.
* @param text a text stream
* @param len the length to read from a stream; if len
is -1
, the whole stream is read in.
* @exception IOException indicates an IO problem
* @exception OutOfMemoryException if a stream is too lengthy
*/
public Matcher matcher(Reader text,int length)throws IOException{
Matcher m=new Matcher(this);
m.setTarget(text,length);
return m;
}
/**
* Returns a replacer of a pattern by specified perl-like expression.
* Such replacer will substitute all occurences of a pattern by an evaluated expression
* ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc).
* Example:
* String text="The quick brown fox jumped over the lazy dog";
* Pattern word=new Pattern("\\w+");
* System.out.println(word.replacer("[$&]").replace(text));
* //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]"
* Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)");
* System.out.println(swap.replacer("$3$2$1").replace(text));
* //prints "The quick brown dog jumped over the lazy fox"
* Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)");
* System.out.println(scramble.replacer("$3$2$1").replace(text));
* //prints "quick The fox brown over jumped lazy the dog"
*
* @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo.
* @see Replacer
*/
public Replacer replacer(String expr){
return new Replacer(this,expr);
}
/**
* Returns a replacer will substitute all occurences of a pattern
* through applying a user-defined substitution model.
* @param model a Substitution object which is in charge for match substitution
* @see Replacer
*/
public Replacer replacer(Substitution model){
return new Replacer(this,model);
}
/**
* Tokenizes a text by an occurences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,String);
* @see RETokenizer
* @see RETokenizer#RETokenizer(jregex.Pattern,java.lang.String)
*
*/
public RETokenizer tokenizer(String text){
return new RETokenizer(this,text);
}
/**
* Tokenizes a specified region by an occurences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,char[],int,int);
* @see RETokenizer
* @see RETokenizer#RETokenizer(jregex.Pattern,char[],int,int)
*/
public RETokenizer tokenizer(char[] data,int off,int len){
return new RETokenizer(this,data,off,len);
}
/**
* Tokenizes a specified region by an occurences of the pattern.
* Note that a series of adjacent matches are regarded as a single separator.
* The same as new RETokenizer(Pattern,Reader,int);
* @see RETokenizer
* @see RETokenizer#RETokenizer(jregex.Pattern,java.io.Reader,int)
*/
public RETokenizer tokenizer(Reader in,int length) throws IOException{
return new RETokenizer(this,in,length);
}
public String toString(){
return stringRepr;
}
/**
* Returns a less or more readable representation of a bytecode for the pattern.
*/
public String toString_d(){
return root.toStringAll();
}
static int parseFlags(String flags)throws PatternSyntaxException{
boolean enable=true;
int len=flags.length();
int result=DEFAULT;
for(int i=0;i