All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.oro.text.awk.AwkCompiler Maven / Gradle / Ivy

There is a newer version: 1.2
Show newest version
/*
 * $Id: AwkCompiler.java,v 1.10 2003/11/07 20:16:24 dfs Exp $
 *
 * ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact [email protected].
 *
 * 5. Products derived from this software may not be called "Apache" 
 *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
 *    name, without prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 */


package org.apache.oro.text.awk;

import org.apache.oro.text.regex.*;

/**
 * The AwkCompiler class is used to create compiled regular expressions
 * conforming to the Awk regular expression syntax.  It generates
 * AwkPattern instances upon compilation to be used in conjunction
 * with an AwkMatcher instance.  AwkMatcher finds true leftmost-longest
 * matches, so you must take care with how you formulate your regular
 * expression to avoid matching more than you really want.
 * 

* The supported regular expression syntax is a superset of traditional AWK, * but NOT to be confused with GNU AWK or other AWK variants. Additionally, * this AWK implementation is DFA-based and only supports 8-bit ASCII. * Consequently, these classes can perform very fast pattern matches in * most cases. *

* This is the traditional Awk syntax that is supported: *

    *
  • Alternatives separated by | *
  • Quantified atoms *
    *
    *
    Match 0 or more times. *
    +
    Match 1 or more times. *
    ?
    Match 0 or 1 times. *
    *
  • Atoms *
      *
    • regular expression within parentheses *
    • a . matches everything including newline *
    • a ^ is a null token matching the beginning of a string * but has no relation to newlines (and is only valid at the * beginning of a regex; this differs from traditional awk * for the sake of efficiency in Java). *
    • a $ is a null token matching the end of a string but has * no relation to newlines (and is only valid at the * end of a regex; this differs from traditional awk for the * sake of efficiency in Java). *
    • Character classes (e.g., [abcd]) and ranges (e.g. [a-z]) *
        *
      • Special backslashed characters work within a character class *
      *
    • Special backslashed characters *
      *
      \b
      backspace *
      \n
      newline *
      \r
      carriage return *
      \t
      tab *
      \f
      formfeed *
      \xnn
      hexadecimal representation of character *
      \nn or \nnn
      octal representation of character *
      Any other backslashed character matches itself *
      *
*

* This is the extended syntax that is supported: *

    *
  • Quantified atoms *
    *
    {n,m}
    Match at least n but not more than m times. *
    {n,}
    Match at least n times. *
    {n}
    Match exactly n times. *
    *
  • Atoms *
      *
    • Special backslashed characters *
      *
      \d
      digit [0-9] *
      \D
      non-digit [^0-9] *
      \w
      word character [0-9a-z_A-Z] *
      \W
      a non-word character [^0-9a-z_A-Z] *
      \s
      a whitespace character [ \t\n\r\f] *
      \S
      a non-whitespace character [^ \t\n\r\f] *
      \cD
      matches the corresponding control character *
      \0
      matches null character *
      *
* * @version @version@ * @since 1.0 * @see org.apache.oro.text.regex.PatternCompiler * @see org.apache.oro.text.regex.MalformedPatternException * @see AwkPattern * @see AwkMatcher */ public final class AwkCompiler implements PatternCompiler { /** * The default mask for the {@link #compile compile} methods. * It is equal to 0 and indicates no special options are active. */ public static final int DEFAULT_MASK = 0; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should be case insensitive. */ public static final int CASE_INSENSITIVE_MASK = 0x0001; /** * A mask passed as an option to the {@link #compile compile} methods * to indicate a compiled regular expression should treat input as having * multiple lines. This option affects the interpretation of * the . metacharacters. When this mask is used, * the . metacharacter will not match newlines. The default * behavior is for . to match newlines. */ public static final int MULTILINE_MASK = 0x0002; static final char _END_OF_INPUT = '\uFFFF'; // All of these are initialized by the compile() and _parse() methods // so there is no need or use in initializing them in the constructor // although this may change in the future. private boolean __inCharacterClass, __caseSensitive, __multiline; private boolean __beginAnchor, __endAnchor; private char __lookahead; private int __position, __bytesRead, __expressionLength; private char[] __regularExpression; private int __openParen, __closeParen; // We do not currently need to initialize any state, but keep this // commented out as a reminder that we may have to at some point. //public AwkCompiler() { } private static boolean __isMetachar(char token) { return (token == '*' || token == '?' || token == '+' || token == '[' || token == ']' || token == '(' || token == ')' || token == '|' || /* token == '^' || token == '$' || */ token == '.'); } static boolean _isWordCharacter(char token) { return ((token >= 'a' && token <= 'z') || (token >= 'A' && token <= 'Z') || (token >= '0' && token <= '9') || (token == '_')); } static boolean _isLowerCase(char token){ return (token >= 'a' && token <= 'z'); } static boolean _isUpperCase(char token){ return (token >= 'A' && token <= 'Z'); } static char _toggleCase(char token){ if(_isUpperCase(token)) return (char)(token + 32); else if(_isLowerCase(token)) return (char)(token - 32); return token; } private void __match(char token) throws MalformedPatternException { if(token == __lookahead){ if(__bytesRead < __expressionLength) __lookahead = __regularExpression[__bytesRead++]; else __lookahead = _END_OF_INPUT; } else throw new MalformedPatternException("token: " + token + " does not match lookahead: " + __lookahead + " at position: " + __bytesRead); } private void __putback() { if(__lookahead != _END_OF_INPUT) --__bytesRead; __lookahead = __regularExpression[__bytesRead - 1]; } private SyntaxNode __regex() throws MalformedPatternException { SyntaxNode left; left = __branch(); if(__lookahead == '|') { __match('|'); return (new OrNode(left, __regex())); } return left; } private SyntaxNode __branch() throws MalformedPatternException { CatNode current; SyntaxNode left, root; left = __piece(); if(__lookahead == ')'){ if(__openParen > __closeParen) return left; else throw new MalformedPatternException("Parse error: close parenthesis" + " without matching open parenthesis at position " + __bytesRead); } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT) return left; root = current = new CatNode(); current._left = left; while(true) { left = __piece(); if(__lookahead == ')'){ if(__openParen > __closeParen){ current._right = left; break; } else throw new MalformedPatternException("Parse error: close parenthesis" + " without matching open parenthesis at position " + __bytesRead); } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT){ current._right = left; break; } current._right = new CatNode(); current = (CatNode)current._right; current._left = left; } return root; } private SyntaxNode __piece() throws MalformedPatternException { SyntaxNode left; left = __atom(); switch(__lookahead){ case '+' : __match('+'); return (new PlusNode(left)); case '?' : __match('?'); return (new QuestionNode(left)); case '*' : __match('*'); return (new StarNode(left)); case '{' : return __repetition(left); } return left; } // if numChars is 0, this means match as many as you want private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits) throws MalformedPatternException { int num, digits = 0; StringBuffer buf; // We don't expect huge numbers, so an initial buffer of 4 is fine. buf = new StringBuffer(4); while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){ buf.append((char)__lookahead); __match(__lookahead); ++digits; } if(digits < minDigits || digits > maxDigits) throw new MalformedPatternException( "Parse error: unexpected number of digits at position " + __bytesRead); try { num = Integer.parseInt(buf.toString(), radix); } catch(NumberFormatException e) { throw new MalformedPatternException("Parse error: numeric value at " + "position " + __bytesRead + " is invalid"); } return num; } private SyntaxNode __repetition(SyntaxNode atom) throws MalformedPatternException { int min, max, startPosition[]; SyntaxNode root = null; CatNode catNode; __match('{'); min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE); startPosition = new int[1]; startPosition[0] = __position; if(__lookahead == '}'){ // Match exactly min times. Concatenate the atom min times. __match('}'); if(min == 0) throw new MalformedPatternException( "Parse error: Superfluous interval specified at position " + __bytesRead + ". Number of occurences was set to zero."); if(min == 1) return atom; root = catNode = new CatNode(); catNode._left = atom; while(--min > 1) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = atom._clone(startPosition); } else if(__lookahead == ','){ __match(','); if(__lookahead == '}') { // match at least min times __match('}'); if(min == 0) return new StarNode(atom); if(min == 1) return new PlusNode(atom); root = catNode = new CatNode(); catNode._left = atom; while(--min > 0) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = new StarNode(atom._clone(startPosition)); } else { // match at least min times and at most max times max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE); __match('}'); if(max < min) throw new MalformedPatternException("Parse error: invalid interval; " + max + " is less than " + min + " at position " + __bytesRead); if(max == 0) throw new MalformedPatternException( "Parse error: Superfluous interval specified at position " + __bytesRead + ". Number of occurences was set to zero."); if(min == 0) { if(max == 1) return new QuestionNode(atom); root = catNode = new CatNode(); atom = new QuestionNode(atom); catNode._left = atom; while(--max > 1) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = atom._clone(startPosition); } else if(min == max) { if(min == 1) return atom; root = catNode = new CatNode(); catNode._left = atom; while(--min > 1) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = atom._clone(startPosition); } else { int count; root = catNode = new CatNode(); catNode._left = atom; for(count=1; count < min; count++) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } atom = new QuestionNode(atom._clone(startPosition)); count = max-min; if(count == 1) catNode._right = atom; else { catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; while(--count > 1) { atom = atom._clone(startPosition); catNode._right = new CatNode(); catNode = (CatNode)catNode._right; catNode._left = atom; } catNode._right = atom._clone(startPosition); } } } } else throw new MalformedPatternException("Parse error: unexpected character " + __lookahead + " in interval at position " + __bytesRead); __position = startPosition[0]; return root; } private SyntaxNode __backslashToken() throws MalformedPatternException { SyntaxNode current; char token; int number; __match('\\'); if(__lookahead == 'x'){ __match('x'); // Parse a hexadecimal number current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2), __position++); } else if(__lookahead == 'c') { __match('c'); // Create a control character token = Character.toUpperCase(__lookahead); token = (char)(token > 63 ? token - 64 : token + 64); current = new TokenNode(token, __position++); __match(__lookahead); } else if(__lookahead >= '0' && __lookahead <= '9') { __match(__lookahead); if(__lookahead >= '0' && __lookahead <= '9'){ // We have an octal character or a multi-digit backreference. // Assume octal character for now. __putback(); number = __parseUnsignedInteger(10, 2, 3); number = Integer.parseInt(Integer.toString(number), 8); current = _newTokenNode((char)number, __position++); } else { // We have either \0, an escaped digit, or a backreference. __putback(); if(__lookahead == '0'){ // \0 matches the null character __match('0'); current = new TokenNode('\0', __position++); } else { // Either an escaped digit or backreference. number = Character.digit(__lookahead, 10); current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } } } else if(__lookahead == 'b') { // Inside of a character class the \b means backspace, otherwise // it means a word boundary //if(__inCharacterClass) // \b always means backspace current = new TokenNode('\b', __position++); /* else current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN, position++); */ __match('b'); } /*else if(__lookahead == 'B' && !__inCharacterClass){ current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN, position++); __match('B'); } */ else { CharacterClassNode characterSet; token = __lookahead; switch(__lookahead){ case 'n' : token = '\n'; break; case 'r' : token = '\r'; break; case 't' : token = '\t'; break; case 'f' : token = '\f'; break; } switch(token) { case 'd' : characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'D' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); current = characterSet; break; case 'w' : characterSet = new CharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 'W' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addTokenRange('0', '9'); characterSet._addTokenRange('a', 'z'); characterSet._addTokenRange('A', 'Z'); characterSet._addToken('_'); current = characterSet; break; case 's' : characterSet = new CharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; case 'S' : characterSet = new NegativeCharacterClassNode(__position++); characterSet._addToken(' '); characterSet._addToken('\f'); characterSet._addToken('\n'); characterSet._addToken('\r'); characterSet._addToken('\t'); current = characterSet; break; default : current = _newTokenNode(token, __position++); break; } __match(__lookahead); } return current; } private SyntaxNode __atom() throws MalformedPatternException { SyntaxNode current; if(__lookahead == '(') { __match('('); ++__openParen; current = __regex(); __match(')'); ++__closeParen; } else if(__lookahead == '[') current = __characterClass(); else if(__lookahead == '.') { CharacterClassNode characterSet; __match('.'); characterSet = new NegativeCharacterClassNode(__position++); if(__multiline) characterSet._addToken('\n'); current = characterSet; } else if(__lookahead == '\\') { current = __backslashToken(); } /*else if(__lookahead == '^') { current = new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++); __match('^'); } else if(__lookahead == '$') { current = new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++); __match('$'); } */ else if(!__isMetachar(__lookahead)) { current = _newTokenNode(__lookahead, __position++); __match(__lookahead); } else throw new MalformedPatternException("Parse error: unexpected character " + __lookahead + " at position " + __bytesRead); return current; } private SyntaxNode __characterClass() throws MalformedPatternException { char lastToken, token; SyntaxNode node; CharacterClassNode current; __match('['); __inCharacterClass = true; if(__lookahead == '^'){ __match('^'); current = new NegativeCharacterClassNode(__position++); } else current = new CharacterClassNode(__position++); while(__lookahead != ']' && __lookahead != _END_OF_INPUT) { if(__lookahead == '\\'){ node = __backslashToken(); --__position; // __backslashToken() (actually newTokenNode()) does not take care of // case insensitivity when __inCharacterClass is true. if(node instanceof TokenNode){ lastToken = ((TokenNode)node)._token; current._addToken(lastToken); if(!__caseSensitive) current._addToken(_toggleCase(lastToken)); } else { CharacterClassNode slash; slash = (CharacterClassNode)node; // This could be made more efficient by manipulating the // characterSet elements of the CharacterClassNodes but // for the moment, this is more clear. for(token=0; token < LeafNode._NUM_TOKENS; token++){ if(slash._matches(token)) current._addToken(token); } // A byproduct of this act is that when a '-' occurs after // a \d, \w, etc. it is not interpreted as a range and no // parse exception is thrown. // This is considered a feature and not a bug for now. continue; } } else { lastToken = __lookahead; current._addToken(__lookahead); if(!__caseSensitive) current._addToken(_toggleCase(__lookahead)); __match(__lookahead); } // In Perl, a - is a token if it occurs at the beginning // or end of the character class. Anywhere else, it indicates // a range. // A byproduct of this implementation is that if a '-' occurs // after the end of a range, it is interpreted as a '-' and no // exception is thrown. e.g., the second dash in [a-z-x] // This is considered a feature and not a bug for now. if(__lookahead == '-'){ __match('-'); if(__lookahead == ']'){ current._addToken('-'); break; } else if(__lookahead == '\\') { node = __backslashToken(); --__position; if(node instanceof TokenNode) token = ((TokenNode)node)._token; else throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); } else { token = __lookahead; __match(__lookahead); } if(token < lastToken) throw new MalformedPatternException( "Parse error: invalid range specified at position " + __bytesRead); current._addTokenRange(lastToken + 1, token); if(!__caseSensitive) current._addTokenRange(_toggleCase((char)(lastToken + 1)), _toggleCase(token)); } } __match(']'); __inCharacterClass = false; return current; } SyntaxNode _newTokenNode(char token, int position){ if(!__inCharacterClass && !__caseSensitive && (_isUpperCase(token) || _isLowerCase(token))){ CharacterClassNode node = new CharacterClassNode(position); node._addToken(token); node._addToken(_toggleCase(token)); return node; } return new TokenNode(token, position); } SyntaxTree _parse(char[] expression) throws MalformedPatternException { SyntaxTree tree; __openParen = __closeParen = 0; __regularExpression = expression; __bytesRead = 0; __expressionLength = expression.length; __inCharacterClass = false; __position = 0; __match(__lookahead); // Call match to read first input. if(__lookahead == '^') { __beginAnchor = true; __match(__lookahead); } if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') { --__expressionLength; __endAnchor = true; } if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) { CatNode root; root = new CatNode(); root._left = __regex(); // end marker root._right = new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++); tree = new SyntaxTree(root, __position); } else tree = new SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1); tree._computeFollowPositions(); return tree; } /** * Compiles an Awk regular expression into an AwkPattern instance that * can be used by an AwkMatcher object to perform pattern matching. *

* @param pattern An Awk regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. Currently the * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(char[] pattern, int options) throws MalformedPatternException { SyntaxTree tree; AwkPattern regexp; __beginAnchor = __endAnchor = false; __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); __multiline = ((options & MULTILINE_MASK) != 0); tree = _parse(pattern); regexp = new AwkPattern(new String(pattern), tree); regexp._options = options; regexp._hasBeginAnchor = __beginAnchor; regexp._hasEndAnchor = __endAnchor; return regexp; } /** * Compiles an Awk regular expression into an AwkPattern instance that * can be used by an AwkMatcher object to perform pattern matching. *

* @param pattern An Awk regular expression to compile. * @param options A set of flags giving the compiler instructions on * how to treat the regular expression. Currently the * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(String pattern, int options) throws MalformedPatternException { SyntaxTree tree; AwkPattern regexp; __beginAnchor = __endAnchor = false; __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0); __multiline = ((options & MULTILINE_MASK) != 0); tree = _parse(pattern.toCharArray()); regexp = new AwkPattern(pattern, tree); regexp._options = options; regexp._hasBeginAnchor = __beginAnchor; regexp._hasEndAnchor = __endAnchor; return regexp; } /** * Same as calling compile(pattern, AwkCompiler.DEFAULT_MASK); *

* @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(char[] pattern) throws MalformedPatternException { return compile(pattern, DEFAULT_MASK); } /** * Same as calling compile(pattern, AwkCompiler.DEFAULT_MASK); *

* @param pattern A regular expression to compile. * @return A Pattern instance constituting the compiled regular expression. * This instance will always be an AwkPattern and can be reliably * be casted to an AwkPattern. * @exception MalformedPatternException If the compiled expression * is not a valid Awk regular expression. */ public Pattern compile(String pattern) throws MalformedPatternException { return compile(pattern, DEFAULT_MASK); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy