All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.eBus.util.regex.RegexLexer Maven / Gradle / Ivy

There is a newer version: 7.4.0
Show newest version
//
// Copyright 2001 - 2010 Charles W. Rapp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package net.sf.eBus.util.regex;

import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Logger;
import net.sf.eBus.util.TernarySearchTree;

/**
 * LIKE conditions use a separate parser for their regular
 * expression values. Regular expressions are parsed into
 * their constituent components. This parser supports the
 * SQL regular expression syntax which is limited to:
 * 
    *
  • * Literals: The argument string's current location must * match exactly this literal character. *
  • *
  • * []: The argument string's current character must match one * of the characters in this set. The set's form may be * either [abcd] or [a-d] but not a combination of the forms. *
  • *
  • * [^]: The argument string's current character must * not match any character in this set. *
  • *
  • * .: Matches the argument string's current character. Fails * only if at the argument string's end. *
  • *
  • * *: Matches zero or more of the argument string's * current characters. This is a reluctant quantifier. This * means the regular expression "%abc%" matches "12abc3". * A greedy * quantifier would not because the first "%" * in the regular expression would greedily consume the * entire argument string. *
  • *
  • * +: Matches one or more the argument string's * current characters. This is a reluctant quantifier. *
  • *
  • * {m, n}: *
  • *
* The reason for parsing the regular expression are two-fold: *
    *
  1. * To convert the SQL syntax to the java.util.regex.Pattern * syntax. *
  2. *
  3. * To support {@link TernarySearchTree#entrySet(Pattern)}. *
  4. *
* * @author Charles Rapp */ @SuppressWarnings("unchecked") /* package */ final class RegexLexer { //--------------------------------------------------------------- // Member data. // //----------------------------------------------------------- // Constants. // // There are four token types: not set, done failed, // done success and regex component. /* package */ static final int TOKEN_NOT_SET = 0; /* package */ static final int DONE_FAILED = 1; /* package */ static final int DONE_SUCCESS = 2; /* package */ static final int REGEX_COMPONENT = 3; /* package */ static final int TOKEN_COUNT = REGEX_COMPONENT + 1; /* package */ static final int SIZE_NOT_SET = -2; private static final int MIN_ASCII_CHAR = 0; private static final int MAX_ASCII_CHAR = 128; private static final int MAX_OCTAL_CHAR = 255; // Predefined character sets. /* package */ static final int DIGIT_FIRST = '0'; /* package */ static final int DIGIT_LAST = '9'; /* package */ static final int LC_ALPHA_FIRST = 'a'; /* package */ static final int LC_ALPHA_LAST = 'z'; /* package */ static final int UC_ALPHA_FIRST = 'A'; /* package */ static final int UC_ALPHA_LAST = 'Z'; /* package */ static final int[] WHITESPACE = {' ', '\t', '\n', 0x0B, '\f', '\r'}; //----------------------------------------------------------- // Statics. // /** * Logging subsystem interface. */ private static final Logger sLogger = Logger.getLogger(RegexLexer.class.getName()); private static String[] sTokenTypeNames; private static Method[] sTransMethod; static { String transName = ""; sTokenTypeNames = new String[TOKEN_COUNT]; sTokenTypeNames[RegexLexer.TOKEN_NOT_SET] = "TOKEN_NOT_SET"; sTokenTypeNames[RegexLexer.DONE_FAILED] = "DONE_FAILED"; sTokenTypeNames[RegexLexer.DONE_SUCCESS] = "DONE_SUCCESS"; sTokenTypeNames[RegexLexer.REGEX_COMPONENT] = "REGEX_COMPONENT"; sTransMethod = new Method[RegexLexer.MAX_ASCII_CHAR + 1]; try { final Class fsmClass = RegexLexerContext.class; final Class[] paramTypes = new Class[1]; int i; Method alpha; Method digit; paramTypes[0] = int.class; transName = "alpha"; alpha = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "digit"; digit = fsmClass.getDeclaredMethod( transName, paramTypes); // Set all transitions to alpha. This will be // overriden later. for (i = MIN_ASCII_CHAR; i < MAX_ASCII_CHAR; ++i) { sTransMethod[i] = alpha; } // Now set digits. for (i = '0'; i <= '9'; ++i) { sTransMethod[i] = digit; } // Now set the special characters. transName = "asterisk"; sTransMethod['*'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "plus"; sTransMethod['+'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "question_mark"; sTransMethod['?'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "period"; sTransMethod['.'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "left_bracket"; sTransMethod['['] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "right_bracket"; sTransMethod[']'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "up_arrow"; sTransMethod['^'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "dash"; sTransMethod['-'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "left_brace"; sTransMethod['{'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "right_brace"; sTransMethod['}'] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "comma"; sTransMethod[','] = fsmClass.getDeclaredMethod( transName, paramTypes); transName = "backslash"; sTransMethod['\\'] = fsmClass.getDeclaredMethod( transName, paramTypes); } catch (NoSuchMethodException | SecurityException jex) { sLogger.severe( String.format( "INITIALIZATION ERROR! No such method as net.sf.eBus.util.regex.LexerContext.%s(int).%n", transName)); } } // end of static //----------------------------------------------------------- // Locals. // // The lexer's state map. private final RegexLexerContext mFsm; // Place the string to be parsed into this reader and // then read in characters from this reader. private final StringReader mInput; // We are now at this character in the input. private int mIndex; // Use only one token object. private final Token mToken; // Count the number of tokens we generate. private int mTokenCount; // Collect the current token string in this buffer before // generating a literal or character set token from it. private final StringBuilder mTokenBuffer; // This flag is set to true when a token has been found or // an error occurs. private boolean mStopFlag; //----------------------------------------------------------- // The following data members are used to store information // for the various component types before instantiating the // Component subclass. // // The target component type: LITERAL, CHARACTER_SET or // ANY_CHAR. private int mcComponentType; // Literal: store the literal character. private int mcLiteral; // Character Set: set true each character's bit and whether // the set is negated or not. Alse store the *potentially* // first character in a set range. private final SortedSet mcCharacterSet; private boolean mcNegatedFlag; private int mcFirstCharacter; // Range: store incoming digits in the buffer and the // minimum and maximum range values. private final StringBuilder mcNumber; private int mcMinimumSize; private int mcMaximumSize; // // Component data. //----------------------------------------------------------- //--------------------------------------------------------------- // Member methods. // //----------------------------------------------------------- // Constructors. // /** * Creates a regular expression lexer which passes tokens * to the specified parser. The lexer drives the parsing. * @param s find tokens in this string. */ /* package */ RegexLexer(final String s) { mFsm = new RegexLexerContext(this); mInput = new StringReader(s); mIndex = -1; mToken = new Token(); mTokenCount = 0; mTokenBuffer = new StringBuilder(); mStopFlag = false; mcComponentType = 0; mcLiteral = -1; mcCharacterSet = new TreeSet<>(); mcNegatedFlag = false; mcFirstCharacter = -1; mcNumber = new StringBuilder(); mcMinimumSize = SIZE_NOT_SET; mcMaximumSize = SIZE_NOT_SET; // _fsm.setDebugFlag(true); } // end of RegexLexer(String) // // end of Constructors. //----------------------------------------------------------- /** * Finds tokens in the input string and passes the tokens to * the regular expression parser. * @return {@code true} if {@code re} was * successfully tokenized and {@code false} otherwise. */ /* package */ Token nextToken() { // Keep reading until told to stop or the end-of-string // is reached. try { int c; mStopFlag = false; while (!mStopFlag) { // If this is the end of the string, then we // have successfully completed. if ((c = readChar()) < 0) { mFsm.EOS(); } else if (c >= MAX_ASCII_CHAR) { // Place this character into the token buffer // before issuing the transition. mTokenBuffer.append((char) c); mFsm.alpha(c); } else { // Place this character into the token buffer // before issuing the transition. mTokenBuffer.append((char) c); sTransMethod[c].invoke(mFsm, c); } } } catch (IOException | IllegalAccessException jex) { badToken(jex); } catch (InvocationTargetException inex) { badToken(inex.getCause()); } return (mToken); } // end of nextToken() // Returns the next character from the string. /* package */ int readChar() throws IOException { // Mark the current position in case we have to // unread. mInput.mark(1); ++mIndex; return (mInput.read()); } // end of readChar() // Unread the current character by resetting to the marked // position. /* package */ void unreadChar() { try { final int bufferSize = mTokenBuffer.length(); mInput.reset(); --mIndex; // Remove this character from the end of the buffer. if (bufferSize > 0) { mTokenBuffer.deleteCharAt(bufferSize - 1); } } catch (IOException ioex) { // Oh, please! } } // end of unreadChar() // Closes the string reader to prevent further reading. /* package */ void closeInput() { mInput.close(); } // end of closeInput() //----------------------------------------------------------- // State Machine Actions // /** * Returns the current token count. * @return the current token count. */ /* package */ int tokenCount() { return (mTokenCount); } // end of tokenCount() // Clears out all token data and sets the new token type // and component type. /* package */ void startToken(final int componentType) { mToken.clear(); mToken.type(REGEX_COMPONENT); mcComponentType = componentType; mTokenBuffer.delete(0, mTokenBuffer.length()); mcLiteral = -1; mcCharacterSet.clear(); mcNegatedFlag = false; mcFirstCharacter = -1; mcNumber.delete(0, mcNumber.length()); mcMinimumSize = SIZE_NOT_SET; mcMaximumSize = SIZE_NOT_SET; } // end of startToken(int) // Creates a regular expression component token. /* package */ void endToken(final int minSize, final int maxSize) { mToken.value(mTokenBuffer.toString()); // Instantiates the appropriate regular expression // component. switch (mcComponentType) { case Component.LITERAL: mToken.regexComponent( new Literal((char) mcLiteral, minSize, maxSize, mTokenCount)); break; case Component.CHARACTER_SET: final SortedSet setCopy = new TreeSet<>(mcCharacterSet); mToken.regexComponent( new CharacterSet( setCopy, mcNegatedFlag, minSize, maxSize, mTokenCount)); break; default: mToken.regexComponent( new AnyChar(minSize, maxSize, mTokenCount)); break; } ++mTokenCount; // Stop. We have found a token. mStopFlag = true; } // end of endToken(int, int) // Returns an error token which should cause the parsing // to terminate. /* package */ void badToken(final String message) { mToken.type(DONE_FAILED); mToken.value(mTokenBuffer.toString()); mToken.errorMessage(message); mToken.index(mIndex); mStopFlag = true; } // end of badToken(String) // Returns an error token with the exception set. /* package */ void badToken(final Throwable cause) { mToken.type(DONE_FAILED); mToken.value(mTokenBuffer.toString()); mToken.errorMessage(cause.getMessage()); mToken.cause(cause); mToken.index(mIndex); mStopFlag = true; } // end of badToken(Throwable) // The parsing has successfully completed. /* package */ void done() { mToken.type(DONE_SUCCESS); mToken.regexComponent(null); mToken.value(null); mToken.errorMessage(null); mToken.cause(null); mStopFlag = true; } // end of done() // Stores the literal component's value. /* package */ void literal(final int c) { mcLiteral = c; } // end of literal(int) // Stores the character set negation flag. /* package */ void negatedFlag(final boolean flag) { mcNegatedFlag = flag; } // end of negatedFlag(boolean) // Returns the character set size. /* package */ int characterSetSize() { return (mcCharacterSet.size()); } // end of characterSetSize() // Sets the single character's bit in the character bit set. /* package */ void addToSet(final int c) { mcCharacterSet.add((char) c); } // end of addToSet(int) // Sets a range of characters. /* package */ void addToSet(final int first, final int last) { int index; for (index = first; index <= last; ++index) { mcCharacterSet.add((char) index); } } // end of addToSet(int, int) // Adds multiple characters. /* package */ void addToSet(final int[] cs) { for (int c: cs) { mcCharacterSet.add((char) c); } } // end of addToSet(int[]) /* package */ int firstCharacter() { return (mcFirstCharacter); } // end of firstCharacter() /* package */ void firstCharacter(final int c) { mcFirstCharacter = c; } // end of firstCharacter(int) /* package */ int numberLength() { return (mcNumber.length()); } // end of numberLength() /* package */ String number() { return (mcNumber.toString()); } // end of number() /* package */ void appendNumber(final int c) { mcNumber.append((char) c); } // end of appendNumber(int) /* package */ void clearNumber() { mcNumber.delete(0, mcNumber.length()); } // end of clearNumber() /* package */ int minimumSize() { return (mcMinimumSize); } // end of minimumSize() /* package */ void minimumSize(final int size) { mcMinimumSize = size; } // end of mimimumSize(int) /* package */ boolean minimumSize(final String s) { boolean retcode = true; try { mcMinimumSize = Integer.parseInt(s); } catch (NumberFormatException formex) { retcode = false; mcMinimumSize = SIZE_NOT_SET; } return (retcode); } // end of minimumSize(String) /* package */ int maximumSize() { return (mcMaximumSize); } // end of maximumSize() /* package */ boolean maximumSize(final String s) { boolean retcode = true; try { mcMaximumSize = Integer.parseInt(s); } catch (NumberFormatException formex) { retcode = false; mcMaximumSize = SIZE_NOT_SET; } return (retcode); } // end of maximumSize(String) /* package */ boolean isOctal() { final int i = Integer.parseInt(mTokenBuffer.substring(2), 8); return (i <= MAX_OCTAL_CHAR); } // end of isOctal() /* package */ char octalChar() { char retval; try { retval = (char) Integer.parseInt( mTokenBuffer.substring(2), 8); } catch (NumberFormatException jex) { retval = Character.MIN_VALUE; } return (retval); } // end of octalChar() /* package */ char hexChar() { char retval; try { retval = (char) Integer.parseInt( mTokenBuffer.substring(2), 16); } catch (NumberFormatException jex) { retval = Character.MIN_VALUE; } return (retval); } // end of hexChar() // // end of State Machine Actions //----------------------------------------------------------- //--------------------------------------------------------------- // Inner classes. // // Token information is placed in this class and the Token // instance passed to the parser. public static final class Token { //----------------------------------------------------------- // Member data. // //------------------------------------------------------- // Locals. // private int mType; private Component mRegexComponent; private String mValue; private String mErrorMessage; private Throwable mCause; private int mIndex; //----------------------------------------------------------- // Member methods. // //------------------------------------------------------- // Constructors. // public Token() { mType = TOKEN_NOT_SET; mRegexComponent = null; mValue = null; mErrorMessage = null; mCause = null; mIndex = -1; } // end of Token() // // end of Constructors. //------------------------------------------------------- //------------------------------------------------------- // Get methods. // public int type() { return (mType); } // end of type() public Component regexComponent() { return (mRegexComponent); } // end of regexComponent() public String value() { return (mValue); } // end of value() public String errorMessage() { return (mErrorMessage); } // end of errorMessage() public Throwable cause() { return (mCause); } // end of cause() public int index() { return (mIndex); } // end of index() // // end of Get methods. //------------------------------------------------------- //------------------------------------------------------- // Set methods. // public void type(final int type) { mType = type; } // end fo type(int) public void regexComponent(final Component component) { mRegexComponent = component; } // end of regexComponent(Component) public void value(final String value) { mValue = value; } // end of value(String) public void errorMessage(final String message) { mErrorMessage = message; } // end of errorMessage(String) public void cause(final Throwable cause) { mCause = cause; } // end of cause(Throwable) public void index(final int index) { mIndex = index; } // end of index(int) // // end of Set methods. //------------------------------------------------------- public void clear() { mType = TOKEN_NOT_SET; mRegexComponent = null; mValue = null; mErrorMessage = null; mCause = null; mIndex = -1; } // end of clear() } // end of class Token } // end of class RegexLexer




© 2015 - 2024 Weber Informatics LLC | Privacy Policy