net.sf.eBus.util.regex.RegexLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of util Show documentation
There is a newer version: 7.4.0
//
// Copyright 2001 - 2010 Charles W. Rapp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package net.sf.eBus.util.regex;

import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Logger;
import net.sf.eBus.util.TernarySearchTree;

/**
 * LIKE conditions use a separate parser for their regular
 * expression values. Regular expressions are parsed into
 * their constituent components. This parser supports the
 * SQL regular expression syntax which is limited to:
 * 
 *   
 *     Literals: The argument string's current location must
 *     match exactly this literal character.
 *   
 *   
 *     []: The argument string's current character must match one
 *     of the characters in this set. The set's form may be
 *     either [abcd] or [a-d] but not a combination of the forms.
 *   
 *   
 *     [^]: The argument string's current character must
 *     not match any character in this set.
 *   
 *   
 *     .: Matches the argument string's current character. Fails
 *     only if at the argument string's end.
 *   
 *   
 *     *: Matches zero or more of the argument string's
 *     current characters. This is a reluctant quantifier. This
 *     means the regular expression "%abc%" matches "12abc3".
 *     A greedy * quantifier would not because the first "%"
 *     in the regular expression would greedily consume the
 *     entire argument string.
 *   
 *   
 *     +: Matches one or more the argument string's
 *     current characters. This is a reluctant quantifier.
 *   
 *   
 *     {m, n}:
 *   
 * 
 * The reason for parsing the regular expression are two-fold:
 * 
 *   
 *     To convert the SQL syntax to the java.util.regex.Pattern
 *     syntax.
 *   
 *   
 *     To support {@link TernarySearchTree#entrySet(Pattern)}.
 *   
 * 
 *
 * @author Charles Rapp
 */

@SuppressWarnings("unchecked")
/* package */ final class RegexLexer
{
//---------------------------------------------------------------
// Member data.
//

    //-----------------------------------------------------------
    // Constants.
    //

    // There are four token types: not set, done failed,
    // done success and regex component.
    /* package */ static final int TOKEN_NOT_SET   = 0;
    /* package */ static final int DONE_FAILED     = 1;
    /* package */ static final int DONE_SUCCESS    = 2;
    /* package */ static final int REGEX_COMPONENT = 3;
    /* package */ static final int TOKEN_COUNT =
        REGEX_COMPONENT + 1;
    /* package */ static final int SIZE_NOT_SET    = -2;

    private static final int MIN_ASCII_CHAR = 0;
    private static final int MAX_ASCII_CHAR = 128;
    private static final int MAX_OCTAL_CHAR = 255;

    // Predefined character sets.
    /* package */ static final int DIGIT_FIRST = '0';
    /* package */ static final int DIGIT_LAST = '9';
    /* package */ static final int LC_ALPHA_FIRST = 'a';
    /* package */ static final int LC_ALPHA_LAST = 'z';
    /* package */ static final int UC_ALPHA_FIRST = 'A';
    /* package */ static final int UC_ALPHA_LAST = 'Z';
    /* package */ static final int[] WHITESPACE =
    {' ', '\t', '\n', 0x0B, '\f', '\r'};

    //-----------------------------------------------------------
    // Statics.
    //

    /**
     * Logging subsystem interface.
     */
    private static final Logger sLogger =
        Logger.getLogger(RegexLexer.class.getName());

    private static String[] sTokenTypeNames;
    private static Method[] sTransMethod;

    static
    {
        String transName = "";

        sTokenTypeNames = new String[TOKEN_COUNT];
        sTokenTypeNames[RegexLexer.TOKEN_NOT_SET] = "TOKEN_NOT_SET";
        sTokenTypeNames[RegexLexer.DONE_FAILED] = "DONE_FAILED";
        sTokenTypeNames[RegexLexer.DONE_SUCCESS] = "DONE_SUCCESS";
        sTokenTypeNames[RegexLexer.REGEX_COMPONENT] =
            "REGEX_COMPONENT";

        sTransMethod = new Method[RegexLexer.MAX_ASCII_CHAR + 1];

        try
        {
            final Class fsmClass =
                RegexLexerContext.class;
            final Class[] paramTypes = new Class[1];
            int i;
            Method alpha;
            Method digit;

            paramTypes[0] = int.class;

            transName = "alpha";
            alpha =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "digit";
            digit =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            // Set all transitions to alpha. This will be
            // overriden later.
            for (i = MIN_ASCII_CHAR; i < MAX_ASCII_CHAR; ++i)
            {
                sTransMethod[i] = alpha;
            }

            // Now set digits.
            for (i = '0'; i <= '9'; ++i)
            {
                sTransMethod[i] = digit;
            }

            // Now set the special characters.
            transName = "asterisk";
            sTransMethod['*'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "plus";
            sTransMethod['+'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "question_mark";
            sTransMethod['?'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "period";
            sTransMethod['.'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "left_bracket";
            sTransMethod['['] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "right_bracket";
            sTransMethod[']'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "up_arrow";
            sTransMethod['^'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "dash";
            sTransMethod['-'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "left_brace";
            sTransMethod['{'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "right_brace";
            sTransMethod['}'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "comma";
            sTransMethod[','] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);

            transName = "backslash";
            sTransMethod['\\'] =
                fsmClass.getDeclaredMethod(
                    transName, paramTypes);
        }
        catch (NoSuchMethodException | SecurityException jex)
        {
            sLogger.severe(
                String.format(
                    "INITIALIZATION ERROR! No such method as net.sf.eBus.util.regex.LexerContext.%s(int).%n",
                    transName));
        }
    } // end of static

    //-----------------------------------------------------------
    // Locals.
    //

    // The lexer's state map.
    private final RegexLexerContext mFsm;

    // Place the string to be parsed into this reader and
    // then read in characters from this reader.
    private final StringReader mInput;

    // We are now at this character in the input.
    private int mIndex;

    // Use only one token object.
    private final Token mToken;

    // Count the number of tokens we generate.
    private int mTokenCount;

    // Collect the current token string in this buffer before
    // generating a literal or character set token from it.
    private final StringBuilder mTokenBuffer;

    // This flag is set to true when a token has been found or
    // an error occurs.
    private boolean mStopFlag;

    //-----------------------------------------------------------
    // The following data members are used to store information
    // for the various component types before instantiating the
    // Component subclass.
    //

    // The target component type: LITERAL, CHARACTER_SET or
    // ANY_CHAR.
    private int mcComponentType;

    // Literal: store the literal character.
    private int mcLiteral;

    // Character Set: set true each character's bit and whether
    // the set is negated or not. Alse store the *potentially*
    // first character in a set range.
    private final SortedSet mcCharacterSet;
    private boolean mcNegatedFlag;
    private int mcFirstCharacter;

    // Range: store incoming digits in the buffer and the
    // minimum and maximum range values.
    private final StringBuilder mcNumber;
    private int mcMinimumSize;
    private int mcMaximumSize;

    //
    // Component data.
    //-----------------------------------------------------------

//---------------------------------------------------------------
// Member methods.
//

    //-----------------------------------------------------------
    // Constructors.
    //

    /**
     * Creates a regular expression lexer which passes tokens
     * to the specified parser. The lexer drives the parsing.
     * @param s find tokens in this string.
     */
    /* package */ RegexLexer(final String s)
    {
        mFsm = new RegexLexerContext(this);
        mInput = new StringReader(s);
        mIndex = -1;
        mToken = new Token();
        mTokenCount = 0;
        mTokenBuffer = new StringBuilder();
        mStopFlag = false;

        mcComponentType = 0;
        mcLiteral = -1;
        mcCharacterSet = new TreeSet<>();
        mcNegatedFlag = false;
        mcFirstCharacter = -1;
        mcNumber = new StringBuilder();
        mcMinimumSize = SIZE_NOT_SET;
        mcMaximumSize = SIZE_NOT_SET;

        // _fsm.setDebugFlag(true);
    } // end of RegexLexer(String)

    //
    // end of Constructors.
    //-----------------------------------------------------------

    /**
     * Finds tokens in the input string and passes the tokens to
     * the regular expression parser.
     * @return {@code true} if {@code re} was
     * successfully tokenized and {@code false} otherwise.
     */
    /* package */ Token nextToken()
    {
        // Keep reading until told to stop or the end-of-string
        // is reached.
        try
        {
            int c;

            mStopFlag = false;
            while (!mStopFlag)
            {
                // If this is the end of the string, then we
                // have successfully completed.
                if ((c = readChar()) < 0)
                {
                    mFsm.EOS();
                }
                else if (c >= MAX_ASCII_CHAR)
                {

                    // Place this character into the token buffer
                    // before issuing the transition.
                    mTokenBuffer.append((char) c);

                    mFsm.alpha(c);
                }
                else
                {
                    // Place this character into the token buffer
                    // before issuing the transition.
                    mTokenBuffer.append((char) c);

                    sTransMethod[c].invoke(mFsm, c);
                }
            }
        }
        catch (IOException | IllegalAccessException jex)
        {
            badToken(jex);
        }
        catch (InvocationTargetException inex)
        {
            badToken(inex.getCause());
        }

        return (mToken);
    } // end of nextToken()

    // Returns the next character from the string.
    /* package */ int readChar()
        throws IOException
    {
        // Mark the current position in case we have to
        // unread.
        mInput.mark(1);

        ++mIndex;

        return (mInput.read());
    } // end of readChar()

    // Unread the current character by resetting to the marked
    // position.
    /* package */ void unreadChar()
    {
        try
        {
            final int bufferSize = mTokenBuffer.length();

            mInput.reset();
            --mIndex;

            // Remove this character from the end of the buffer.
            if (bufferSize > 0)
            {
                mTokenBuffer.deleteCharAt(bufferSize - 1);
            }
        }
        catch (IOException ioex)
        {
            // Oh, please!
        }
    } // end of unreadChar()

    // Closes the string reader to prevent further reading.
    /* package */ void closeInput()
    {
        mInput.close();
    } // end of closeInput()

    //-----------------------------------------------------------
    // State Machine Actions
    //

    /**
     * Returns the current token count.
     * @return the current token count.
     */
    /* package */ int tokenCount()
    {
        return (mTokenCount);
    } // end of tokenCount()

    // Clears out all token data and sets the new token type
    // and component type.
    /* package */ void startToken(final int componentType)
    {
        mToken.clear();

        mToken.type(REGEX_COMPONENT);
        mcComponentType = componentType;

        mTokenBuffer.delete(0, mTokenBuffer.length());

        mcLiteral = -1;
        mcCharacterSet.clear();
        mcNegatedFlag = false;
        mcFirstCharacter = -1;
        mcNumber.delete(0, mcNumber.length());
        mcMinimumSize = SIZE_NOT_SET;
        mcMaximumSize = SIZE_NOT_SET;
    } // end of startToken(int)

    // Creates a regular expression component token.
    /* package */ void endToken(final int minSize,
                                final int maxSize)
    {
        mToken.value(mTokenBuffer.toString());

        // Instantiates the appropriate regular expression
        // component.
        switch (mcComponentType)
        {
            case Component.LITERAL:
                mToken.regexComponent(
                    new Literal((char) mcLiteral,
                        minSize,
                        maxSize,
                        mTokenCount));
                break;

            case Component.CHARACTER_SET:
                final SortedSet setCopy =
                    new TreeSet<>(mcCharacterSet);

                mToken.regexComponent(
                    new CharacterSet(
                        setCopy,
                        mcNegatedFlag,
                        minSize,
                        maxSize,
                        mTokenCount));
                break;

            default:
                mToken.regexComponent(
                    new AnyChar(minSize, maxSize, mTokenCount));
                break;
        }

        ++mTokenCount;

        // Stop. We have found a token.
        mStopFlag = true;
    } // end of endToken(int, int)

    // Returns an error token which should cause the parsing
    // to terminate.
    /* package */ void badToken(final String message)
    {
        mToken.type(DONE_FAILED);
        mToken.value(mTokenBuffer.toString());
        mToken.errorMessage(message);
        mToken.index(mIndex);

        mStopFlag = true;
    } // end of badToken(String)

    // Returns an error token with the exception set.
    /* package */ void badToken(final Throwable cause)
    {
        mToken.type(DONE_FAILED);
        mToken.value(mTokenBuffer.toString());
        mToken.errorMessage(cause.getMessage());
        mToken.cause(cause);
        mToken.index(mIndex);

        mStopFlag = true;
    } // end of badToken(Throwable)

    // The parsing has successfully completed.
    /* package */ void done()
    {
        mToken.type(DONE_SUCCESS);
        mToken.regexComponent(null);
        mToken.value(null);
        mToken.errorMessage(null);
        mToken.cause(null);

        mStopFlag = true;
    } // end of done()

    // Stores the literal component's value.
    /* package */ void literal(final int c)
    {
        mcLiteral = c;
    } // end of literal(int)

    // Stores the character set negation flag.
    /* package */ void negatedFlag(final boolean flag)
    {
        mcNegatedFlag = flag;
    } // end of negatedFlag(boolean)

    // Returns the character set size.
    /* package */ int characterSetSize()
    {
        return (mcCharacterSet.size());
    } // end of characterSetSize()

    // Sets the single character's bit in the character bit set.
    /* package */ void addToSet(final int c)
    {
        mcCharacterSet.add((char) c);
    } // end of addToSet(int)

    // Sets a range of characters.
    /* package */ void addToSet(final int first, final int last)
    {
        int index;

        for (index = first; index <= last; ++index)
        {
            mcCharacterSet.add((char) index);
        }
    } // end of addToSet(int, int)

    // Adds multiple characters.
    /* package */ void addToSet(final int[] cs)
    {
        for (int c: cs)
        {
            mcCharacterSet.add((char) c);
        }
    } // end of addToSet(int[])

    /* package */ int firstCharacter()
    {
        return (mcFirstCharacter);
    } // end of firstCharacter()

    /* package */ void firstCharacter(final int c)
    {
        mcFirstCharacter = c;
    } // end of firstCharacter(int)

    /* package */ int numberLength()
    {
        return (mcNumber.length());
    } // end of numberLength()

    /* package */ String number()
    {
        return (mcNumber.toString());
    } // end of number()

    /* package */ void appendNumber(final int c)
    {
        mcNumber.append((char) c);
    } // end of appendNumber(int)

    /* package */ void clearNumber()
    {
        mcNumber.delete(0, mcNumber.length());
    } // end of clearNumber()

    /* package */ int minimumSize()
    {
        return (mcMinimumSize);
    } // end of minimumSize()

    /* package */ void minimumSize(final int size)
    {
        mcMinimumSize = size;
    } // end of mimimumSize(int)

    /* package */ boolean minimumSize(final String s)
    {
        boolean retcode = true;

        try
        {
            mcMinimumSize = Integer.parseInt(s);
        }
        catch (NumberFormatException formex)
        {
            retcode = false;
            mcMinimumSize = SIZE_NOT_SET;
        }

        return (retcode);
    } // end of minimumSize(String)

    /* package */ int maximumSize()
    {
        return (mcMaximumSize);
    } // end of maximumSize()

    /* package */ boolean maximumSize(final String s)
    {
        boolean retcode = true;

        try
        {
            mcMaximumSize = Integer.parseInt(s);
        }
        catch (NumberFormatException formex)
        {
            retcode = false;
            mcMaximumSize = SIZE_NOT_SET;
        }

        return (retcode);
    } // end of maximumSize(String)

    /* package */ boolean isOctal()
    {
        final int i =
            Integer.parseInt(mTokenBuffer.substring(2), 8);

        return (i <= MAX_OCTAL_CHAR);
    } // end of isOctal()

    /* package */ char octalChar()
    {
        char retval;

        try
        {
            retval =
                (char) Integer.parseInt(
                    mTokenBuffer.substring(2), 8);
        }
        catch (NumberFormatException jex)
        {
            retval = Character.MIN_VALUE;
        }

        return (retval);
    } // end of octalChar()

    /* package */ char hexChar()
    {
        char retval;

        try
        {
            retval =
                (char) Integer.parseInt(
                    mTokenBuffer.substring(2), 16);
        }
        catch (NumberFormatException jex)
        {
            retval = Character.MIN_VALUE;
        }

        return (retval);
    } // end of hexChar()

    //
    // end of State Machine Actions
    //-----------------------------------------------------------

//---------------------------------------------------------------
// Inner classes.
//

    // Token information is placed in this class and the Token
    // instance passed to the parser.
    public static final class Token
    {
    //-----------------------------------------------------------
    // Member data.
    //

        //-------------------------------------------------------
        // Locals.
        //

        private int mType;
        private Component mRegexComponent;
        private String mValue;
        private String mErrorMessage;
        private Throwable mCause;
        private int mIndex;

    //-----------------------------------------------------------
    // Member methods.
    //

        //-------------------------------------------------------
        // Constructors.
        //

        public Token()
        {
            mType = TOKEN_NOT_SET;
            mRegexComponent = null;
            mValue = null;
            mErrorMessage = null;
            mCause = null;
            mIndex = -1;
        } // end of Token()

        //
        // end of Constructors.
        //-------------------------------------------------------

        //-------------------------------------------------------
        // Get methods.
        //

        public int type()
        {
            return (mType);
        } // end of type()

        public Component regexComponent()
        {
            return (mRegexComponent);
        } // end of regexComponent()

        public String value()
        {
            return (mValue);
        } // end of value()

        public String errorMessage()
        {
            return (mErrorMessage);
        } // end of errorMessage()

        public Throwable cause()
        {
            return (mCause);
        } // end of cause()

        public int index()
        {
            return (mIndex);
        } // end of index()

        //
        // end of Get methods.
        //-------------------------------------------------------

        //-------------------------------------------------------
        // Set methods.
        //

        public void type(final int type)
        {
            mType = type;
        } // end fo type(int)

        public void regexComponent(final Component component)
        {
            mRegexComponent = component;
        } // end of regexComponent(Component)

        public void value(final String value)
        {
            mValue = value;
        } // end of value(String)

        public void errorMessage(final String message)
        {
            mErrorMessage = message;
        } // end of errorMessage(String)

        public void cause(final Throwable cause)
        {
            mCause = cause;
        } // end of cause(Throwable)

        public void index(final int index)
        {
            mIndex = index;
        } // end of index(int)

        //
        // end of Set methods.
        //-------------------------------------------------------

        public void clear()
        {
            mType = TOKEN_NOT_SET;
            mRegexComponent = null;
            mValue = null;
            mErrorMessage = null;
            mCause = null;
            mIndex = -1;
        } // end of clear()
    } // end of class Token
} // end of class RegexLexer