net.sf.saxon.expr.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of saxon9 Show documentation
Provides a basic XSLT 2.0 and XQuery 1.0 processor (W3C Recommendations, January 2007). Command line interfaces and implementations of several Java APIs (DOM, XPath, s9api) are also included.
The newest version!
package net.sf.saxon.expr;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.Whitespace;

import java.util.ArrayList;
import java.util.List;

/**
 * Tokenizer for expressions and inputs.
 *
 * This code was originally derived from James Clark's xt, though it has been greatly modified since.
 * See copyright notice at end of file.
 */


//@SuppressWarnings({"StringEquality"})
public final class Tokenizer {


    private int state = DEFAULT_STATE;
        // we may need to make this a stack at some time

    /**
     * Initial default state of the Tokenizer
     */
    public static final int DEFAULT_STATE = 0;

    /**
     * State in which a name is NOT to be merged with what comes next, for example "("
     */
    public static final int BARE_NAME_STATE = 1;

    /**
     * State in which the next thing to be read is a SequenceType
     */
    public static final int SEQUENCE_TYPE_STATE = 2;
    /**
     * State in which the next thing to be read is an operator
     */

    public static final int OPERATOR_STATE = 3;

    /**
     * The starting line number (for XPath in XSLT, the line number in the stylesheet)
     */
    public int startLineNumber;
    /**
     * The number identifying the most recently read token
     */
    public int currentToken = Token.EOF;
    /**
     * The string value of the most recently read token
     */
    public String currentTokenValue = null;
    /**
     * The position in the input expression where the current token starts
     */
    public int currentTokenStartOffset = 0;
    /**
     * The number of the next token to be returned
     */
    private int nextToken = Token.EOF;
    /**
     * The string value of the next token to be returned
     */
    private String nextTokenValue = null;
    /**
     * The position in the expression of the start of the next token
     */
    private int nextTokenStartOffset = 0;
    /**
     * The string being parsed
     */
    public String input;
    /**
     * The current position within the input string
     */
    public int inputOffset = 0;
    /**
     * The length of the input string
     */
    private int inputLength;
    /**
     * The line number (within the expression) of the current token
     */
    private int lineNumber = 1;
    /**
     * The line number (within the expression) of the next token
     */
    private int nextLineNumber = 1;

    /**
     * List containing the positions (offsets in the input string) at which newline characters
     * occur
     */

    private List newlineOffsets = null;

    /**
     * The token number of the token that preceded the current token
     */
    private int precedingToken = Token.UNKNOWN;

    /**
     * Get the current tokenizer state
     * @return the current state
     */

    public int getState() {
        return state;
    }

    /**
     * Set the tokenizer into a special state
     * @param state the new state
     */

    public void setState(int state) {
        this.state = state;
        if (state==DEFAULT_STATE) {
            // force the followsOperator() test to return true
            precedingToken = Token.UNKNOWN;
            currentToken = Token.UNKNOWN;
        } else if (state==OPERATOR_STATE) {
            precedingToken = Token.RPAR;
            currentToken = Token.RPAR;
        }
    }


    //
    // Lexical analyser for expressions, queries, and XSLT patterns
    //

    /**
     * Prepare a string for tokenization.
     * The actual tokens are obtained by calls on next()
     *
     * @param input the string to be tokenized
     * @param start start point within the string
     * @param end end point within the string (last character not read):
     * -1 means end of string
     * @param lineNumber the linenumber in the source where the expression appears
     * @throws XPathException if a lexical error occurs, e.g. unmatched
     *     string quotes
     */
    public void tokenize(String input, int start, int end, int lineNumber) throws XPathException {
        nextToken = Token.EOF;
        nextTokenValue = null;
        nextTokenStartOffset = 0;
        inputOffset = start;
        this.input = input;
        startLineNumber = lineNumber;
        this.lineNumber = lineNumber;
        nextLineNumber = lineNumber;
        if (end==-1) {
            inputLength = input.length();
        } else {
            inputLength = end;
        }

        // The tokenizer actually reads one token ahead. The raw lexical analysis performed by
        // the lookAhead() method does not (in general) distinguish names used as QNames from names
        // used for operators, axes, and functions. The next() routine further refines names into the
        // correct category, by looking at the following token. In addition, it combines compound tokens
        // such as "instance of" and "cast as".

        lookAhead();
        next();
    }

    //diagnostic version of next(): change real version to realnext()
    //
    //public void next() throws XPathException {
    //    realnext();
    //    System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
    //}

    /**
     * Get the next token from the input expression. The type of token is returned in the
     * currentToken variable, the string value of the token in currentTokenValue.
     *
     * @throws XPathException if a lexical error is detected
     */

    public void next() throws XPathException {
        precedingToken = currentToken;
        currentToken = nextToken;
        currentTokenValue = nextTokenValue;
        if (currentTokenValue==null) {
            currentTokenValue="";
        }
        currentTokenStartOffset = nextTokenStartOffset;
        lineNumber = nextLineNumber;

        // disambiguate the current token based on the tokenizer state

        switch (currentToken) {
            case Token.NAME:
                int optype = getBinaryOp(currentTokenValue);
                if (optype!=Token.UNKNOWN && !followsOperator(precedingToken)) {
                    currentToken = optype;
                }
                break;
            case Token.LT:
                if (followsOperator(precedingToken)) {
                    currentToken = Token.TAG;
                }
                break;
            case Token.STAR:
                if (!followsOperator(precedingToken)) {
                    currentToken = Token.MULT;
                }
                break;
        }

        if (currentToken == Token.TAG || currentToken == Token.RCURLY) {
            // No lookahead after encountering "<" at the start of an XML-like tag.
            // After an RCURLY, the parser must do an explicit lookahead() to continue
            // tokenizing; otherwise it can continue with direct character reading
            return;
        }

        int oldPrecedingToken = precedingToken;
        lookAhead();

        if (currentToken == Token.NAME) {
            if (state == BARE_NAME_STATE) {
                return;
            }
            switch (nextToken) {
                case Token.LPAR:
                    int op = getBinaryOp(currentTokenValue);
                    // the test on followsOperator() is to cater for an operator being used as a function name,
                    // e.g. is(): see XQTS test K-FunctionProlog-66
                    if (op == Token.UNKNOWN || followsOperator(oldPrecedingToken)) {
	                    currentToken = getFunctionType(currentTokenValue);
	                    lookAhead();    // swallow the "("
                    } else {
                        currentToken = op;
                    }
                    break;

                case Token.LCURLY:
                    if (!(state == SEQUENCE_TYPE_STATE)) {
                        currentToken = Token.KEYWORD_CURLY;
                        lookAhead();        // swallow the "{"
                    }
                    break;

                case Token.COLONCOLON:
                    lookAhead();
                    currentToken = Token.AXIS;
                    break;

                case Token.COLONSTAR:
                    lookAhead();
                    currentToken = Token.PREFIX;
                    break;

                case Token.DOLLAR:
                    if (currentTokenValue=="for") {
                        currentToken = Token.FOR;
                    } else if (currentTokenValue=="some") {
                        currentToken = Token.SOME;
                    } else if (currentTokenValue=="every") {
                        currentToken = Token.EVERY;
                    } else if (currentTokenValue=="let") {
                        currentToken = Token.LET;
                    }
                    break;

                case Token.NAME:
                    int candidate = -1;
                    if (currentTokenValue.equals("element")) {
                        candidate = Token.ELEMENT_QNAME;
                    } else if (currentTokenValue.equals("attribute")) {
                        candidate = Token.ATTRIBUTE_QNAME;
                    } else if (currentTokenValue.equals("processing-instruction")) {
                        candidate = Token.PI_QNAME;
                    }
                    if (candidate != -1) {
                        // <'element' QName '{'> constructor
                        // <'attribute' QName '{'> constructor
                        // <'processing-instruction' QName '{'> constructor

                        String qname = nextTokenValue;
                        String saveTokenValue = currentTokenValue;
                        int savePosition = inputOffset;
                        lookAhead();
                        if (nextToken == Token.LCURLY) {
                            currentToken = candidate;
                            currentTokenValue = qname;
                            lookAhead();
                            return;
                        } else {
                            // backtrack (we don't have 2-token lookahead; this is the
                            // only case where it's needed. So we backtrack instead.)
                            currentToken = Token.NAME;
                            currentTokenValue = saveTokenValue;
                            inputOffset = savePosition;
                            nextToken = Token.NAME;
                            nextTokenValue = qname;
                        }

                    }
                    String composite = currentTokenValue + ' ' + nextTokenValue;
                    Integer val = (Integer)Token.doubleKeywords.get(composite);
                    if (val==null) {
                        break;
                    } else {
                        currentToken = val.intValue();
                        currentTokenValue = composite;
                        lookAhead();
                        return;
                    }
                default:
                    // no action needed
            }
        }
    }

    /**
     * Force the current token to be treated as an operator if possible
     */

    public void treatCurrentAsOperator() {
        switch (currentToken) {
            case Token.NAME:
                int optype = getBinaryOp(currentTokenValue);
                if (optype!=Token.UNKNOWN) {
                    currentToken = optype;
                }
                break;
            case Token.STAR:
                currentToken = Token.MULT;
                break;
        }
    }

    /**
     * Look ahead by one token. This method does the real tokenization work.
     * The method is normally called internally, but the XQuery parser also
     * calls it to resume normal tokenization after dealing with pseudo-XML
     * syntax.
     * @throws XPathException if a lexical error occurs
     */
    public void lookAhead() throws XPathException {
        precedingToken = nextToken;
        nextTokenValue = null;
        nextTokenStartOffset = inputOffset;
        for (;;) {
            if (inputOffset >= inputLength) {
	            nextToken = Token.EOF;
	            return;
            }
            char c = input.charAt(inputOffset++);
            switch (c) {
            case '/':
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '/') {
	                inputOffset++;
	                nextToken = Token.SLSL;
	                return;
	            }
	            nextToken = Token.SLASH;
	            return;
            case ':':
	            if (inputOffset < inputLength) {
	                if (input.charAt(inputOffset) == ':') {
	                    inputOffset++;
	                    nextToken = Token.COLONCOLON;
	                    return;
	                } else if (input.charAt(inputOffset) == '=') {
                        nextToken = Token.ASSIGN;
                        inputOffset++;
                        return;
                    }
	            }
	            throw new XPathException("Unexpected colon at start of token");
            case '@':
	            nextToken = Token.AT;
	            return;
	        case '?':
	            nextToken = Token.QMARK;
	            return;
            case '[':
	            nextToken = Token.LSQB;
	            return;
            case ']':
	            nextToken = Token.RSQB;
	            return;
            case '{':
	            nextToken = Token.LCURLY;
	            return;
            case '}':
	            nextToken = Token.RCURLY;
	            return;
            case ';':
                nextToken = Token.SEMICOLON;
                state = DEFAULT_STATE;
                return;
            case '(':
                if (inputOffset < inputLength && input.charAt(inputOffset) == '#') {
	                inputOffset++;
                    int pragmaStart = inputOffset;
                    int nestingDepth = 1;
                    while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
                        if (input.charAt(inputOffset) == '\n') {
                            incrementLineNumber();
                        } else if (input.charAt(inputOffset) == '#' &&
                               input.charAt(inputOffset+1) == ')') {
                            nestingDepth--;
                            inputOffset++;
                        } else if (input.charAt(inputOffset) == '(' &&
                               input.charAt(inputOffset+1) == '#') {
                            nestingDepth++;
                            inputOffset++;
                        }
                        inputOffset++;
                    }
                    if (nestingDepth > 0) {
                        throw new XPathException("Unclosed XQuery pragma");
                    }
	                nextToken = Token.PRAGMA;
                    nextTokenValue = input.substring(pragmaStart, inputOffset-2 );
	                return;
	            }
	            if (inputOffset < inputLength && input.charAt(inputOffset) == ':') {
                    // XPath comment syntax is (: .... :)
                    // Comments may be nested, and may now be empty
                    inputOffset++;
                    int nestingDepth = 1;
                    while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
                        if (input.charAt(inputOffset) == '\n') {
                            incrementLineNumber();
                        } else if (input.charAt(inputOffset) == ':' &&
                                input.charAt(inputOffset+1) == ')') {
                            nestingDepth--;
                            inputOffset++;
                        } else if (input.charAt(inputOffset) == '(' &&
                               input.charAt(inputOffset+1) == ':') {
                            nestingDepth++;
                            inputOffset++;
                        }
                        inputOffset++;
                    }
                    if (nestingDepth > 0) {
                        throw new XPathException("Unclosed XPath comment");
                    }
                    lookAhead();
                } else {
	                nextToken = Token.LPAR;
	            }
	            return;
            case ')':
	            nextToken = Token.RPAR;
	            return;
            case '+':
	            nextToken = Token.PLUS;
	            return;
            case '-':
	            nextToken = Token.MINUS;   // not detected if part of a name
	            return;
            case '=':
	            nextToken = Token.EQUALS;
	            return;
            case '!':
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '=') {
	                inputOffset++;
	                nextToken = Token.NE;
	                return;
	            }
	            throw new XPathException("'!' without '='");
            case '*':
                // disambiguation of MULT and STAR is now done later
                if (inputOffset < inputLength
                        && input.charAt(inputOffset) == ':') {
                    inputOffset++;
                    nextToken = Token.SUFFIX;
                    // we leave the parser to get the following name as a separate
                    // token, but first check there's no intervening white space or comments
                    if (inputOffset < inputLength) {
                        char ahead = input.charAt(inputOffset);
                        if (" \r\t\n(".indexOf(ahead) >= 0) {
                            throw new XPathException("Whitespace and comments are not allowed after '*:'");
                        }
                    }
                    return;
                }
                nextToken = Token.STAR;
	            return;
            case ',':
	            nextToken = Token.COMMA;
	            return;
            case '$':
	            nextToken = Token.DOLLAR;
	            return;
            case '|':
	            nextToken = Token.UNION;
	            return;
            case '<':
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '=') {
	                inputOffset++;
	                nextToken = Token.LE;
	                return;
	            }
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '<') {
	                inputOffset++;
	                nextToken = Token.PRECEDES;
	                return;
	            }
	            nextToken = Token.LT;
	            return;
            case '>':
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '=') {
	                inputOffset++;
	                nextToken = Token.GE;
	                return;
	            }
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '>') {
	                inputOffset++;
	                nextToken = Token.FOLLOWS;
	                return;
	            }
	            nextToken = Token.GT;
	            return;
            case '.':
	            if (inputOffset < inputLength
	                    && input.charAt(inputOffset) == '.') {
	                inputOffset++;
	                nextToken = Token.DOTDOT;
	                return;
	            }
	            if (inputOffset == inputLength
	                    || input.charAt(inputOffset) < '0'
	                    || input.charAt(inputOffset) > '9') {
	                nextToken = Token.DOT;
	                return;
	            }
                // otherwise drop through: we have a number starting with a decimal point
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                // The logic here can return some tokens that are not legitimate numbers,
                // for example "23e" or "1.0e+". However, this will only happen if the XPath
                // expression as a whole is syntactically incorrect.
                // These errors will be caught by the numeric constructor.
                boolean allowE = true;
                boolean allowSign = false;
                boolean allowDot = true;
                boolean endOfNum = false;
            numloop:
                while (!endOfNum) {
	                switch (c) {
                        case '0': case '1': case '2': case '3': case '4':
                        case '5': case '6': case '7': case '8': case '9':
                            allowSign = false;
                            break;
                        case '.':
                            if (allowDot) {
                                allowDot = false;
                                allowSign = false;
                            } else {
                                inputOffset--;
                                break numloop;
                            }
                            break;
                        case 'E': case 'e':
                            if (allowE) {
                                allowSign = true;
                                allowE = false;
                            } else {
                                inputOffset--;
                                break numloop;
                            }
                            break;
                        case '+': case '-':
                            if (allowSign) {
                                allowSign = false;
                            } else {
                                inputOffset--;
                                break numloop;
                            }
                            break;
                        default:
                            if (('a' <= c && c <= 'z') || c>127) {
                                // this prevents the famous "10div 3"
                                throw new XPathException("Separator needed after numeric literal");
                            }
                            inputOffset--;
                            break numloop;
                    }
                    if (inputOffset >= inputLength) break;
                    c = input.charAt(inputOffset++);
	            }
	            nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
	            nextToken = Token.NUMBER;
	            return;
            case '"':
            case '\'':
                nextTokenValue = "";
                while (true) {
    	            inputOffset = input.indexOf(c, inputOffset);
    	            if (inputOffset < 0) {
    	                inputOffset = nextTokenStartOffset + 1;
    	                throw new XPathException("Unmatched quote in expression");
    	            }
    	            nextTokenValue += input.substring(nextTokenStartOffset + 1, inputOffset++);
    		        // look for doubled delimiters
    			    if (inputOffset < inputLength && input.charAt(inputOffset) == c) {
	                    nextTokenValue += c;
	                    nextTokenStartOffset = inputOffset;
	                    inputOffset++;
	                } else {
	                    break;
	                }
	            }

                // maintain line number if there are newlines in the string
                if (nextTokenValue.indexOf('\n') >= 0) {
                    for (int i = 0; i inputLength) {
            inputOffset = inputLength;
        }
        if (inputOffset < 34) {
            return input.substring(0, inputOffset);
        } else {
            return Whitespace.collapseWhitespace(
                    "..." + input.substring(inputOffset-30, inputOffset)).toString();
        }
    }

    /**
     * Get the line number of the current token
     * @return the line number
     */

    public int getLineNumber() {
        return lineNumber;
    }

    /**
     * Get the column number of the current token
     * @return the column number
     */

    public int getColumnNumber() {
        return (int)(getLineAndColumn(currentTokenStartOffset)&0x7fffffff);
    }

    /**
     * Get the line and column number corresponding to a given offset in the input expression,
     * as a long value with the line number in the top half
     * and the column number in the lower half
     * @param offset the byte offset in the expression
     * @return the line and column number, packed together
     */

    public long getLineAndColumn(int offset) {
        if (newlineOffsets==null) {
            return ((long)startLineNumber) << 32 | (long)offset;
        }
        for (int line=newlineOffsets.size()-1; line>=0; line--) {
            int nloffset = ((Integer)newlineOffsets.get(line)).intValue();
            if (offset > nloffset) {
                return ((long)(line+startLineNumber+1)<<32) | ((long)(offset - nloffset));
            }
        }
        return ((long)startLineNumber) << 32 | (long)(offset+1);
    }

    /**
     * Return the line number corresponding to a given offset in the expression
     * @param offset the byte offset in the expression
     * @return the line number
     */

    public int getLineNumber(int offset) {
        return (int)((getLineAndColumn(offset))>>32);
    }

    /**
     * Return the column number corresponding to a given offset in the expression
     * @param offset the byte offset in the expression
     * @return the column number
     */

    public int getColumnNumber(int offset) {
        return (int)((getLineAndColumn(offset))&0x7fffffff);
    }

}

/*

The following copyright notice is copied from the licence for xt, from which the
original version of this module was derived:
--------------------------------------------------------------------------------
Copyright (c) 1998, 1999 James Clark

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

Except as contained in this notice, the name of James Clark shall
not be used in advertising or otherwise to promote the sale, use or
other dealings in this Software without prior written authorization
from James Clark.
---------------------------------------------------------------------------
*/

//
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
//
// The Original Code is: all this file, other than the parts developed by James Clark as part of xt.
//
// The Initial Developer of the Original Code is Michael H. Kay.
//
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
//
// Contributor(s): none.
//