org.stringtemplate.v4.compiler.STLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of antlr-complete Show documentation
Complete distribution for ANTLR 3
There is a newer version: 3.5.3
/*
 * [The "BSD license"]
 *  Copyright (c) 2011 Terence Parr
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.stringtemplate.v4.compiler;

import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.misc.ErrorManager;
import org.stringtemplate.v4.misc.Misc;

import java.util.ArrayList;
import java.util.List;

/**
 * This class represents the tokenizer for templates. It operates in two modes:
 * inside and outside of expressions. It implements the {@link TokenSource}
 * interface so it can be used with ANTLR parsers. Outside of expressions, we
 * can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
 * (start of expression), {@link #RCURLY} (end of subtemplate), and
 * {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
 * tokens needed by {@link STParser}. From the parser's point of view, it can
 * treat a template as a simple stream of elements.
 * 
 * This class defines the token types and communicates these values to
 * {@code STParser.g} via {@code STLexer.tokens} file (which must remain
 * consistent).
 */
public class STLexer implements TokenSource {
    public static final char EOF = (char)-1;            // EOF char
    public static final int EOF_TYPE = CharStream.EOF;  // EOF token type

    /** We build {@code STToken} tokens instead of relying on {@link CommonToken}
	 *  so we can override {@link #toString()}. It just converts token types to
     *  token names like 23 to {@code "LDELIM"}.
     */
    public static class STToken extends CommonToken {
        public STToken(CharStream input, int type, int start, int stop) {
            super(input, type, DEFAULT_CHANNEL, start, stop);
        }
        public STToken(int type, String text) { super(type, text); }

		@Override
        public String toString() {
            String channelStr = "";
            if ( channel>0 ) {
                channelStr=",channel="+channel;
            }
            String txt = getText();
            if ( txt!=null ) txt = Misc.replaceEscapes(txt);
            else txt = "";
			String tokenName = null;
			if ( type==EOF_TYPE ) tokenName = "EOF";
			else tokenName = STParser.tokenNames[type];
			return "[@"+getTokenIndex()+","+start+":"+stop+"='"+txt+"',<"+ tokenName +">"+channelStr+","+line+":"+getCharPositionInLine()+"]";
        }
    }

    public static final Token SKIP = new STToken(-1, "");

    // must follow STLexer.tokens file that STParser.g loads
    public static final int RBRACK=17;
    public static final int LBRACK=16;
    public static final int ELSE=5;
    public static final int ELLIPSIS=11;
    public static final int LCURLY=20;
    public static final int BANG=10;
    public static final int EQUALS=12;
    public static final int TEXT=22;
    public static final int ID=25;
    public static final int SEMI=9;
    public static final int LPAREN=14;
    public static final int IF=4;
    public static final int ELSEIF=6;
    public static final int COLON=13;
    public static final int RPAREN=15;
    public static final int COMMA=18;
    public static final int RCURLY=21;
    public static final int ENDIF=7;
    public static final int RDELIM=24;
    public static final int SUPER=8;
    public static final int DOT=19;
    public static final int LDELIM=23;
    public static final int STRING=26;
	public static final int PIPE=28;
	public static final int OR=29;
	public static final int AND=30;
	public static final int INDENT=31;
    public static final int NEWLINE=32;
    public static final int AT=33;
    public static final int REGION_END=34;
	public static final int TRUE=35;
	public static final int FALSE=36;
	public static final int COMMENT=37;


    /** The char which delimits the start of an expression. */
    char delimiterStartChar = '<';
    /** The char which delimits the end of an expression. */
    char delimiterStopChar = '>';

	/**
	 * This keeps track of the current mode of the lexer. Are we inside or
	 * outside an ST expression?
	 */
    boolean scanningInsideExpr = false;

    /** To be able to properly track the inside/outside mode, we need to
     *  track how deeply nested we are in some templates. Otherwise, we
     *  know whether a '}' and the outermost subtemplate to send this
	 *  back to outside mode.
     */
	public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate

	ErrorManager errMgr;

	/** template embedded in a group file? this is the template */
	Token templateToken;

    CharStream input;
	/** current character */
    char c;

    /** When we started token, track initial coordinates so we can properly
     *  build token objects.
     */
    int startCharIndex;
    int startLine;
    int startCharPositionInLine;

    /** Our lexer routines might have to emit more than a single token. We
     *  buffer everything through this list.
     */
    List tokens = new ArrayList();

	public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); }

    public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) {
		this(errMgr, input, templateToken, '<', '>');
	}

	public STLexer(ErrorManager errMgr,
				   CharStream input,
				   Token templateToken,
				   char delimiterStartChar,
				   char delimiterStopChar)
	{
		this.errMgr = errMgr;
		this.input = input;
		c = (char)input.LA(1); // prime lookahead
		this.templateToken = templateToken;
		this.delimiterStartChar = delimiterStartChar;
		this.delimiterStopChar = delimiterStopChar;
	}

	@Override
	public Token nextToken() {
		Token t;
		if ( tokens.size()>0 ) { t = tokens.remove(0); }
		else t = _nextToken();
//		System.out.println(t);
		return t;
	}

    /** Consume if {@code x} is next character on the input stream.
	 */
    public void match(char x) {
        if ( c != x ) {
			NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "expecting '"+x+"', found '"+str(c)+"'", templateToken, e);
		}
		consume();
    }

    protected void consume() {
        input.consume();
        c = (char)input.LA(1);
    }

    public void emit(Token token) { tokens.add(token); }

    public Token _nextToken() {
		//System.out.println("nextToken: c="+(char)c+"@"+input.index());
        while ( true ) { // lets us avoid recursion when skipping stuff
            startCharIndex = input.index();
            startLine = input.getLine();
            startCharPositionInLine = input.getCharPositionInLine();

            if ( c==EOF ) return newToken(EOF_TYPE);
            Token t;
            if ( scanningInsideExpr ) t = inside();
            else t = outside();
            if ( t!=SKIP ) return t;
        }
    }

    protected Token outside() {
        if ( input.getCharPositionInLine()==0 && (c==' '||c=='\t') ) {
            while ( c==' ' || c=='\t' ) consume(); // scarf indent
            if ( c!=EOF ) return newToken(INDENT);
            return newToken(TEXT);
        }
        if ( c==delimiterStartChar ) {
            consume();
            if ( c=='!' ) return COMMENT();
            if ( c=='\\' ) return ESCAPE(); // <\\> <\uFFFF> <\n> etc...
            scanningInsideExpr = true;
            return newToken(LDELIM);
        }
        if ( c=='\r' ) { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n
        if ( c=='\n') {	consume(); return newToken(NEWLINE); }
        if ( c=='}' && subtemplateDepth>0 ) {
            scanningInsideExpr = true;
            subtemplateDepth--;
            consume();
            return newTokenFromPreviousChar(RCURLY);
        }
        return mTEXT();
    }

    protected Token inside() {
        while ( true ) {
            switch ( c ) {
                case ' ': case '\t': case '\n': case '\r':
					consume();
					return SKIP;
                case '.' :
					consume();
					if ( input.LA(1)=='.' && input.LA(2)=='.' ) {
						consume();
						match('.');
						return newToken(ELLIPSIS);
					}
					return newToken(DOT);
                case ',' : consume(); return newToken(COMMA);
				case ':' : consume(); return newToken(COLON);
				case ';' : consume(); return newToken(SEMI);
                case '(' : consume(); return newToken(LPAREN);
                case ')' : consume(); return newToken(RPAREN);
                case '[' : consume(); return newToken(LBRACK);
                case ']' : consume(); return newToken(RBRACK);
				case '=' : consume(); return newToken(EQUALS);
                case '!' : consume(); return newToken(BANG);
                case '@' :
                    consume();
                    if ( c=='e' && input.LA(2)=='n' && input.LA(3)=='d' ) {
                        consume(); consume(); consume();
                        return newToken(REGION_END);
                    }
                    return newToken(AT);
                case '"' : return mSTRING();
                case '&' : consume(); match('&'); return newToken(AND); // &&
                case '|' : consume(); match('|'); return newToken(OR); // ||
				case '{' : return subTemplate();
				default:
					if ( c==delimiterStopChar ) {
						consume();
						scanningInsideExpr =false;
						return newToken(RDELIM);
					}
                    if ( isIDStartLetter(c) ) {
						Token id = mID();
						String name = id.getText();
						if ( name.equals("if") ) return newToken(IF);
						else if ( name.equals("endif") ) return newToken(ENDIF);
						else if ( name.equals("else") ) return newToken(ELSE);
						else if ( name.equals("elseif") ) return newToken(ELSEIF);
						else if ( name.equals("super") ) return newToken(SUPER);
						else if ( name.equals("true") ) return newToken(TRUE);
						else if ( name.equals("false") ) return newToken(FALSE);
						return id;
					}
					RecognitionException re =
						new NoViableAltException("",0,0,input);
                    re.line = startLine;
                    re.charPositionInLine = startCharPositionInLine;
					errMgr.lexerError(input.getSourceName(), "invalid character '"+str(c)+"'", templateToken, re);
					if (c==EOF) {
						return newToken(EOF_TYPE);
					}
					consume();
            }
        }
    }

    Token subTemplate() {
        // look for "{ args ID (',' ID)* '|' ..."
		subtemplateDepth++;
        int m = input.mark();
        int curlyStartChar = startCharIndex;
        int curlyLine = startLine;
        int curlyPos = startCharPositionInLine;
        List argTokens = new ArrayList();
        consume();
		Token curly = newTokenFromPreviousChar(LCURLY);
        WS();
        argTokens.add( mID() );
        WS();
        while ( c==',' ) {
			consume();
            argTokens.add( newTokenFromPreviousChar(COMMA) );
            WS();
            argTokens.add( mID() );
            WS();
        }
        WS();
        if ( c=='|' ) {
			consume();
            argTokens.add( newTokenFromPreviousChar(PIPE) );
            if ( isWS(c) ) consume(); // ignore a single whitespace after |
            //System.out.println("matched args: "+argTokens);
            for (Token t : argTokens) emit(t);
			input.release(m);
			scanningInsideExpr = false;
			startCharIndex = curlyStartChar; // reset state
			startLine = curlyLine;
			startCharPositionInLine = curlyPos;
			return curly;
		}
		input.rewind(m);
		startCharIndex = curlyStartChar; // reset state
		startLine = curlyLine;
        startCharPositionInLine = curlyPos;
		consume();
		scanningInsideExpr = false;
        return curly;
    }

    Token ESCAPE() {
		startCharIndex = input.index();
		startCharPositionInLine = input.getCharPositionInLine();
		consume(); // kill \\
		if ( c=='u') return UNICODE();
		String text = null;
        switch ( c ) {
            case '\\' : LINEBREAK(); return SKIP;
			case 'n'  : text = "\n"; break;
			case 't'  : text = "\t"; break;
			case ' '  : text = " "; break;
            default :
                NoViableAltException e = new NoViableAltException("",0,0,input);
                errMgr.lexerError(input.getSourceName(), "invalid escaped char: '"+str(c)+"'", templateToken, e);
				consume();
				match(delimiterStopChar);
				return SKIP;
        }
        consume();
		Token t = newToken(TEXT, text, input.getCharPositionInLine()-2);
        match(delimiterStopChar);
        return t;
    }

    Token UNICODE() {
        consume();
        char[] chars = new char[4];
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
            errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[0] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[1] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[2] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[3] = c;
        // ESCAPE kills >
        char uc = (char)Integer.parseInt(new String(chars), 16);
        Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine()-6);
		consume();
		match(delimiterStopChar);
		return t;
    }

    Token mTEXT() {
		boolean modifiedText = false;
        StringBuilder buf = new StringBuilder();
        while ( c != EOF && c != delimiterStartChar ) {
			if ( c=='\r' || c=='\n') break;
			if ( c=='}' && subtemplateDepth>0 ) break;
            if ( c=='\\' ) {
                if ( input.LA(2)=='\\' ) { // convert \\ to \
                    consume(); consume(); buf.append('\\');
                    modifiedText = true;
                    continue;
                }
                if ( input.LA(2)==delimiterStartChar ||
					 input.LA(2)=='}' )
				{
                    modifiedText = true;
                    consume(); // toss out \ char
                    buf.append(c); consume();
                }
                else {
                    buf.append(c);
                    consume();
                }
                continue;
            }
            buf.append(c);
            consume();
        }
        if ( modifiedText )	return newToken(TEXT, buf.toString());
        else return newToken(TEXT);
    }

    /** 	 *  ID  : ('a'..'z'|'A'..'Z'|'_'|'/')
	 *        ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
	 *      ;
	 *  
	 */
    Token mID() {
        // called from subTemplate; so keep resetting position during speculation
        startCharIndex = input.index();
        startLine = input.getLine();
        startCharPositionInLine = input.getCharPositionInLine();
        consume();
        while ( isIDLetter(c) ) {
            consume();
        }
        return newToken(ID);
    }

    /** 	 *  STRING : '"'
	 *           (   '\\' '"'
	 *           |   '\\' ~'"'
	 *           |   ~('\\'|'"')
	 *           )*
	 *           '"'
	 *         ;
	 * 
	 */
    Token mSTRING() {
    	//{setText(getText().substring(1, getText().length()-1));}
        boolean sawEscape = false;
        StringBuilder buf = new StringBuilder();
        buf.append(c); consume();
        while ( c != '"' ) {
            if ( c=='\\' ) {
                sawEscape = true;
                consume();
				switch ( c ) {
					case 'n' : buf.append('\n'); break;
					case 'r' : buf.append('\r'); break;
					case 't' : buf.append('\t'); break;
                	default : buf.append(c); break;
				}
				consume();
                continue;
            }
            buf.append(c);
            consume();
			if ( c==EOF ) {
				RecognitionException re =
					new MismatchedTokenException((int)'"', input);
				re.line = input.getLine();
				re.charPositionInLine = input.getCharPositionInLine();
				errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re);
				break;
			}
        }
        buf.append(c);
        consume();
        if ( sawEscape ) return newToken(STRING, buf.toString());
        else return newToken(STRING);
    }

    void WS() {
        while ( c==' ' || c=='\t' || c=='\n' || c=='\r' ) consume();
    }

    Token COMMENT() {
        match('!');
        while ( !(c=='!' && input.LA(2)==delimiterStopChar) ) {
			if (c==EOF) {
				RecognitionException re =
					new MismatchedTokenException((int)'!', input);
				re.line = input.getLine();
				re.charPositionInLine = input.getCharPositionInLine();
				errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " +
					startLine+":"+startCharPositionInLine+": '!"+
					delimiterStopChar+"' missing", templateToken, re);
				break;
			}
			consume();
		}
        consume(); consume(); // grab !>
		return newToken(COMMENT);
    }

    void LINEBREAK() {
        match('\\'); // only kill 2nd \ as ESCAPE() kills first one
        match(delimiterStopChar);
        while ( c==' ' || c=='\t' ) consume(); // scarf WS after <\\>
		if ( c==EOF ) {
			RecognitionException re = new RecognitionException(input);
			re.line = input.getLine();
			re.charPositionInLine = input.getCharPositionInLine();
			errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>",
				              templateToken, re);
			return;
		}
		if ( c=='\r' ) consume();
        match('\n');
        while ( c==' ' || c=='\t' ) consume(); // scarf any indent
    }

    public static boolean isIDStartLetter(char c) { return isIDLetter(c); }
	public static boolean isIDLetter(char c) { return c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' || c=='_' || c=='/'; }
    public static boolean isWS(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; }
    public static boolean isUnicodeLetter(char c) { return c>='a'&&c<='f' || c>='A'&&c<='F' || c>='0'&&c<='9'; }

    public Token newToken(int ttype) {
        STToken t = new STToken(input, ttype, startCharIndex, input.index()-1);
        t.setLine(startLine);
        t.setCharPositionInLine(startCharPositionInLine);
		return t;
	}

    public Token newTokenFromPreviousChar(int ttype) {
        STToken t = new STToken(input, ttype, input.index()-1, input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(input.getCharPositionInLine()-1);
        return t;
    }

    public Token newToken(int ttype, String text, int pos) {
        STToken t = new STToken(ttype, text);
		t.setStartIndex(startCharIndex);
		t.setStopIndex(input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(pos);
        return t;
    }

	public Token newToken(int ttype, String text) {
		STToken t = new STToken(ttype, text);
        t.setStartIndex(startCharIndex);
        t.setStopIndex(input.index()-1);
		t.setLine(startLine);
		t.setCharPositionInLine(startCharPositionInLine);
		return t;
	}

//    public String getErrorHeader() {
//        return startLine+":"+startCharPositionInLine;
//    }
//
	@Override
    public String getSourceName() {
        return "no idea";
    }

	public static String str(int c) {
		if ( c==EOF ) return "";
		return String.valueOf((char)c);
	}
}