All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.stringtemplate.v4.compiler.STLexer Maven / Gradle / Ivy

/*
 * [The "BSD license"]
 *  Copyright (c) 2011 Terence Parr
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.stringtemplate.v4.compiler;

import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.misc.ErrorManager;
import org.stringtemplate.v4.misc.Misc;

import java.util.ArrayList;
import java.util.List;

/**
 * This class represents the tokenizer for templates. It operates in two modes:
 * inside and outside of expressions. It implements the {@link TokenSource}
 * interface so it can be used with ANTLR parsers. Outside of expressions, we
 * can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
 * (start of expression), {@link #RCURLY} (end of subtemplate), and
 * {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
 * tokens needed by {@link STParser}. From the parser's point of view, it can
 * treat a template as a simple stream of elements.
 * 

* This class defines the token types and communicates these values to * {@code STParser.g} via {@code STLexer.tokens} file (which must remain * consistent).

*/ public class STLexer implements TokenSource { public static final char EOF = (char)-1; // EOF char public static final int EOF_TYPE = CharStream.EOF; // EOF token type /** We build {@code STToken} tokens instead of relying on {@link CommonToken} * so we can override {@link #toString()}. It just converts token types to * token names like 23 to {@code "LDELIM"}. */ public static class STToken extends CommonToken { public STToken(CharStream input, int type, int start, int stop) { super(input, type, DEFAULT_CHANNEL, start, stop); } public STToken(int type, String text) { super(type, text); } @Override public String toString() { String channelStr = ""; if ( channel>0 ) { channelStr=",channel="+channel; } String txt = getText(); if ( txt!=null ) txt = Misc.replaceEscapes(txt); else txt = ""; String tokenName = null; if ( type==EOF_TYPE ) tokenName = "EOF"; else tokenName = STParser.tokenNames[type]; return "[@"+getTokenIndex()+","+start+":"+stop+"='"+txt+"',<"+ tokenName +">"+channelStr+","+line+":"+getCharPositionInLine()+"]"; } } public static final Token SKIP = new STToken(-1, ""); // must follow STLexer.tokens file that STParser.g loads public static final int RBRACK=17; public static final int LBRACK=16; public static final int ELSE=5; public static final int ELLIPSIS=11; public static final int LCURLY=20; public static final int BANG=10; public static final int EQUALS=12; public static final int TEXT=22; public static final int ID=25; public static final int SEMI=9; public static final int LPAREN=14; public static final int IF=4; public static final int ELSEIF=6; public static final int COLON=13; public static final int RPAREN=15; public static final int COMMA=18; public static final int RCURLY=21; public static final int ENDIF=7; public static final int RDELIM=24; public static final int SUPER=8; public static final int DOT=19; public static final int LDELIM=23; public static final int STRING=26; public static final int PIPE=28; public static final int OR=29; public static final int AND=30; public static final int INDENT=31; public static final int NEWLINE=32; public static final int AT=33; public static final int REGION_END=34; public static final int TRUE=35; public static final int FALSE=36; public static final int COMMENT=37; /** The char which delimits the start of an expression. */ char delimiterStartChar = '<'; /** The char which delimits the end of an expression. */ char delimiterStopChar = '>'; /** * This keeps track of the current mode of the lexer. Are we inside or * outside an ST expression? */ boolean scanningInsideExpr = false; /** To be able to properly track the inside/outside mode, we need to * track how deeply nested we are in some templates. Otherwise, we * know whether a '}' and the outermost subtemplate to send this * back to outside mode. */ public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate ErrorManager errMgr; /** template embedded in a group file? this is the template */ Token templateToken; CharStream input; /** current character */ char c; /** When we started token, track initial coordinates so we can properly * build token objects. */ int startCharIndex; int startLine; int startCharPositionInLine; /** Our lexer routines might have to emit more than a single token. We * buffer everything through this list. */ List tokens = new ArrayList(); public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); } public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) { this(errMgr, input, templateToken, '<', '>'); } public STLexer(ErrorManager errMgr, CharStream input, Token templateToken, char delimiterStartChar, char delimiterStopChar) { this.errMgr = errMgr; this.input = input; c = (char)input.LA(1); // prime lookahead this.templateToken = templateToken; this.delimiterStartChar = delimiterStartChar; this.delimiterStopChar = delimiterStopChar; } @Override public Token nextToken() { Token t; if ( tokens.size()>0 ) { t = tokens.remove(0); } else t = _nextToken(); // System.out.println(t); return t; } /** Consume if {@code x} is next character on the input stream. */ public void match(char x) { if ( c != x ) { NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "expecting '"+x+"', found '"+str(c)+"'", templateToken, e); } consume(); } protected void consume() { input.consume(); c = (char)input.LA(1); } public void emit(Token token) { tokens.add(token); } public Token _nextToken() { //System.out.println("nextToken: c="+(char)c+"@"+input.index()); while ( true ) { // lets us avoid recursion when skipping stuff startCharIndex = input.index(); startLine = input.getLine(); startCharPositionInLine = input.getCharPositionInLine(); if ( c==EOF ) return newToken(EOF_TYPE); Token t; if ( scanningInsideExpr ) t = inside(); else t = outside(); if ( t!=SKIP ) return t; } } protected Token outside() { if ( input.getCharPositionInLine()==0 && (c==' '||c=='\t') ) { while ( c==' ' || c=='\t' ) consume(); // scarf indent if ( c!=EOF ) return newToken(INDENT); return newToken(TEXT); } if ( c==delimiterStartChar ) { consume(); if ( c=='!' ) return COMMENT(); if ( c=='\\' ) return ESCAPE(); // <\\> <\uFFFF> <\n> etc... scanningInsideExpr = true; return newToken(LDELIM); } if ( c=='\r' ) { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n if ( c=='\n') { consume(); return newToken(NEWLINE); } if ( c=='}' && subtemplateDepth>0 ) { scanningInsideExpr = true; subtemplateDepth--; consume(); return newTokenFromPreviousChar(RCURLY); } return mTEXT(); } protected Token inside() { while ( true ) { switch ( c ) { case ' ': case '\t': case '\n': case '\r': consume(); return SKIP; case '.' : consume(); if ( input.LA(1)=='.' && input.LA(2)=='.' ) { consume(); match('.'); return newToken(ELLIPSIS); } return newToken(DOT); case ',' : consume(); return newToken(COMMA); case ':' : consume(); return newToken(COLON); case ';' : consume(); return newToken(SEMI); case '(' : consume(); return newToken(LPAREN); case ')' : consume(); return newToken(RPAREN); case '[' : consume(); return newToken(LBRACK); case ']' : consume(); return newToken(RBRACK); case '=' : consume(); return newToken(EQUALS); case '!' : consume(); return newToken(BANG); case '@' : consume(); if ( c=='e' && input.LA(2)=='n' && input.LA(3)=='d' ) { consume(); consume(); consume(); return newToken(REGION_END); } return newToken(AT); case '"' : return mSTRING(); case '&' : consume(); match('&'); return newToken(AND); // && case '|' : consume(); match('|'); return newToken(OR); // || case '{' : return subTemplate(); default: if ( c==delimiterStopChar ) { consume(); scanningInsideExpr =false; return newToken(RDELIM); } if ( isIDStartLetter(c) ) { Token id = mID(); String name = id.getText(); if ( name.equals("if") ) return newToken(IF); else if ( name.equals("endif") ) return newToken(ENDIF); else if ( name.equals("else") ) return newToken(ELSE); else if ( name.equals("elseif") ) return newToken(ELSEIF); else if ( name.equals("super") ) return newToken(SUPER); else if ( name.equals("true") ) return newToken(TRUE); else if ( name.equals("false") ) return newToken(FALSE); return id; } RecognitionException re = new NoViableAltException("",0,0,input); re.line = startLine; re.charPositionInLine = startCharPositionInLine; errMgr.lexerError(input.getSourceName(), "invalid character '"+str(c)+"'", templateToken, re); if (c==EOF) { return newToken(EOF_TYPE); } consume(); } } } Token subTemplate() { // look for "{ args ID (',' ID)* '|' ..." subtemplateDepth++; int m = input.mark(); int curlyStartChar = startCharIndex; int curlyLine = startLine; int curlyPos = startCharPositionInLine; List argTokens = new ArrayList(); consume(); Token curly = newTokenFromPreviousChar(LCURLY); WS(); argTokens.add( mID() ); WS(); while ( c==',' ) { consume(); argTokens.add( newTokenFromPreviousChar(COMMA) ); WS(); argTokens.add( mID() ); WS(); } WS(); if ( c=='|' ) { consume(); argTokens.add( newTokenFromPreviousChar(PIPE) ); if ( isWS(c) ) consume(); // ignore a single whitespace after | //System.out.println("matched args: "+argTokens); for (Token t : argTokens) emit(t); input.release(m); scanningInsideExpr = false; startCharIndex = curlyStartChar; // reset state startLine = curlyLine; startCharPositionInLine = curlyPos; return curly; } input.rewind(m); startCharIndex = curlyStartChar; // reset state startLine = curlyLine; startCharPositionInLine = curlyPos; consume(); scanningInsideExpr = false; return curly; } Token ESCAPE() { startCharIndex = input.index(); startCharPositionInLine = input.getCharPositionInLine(); consume(); // kill \\ if ( c=='u') return UNICODE(); String text = null; switch ( c ) { case '\\' : LINEBREAK(); return SKIP; case 'n' : text = "\n"; break; case 't' : text = "\t"; break; case ' ' : text = " "; break; default : NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "invalid escaped char: '"+str(c)+"'", templateToken, e); consume(); match(delimiterStopChar); return SKIP; } consume(); Token t = newToken(TEXT, text, input.getCharPositionInLine()-2); match(delimiterStopChar); return t; } Token UNICODE() { consume(); char[] chars = new char[4]; if ( !isUnicodeLetter(c) ) { NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e); } chars[0] = c; consume(); if ( !isUnicodeLetter(c) ) { NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e); } chars[1] = c; consume(); if ( !isUnicodeLetter(c) ) { NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e); } chars[2] = c; consume(); if ( !isUnicodeLetter(c) ) { NoViableAltException e = new NoViableAltException("",0,0,input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e); } chars[3] = c; // ESCAPE kills > char uc = (char)Integer.parseInt(new String(chars), 16); Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine()-6); consume(); match(delimiterStopChar); return t; } Token mTEXT() { boolean modifiedText = false; StringBuilder buf = new StringBuilder(); while ( c != EOF && c != delimiterStartChar ) { if ( c=='\r' || c=='\n') break; if ( c=='}' && subtemplateDepth>0 ) break; if ( c=='\\' ) { if ( input.LA(2)=='\\' ) { // convert \\ to \ consume(); consume(); buf.append('\\'); modifiedText = true; continue; } if ( input.LA(2)==delimiterStartChar || input.LA(2)=='}' ) { modifiedText = true; consume(); // toss out \ char buf.append(c); consume(); } else { buf.append(c); consume(); } continue; } buf.append(c); consume(); } if ( modifiedText ) return newToken(TEXT, buf.toString()); else return newToken(TEXT); } /**
	 *  ID  : ('a'..'z'|'A'..'Z'|'_'|'/')
	 *        ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
	 *      ;
	 *  
*/ Token mID() { // called from subTemplate; so keep resetting position during speculation startCharIndex = input.index(); startLine = input.getLine(); startCharPositionInLine = input.getCharPositionInLine(); consume(); while ( isIDLetter(c) ) { consume(); } return newToken(ID); } /**
	 *  STRING : '"'
	 *           (   '\\' '"'
	 *           |   '\\' ~'"'
	 *           |   ~('\\'|'"')
	 *           )*
	 *           '"'
	 *         ;
	 * 
*/ Token mSTRING() { //{setText(getText().substring(1, getText().length()-1));} boolean sawEscape = false; StringBuilder buf = new StringBuilder(); buf.append(c); consume(); while ( c != '"' ) { if ( c=='\\' ) { sawEscape = true; consume(); switch ( c ) { case 'n' : buf.append('\n'); break; case 'r' : buf.append('\r'); break; case 't' : buf.append('\t'); break; default : buf.append(c); break; } consume(); continue; } buf.append(c); consume(); if ( c==EOF ) { RecognitionException re = new MismatchedTokenException((int)'"', input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re); break; } } buf.append(c); consume(); if ( sawEscape ) return newToken(STRING, buf.toString()); else return newToken(STRING); } void WS() { while ( c==' ' || c=='\t' || c=='\n' || c=='\r' ) consume(); } Token COMMENT() { match('!'); while ( !(c=='!' && input.LA(2)==delimiterStopChar) ) { if (c==EOF) { RecognitionException re = new MismatchedTokenException((int)'!', input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " + startLine+":"+startCharPositionInLine+": '!"+ delimiterStopChar+"' missing", templateToken, re); break; } consume(); } consume(); consume(); // grab !> return newToken(COMMENT); } void LINEBREAK() { match('\\'); // only kill 2nd \ as ESCAPE() kills first one match(delimiterStopChar); while ( c==' ' || c=='\t' ) consume(); // scarf WS after <\\> if ( c==EOF ) { RecognitionException re = new RecognitionException(input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>", templateToken, re); return; } if ( c=='\r' ) consume(); match('\n'); while ( c==' ' || c=='\t' ) consume(); // scarf any indent } public static boolean isIDStartLetter(char c) { return isIDLetter(c); } public static boolean isIDLetter(char c) { return c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' || c=='_' || c=='/'; } public static boolean isWS(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; } public static boolean isUnicodeLetter(char c) { return c>='a'&&c<='f' || c>='A'&&c<='F' || c>='0'&&c<='9'; } public Token newToken(int ttype) { STToken t = new STToken(input, ttype, startCharIndex, input.index()-1); t.setLine(startLine); t.setCharPositionInLine(startCharPositionInLine); return t; } public Token newTokenFromPreviousChar(int ttype) { STToken t = new STToken(input, ttype, input.index()-1, input.index()-1); t.setLine(input.getLine()); t.setCharPositionInLine(input.getCharPositionInLine()-1); return t; } public Token newToken(int ttype, String text, int pos) { STToken t = new STToken(ttype, text); t.setStartIndex(startCharIndex); t.setStopIndex(input.index()-1); t.setLine(input.getLine()); t.setCharPositionInLine(pos); return t; } public Token newToken(int ttype, String text) { STToken t = new STToken(ttype, text); t.setStartIndex(startCharIndex); t.setStopIndex(input.index()-1); t.setLine(startLine); t.setCharPositionInLine(startCharPositionInLine); return t; } // public String getErrorHeader() { // return startLine+":"+startCharPositionInLine; // } // @Override public String getSourceName() { return "no idea"; } public static String str(int c) { if ( c==EOF ) return ""; return String.valueOf((char)c); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy