org.stringtemplate.v4.compiler.STLexer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of virtdata-lib-realer Show documentation
Show all versions of virtdata-lib-realer Show documentation
With inspiration from other libraries
/*
* [The "BSD license"]
* Copyright (c) 2011 Terence Parr
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.stringtemplate.v4.compiler;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.misc.ErrorManager;
import org.stringtemplate.v4.misc.Misc;
import java.util.ArrayList;
import java.util.List;
/**
* This class represents the tokenizer for templates. It operates in two modes:
* inside and outside of expressions. It implements the {@link TokenSource}
* interface so it can be used with ANTLR parsers. Outside of expressions, we
* can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
* (start of expression), {@link #RCURLY} (end of subtemplate), and
* {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
* tokens needed by {@link STParser}. From the parser's point of view, it can
* treat a template as a simple stream of elements.
*
* This class defines the token types and communicates these values to
* {@code STParser.g} via {@code STLexer.tokens} file (which must remain
* consistent).
*/
public class STLexer implements TokenSource {
public static final char EOF = (char)-1; // EOF char
public static final int EOF_TYPE = CharStream.EOF; // EOF token type
/** We build {@code STToken} tokens instead of relying on {@link CommonToken}
* so we can override {@link #toString()}. It just converts token types to
* token names like 23 to {@code "LDELIM"}.
*/
public static class STToken extends CommonToken {
public STToken(CharStream input, int type, int start, int stop) {
super(input, type, DEFAULT_CHANNEL, start, stop);
}
public STToken(int type, String text) { super(type, text); }
@Override
public String toString() {
String channelStr = "";
if ( channel>0 ) {
channelStr=",channel="+channel;
}
String txt = getText();
if ( txt!=null ) txt = Misc.replaceEscapes(txt);
else txt = "";
String tokenName = null;
if ( type==EOF_TYPE ) tokenName = "EOF";
else tokenName = STParser.tokenNames[type];
return "[@"+getTokenIndex()+","+start+":"+stop+"='"+txt+"',<"+ tokenName +">"+channelStr+","+line+":"+getCharPositionInLine()+"]";
}
}
public static final Token SKIP = new STToken(-1, "");
// must follow STLexer.tokens file that STParser.g loads
public static final int RBRACK=17;
public static final int LBRACK=16;
public static final int ELSE=5;
public static final int ELLIPSIS=11;
public static final int LCURLY=20;
public static final int BANG=10;
public static final int EQUALS=12;
public static final int TEXT=22;
public static final int ID=25;
public static final int SEMI=9;
public static final int LPAREN=14;
public static final int IF=4;
public static final int ELSEIF=6;
public static final int COLON=13;
public static final int RPAREN=15;
public static final int COMMA=18;
public static final int RCURLY=21;
public static final int ENDIF=7;
public static final int RDELIM=24;
public static final int SUPER=8;
public static final int DOT=19;
public static final int LDELIM=23;
public static final int STRING=26;
public static final int PIPE=28;
public static final int OR=29;
public static final int AND=30;
public static final int INDENT=31;
public static final int NEWLINE=32;
public static final int AT=33;
public static final int REGION_END=34;
public static final int TRUE=35;
public static final int FALSE=36;
public static final int COMMENT=37;
/** The char which delimits the start of an expression. */
char delimiterStartChar = '<';
/** The char which delimits the end of an expression. */
char delimiterStopChar = '>';
/**
* This keeps track of the current mode of the lexer. Are we inside or
* outside an ST expression?
*/
boolean scanningInsideExpr = false;
/** To be able to properly track the inside/outside mode, we need to
* track how deeply nested we are in some templates. Otherwise, we
* know whether a '}'
and the outermost subtemplate to send this
* back to outside mode.
*/
public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate
ErrorManager errMgr;
/** template embedded in a group file? this is the template */
Token templateToken;
CharStream input;
/** current character */
char c;
/** When we started token, track initial coordinates so we can properly
* build token objects.
*/
int startCharIndex;
int startLine;
int startCharPositionInLine;
/** Our lexer routines might have to emit more than a single token. We
* buffer everything through this list.
*/
List tokens = new ArrayList();
public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); }
public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) {
this(errMgr, input, templateToken, '<', '>');
}
public STLexer(ErrorManager errMgr,
CharStream input,
Token templateToken,
char delimiterStartChar,
char delimiterStopChar)
{
this.errMgr = errMgr;
this.input = input;
c = (char)input.LA(1); // prime lookahead
this.templateToken = templateToken;
this.delimiterStartChar = delimiterStartChar;
this.delimiterStopChar = delimiterStopChar;
}
@Override
public Token nextToken() {
Token t;
if ( tokens.size()>0 ) { t = tokens.remove(0); }
else t = _nextToken();
// System.out.println(t);
return t;
}
/** Consume if {@code x} is next character on the input stream.
*/
public void match(char x) {
if ( c != x ) {
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "expecting '"+x+"', found '"+str(c)+"'", templateToken, e);
}
consume();
}
protected void consume() {
input.consume();
c = (char)input.LA(1);
}
public void emit(Token token) { tokens.add(token); }
public Token _nextToken() {
//System.out.println("nextToken: c="+(char)c+"@"+input.index());
while ( true ) { // lets us avoid recursion when skipping stuff
startCharIndex = input.index();
startLine = input.getLine();
startCharPositionInLine = input.getCharPositionInLine();
if ( c==EOF ) return newToken(EOF_TYPE);
Token t;
if ( scanningInsideExpr ) t = inside();
else t = outside();
if ( t!=SKIP ) return t;
}
}
protected Token outside() {
if ( input.getCharPositionInLine()==0 && (c==' '||c=='\t') ) {
while ( c==' ' || c=='\t' ) consume(); // scarf indent
if ( c!=EOF ) return newToken(INDENT);
return newToken(TEXT);
}
if ( c==delimiterStartChar ) {
consume();
if ( c=='!' ) return COMMENT();
if ( c=='\\' ) return ESCAPE(); // <\\> <\uFFFF> <\n> etc...
scanningInsideExpr = true;
return newToken(LDELIM);
}
if ( c=='\r' ) { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n
if ( c=='\n') { consume(); return newToken(NEWLINE); }
if ( c=='}' && subtemplateDepth>0 ) {
scanningInsideExpr = true;
subtemplateDepth--;
consume();
return newTokenFromPreviousChar(RCURLY);
}
return mTEXT();
}
protected Token inside() {
while ( true ) {
switch ( c ) {
case ' ': case '\t': case '\n': case '\r':
consume();
return SKIP;
case '.' :
consume();
if ( input.LA(1)=='.' && input.LA(2)=='.' ) {
consume();
match('.');
return newToken(ELLIPSIS);
}
return newToken(DOT);
case ',' : consume(); return newToken(COMMA);
case ':' : consume(); return newToken(COLON);
case ';' : consume(); return newToken(SEMI);
case '(' : consume(); return newToken(LPAREN);
case ')' : consume(); return newToken(RPAREN);
case '[' : consume(); return newToken(LBRACK);
case ']' : consume(); return newToken(RBRACK);
case '=' : consume(); return newToken(EQUALS);
case '!' : consume(); return newToken(BANG);
case '@' :
consume();
if ( c=='e' && input.LA(2)=='n' && input.LA(3)=='d' ) {
consume(); consume(); consume();
return newToken(REGION_END);
}
return newToken(AT);
case '"' : return mSTRING();
case '&' : consume(); match('&'); return newToken(AND); // &&
case '|' : consume(); match('|'); return newToken(OR); // ||
case '{' : return subTemplate();
default:
if ( c==delimiterStopChar ) {
consume();
scanningInsideExpr =false;
return newToken(RDELIM);
}
if ( isIDStartLetter(c) ) {
Token id = mID();
String name = id.getText();
if ( name.equals("if") ) return newToken(IF);
else if ( name.equals("endif") ) return newToken(ENDIF);
else if ( name.equals("else") ) return newToken(ELSE);
else if ( name.equals("elseif") ) return newToken(ELSEIF);
else if ( name.equals("super") ) return newToken(SUPER);
else if ( name.equals("true") ) return newToken(TRUE);
else if ( name.equals("false") ) return newToken(FALSE);
return id;
}
RecognitionException re =
new NoViableAltException("",0,0,input);
re.line = startLine;
re.charPositionInLine = startCharPositionInLine;
errMgr.lexerError(input.getSourceName(), "invalid character '"+str(c)+"'", templateToken, re);
if (c==EOF) {
return newToken(EOF_TYPE);
}
consume();
}
}
}
Token subTemplate() {
// look for "{ args ID (',' ID)* '|' ..."
subtemplateDepth++;
int m = input.mark();
int curlyStartChar = startCharIndex;
int curlyLine = startLine;
int curlyPos = startCharPositionInLine;
List argTokens = new ArrayList();
consume();
Token curly = newTokenFromPreviousChar(LCURLY);
WS();
argTokens.add( mID() );
WS();
while ( c==',' ) {
consume();
argTokens.add( newTokenFromPreviousChar(COMMA) );
WS();
argTokens.add( mID() );
WS();
}
WS();
if ( c=='|' ) {
consume();
argTokens.add( newTokenFromPreviousChar(PIPE) );
if ( isWS(c) ) consume(); // ignore a single whitespace after |
//System.out.println("matched args: "+argTokens);
for (Token t : argTokens) emit(t);
input.release(m);
scanningInsideExpr = false;
startCharIndex = curlyStartChar; // reset state
startLine = curlyLine;
startCharPositionInLine = curlyPos;
return curly;
}
input.rewind(m);
startCharIndex = curlyStartChar; // reset state
startLine = curlyLine;
startCharPositionInLine = curlyPos;
consume();
scanningInsideExpr = false;
return curly;
}
Token ESCAPE() {
startCharIndex = input.index();
startCharPositionInLine = input.getCharPositionInLine();
consume(); // kill \\
if ( c=='u') return UNICODE();
String text = null;
switch ( c ) {
case '\\' : LINEBREAK(); return SKIP;
case 'n' : text = "\n"; break;
case 't' : text = "\t"; break;
case ' ' : text = " "; break;
default :
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "invalid escaped char: '"+str(c)+"'", templateToken, e);
consume();
match(delimiterStopChar);
return SKIP;
}
consume();
Token t = newToken(TEXT, text, input.getCharPositionInLine()-2);
match(delimiterStopChar);
return t;
}
Token UNICODE() {
consume();
char[] chars = new char[4];
if ( !isUnicodeLetter(c) ) {
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
}
chars[0] = c;
consume();
if ( !isUnicodeLetter(c) ) {
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
}
chars[1] = c;
consume();
if ( !isUnicodeLetter(c) ) {
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
}
chars[2] = c;
consume();
if ( !isUnicodeLetter(c) ) {
NoViableAltException e = new NoViableAltException("",0,0,input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
}
chars[3] = c;
// ESCAPE kills >
char uc = (char)Integer.parseInt(new String(chars), 16);
Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine()-6);
consume();
match(delimiterStopChar);
return t;
}
Token mTEXT() {
boolean modifiedText = false;
StringBuilder buf = new StringBuilder();
while ( c != EOF && c != delimiterStartChar ) {
if ( c=='\r' || c=='\n') break;
if ( c=='}' && subtemplateDepth>0 ) break;
if ( c=='\\' ) {
if ( input.LA(2)=='\\' ) { // convert \\ to \
consume(); consume(); buf.append('\\');
modifiedText = true;
continue;
}
if ( input.LA(2)==delimiterStartChar ||
input.LA(2)=='}' )
{
modifiedText = true;
consume(); // toss out \ char
buf.append(c); consume();
}
else {
buf.append(c);
consume();
}
continue;
}
buf.append(c);
consume();
}
if ( modifiedText ) return newToken(TEXT, buf.toString());
else return newToken(TEXT);
}
/**
* ID : ('a'..'z'|'A'..'Z'|'_'|'/')
* ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
* ;
*
*/
Token mID() {
// called from subTemplate; so keep resetting position during speculation
startCharIndex = input.index();
startLine = input.getLine();
startCharPositionInLine = input.getCharPositionInLine();
consume();
while ( isIDLetter(c) ) {
consume();
}
return newToken(ID);
}
/**
* STRING : '"'
* ( '\\' '"'
* | '\\' ~'"'
* | ~('\\'|'"')
* )*
* '"'
* ;
*
*/
Token mSTRING() {
//{setText(getText().substring(1, getText().length()-1));}
boolean sawEscape = false;
StringBuilder buf = new StringBuilder();
buf.append(c); consume();
while ( c != '"' ) {
if ( c=='\\' ) {
sawEscape = true;
consume();
switch ( c ) {
case 'n' : buf.append('\n'); break;
case 'r' : buf.append('\r'); break;
case 't' : buf.append('\t'); break;
default : buf.append(c); break;
}
consume();
continue;
}
buf.append(c);
consume();
if ( c==EOF ) {
RecognitionException re =
new MismatchedTokenException((int)'"', input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re);
break;
}
}
buf.append(c);
consume();
if ( sawEscape ) return newToken(STRING, buf.toString());
else return newToken(STRING);
}
void WS() {
while ( c==' ' || c=='\t' || c=='\n' || c=='\r' ) consume();
}
Token COMMENT() {
match('!');
while ( !(c=='!' && input.LA(2)==delimiterStopChar) ) {
if (c==EOF) {
RecognitionException re =
new MismatchedTokenException((int)'!', input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " +
startLine+":"+startCharPositionInLine+": '!"+
delimiterStopChar+"' missing", templateToken, re);
break;
}
consume();
}
consume(); consume(); // grab !>
return newToken(COMMENT);
}
void LINEBREAK() {
match('\\'); // only kill 2nd \ as ESCAPE() kills first one
match(delimiterStopChar);
while ( c==' ' || c=='\t' ) consume(); // scarf WS after <\\>
if ( c==EOF ) {
RecognitionException re = new RecognitionException(input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>",
templateToken, re);
return;
}
if ( c=='\r' ) consume();
match('\n');
while ( c==' ' || c=='\t' ) consume(); // scarf any indent
}
public static boolean isIDStartLetter(char c) { return isIDLetter(c); }
public static boolean isIDLetter(char c) { return c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' || c=='_' || c=='/'; }
public static boolean isWS(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; }
public static boolean isUnicodeLetter(char c) { return c>='a'&&c<='f' || c>='A'&&c<='F' || c>='0'&&c<='9'; }
public Token newToken(int ttype) {
STToken t = new STToken(input, ttype, startCharIndex, input.index()-1);
t.setLine(startLine);
t.setCharPositionInLine(startCharPositionInLine);
return t;
}
public Token newTokenFromPreviousChar(int ttype) {
STToken t = new STToken(input, ttype, input.index()-1, input.index()-1);
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-1);
return t;
}
public Token newToken(int ttype, String text, int pos) {
STToken t = new STToken(ttype, text);
t.setStartIndex(startCharIndex);
t.setStopIndex(input.index()-1);
t.setLine(input.getLine());
t.setCharPositionInLine(pos);
return t;
}
public Token newToken(int ttype, String text) {
STToken t = new STToken(ttype, text);
t.setStartIndex(startCharIndex);
t.setStopIndex(input.index()-1);
t.setLine(startLine);
t.setCharPositionInLine(startCharPositionInLine);
return t;
}
// public String getErrorHeader() {
// return startLine+":"+startCharPositionInLine;
// }
//
@Override
public String getSourceName() {
return "no idea";
}
public static String str(int c) {
if ( c==EOF ) return "";
return String.valueOf((char)c);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy