org.antlr.v4.runtime.Lexer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of antlr4-runtime Show documentation
Show all versions of antlr4-runtime Show documentation
The ANTLR 4 Runtime (Optimized)
/*
* Copyright (c) 2012 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD-3-Clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import org.antlr.v4.runtime.atn.LexerATNSimulator;
import org.antlr.v4.runtime.misc.IntegerStack;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.Tuple;
import org.antlr.v4.runtime.misc.Tuple2;
import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.List;
/** A lexer is recognizer that draws input symbols from a character stream.
* lexer grammars result in a subclass of this object. A Lexer object
* uses simplified match() and error recovery mechanisms in the interest
* of speed.
*/
public abstract class Lexer extends Recognizer
implements TokenSource
{
public static final int DEFAULT_MODE = 0;
public static final int MORE = -2;
public static final int SKIP = -3;
public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
public static final int HIDDEN = Token.HIDDEN_CHANNEL;
public static final int MIN_CHAR_VALUE = '\u0000';
public static final int MAX_CHAR_VALUE = '\uFFFE';
public CharStream _input;
protected Tuple2 extends TokenSource, CharStream> _tokenFactorySourcePair;
/** How to create token objects */
protected TokenFactory _factory = CommonTokenFactory.DEFAULT;
/** The goal of all lexer rules/methods is to create a token object.
* This is an instance variable as multiple rules may collaborate to
* create a single token. nextToken will return this object after
* matching lexer rule(s). If you subclass to allow multiple token
* emissions, then set this to the last token to be matched or
* something nonnull so that the auto token emit mechanism will not
* emit another token.
*/
public Token _token;
/** What character index in the stream did the current token start at?
* Needed, for example, to get the text for current token. Set at
* the start of nextToken.
*/
public int _tokenStartCharIndex = -1;
/** The line on which the first character of the token resides */
public int _tokenStartLine;
/** The character position of first character within the line */
public int _tokenStartCharPositionInLine;
/** Once we see EOF on char stream, next token will be EOF.
* If you have DONE : EOF ; then you see DONE EOF.
*/
public boolean _hitEOF;
/** The channel number for the current token */
public int _channel;
/** The token type for the current token */
public int _type;
public final IntegerStack _modeStack = new IntegerStack();
public int _mode = Lexer.DEFAULT_MODE;
/** You can set the text for the current token to override what is in
* the input char buffer. Use setText() or can set this instance var.
*/
public String _text;
public Lexer(CharStream input) {
this._input = input;
this._tokenFactorySourcePair = Tuple.create(this, input);
}
public void reset() {
// wack Lexer state variables
if ( _input !=null ) {
_input.seek(0); // rewind the input
}
_token = null;
_type = Token.INVALID_TYPE;
_channel = Token.DEFAULT_CHANNEL;
_tokenStartCharIndex = -1;
_tokenStartCharPositionInLine = -1;
_tokenStartLine = -1;
_text = null;
_hitEOF = false;
_mode = Lexer.DEFAULT_MODE;
_modeStack.clear();
getInterpreter().reset();
}
/** Return a token from this source; i.e., match a token on the char
* stream.
*/
@Override
public Token nextToken() {
if (_input == null) {
throw new IllegalStateException("nextToken requires a non-null input stream.");
}
// Mark start location in char stream so unbuffered streams are
// guaranteed at least have text of current token
int tokenStartMarker = _input.mark();
try{
outer:
while (true) {
if (_hitEOF) {
emitEOF();
return _token;
}
_token = null;
_channel = Token.DEFAULT_CHANNEL;
_tokenStartCharIndex = _input.index();
_tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine();
_tokenStartLine = getInterpreter().getLine();
_text = null;
do {
_type = Token.INVALID_TYPE;
// System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+
// " in mode "+mode+
// " at index "+input.index());
int ttype;
try {
ttype = getInterpreter().match(_input, _mode);
}
catch (LexerNoViableAltException e) {
notifyListeners(e); // report error
recover(e);
ttype = SKIP;
}
if ( _input.LA(1)==IntStream.EOF ) {
_hitEOF = true;
}
if ( _type == Token.INVALID_TYPE ) _type = ttype;
if ( _type ==SKIP ) {
continue outer;
}
} while ( _type ==MORE );
if ( _token == null ) emit();
return _token;
}
}
finally {
// make sure we release marker after match or
// unbuffered char stream will keep buffering
_input.release(tokenStartMarker);
}
}
/** Instruct the lexer to skip creating a token for current lexer rule
* and look for another token. nextToken() knows to keep looking when
* a lexer rule finishes with token set to SKIP_TOKEN. Recall that
* if token==null at end of any token rule, it creates one for you
* and emits it.
*/
public void skip() {
_type = SKIP;
}
public void more() {
_type = MORE;
}
public void mode(int m) {
_mode = m;
}
public void pushMode(int m) {
if ( LexerATNSimulator.debug ) System.out.println("pushMode "+m);
_modeStack.push(_mode);
mode(m);
}
public int popMode() {
if ( _modeStack.isEmpty() ) throw new EmptyStackException();
if ( LexerATNSimulator.debug ) System.out.println("popMode back to "+ _modeStack.peek());
mode( _modeStack.pop() );
return _mode;
}
@Override
public TokenFactory getTokenFactory() {
return _factory;
}
@Override
public void setTokenFactory(TokenFactory factory) {
this._factory = factory;
}
/** Set the char stream and reset the lexer */
public void setInputStream(CharStream input) {
this._input = null;
this._tokenFactorySourcePair = Tuple.create(this, _input);
reset();
this._input = input;
this._tokenFactorySourcePair = Tuple.create(this, _input);
}
@Override
public String getSourceName() {
return _input.getSourceName();
}
@Override
public CharStream getInputStream() {
return _input;
}
/** By default does not support multiple emits per nextToken invocation
* for efficiency reasons. Subclass and override this method, nextToken,
* and getToken (to push tokens into a list and pull from that list
* rather than a single variable as this implementation does).
*/
public void emit(Token token) {
//System.err.println("emit "+token);
this._token = token;
}
/** The standard method called to automatically emit a token at the
* outermost lexical rule. The token object should point into the
* char buffer start..stop. If there is a text override in 'text',
* use that to set the token's text. Override this method to emit
* custom Token objects or provide a new factory.
*/
public Token emit() {
Token t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, getCharIndex()-1,
_tokenStartLine, _tokenStartCharPositionInLine);
emit(t);
return t;
}
public Token emitEOF() {
int cpos = getCharPositionInLine();
int line = getLine();
Token eof = _factory.create(_tokenFactorySourcePair, Token.EOF, null, Token.DEFAULT_CHANNEL, _input.index(), _input.index()-1,
line, cpos);
emit(eof);
return eof;
}
@Override
public int getLine() {
return getInterpreter().getLine();
}
@Override
public int getCharPositionInLine() {
return getInterpreter().getCharPositionInLine();
}
public void setLine(int line) {
getInterpreter().setLine(line);
}
public void setCharPositionInLine(int charPositionInLine) {
getInterpreter().setCharPositionInLine(charPositionInLine);
}
/** What is the index of the current character of lookahead? */
public int getCharIndex() {
return _input.index();
}
/** Return the text matched so far for the current token or any
* text override.
*/
public String getText() {
if ( _text !=null ) {
return _text;
}
return getInterpreter().getText(_input);
}
/** Set the complete text of this token; it wipes any previous
* changes to the text.
*/
public void setText(String text) {
this._text = text;
}
/** Override if emitting multiple tokens. */
public Token getToken() { return _token; }
public void setToken(Token _token) {
this._token = _token;
}
public void setType(int ttype) {
_type = ttype;
}
public int getType() {
return _type;
}
public void setChannel(int channel) {
_channel = channel;
}
public int getChannel() {
return _channel;
}
public String[] getModeNames() {
return null;
}
/** Used to print out token names like ID during debugging and
* error reporting. The generated parsers implement a method
* that overrides this to point to their String[] tokenNames.
*/
@Override
@Deprecated
public String[] getTokenNames() {
return null;
}
/** Return a list of all Token objects in input char stream.
* Forces load of all tokens. Does not include EOF token.
*/
public List extends Token> getAllTokens() {
List tokens = new ArrayList();
Token t = nextToken();
while ( t.getType()!=Token.EOF ) {
tokens.add(t);
t = nextToken();
}
return tokens;
}
public void recover(LexerNoViableAltException e) {
if (_input.LA(1) != IntStream.EOF) {
// skip a char and try again
getInterpreter().consume(_input);
}
}
public void notifyListeners(LexerNoViableAltException e) {
String text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
String msg = "token recognition error at: '"+ getErrorDisplay(text) + "'";
ANTLRErrorListener super Integer> listener = getErrorListenerDispatch();
listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
}
public String getErrorDisplay(String s) {
StringBuilder buf = new StringBuilder();
for (char c : s.toCharArray()) {
buf.append(getErrorDisplay(c));
}
return buf.toString();
}
public String getErrorDisplay(int c) {
String s = String.valueOf((char)c);
switch ( c ) {
case Token.EOF :
s = "";
break;
case '\n' :
s = "\\n";
break;
case '\t' :
s = "\\t";
break;
case '\r' :
s = "\\r";
break;
}
return s;
}
public String getCharErrorDisplay(int c) {
String s = getErrorDisplay(c);
return "'"+s+"'";
}
/** Lexers can normally match any char in it's vocabulary after matching
* a token, so do the easy thing and just kill a character and hope
* it all works out. You can instead use the rule invocation stack
* to do sophisticated error recovery if you are in a fragment rule.
*/
public void recover(RecognitionException re) {
//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
//re.printStackTrace();
// TODO: Do we lose character or line position information?
_input.consume();
}
}