commonMain.org.antlr.v4.kotlinruntime.Lexer.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of antlr-kotlin-runtime-jvm Show documentation
Show all versions of antlr-kotlin-runtime-jvm Show documentation
Kotlin multiplatform port of ANTLR
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.kotlinruntime
import com.strumenta.kotlinmultiplatform.asCharArray
import com.strumenta.kotlinmultiplatform.outMessage
import org.antlr.v4.kotlinruntime.atn.LexerATNSimulator
import org.antlr.v4.kotlinruntime.misc.IntegerStack
import org.antlr.v4.kotlinruntime.misc.Interval
/** A lexer is recognizer that draws input symbols from a character stream.
* lexer grammars result in a subclass of this object. A Lexer object
* uses simplified match() and error recovery mechanisms in the interest
* of speed.
*/
abstract class Lexer : Recognizer, TokenSource {
// /** Set the char stream and reset the lexer */
// fun setInputStream(input: IntStream) {
// this.inputStream = null
// this._tokenFactorySourcePair = Pair(this, inputStream)
// reset()
// this.inputStream = input as CharStream
// this._tokenFactorySourcePair = Pair(this, readInputStream())
// }
override fun assignInputStream(newValue: IntStream?) {
assignInputStream(newValue as CharStream?)
}
open fun assignInputStream(input: CharStream?) {
this.inputStream = null
this._tokenFactorySourcePair = Pair(this, null)
reset()
this.inputStream = input as CharStream
this._tokenFactorySourcePair = Pair(this, readInputStream())
}
override fun readInputStream(): CharStream? {
return this.inputStream as CharStream?
}
protected var _tokenFactorySourcePair: Pair? = null
/** How to create token objects */
override var tokenFactory: TokenFactory = CommonTokenFactory.DEFAULT
/** The goal of all lexer rules/methods is to create a token object.
* This is an instance variable as multiple rules may collaborate to
* create a single token. nextToken will return this object after
* matching lexer rule(s). If you subclass to allow multiple token
* emissions, then set this to the last token to be matched or
* something nonnull so that the auto token emit mechanism will not
* emit another token.
*/
/** Override if emitting multiple tokens. */
var token: Token? = null
/** What character index in the stream did the current token start at?
* Needed, for example, to get the text for current token. Set at
* the start of nextToken.
*/
var _tokenStartCharIndex = -1
/** The line on which the first character of the token resides */
var _tokenStartLine: Int = 0
/** The character position of first character within the line */
var _tokenStartCharPositionInLine: Int = 0
/** Once we see EOF on char stream, next token will be EOF.
* If you have DONE : EOF ; then you see DONE EOF.
*/
var _hitEOF: Boolean = false
/** The channel number for the current token */
var channel: Int = 0
/** The token type for the current token */
var type: Int = 0
val _modeStack = IntegerStack()
var _mode = Lexer.DEFAULT_MODE
/** You can set the text for the current token to override what is in
* the input char buffer. Use setText() or can set this instance var.
*/
var _text: String? = null
override val sourceName: String
get() = inputStream!!.sourceName!!
override var line: Int
get() = interpreter!!.line
set(line) {
interpreter!!.line = line
}
override var charPositionInLine: Int
get() = interpreter!!.charPositionInLine
set(charPositionInLine) {
interpreter!!.charPositionInLine = charPositionInLine
}
/** What is the index of the current character of lookahead? */
val charIndex: Int
get() = inputStream!!.index()
/** Return the text matched so far for the current token or any
* text override.
*/
/** Set the complete text of this token; it wipes any previous
* changes to the text.
*/
var text: String
get() = if (_text != null) {
_text!!
} else interpreter!!.getText(readInputStream()!!)
set(text) {
this._text = text
}
open val channelNames: Array?
get() = null
open val modeNames: Array?
get() = null
/** Used to print out token names like ID during debugging and
* error reporting. The generated parsers implement a method
* that overrides this to point to their String[] tokenNames.
*/
override val tokenNames: Array?
@Deprecated("")
get() = null
/** Return a list of all Token objects in input char stream.
* Forces load of all tokens. Does not include EOF token.
*/
val allTokens: List
get() {
val tokens = ArrayList()
var t = nextToken()
while (t!!.type != Token.EOF) {
tokens.add(t)
t = nextToken()
}
return tokens
}
constructor() {}
constructor(input: CharStream) {
this.inputStream = input
this._tokenFactorySourcePair = Pair(this, input)
}
open fun reset() {
// wack Lexer state variables
if (inputStream != null) {
inputStream!!.seek(0) // rewind the input
}
token = null
type = Token.INVALID_TYPE
channel = Token.DEFAULT_CHANNEL
_tokenStartCharIndex = -1
_tokenStartCharPositionInLine = -1
_tokenStartLine = -1
_text = null
_hitEOF = false
_mode = Lexer.DEFAULT_MODE
_modeStack.clear()
interpreter!!.reset()
}
/** Return a token from this source; i.e., match a token on the char
* stream.
*/
override fun nextToken(): Token {
if (inputStream == null) {
throw IllegalStateException("nextToken requires a non-null input stream.")
}
// Mark start location in char stream so unbuffered streams are
// guaranteed at least have text of current token
val tokenStartMarker = inputStream!!.mark()
try {
outer@ while (true) {
if (_hitEOF) {
emitEOF()
return token!!
}
token = null
channel = Token.DEFAULT_CHANNEL
_tokenStartCharIndex = inputStream!!.index()
_tokenStartCharPositionInLine = interpreter!!.charPositionInLine
_tokenStartLine = interpreter!!.line
_text = null
do {
type = Token.INVALID_TYPE
// println("nextToken line "+_tokenStartLine+" at "+inputStream!!.LA(1)+
// " in mode "+_mode+
// " at index "+ inputStream!!.index())
var ttype: Int
try {
ttype = interpreter!!.match(readInputStream()!!, _mode)
} catch (e: LexerNoViableAltException) {
notifyListeners(e) // report error
recover(e)
ttype = SKIP
}
if (inputStream!!.LA(1) == IntStream.EOF) {
_hitEOF = true
}
if (type == Token.INVALID_TYPE) type = ttype
if (type == SKIP) {
continue@outer
}
} while (type == MORE)
if (token == null) emit()
return token!!
}
} finally {
// make sure we release marker after match or
// unbuffered char stream will keep buffering
inputStream!!.release(tokenStartMarker)
}
}
/** Instruct the lexer to skip creating a token for current lexer rule
* and look for another token. nextToken() knows to keep looking when
* a lexer rule finishes with token set to SKIP_TOKEN. Recall that
* if token==null at end of any token rule, it creates one for you
* and emits it.
*/
open fun skip() {
type = SKIP
}
open fun more() {
type = MORE
}
open fun mode(m: Int) {
_mode = m
}
open fun pushMode(m: Int) {
if (LexerATNSimulator.debug) println("pushMode " + m)
_modeStack.push(_mode)
mode(m)
}
open fun popMode(): Int {
if (_modeStack.isEmpty) throw RuntimeException()
if (LexerATNSimulator.debug) outMessage("popMode back to " + _modeStack.peek())
mode(_modeStack.pop())
return _mode
}
/** By default does not support multiple emits per nextToken invocation
* for efficiency reasons. Subclass and override this method, nextToken,
* and getToken (to push tokens into a list and pull from that list
* rather than a single variable as this implementation does).
*/
open fun emit(token: Token) {
//System.err.println("emit "+token);
this.token = token
}
/** The standard method called to automatically emit a token at the
* outermost lexical rule. The token object should point into the
* char buffer start..stop. If there is a text override in 'text',
* use that to set the token's text. Override this method to emit
* custom Token objects or provide a new factory.
*/
open fun emit(): Token {
val t = tokenFactory.create(_tokenFactorySourcePair!!, type, _text, channel, _tokenStartCharIndex, charIndex - 1,
_tokenStartLine, _tokenStartCharPositionInLine)
emit(t)
return t
}
open fun emitEOF(): Token {
val cpos = charPositionInLine
val line = line
val eof = tokenFactory.create(_tokenFactorySourcePair!!, Token.EOF, null, Token.DEFAULT_CHANNEL, inputStream!!.index(), inputStream!!.index() - 1,
line, cpos)
emit(eof)
return eof
}
open fun recover(e: LexerNoViableAltException) {
if (inputStream!!.LA(1) != IntStream.EOF) {
// skip a char and try again
interpreter!!.consume(readInputStream()!!)
}
}
open fun notifyListeners(e: LexerNoViableAltException) {
val text = readInputStream()!!.getText(Interval.of(_tokenStartCharIndex, inputStream!!.index()))
val msg = "token recognition error at: '" + getErrorDisplay(text) + "'"
val listener = errorListenerDispatch
listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e)
}
open fun getErrorDisplay(s: String): String {
val buf = StringBuilder()
for (c in s.asCharArray()) {
buf.append(getErrorDisplay(c.toInt()))
}
return buf.toString()
}
open fun getErrorDisplay(c: Int): String {
var s = c.toChar().toString()
when (c) {
Token.EOF -> s = ""
'\n'.toInt() -> s = "\\n"
'\t'.toInt() -> s = "\\t"
'\r'.toInt() -> s = "\\r"
}
return s
}
open fun getCharErrorDisplay(c: Int): String {
val s = getErrorDisplay(c)
return "'$s'"
}
/** Lexers can normally match any char in it's vocabulary after matching
* a token, so do the easy thing and just kill a character and hope
* it all works out. You can instead use the rule invocation stack
* to do sophisticated error recovery if you are in a fragment rule.
*/
open fun recover(re: RecognitionException) {
//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
//re.printStackTrace();
// TODO: Do we lose character or line position information?
inputStream!!.consume()
}
companion object {
val DEFAULT_MODE = 0
val MORE = -2
val SKIP = -3
val DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL
val HIDDEN = Token.HIDDEN_CHANNEL
val MIN_CHAR_VALUE = 0x0000
val MAX_CHAR_VALUE = 0x10FFFF
}
}