![JAR search and dependency download from the Maven repository](/logo.png)
org.antlr.v4.parse.ANTLRLexer.g Maven / Gradle / Ivy
/*
* Copyright (c) 2012 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD-3-Clause license that
* can be found in the LICENSE.txt file in the project root.
*/
// File : A3Lexer.g
// Author : Jim Idle ([email protected])
// Copyright : Free BSD - See @header clause below
// Version : First implemented as part of ANTLR 3.2 this is the self
// hosting ANTLR 3 Lexer.
//
// Description
// -----------
// This is the definitive lexer grammar for parsing ANTLR V3.x.x grammars. All other
// gramnmars are derived from this grammar via source code control integration (perforce)
// or by the gdiff tool.
//
// This grammar and its associated grmmmars A3Parser.g and A3Walker.g exhibit the following
// traits, which are recommended for all production quality grammars:
//
// 1) They are separate grammars, not composite grammars;
// 2) They implement all supporting methods in a superclass (at least this is recommended
// for language targets that support inheritence;
// 3) All errors are pushed as far down the parsing chain as possible, which means
// that the lexer tries to defer error reporting to the parser, and the parser
// tries to defer error reporting to a semantic phase consisting of a single
// walk of the AST. The reason for this is that the error messages produced
// from later phases of the parse will generally have better context and so
// be more useful to the end user. Consider the message: "Syntax error at 'options'"
// vs: "You cannot specify two options{} sections in a single grammar file".
// 4) The lexer is 'programmed' to catch common mistakes such as unterminated literals
// and report them specifically and not just issue confusing lexer mismatch errors.
//
/** Read in an ANTLR grammar and build an AST. Try not to do
* any actions, just build the tree.
*
* The phases are:
*
* A3Lexer.g (this file)
* A3Parser.g
* A3Verify.g (derived from A3Walker.g)
* assign.types.g
* define.g
* buildnfa.g
* antlr.print.g (optional)
* codegen.g
*
* Terence Parr
* University of San Francisco
* 2005
* Jim Idle (this v3 grammar)
* Temporal Wave LLC
* 2009
*/
lexer grammar ANTLRLexer;
// ==============================================================================
// Note that while this grammar does not care about order of constructs
// that don't really matter, such as options before @header etc, it must first
// be parsed by the original v2 parser, before it replaces it. That parser does
// care about order of structures. Hence we are constrained by the v2 parser
// for at least the first bootstrap release that causes this parser to replace
// the v2 version.
// ==============================================================================
// -------
// Options
//
// V3 option directives to tell the tool what we are asking of it for this
// grammar.
//
options {
// Target language is Java, which is the default but being specific
// here as this grammar is also meant as a good example grammar for
// for users.
//
language = Java;
// The super class that this lexer should expect to inherit from, and
// which contains any and all support routines for the lexer. This is
// commented out in this baseline (definitive or normative grammar)
// - see the ANTLR tool implementation for hints on how to use the super
// class
//
//superclass = AbstractA3Lexer;
}
tokens { SEMPRED; TOKEN_REF; RULE_REF; LEXER_CHAR_SET; ARG_ACTION; }
// Include the copyright in this source and also the generated source
//
@lexer::header {
/*
* Copyright (c) 2012 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD-3-Clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.parse;
import org.antlr.v4.tool.*;
import org.antlr.v4.runtime.misc.Interval;
}
@members {
public static final int COMMENTS_CHANNEL = 2;
public CommonTokenStream tokens; // track stream we push to; need for context info
public boolean isLexerRule = false;
public void grammarError(ErrorType etype, org.antlr.runtime.Token token, Object... args) { }
/** scan backwards from current point in this.tokens list
* looking for the start of the rule or subrule.
* Return token or null if for some reason we can't find the start.
*/
public Token getRuleOrSubruleStartToken() {
if ( tokens==null ) return null;
int i = tokens.index();
int n = tokens.size();
if ( i>=n ) i = n-1; // seems index == n as we lex
while ( i>=0 && i ' $ANTLR' SRC
| ~(NLCHARS)*
)
| // Multi-line comment, which may be a documentation comment
// if it starts /** (note that we protect against accidentaly
// recognizing a comment /**/ as a documentation comment
//
'*' (
{ input.LA(2) != '/'}?=> '*' { $type = DOC_COMMENT; }
| { true }?=> // Required to cover all alts with predicates
)
// Should we support embedded multiline comments here?
//
(
// Pick out end of multiline comment and exit the loop
// if we find it.
//
{ !(input.LA(1) == '*' && input.LA(2) == '/') }?
// Anything else other than the non-greedy match of
// the comment close sequence
//
.
)*
(
// Look for the comment terminator, but if it is accidentally
// unterminated, then we will hit EOF, which will trigger the
// epsilon alt and hence we can issue an error message relative
// to the start of the unterminated multi-line comment
//
'*/'
| // Unterminated comment!
//
{
// ErrorManager.msg(Msg.UNTERMINATED_DOC_COMMENT, startLine, offset, $pos, startLine, offset, $pos, (Object)null);
}
)
| // There was nothing that made sense following the opening '/' and so
// we issue an error regarding the malformed comment
//
{
// TODO: Insert error message relative to comment start
//
}
)
{
// We do not wish to pass the comments in to the parser. If you are
// writing a formatter then you will want to preserve the comments off
// channel, but could just skip and save token space if not.
//
$channel=COMMENTS_CHANNEL;
}
;
ARG_OR_CHARSET
options {k=1;}
: {isLexerRule}?=> LEXER_CHAR_SET {$type=LEXER_CHAR_SET;}
| {!isLexerRule}?=> ARG_ACTION
{
$type=ARG_ACTION;
// Set the token text to our gathered string minus outer [ ]
String t = $text;
t = t.substring(1,t.length()-1);
setText(t);
}
;
fragment
LEXER_CHAR_SET
: '['
( '\\' ~('\r'|'\n')
| ~('\r'|'\n'|'\\'|']')
)*
']'
;
// --------------
// Argument specs
//
// Certain argument lists, such as those specifying call parameters
// to a rule invocation, or input parameters to a rule specification
// are contained within square brackets. In the lexer we consume them
// all at once and sort them out later in the grammar analysis.
//
fragment
ARG_ACTION
: '['
(
ARG_ACTION
| ('"')=>ACTION_STRING_LITERAL
| ('\'')=>ACTION_CHAR_LITERAL
| ~('['|']')
)*
']'
;
// -------
// Actions
//
// Other than making sure to distinguish between { and } embedded
// within what we have assumed to be literals in the action code, the
// job of the lexer is merely to gather the code within the action
// (delimited by {}) and pass it to the parser as a single token.
// We know that this token will be asked for its text somewhere
// in the upcoming parse, so setting the text here to exclude
// the delimiting {} is no additional overhead.
//
ACTION
: NESTED_ACTION
( '?' {$type = SEMPRED;}
( (WSNLCHARS* '=>') => WSNLCHARS* '=>' // v3 gated sempred
{
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
t.setLine(state.tokenStartLine);
t.setText(state.text);
t.setCharPositionInLine(state.tokenStartCharPositionInLine);
grammarError(ErrorType.V3_GATED_SEMPRED, t);
}
)?
)?
;
// ----------------
// Action structure
//
// Many language targets use {} as block delimiters and so we
// must recursively match {} delimited blocks to balance the
// braces. Additionally, we must make some assumptions about
// literal string representation in the target language. We assume
// that they are delimited by ' or " and so consume these
// in their own alts so as not to inadvertantly match {}.
// This rule calls itself on matching a {
//
fragment
NESTED_ACTION
@init {
// Record the start line and offsets as if we need to report an
// unterminated block, then we want to show the start of the comment
// we think is broken, not the end, where people will have to try and work
// it out themselves.
//
int startLine = getLine();
int offset = getCharPositionInLine();
}
: // Action and other blocks start with opening {
//
'{'
(
// And now we can match one of a number of embedded
// elements within the action until we find a
// } that balances the opening {. If we do not find
// the balanced } then we will hit EOF and can issue
// an error message about the brace that we belive to
// be mismatched. This won't be foolproof but we will
// be able to at least report an error against the
// opening brace that we feel is in error and this will
// guide the user to the correction as best we can.
//
// An embedded {} block
//
NESTED_ACTION
| // What appears to be a literal
//
ACTION_CHAR_LITERAL
| // We have assumed that the target language has C/Java
// type comments.
//
COMMENT
| // What appears to be a literal
//
ACTION_STRING_LITERAL
| // What appears to be an escape sequence
//
ACTION_ESC
| // Some other single character that is not
// handled above
//
~('\\'|'"'|'\''|'/'|'{'|'}')
)*
(
// Correctly balanced closing brace
//
'}'
| // Looks like have an imblanced {} block, report
// with respect to the opening brace.
//
{
// TODO: Report imbalanced {}
System.out.println("Block starting at line " + startLine + " offset " + (offset+1) + " contains imbalanced {} or is missing a }");
}
)
;
// Keywords
// --------
// keywords used to specify ANTLR v3 grammars. Keywords may not be used as
// labels for rules or in any other context where they would be ambiguous
// with the keyword vs some other identifier
// OPTIONS, TOKENS, and CHANNELS must also consume the opening brace that captures
// their option block, as this is the easiest way to parse it separate
// to an ACTION block, despite it using the same {} delimiters.
//
OPTIONS : 'options' WSNLCHARS* '{' ;
TOKENS_SPEC : 'tokens' WSNLCHARS* '{' ;
CHANNELS : 'channels' WSNLCHARS* '{' ;
IMPORT : 'import' ;
FRAGMENT : 'fragment' ;
LEXER : 'lexer' ;
PARSER : 'parser' ;
GRAMMAR : 'grammar' ;
TREE_GRAMMAR : 'tree' WSNLCHARS* 'grammar' ;
PROTECTED : 'protected' ;
PUBLIC : 'public' ;
PRIVATE : 'private' ;
RETURNS : 'returns' ;
LOCALS : 'locals' ;
THROWS : 'throws' ;
CATCH : 'catch' ;
FINALLY : 'finally' ;
MODE : 'mode' ;
// -----------
// Punctuation
//
// Character sequences used as separators, delimters, operators, etc
//
COLON : ':'
{
// scan backwards, looking for a RULE_REF or TOKEN_REF.
// which would indicate the start of a rule definition.
// If we see a LPAREN, then it's the start of the subrule.
// this.tokens is the token string we are pushing into, so
// just loop backwards looking for a rule definition. Then
// we set isLexerRule.
Token t = getRuleOrSubruleStartToken();
if ( t!=null ) {
if ( t.getType()==RULE_REF ) isLexerRule = false;
else if ( t.getType()==TOKEN_REF ) isLexerRule = true;
// else must be subrule; don't alter context
}
}
;
COLONCOLON : '::' ;
COMMA : ',' ;
SEMI : ';' ;
LPAREN : '(' ;
RPAREN : ')' ;
RARROW : '->' ;
LT : '<' ;
GT : '>' ;
ASSIGN : '=' ;
QUESTION : '?' ;
SYNPRED : '=>'
{
Token t = new CommonToken(input, state.type, state.channel,
state.tokenStartCharIndex, getCharIndex()-1);
t.setLine(state.tokenStartLine);
t.setText(state.text);
t.setCharPositionInLine(state.tokenStartCharPositionInLine);
grammarError(ErrorType.V3_SYNPRED, t);
$channel=HIDDEN;
}
;
STAR : '*' ;
PLUS : '+' ;
PLUS_ASSIGN : '+=' ;
OR : '|' ;
DOLLAR : '$' ;
DOT : '.' ; // can be WILDCARD or DOT in qid or imported rule ref
RANGE : '..' ;
AT : '@' ;
POUND : '#' ;
NOT : '~' ;
RBRACE : '}' ;
/** Allow unicode rule/token names */
ID : a=NameStartChar NameChar*
{
if ( Grammar.isTokenName($a.text) ) $type = TOKEN_REF;
else $type = RULE_REF;
}
;
fragment
NameChar : NameStartChar
| '0'..'9'
| '_'
| '\u00B7'
| '\u0300'..'\u036F'
| '\u203F'..'\u2040'
;
fragment
NameStartChar
: 'A'..'Z' | 'a'..'z'
| '\u00C0'..'\u00D6'
| '\u00D8'..'\u00F6'
| '\u00F8'..'\u02FF'
| '\u0370'..'\u037D'
| '\u037F'..'\u1FFF'
| '\u200C'..'\u200D'
| '\u2070'..'\u218F'
| '\u2C00'..'\u2FEF'
| '\u3001'..'\uD7FF'
| '\uF900'..'\uFDCF'
| '\uFDF0'..'\uFEFE'
| '\uFF00'..'\uFFFD'
; // ignores | ['\u10000-'\uEFFFF] ;
// ----------------------------
// Literals embedded in actions
//
// Note that we have made the assumption that the language used within
// actions uses the fairly standard " and ' delimiters for literals and
// that within these literals, characters are escaped using the \ character.
// There are some languages which do not conform to this in all cases, such
// as by using /string/ and so on. We will have to deal with such cases if
// if they come up in targets.
//
// Within actions, or other structures that are not part of the ANTLR
// syntax, we may encounter literal characters. Within these, we do
// not want to inadvertantly match things like '}' and so we eat them
// specifically. While this rule is called CHAR it allows for the fact that
// some languages may use/allow ' as the string delimiter.
//
fragment
ACTION_CHAR_LITERAL
: '\'' (('\\')=>ACTION_ESC | ~'\'' )* '\''
;
// Within actions, or other structures that are not part of the ANTLR
// syntax, we may encounter literal strings. Within these, we do
// not want to inadvertantly match things like '}' and so we eat them
// specifically.
//
fragment
ACTION_STRING_LITERAL
: '"' (('\\')=>ACTION_ESC | ~'"')* '"'
;
// Within literal strings and characters that are not part of the ANTLR
// syntax, we must allow for escaped character sequences so that we do not
// inadvertantly recognize the end of a string or character when the terminating
// delimiter has been esacped.
//
fragment
ACTION_ESC
: '\\' .
;
// -------
// Integer
//
// Obviously (I hope) match an aribtrary long sequence of digits.
//
INT : ('0'..'9')+
;
// -----------
// Source spec
//
// A fragment rule for picking up information about an origrinating
// file from which the grammar we are parsing has been generated. This allows
// ANTLR to report errors against the originating file and not the generated
// file.
//
fragment
SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT
{
// TODO: Add target specific code to change the source file name and current line number
//
}
;
// --------------
// Literal string
//
// ANTLR makes no disticintion between a single character literal and a
// multi-character string. All literals are single quote delimited and
// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx},
// where x is a valid hexadecimal number.
STRING_LITERAL
: '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
( '\''
| // Unterminated string literal
{
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
t.setLine(state.tokenStartLine);
t.setText(state.text);
t.setCharPositionInLine(state.tokenStartCharPositionInLine);
grammarError(ErrorType.UNTERMINATED_STRING_LITERAL, t);
}
)
;
// A valid hex digit specification
//
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
// Any kind of escaped character that we can embed within ANTLR
// literal strings.
//
fragment
ESC_SEQ
: '\\'
(
// The standard escaped character set such as tab, newline, etc...
'b'|'t'|'n'|'f'|'r'|'\''|'\\'
| // A Java style Unicode escape sequence
UNICODE_ESC
| // A Swift/Hack style Unicode escape sequence
UNICODE_EXTENDED_ESC
| // An illegal escape seqeunce
~('b'|'t'|'n'|'f'|'r'|'\''|'\\'|'u') // \x for any invalid x (make sure to match char here)
{
Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-2, getCharIndex()-1);
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-2);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(getCharIndex()-2,getCharIndex()-1));
}
)
;
fragment
UNICODE_ESC
@init {
// Flag to tell us whether we have a valid number of
// hex digits in the escape sequence
//
int hCount = 0;
}
: 'u' // Leadin for unicode escape sequence
// We now require 4 hex digits. Note though
// that we accept any number of characters
// and issue an error if we do not get 4. We cannot
// use an inifinite count such as + because this
// might consume too many, so we lay out the lexical
// options and issue an error at the invalid paths.
//
(
(
HEX_DIGIT { hCount++; }
(
HEX_DIGIT { hCount++; }
(
HEX_DIGIT { hCount++; }
(
// Four valid hex digits, we are good
//
HEX_DIGIT { hCount++; }
| // Three valid digits
)
| // Two valid digits
)
| // One valid digit
)
)
| // No valid hex digits at all
)
// Now check the digit count and issue an error if we need to
//
{
if (hCount < 4) {
Interval badRange = Interval.of(getCharIndex()-2-hCount, getCharIndex());
String lastChar = input.substring(badRange.b, badRange.b);
if ( lastChar.codePointAt(0)=='\'' ) {
badRange = new Interval(badRange.a, badRange.b - 1);
}
String bad = input.substring(badRange.a, badRange.b);
Token t = new CommonToken(input, state.type, state.channel, badRange.a, badRange.b);
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-hCount-2);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, bad);
}
}
;
fragment
UNICODE_EXTENDED_ESC
: 'u{' // Leadin for unicode extended escape sequence
HEX_DIGIT+ // One or more hexadecimal digits
'}' // Leadout for unicode extended escape sequence
// Now check the digit count and issue an error if we need to
{
int numDigits = getCharIndex()-state.tokenStartCharIndex-6;
if (numDigits > 6) {
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
t.setText(t.getText());
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(state.tokenStartCharIndex,getCharIndex()-1));
}
}
;
// ----------
// Whitespace
//
// Characters and character constructs that are of no import
// to the parser and are used to make the grammar easier to read
// for humans.
//
WS
: (
' '
| '\t'
| '\r'
| '\n'
| '\f'
)+
{$channel=HIDDEN;}
;
// A fragment rule for use in recognizing end of line in
// rules like COMMENT.
//
fragment
NLCHARS
: '\n' | '\r'
;
// A fragment rule for recognizing traditional whitespace
// characters within lexer rules.
//
fragment
WSCHARS
: ' ' | '\t' | '\f'
;
// A fragment rule for recognizing both traditional whitespace and
// end of line markers, when we don't care to distinguish but don't
// want any action code going on.
//
fragment
WSNLCHARS
: ' ' | '\t' | '\f' | '\n' | '\r'
;
// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a
// byte order mark. Since this Unicode character doesn't appear as a token
// anywhere else in the grammar, we can simply skip all instances of it without
// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or
// STRING_LITERAL.
UnicodeBOM
: '\uFEFF' {skip();}
;
// -----------------
// Illegal Character
//
// This is an illegal character trap which is always the last rule in the
// lexer specification. It matches a single character of any value and being
// the last rule in the file will match when no other rule knows what to do
// about the character. It is reported as an error but is not passed on to the
// parser. This means that the parser to deal with the grammar file anyway
// but we will not try to analyse or code generate from a file with lexical
// errors.
//
ERRCHAR
: .
{
Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
t.setLine(state.tokenStartLine);
t.setText(state.text);
t.setCharPositionInLine(state.tokenStartCharPositionInLine);
String msg = getTokenErrorDisplay(t) + " came as a complete surprise to me";
grammarError(ErrorType.SYNTAX_ERROR, t, msg);
state.syntaxErrors++;
skip();
}
;
© 2015 - 2025 Weber Informatics LLC | Privacy Policy