org.antlr.v4.parse.ANTLRLexer.g Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2012 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD-3-Clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

// File      : A3Lexer.g
// Author    : Jim Idle ([email protected])
// Copyright : Free BSD - See @header clause below
// Version   : First implemented as part of ANTLR 3.2 this is the self
//             hosting ANTLR 3 Lexer.
//
// Description
// -----------
// This is the definitive lexer grammar for parsing ANTLR V3.x.x grammars. All other
// gramnmars are derived from this grammar via source code control integration (perforce)
// or by the gdiff tool.
//
// This grammar and its associated grmmmars A3Parser.g and A3Walker.g exhibit the following
// traits, which are recommended for all production quality grammars:
//
// 1) They are separate grammars, not composite grammars;
// 2) They implement all supporting methods in a superclass (at least this is recommended
//    for language targets that support inheritence;
// 3) All errors are pushed as far down the parsing chain as possible, which means
//    that the lexer tries to defer error reporting to the parser, and the parser
//    tries to defer error reporting to a semantic phase consisting of a single
//    walk of the AST. The reason for this is that the error messages produced
//    from later phases of the parse will generally have better context and so
//    be more useful to the end user. Consider the message: "Syntax error at 'options'"
//    vs: "You cannot specify two options{} sections in a single grammar file".
// 4) The lexer is 'programmed' to catch common mistakes such as unterminated literals
//    and report them specifically and not just issue confusing lexer mismatch errors.
//

/** Read in an ANTLR grammar and build an AST.  Try not to do
 *  any actions, just build the tree.
 *
 *  The phases are:
 *
 *		A3Lexer.g (this file)
 *              A3Parser.g
 *              A3Verify.g (derived from A3Walker.g)
 *		assign.types.g
 *		define.g
 *		buildnfa.g
 *		antlr.print.g (optional)
 *		codegen.g
 *
 *  Terence Parr
 *  University of San Francisco
 *  2005
 *  Jim Idle (this v3 grammar)
 *  Temporal Wave LLC
 *  2009
 */
lexer grammar ANTLRLexer;

// ==============================================================================
// Note that while this grammar does not care about order of constructs
// that don't really matter, such as options before @header etc, it must first
// be parsed by the original v2 parser, before it replaces it. That parser does
// care about order of structures. Hence we are constrained by the v2 parser
// for at least the first bootstrap release that causes this parser to replace
// the v2 version.
// ==============================================================================

// -------
// Options
//
// V3 option directives to tell the tool what we are asking of it for this
// grammar.
//
options {

	// Target language is Java, which is the default but being specific
	// here as this grammar is also meant as a good example grammar for
	// for users.
	//
	language      = Java;

	// The super class that this lexer should expect to inherit from, and
	// which contains any and all support routines for the lexer. This is
	// commented out in this baseline (definitive or normative grammar)
	// - see the ANTLR tool implementation for hints on how to use the super
	//   class
	//
	//superclass    = AbstractA3Lexer;
}

tokens { SEMPRED; TOKEN_REF; RULE_REF; LEXER_CHAR_SET; ARG_ACTION; }

// Include the copyright in this source and also the generated source
//
@lexer::header {
/*
 * Copyright (c) 2012 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD-3-Clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
package org.antlr.v4.parse;
import org.antlr.v4.tool.*;
import org.antlr.v4.runtime.misc.Interval;
}


@members {
	public static final int COMMENTS_CHANNEL = 2;

    public CommonTokenStream tokens; // track stream we push to; need for context info
    public boolean isLexerRule = false;

	public void grammarError(ErrorType etype, org.antlr.runtime.Token token, Object... args) { }

	/** scan backwards from current point in this.tokens list
	 *  looking for the start of the rule or subrule.
	 *  Return token or null if for some reason we can't find the start.
	 */
	public Token getRuleOrSubruleStartToken() {
	    if ( tokens==null ) return null;
		int i = tokens.index();
        int n = tokens.size();
        if ( i>=n ) i = n-1; // seems index == n as we lex
		while ( i>=0 && i ' $ANTLR' SRC
              | ~(NLCHARS)*
            )

         | // Multi-line comment, which may be a documentation comment
           // if it starts /** (note that we protect against accidentaly
           // recognizing a comment /**/ as a documentation comment
           //
           '*' (
           			  { input.LA(2) != '/'}?=> '*' { $type = DOC_COMMENT; }
           			| { true }?=>  // Required to cover all alts with predicates
           	   )

               // Should we support embedded multiline comments here?
               //
               (
                   // Pick out end of multiline comment and exit the loop
                   // if we find it.
                   //
               	   {    !(input.LA(1) == '*' && input.LA(2) == '/') }?

              	      // Anything else other than the non-greedy match of
              	      // the comment close sequence
              	      //
              	      .
               )*
            (
                 // Look for the comment terminator, but if it is accidentally
                 // unterminated, then we will hit EOF, which will trigger the
                 // epsilon alt and hence we can issue an error message relative
                 // to the start of the unterminated multi-line comment
                 //
                 '*/'

               | // Unterminated comment!
                 //
                 {
                   // ErrorManager.msg(Msg.UNTERMINATED_DOC_COMMENT, startLine, offset, $pos, startLine, offset, $pos, (Object)null);
                 }
             )

         | // There was nothing that made sense following the opening '/' and so
           // we issue an error regarding the malformed comment
           //
           {
           	 // TODO: Insert error message relative to comment start
             //
           }
       )
       {
         // We do not wish to pass the comments in to the parser. If you are
         // writing a formatter then you will want to preserve the comments off
         // channel, but could just skip and save token space if not.
         //
         $channel=COMMENTS_CHANNEL;
       }
    ;

ARG_OR_CHARSET
options {k=1;}
    :   {isLexerRule}?=> LEXER_CHAR_SET {$type=LEXER_CHAR_SET;}
    |   {!isLexerRule}?=> ARG_ACTION
        {
        $type=ARG_ACTION;
        // Set the token text to our gathered string minus outer [ ]
        String t = $text;
        t = t.substring(1,t.length()-1);
        setText(t);
        }
    ;

fragment
LEXER_CHAR_SET
	:	'['
		(	'\\' ~('\r'|'\n')
		|	~('\r'|'\n'|'\\'|']')
		)*
		']'
	;

// --------------
// Argument specs
//
// Certain argument lists, such as those specifying call parameters
// to a rule invocation, or input parameters to a rule specification
// are contained within square brackets. In the lexer we consume them
// all at once and sort them out later in the grammar analysis.
//
fragment
ARG_ACTION
	: '['
         (
             ARG_ACTION

           | ('"')=>ACTION_STRING_LITERAL

           | ('\'')=>ACTION_CHAR_LITERAL

           | ~('['|']')
	     )*

       ']'
	;

// -------
// Actions
//
// Other than making sure to distinguish between { and } embedded
// within what we have assumed to be literals in the action code, the
// job of the lexer is merely to gather the code within the action
// (delimited by {}) and pass it to the parser as a single token.
// We know that this token will be asked for its text somewhere
// in the upcoming parse, so setting the text here to exclude
// the delimiting {} is no additional overhead.
//
ACTION
	:	NESTED_ACTION
		(	'?' {$type = SEMPRED;}
			(	(WSNLCHARS* '=>') => WSNLCHARS* '=>' // v3 gated sempred
				{
				Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
				t.setLine(state.tokenStartLine);
				t.setText(state.text);
				t.setCharPositionInLine(state.tokenStartCharPositionInLine);
				grammarError(ErrorType.V3_GATED_SEMPRED, t);
				}
			)?
		)?
	;

// ----------------
// Action structure
//
// Many language targets use {} as block delimiters and so we
// must recursively match {} delimited blocks to balance the
// braces. Additionally, we must make some assumptions about
// literal string representation in the target language. We assume
// that they are delimited by ' or " and so consume these
// in their own alts so as not to inadvertantly match {}.
// This rule calls itself on matching a {
//
fragment
NESTED_ACTION
@init {

	// Record the start line and offsets as if we need to report an
	// unterminated block, then we want to show the start of the comment
	// we think is broken, not the end, where people will have to try and work
	// it out themselves.
	//
	int startLine = getLine();
	int offset    = getCharPositionInLine();
}

    : // Action and other blocks start with opening {
      //
	  '{'
      (
	    // And now we can match one of a number of embedded
	    // elements within the action until we find a
	    // } that balances the opening {. If we do not find
	    // the balanced } then we will hit EOF and can issue
	    // an error message about the brace that we belive to
	    // be mismatched. This won't be foolproof but we will
	    // be able to at least report an error against the
	    // opening brace that we feel is in error and this will
	    // guide the user to the correction as best we can.
	    //


          // An embedded {} block
	      //
	      NESTED_ACTION

        | // What appears to be a literal
          //
          ACTION_CHAR_LITERAL

        | // We have assumed that the target language has C/Java
          // type comments.
          //
          COMMENT

        | // What appears to be a literal
          //
          ACTION_STRING_LITERAL

	    | // What appears to be an escape sequence
	      //
	      ACTION_ESC

	    | // Some other single character that is not
	      // handled above
	      //
	      ~('\\'|'"'|'\''|'/'|'{'|'}')

      )*

	(
	    // Correctly balanced closing brace
	    //
	    '}'

	  | // Looks like have an imblanced {} block, report
	    // with respect to the opening brace.
	    //
	    {
	        // TODO: Report imbalanced {}
	        System.out.println("Block starting  at line " + startLine + " offset " + (offset+1) + " contains imbalanced {} or is missing a }");
	    }
	)
   ;


// Keywords
// --------
// keywords used to specify ANTLR v3 grammars. Keywords may not be used as
// labels for rules or in any other context where they would be ambiguous
// with the keyword vs some other identifier
// OPTIONS, TOKENS, and CHANNELS must also consume the opening brace that captures
// their option block, as this is the easiest way to parse it separate
// to an ACTION block, despite it using the same {} delimiters.
//
OPTIONS      : 'options'  WSNLCHARS* '{'  ;
TOKENS_SPEC  : 'tokens'   WSNLCHARS* '{'  ;
CHANNELS     : 'channels' WSNLCHARS* '{'  ;

IMPORT       : 'import'               ;
FRAGMENT     : 'fragment'             ;
LEXER        : 'lexer'                ;
PARSER       : 'parser'               ;
GRAMMAR      : 'grammar'              ;
TREE_GRAMMAR : 'tree' WSNLCHARS* 'grammar' ;
PROTECTED    : 'protected'            ;
PUBLIC       : 'public'               ;
PRIVATE      : 'private'              ;
RETURNS      : 'returns'              ;
LOCALS       : 'locals'               ;
THROWS       : 'throws'               ;
CATCH        : 'catch'                ;
FINALLY      : 'finally'              ;
MODE         : 'mode'                 ;

// -----------
// Punctuation
//
// Character sequences used as separators, delimters, operators, etc
//
COLON        : ':'
               {
               // scan backwards, looking for a RULE_REF or TOKEN_REF.
               // which would indicate the start of a rule definition.
               // If we see a LPAREN, then it's the start of the subrule.
               // this.tokens is the token string we are pushing into, so
               // just loop backwards looking for a rule definition. Then
               // we set isLexerRule.
               Token t = getRuleOrSubruleStartToken();
               if ( t!=null ) {
                    if ( t.getType()==RULE_REF ) isLexerRule = false;
                    else if ( t.getType()==TOKEN_REF ) isLexerRule = true;
                    // else must be subrule; don't alter context
               }
               }
             ;
COLONCOLON   : '::'                   ;
COMMA        : ','                    ;
SEMI         : ';'                    ;
LPAREN       : '('                    ;
RPAREN       : ')'                    ;
RARROW       : '->'                   ;
LT           : '<'                    ;
GT           : '>'                    ;
ASSIGN       : '='                    ;
QUESTION     : '?'                    ;
SYNPRED      : '=>'
			   {
			    Token t = new CommonToken(input, state.type, state.channel,
			                              state.tokenStartCharIndex, getCharIndex()-1);
				t.setLine(state.tokenStartLine);
				t.setText(state.text);
				t.setCharPositionInLine(state.tokenStartCharPositionInLine);
				grammarError(ErrorType.V3_SYNPRED, t);
                $channel=HIDDEN;
				}
             ;
STAR         : '*'                    ;
PLUS         : '+'                    ;
PLUS_ASSIGN  : '+='                   ;
OR           : '|'                    ;
DOLLAR       : '$'                    ;
DOT		     : '.'                    ; // can be WILDCARD or DOT in qid or imported rule ref
RANGE        : '..'                   ;
AT           : '@'                    ;
POUND        : '#'                    ;
NOT          : '~'                    ;
RBRACE       : '}'                    ;

/** Allow unicode rule/token names */
ID			:	a=NameStartChar NameChar*
				{
				if ( Grammar.isTokenName($a.text) ) $type = TOKEN_REF;
				else $type = RULE_REF;
				}
			;

fragment
NameChar    :   NameStartChar
            |   '0'..'9'
            |   '_'
            |   '\u00B7'
            |   '\u0300'..'\u036F'
            |   '\u203F'..'\u2040'
            ;

fragment
NameStartChar
            :   'A'..'Z' | 'a'..'z'
            |   '\u00C0'..'\u00D6'
            |   '\u00D8'..'\u00F6'
            |   '\u00F8'..'\u02FF'
            |   '\u0370'..'\u037D'
            |   '\u037F'..'\u1FFF'
            |   '\u200C'..'\u200D'
            |   '\u2070'..'\u218F'
            |   '\u2C00'..'\u2FEF'
            |   '\u3001'..'\uD7FF'
            |   '\uF900'..'\uFDCF'
            |   '\uFDF0'..'\uFEFE'
            |   '\uFF00'..'\uFFFD'
            ; // ignores | ['\u10000-'\uEFFFF] ;

// ----------------------------
// Literals embedded in actions
//
// Note that we have made the assumption that the language used within
// actions uses the fairly standard " and ' delimiters for literals and
// that within these literals, characters are escaped using the \ character.
// There are some languages which do not conform to this in all cases, such
// as by using /string/ and so on. We will have to deal with such cases if
// if they come up in targets.
//

// Within actions, or other structures that are not part of the ANTLR
// syntax, we may encounter literal characters. Within these, we do
// not want to inadvertantly match things like '}' and so we eat them
// specifically. While this rule is called CHAR it allows for the fact that
// some languages may use/allow ' as the string delimiter.
//
fragment
ACTION_CHAR_LITERAL
	:	'\'' (('\\')=>ACTION_ESC | ~'\'' )* '\''
	;

// Within actions, or other structures that are not part of the ANTLR
// syntax, we may encounter literal strings. Within these, we do
// not want to inadvertantly match things like '}' and so we eat them
// specifically.
//
fragment
ACTION_STRING_LITERAL
	:	'"' (('\\')=>ACTION_ESC | ~'"')* '"'
	;

// Within literal strings and characters that are not part of the ANTLR
// syntax, we must allow for escaped character sequences so that we do not
// inadvertantly recognize the end of a string or character when the terminating
// delimiter has been esacped.
//
fragment
ACTION_ESC
	: '\\' .
	;

// -------
// Integer
//
// Obviously (I hope) match an aribtrary long sequence of digits.
//
INT : ('0'..'9')+
    ;

// -----------
// Source spec
//
// A fragment rule for picking up information about an origrinating
// file from which the grammar we are parsing has been generated. This allows
// ANTLR to report errors against the originating file and not the generated
// file.
//
fragment
SRC : 'src' WSCHARS+ file=ACTION_STRING_LITERAL WSCHARS+ line=INT
      {
         // TODO: Add target specific code to change the source file name and current line number
         //
      }
    ;

// --------------
// Literal string
//
// ANTLR makes no disticintion between a single character literal and a
// multi-character string. All literals are single quote delimited and
// may contain unicode escape sequences of the form \uxxxx or \u{xxxxxx},
// where x is a valid hexadecimal number.
STRING_LITERAL
    :  '\'' ( ( ESC_SEQ | ~('\\'|'\''|'\r'|'\n') ) )*
       (    '\''
       |    // Unterminated string literal
            {
            Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
            t.setLine(state.tokenStartLine);
            t.setText(state.text);
            t.setCharPositionInLine(state.tokenStartCharPositionInLine);
            grammarError(ErrorType.UNTERMINATED_STRING_LITERAL, t);
            }
       )
    ;

// A valid hex digit specification
//
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;

// Any kind of escaped character that we can embed within ANTLR
// literal strings.
//
fragment
ESC_SEQ
    : '\\'
        (
              // The standard escaped character set such as tab, newline, etc...
    		  'b'|'t'|'n'|'f'|'r'|'\''|'\\'

    	    | // A Java style Unicode escape sequence
    	      UNICODE_ESC

            | // A Swift/Hack style Unicode escape sequence
              UNICODE_EXTENDED_ESC

    	    | // An illegal escape seqeunce
    	      ~('b'|'t'|'n'|'f'|'r'|'\''|'\\'|'u') // \x for any invalid x (make sure to match char here)
    	      {
                Token t = new CommonToken(input, state.type, state.channel, getCharIndex()-2, getCharIndex()-1);
                t.setText(t.getText());
                t.setLine(input.getLine());
                t.setCharPositionInLine(input.getCharPositionInLine()-2);
                grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(getCharIndex()-2,getCharIndex()-1));
    	      }
        )
    ;

fragment
UNICODE_ESC
@init {

	// Flag to tell us whether we have a valid number of
	// hex digits in the escape sequence
	//
	int	hCount = 0;
}
    :   'u' // Leadin for unicode escape sequence

        // We now require 4 hex digits. Note though
        // that we accept any number of characters
        // and issue an error if we do not get 4. We cannot
        // use an inifinite count such as + because this
        // might consume too many, so we lay out the lexical
        // options and issue an error at the invalid paths.
        //
    	(
    	   (
    	      HEX_DIGIT  { hCount++; }
    	        (
    	             HEX_DIGIT  { hCount++; }
    		         (
    		              HEX_DIGIT  { hCount++; }
    		              (
    		                  // Four valid hex digits, we are good
    		                  //
    		                  HEX_DIGIT { hCount++; }

    		                | // Three valid digits
    		              )

    		            | // Two valid digits
    		          )

    		        | // One valid digit
    		    )
            )
          | // No valid hex digits at all
    	)

    	// Now check the digit count and issue an error if we need to
    	//
    	{
    		if (hCount < 4) {
				Interval badRange = Interval.of(getCharIndex()-2-hCount, getCharIndex());
				String lastChar = input.substring(badRange.b, badRange.b);
				if ( lastChar.codePointAt(0)=='\'' ) {
					badRange = new Interval(badRange.a, badRange.b - 1);
				}
				String bad = input.substring(badRange.a, badRange.b);
				Token t = new CommonToken(input, state.type, state.channel, badRange.a, badRange.b);
				t.setLine(input.getLine());
				t.setCharPositionInLine(input.getCharPositionInLine()-hCount-2);
				grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, bad);
    		}
    	}
    ;

fragment
UNICODE_EXTENDED_ESC
    :   'u{' // Leadin for unicode extended escape sequence

        HEX_DIGIT+ // One or more hexadecimal digits

        '}' // Leadout for unicode extended escape sequence

        // Now check the digit count and issue an error if we need to
        {
            int numDigits = getCharIndex()-state.tokenStartCharIndex-6;
            if (numDigits > 6) {
                Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
                t.setText(t.getText());
                t.setLine(input.getLine());
                t.setCharPositionInLine(input.getCharPositionInLine()-numDigits);
                grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, t, input.substring(state.tokenStartCharIndex,getCharIndex()-1));
			}
        }
    ;

// ----------
// Whitespace
//
// Characters and character constructs that are of no import
// to the parser and are used to make the grammar easier to read
// for humans.
//
WS
    : (
    	  ' '
        | '\t'
        | '\r'
        | '\n'
        | '\f'
      )+
      {$channel=HIDDEN;}
    ;

// A fragment rule for use in recognizing end of line in
// rules like COMMENT.
//
fragment
NLCHARS
    : '\n' | '\r'
    ;

// A fragment rule for recognizing traditional whitespace
// characters within lexer rules.
//
fragment
WSCHARS
    : ' ' | '\t' | '\f'
    ;

// A fragment rule for recognizing both traditional whitespace and
// end of line markers, when we don't care to distinguish but don't
// want any action code going on.
//
fragment
WSNLCHARS
    : ' ' | '\t' | '\f' | '\n' | '\r'
    ;

// This rule allows ANTLR 4 to parse grammars using the UTF-8 encoding with a
// byte order mark. Since this Unicode character doesn't appear as a token
// anywhere else in the grammar, we can simply skip all instances of it without
// problem. This rule will not break usage of \uFEFF inside a LEXER_CHAR_SET or
// STRING_LITERAL.
UnicodeBOM
    :   '\uFEFF' {skip();}
    ;

// -----------------
// Illegal Character
//
// This is an illegal character trap which is always the last rule in the
// lexer specification. It matches a single character of any value and being
// the last rule in the file will match when no other rule knows what to do
// about the character. It is reported as an error but is not passed on to the
// parser. This means that the parser to deal with the grammar file anyway
// but we will not try to analyse or code generate from a file with lexical
// errors.
//
ERRCHAR
    : .
      {
         Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
         t.setLine(state.tokenStartLine);
         t.setText(state.text);
         t.setCharPositionInLine(state.tokenStartCharPositionInLine);
         String msg = getTokenErrorDisplay(t) + " came as a complete surprise to me";
         grammarError(ErrorType.SYNTAX_ERROR, t, msg);
         state.syntaxErrors++;
         skip();
      }
    ;