bparser.2.13.5.source-code.PreParser.scc Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bparser Show documentation
Part of the ProB Parser library
The newest version!
Package de.be4.classicalb.core.preparser;

/*******************************************************************
 * Helpers                                                         *
 *******************************************************************/
Helpers
  triple_quote = 39 39 39;
  single_quote = 39;

  lf = 10;
  cr = 13;
  unicode_line_seperator = 8232;
  unicode_paragraph_seperator = 8233;

  letter = ['a' .. 'z'] | ['A' .. 'Z'];
  digit = ['0' .. '9'];

  // unicode_letter definition from java-1.7.sablecc by Etienne M. Gagnon, http://sablecc.org/java1.7/
  // but removed unicode lambda 0x03bb from [0x03a3..0x03ce]
  unicode_letter =
    [[[0x0041..0x005a] + [0x0061..0x007a]] + [[0x00aa..0x00aa] + [0x00b5..0x00b5]]] |
    [[[0x00ba..0x00ba] + [0x00c0..0x00d6]] + [[0x00d8..0x00f6] + [0x00f8..0x01f5]]] |
    [[[0x01fa..0x0217] + [0x0250..0x02a8]] + [[0x02b0..0x02b8] + [0x02bb..0x02c1]]] |
    [[[0x02d0..0x02d1] + [0x02e0..0x02e4]] + [[0x037a..0x037a] + [0x0386..0x0386]]] |
    [[[0x0388..0x038a] + [0x038c..0x038c]] + [[[0x038e..0x03a1] + [0x03a3..0x03ba]] + [0x03bc..0x03ce]]] |
    //[[[0x0388..0x038a] + [0x038c..0x038c]] + [[0x038e..0x03a1] + [0x03a3..0x03ce]]] | // removed unicode_lambda 0x03bb
    [[[0x03d0..0x03d6] + [0x03da..0x03da]] + [[0x03dc..0x03dc] + [0x03de..0x03de]]] |
    [[[0x03e0..0x03e0] + [0x03e2..0x03f3]] + [[0x0401..0x040c] + [0x040e..0x044f]]] |
    [[[0x0451..0x045c] + [0x045e..0x0481]] + [[0x0490..0x04c4] + [0x04c7..0x04c8]]] |
    [[[0x04cb..0x04cc] + [0x04d0..0x04eb]] + [[0x04ee..0x04f5] + [0x04f8..0x04f9]]] |
    [[[0x0531..0x0556] + [0x0559..0x0559]] + [[0x0561..0x0587] + [0x05d0..0x05ea]]] |
    [[[0x05f0..0x05f2] + [0x0621..0x063a]] + [[0x0640..0x064a] + [0x0671..0x06b7]]] |
    [[[0x06ba..0x06be] + [0x06c0..0x06ce]] + [[0x06d0..0x06d3] + [0x06d5..0x06d5]]] |
    [[[0x06e5..0x06e6] + [0x0905..0x0939]] + [[0x093d..0x093d] + [0x0958..0x0961]]] |
    [[[0x0985..0x098c] + [0x098f..0x0990]] + [[0x0993..0x09a8] + [0x09aa..0x09b0]]] |
    [[[0x09b2..0x09b2] + [0x09b6..0x09b9]] + [[0x09dc..0x09dd] + [0x09df..0x09e1]]] |
    [[[0x09f0..0x09f1] + [0x0a05..0x0a0a]] + [[0x0a0f..0x0a10] + [0x0a13..0x0a28]]] |
    [[[0x0a2a..0x0a30] + [0x0a32..0x0a33]] + [[0x0a35..0x0a36] + [0x0a38..0x0a39]]] |
    [[[0x0a59..0x0a5c] + [0x0a5e..0x0a5e]] + [[0x0a72..0x0a74] + [0x0a85..0x0a8b]]] |
    [[[0x0a8d..0x0a8d] + [0x0a8f..0x0a91]] + [[0x0a93..0x0aa8] + [0x0aaa..0x0ab0]]] |
    [[[0x0ab2..0x0ab3] + [0x0ab5..0x0ab9]] + [[0x0abd..0x0abd] + [0x0ae0..0x0ae0]]] |
    [[[0x0b05..0x0b0c] + [0x0b0f..0x0b10]] + [[0x0b13..0x0b28] + [0x0b2a..0x0b30]]] |
    [[[0x0b32..0x0b33] + [0x0b36..0x0b39]] + [[0x0b3d..0x0b3d] + [0x0b5c..0x0b5d]]] |
    [[[0x0b5f..0x0b61] + [0x0b85..0x0b8a]] + [[0x0b8e..0x0b90] + [0x0b92..0x0b95]]] |
    [[[0x0b99..0x0b9a] + [0x0b9c..0x0b9c]] + [[0x0b9e..0x0b9f] + [0x0ba3..0x0ba4]]] |
    [[[0x0ba8..0x0baa] + [0x0bae..0x0bb5]] + [[0x0bb7..0x0bb9] + [0x0c05..0x0c0c]]] |
    [[[0x0c0e..0x0c10] + [0x0c12..0x0c28]] + [[0x0c2a..0x0c33] + [0x0c35..0x0c39]]] |
    [[[0x0c60..0x0c61] + [0x0c85..0x0c8c]] + [[0x0c8e..0x0c90] + [0x0c92..0x0ca8]]] |
    [[[0x0caa..0x0cb3] + [0x0cb5..0x0cb9]] + [[0x0cde..0x0cde] + [0x0ce0..0x0ce1]]] |
    [[[0x0d05..0x0d0c] + [0x0d0e..0x0d10]] + [[0x0d12..0x0d28] + [0x0d2a..0x0d39]]] |
    [[[0x0d60..0x0d61] + [0x0e01..0x0e2e]] + [[0x0e30..0x0e30] + [0x0e32..0x0e33]]] |
    [[[0x0e40..0x0e46] + [0x0e81..0x0e82]] + [[0x0e84..0x0e84] + [0x0e87..0x0e88]]] |
    [[[0x0e8a..0x0e8a] + [0x0e8d..0x0e8d]] + [[0x0e94..0x0e97] + [0x0e99..0x0e9f]]] |
    [[[0x0ea1..0x0ea3] + [0x0ea5..0x0ea5]] + [[0x0ea7..0x0ea7] + [0x0eaa..0x0eab]]] |
    [[[0x0ead..0x0eae] + [0x0eb0..0x0eb0]] + [[0x0eb2..0x0eb3] + [0x0ebd..0x0ebd]]] |
    [[[0x0ec0..0x0ec4] + [0x0ec6..0x0ec6]] + [[0x0edc..0x0edd] + [0x0f40..0x0f47]]] |
    [[[0x0f49..0x0f69] + [0x10a0..0x10c5]] + [[0x10d0..0x10f6] + [0x1100..0x1159]]] |
    [[[0x115f..0x11a2] + [0x11a8..0x11f9]] + [[0x1e00..0x1e9b] + [0x1ea0..0x1ef9]]] |
    [[[0x1f00..0x1f15] + [0x1f18..0x1f1d]] + [[0x1f20..0x1f45] + [0x1f48..0x1f4d]]] |
    [[[0x1f50..0x1f57] + [0x1f59..0x1f59]] + [[0x1f5b..0x1f5b] + [0x1f5d..0x1f5d]]] |
    [[[0x1f5f..0x1f7d] + [0x1f80..0x1fb4]] + [[0x1fb6..0x1fbc] + [0x1fbe..0x1fbe]]] |
    [[[0x1fc2..0x1fc4] + [0x1fc6..0x1fcc]] + [[0x1fd0..0x1fd3] + [0x1fd6..0x1fdb]]] |
    [[[0x1fe0..0x1fec] + [0x1ff2..0x1ff4]] + [[0x1ff6..0x1ffc] + [0x207f..0x207f]]] |
    [[[0x2102..0x2102] + [0x2107..0x2107]] + [[0x210a..0x2113] + [0x2115..0x2115]]] |
    [[[0x2118..0x211d] + [0x2124..0x2124]] + [[0x2126..0x2126] + [0x2128..0x2128]]] |
    [[[0x212a..0x2131] + [0x2133..0x2138]] + [[0x3005..0x3005] + [0x3031..0x3035]]] |
    [[[0x3041..0x3094] + [0x309b..0x309e]] + [[0x30a1..0x30fa] + [0x30fc..0x30fe]]] |
    [[[0x3105..0x312c] + [0x3131..0x318e]] + [[0x4e00..0x9fa5] + [0xac00..0xd7a3]]] |
    [[[0xf900..0xfa2d] + [0xfb00..0xfb06]] + [[0xfb13..0xfb17] + [0xfb1f..0xfb28]]] |
    [[[0xfb2a..0xfb36] + [0xfb38..0xfb3c]] + [[0xfb3e..0xfb3e] + [0xfb40..0xfb41]]] |
    [[[0xfb43..0xfb44] + [0xfb46..0xfbb1]] + [[0xfbd3..0xfd3d] + [0xfd50..0xfd8f]]] |
    [[[0xfd92..0xfdc7] + [0xfdf0..0xfdfb]] + [[0xfe70..0xfe72] + [0xfe74..0xfe74]]] |
    [[[0xfe76..0xfefc] + [0xff21..0xff3a]] + [[0xff41..0xff5a] + [0xff66..0xffbe]]] |
    [[[0xffc2..0xffc7] + [0xffca..0xffcf]] + [[0xffd2..0xffd7] + [0xffda..0xffdc]]];

  unicode_lambda = 0x03bb | 0x1d706;
  unicode_subscripts = [0x2080..0x2089];
  unicode_prime = [0x2032 + [8216..8217]]; // ' primes; we could also add 8219

  all_chars = [0 .. 0xffff];
  line_break = [[[lf + cr] + unicode_line_seperator] + unicode_paragraph_seperator];
  layout_char = [[9 + 32] + 160];

  single_line_string_char = [all_chars - line_break];
  string_literal = '"' ([single_line_string_char - ['"' + '\']] | '\' single_line_string_char)* '"';
  // backquote to generate identifiers with special symbols or which clash with keywords
  quoted_identifier_literal = '`' ([single_line_string_char - ['`' + '\']] | '\' single_line_string_char)* '`';
  regular_identifier_literal = (unicode_letter) (unicode_letter | unicode_lambda | digit | '_')* (unicode_subscripts)* (unicode_prime)*;

  safe_to_skip = ',' | '{' | '}' | '-' | '>'  | '+' | ':' |
                 '<' | '.' | '|' | '(' | ')' | '[' | ']' | '%'; // safe to skip in normal mode


/*******************************************************************
 * States                                                          *
 *******************************************************************/
States
  start, // special initial state in which we accept #PREDICATE, ...
  normal,
  definitions,
  definitions_rhs,
  no_definitions,
  // Multiline constructs (block comments, multiline strings) have their own lexer states
  // to provide better errors when the closing delimiter is missing.
  // If e. g. block comments were parsed as a single token,
  // then a long unclosed block comment would cause a pushback buffer overflow:
  // the lexer first tries to lex the entire remaining text as a block comment,
  // but once EOF is reached,
  // it tries to backtrack all the way to the start of the comment.
  // This is not an issue for line comments,
  // because EOF is a valid line comment terminator.
  block_comment,
  multiline_string,
  multiline_template;

/*******************************************************************
 * Tokens                                                          *
 *******************************************************************/

Tokens
  {start->normal, normal, definitions, definitions_rhs} comment = '/*'; // switch to block_comment state is done in PreLexer.java
  {block_comment} comment_end = '*/'; // switch back to previous state is done in PreLexer.java
  {block_comment} comment_body = [all_chars - '*']*;
  {block_comment} star = '*';

  {start->normal, normal, definitions_rhs} multiline_string_start = triple_quote;
  {multiline_string} multiline_string_end = triple_quote; // switch back to previous state is done in PreLexer.java
  // One or two single quotes followed by a different character don't terminate the string.
  {multiline_string} multiline_string_content = ((single_quote single_quote?)? [all_chars - [single_quote + '\']] | '\' all_chars)*;

  {start->normal, normal, definitions_rhs} multiline_template_start = '```';
  {multiline_template} multiline_template_end = '```'; // switch back to previous state is done in PreLexer.java
  // One or two backquotes followed by a different character don't terminate the string.
  {multiline_template} multiline_template_content = (('`' '`'?)? [all_chars - ['`' + '\']] | '\' all_chars)*;

  {start -> normal, normal, definitions, definitions_rhs} line_comment = '//' [all_chars - line_break]* line_break?;

  {start -> definitions, normal -> definitions, definitions -> definitions} pre_parser_definitions = 'DEFINITIONS';
   // definitions -> definitions is an error, which will be caught later,
   // generating Clause 'DEFINITIONS' is used more than once error message

  {start -> normal, normal, definitions -> normal, definitions_rhs} other_clause_begin =
    'MACHINE' | 'MODEL' | 'SYSTEM' | 'RULES_MACHINE' |
    'ABSTRACT_CONSTANTS' | 'ABSTRACT_VARIABLES' | 'ASSERTIONS' | 'CONCRETE_CONSTANTS' |
    'CONCRETE_VARIABLES' | 'CONSTANTS' | 'CONSTRAINTS' | 'IMPLEMENTATION' |
    'IMPORTS' | 'INCLUDES' | 'INITIALISATION' | 'INITIALIZATION' | 'INVARIANT' | 'LOCAL_OPERATIONS' |
    'OPERATIONS' | 'EVENTS' | 'PROMOTES' | 'PROPERTIES' | 'REFINES' | 'REFINEMENT' | 'EXTENDS' |
    'FREETYPES' |
    'SEES' | 'SETS' | 'USES' | 'VALUES' | 'VARIABLES';

  {start -> normal, normal, definitions -> normal} end_machine = 'END';

  {start -> no_definitions} kw_prefix = '#EXPRESSION' | '#PREDICATE' | '#SUBSTITUTION';
  {no_definitions} no_def_something = all_chars*;

  {definitions} pre_parser_identifier = regular_identifier_literal | quoted_identifier_literal;
  {definitions, definitions_rhs} left_par = '(';
  {definitions, definitions_rhs} right_par = ')';
  {definitions, definitions_rhs} slash = '/';
  {definitions} comma = ',';

  {definitions -> definitions_rhs} begin_def_body = '==' | 0x225c; // Unicode DELTA EQUAL TO
  {definitions_rhs} rhs_body = [[[[[[all_chars - layout_char] - ';'] - '('] - ')'] - '/'] - '"'];
  {definitions_rhs} begin_nesting =
    'BEGIN' | 'PRE' | 'ASSERT' | 'CHOICE' | 'IF' | 'WITNESS' |
    'SELECT' | 'CASE' | 'ANY' | 'VAR' | 'WHILE' | 'LET';
  {definitions_rhs} end_nesting = 'END';

  // Lex identifiers as complete words to ignore keywords within identifiers.
  // This only needs letter and not unicode_letter, because none of the keywords contain Unicode letters.
  {definitions_rhs} rhs_identifier = letter (letter | digit | '_')*;

  {definitions, definitions_rhs} semicolon = ';'; // returning to state definitions is done by PreLexer

  {definitions, definitions_rhs} pre_parser_string = string_literal;

  {definitions, definitions_rhs} white_space = line_break | layout_char+;

  {start -> normal, normal} some_value = // could actually be renamed to some_identifier
    (letter (letter | digit | '_')*)
     // if we allow whitespace after identifiers, maximal munch rule means Lexer will miss DEFINITIONS token
   ;

  // we want to detect things like { "abc" |-> 10, "ef" |-> 20, ... } as a single chunk

  // other transition rules in normal mode we need to be careful about: line_comment
  // something could in principle match more than line_comment, but as we match only safe characters
  // we will not miss start of DEFINITIONS or '/' or '"' or ''' or ` (backquote) or machine clause
  // we ar thus safe to skip over any char that is not a letter, a / or a " or '
  {start -> normal, normal} something =
     //[[all_chars - '"'] - layout_char] // could be backslash !
     ( [all_chars - '"'] // could be backslash !
       | string_literal
       | quoted_identifier_literal
      )
      (layout_char |
       digit |
       safe_to_skip
       // string_literal |  // otherwise testStringLiteralNotClosedLongString fails with Pushback buffer overflow
       ) *;  // make token as large as possible to avoid creating too many TSomething objects


/*******************************************************************
 * Ignored Tokens                                                  *
 *******************************************************************/
Ignored Tokens
  white_space, other_clause_begin, end_machine,
  some_value, something,
  comment, comment_body, star, comment_end, line_comment,
  multiline_string_start, multiline_string_end, multiline_string_content,
  multiline_template_start, multiline_template_end, multiline_template_content;

/*******************************************************************
 * Productions                                                     *
 *******************************************************************/
Productions

parse_unit {-> pre_parse_unit} =
  {defs_clause} [def_clause]:def_clause {-> New pre_parse_unit([def_clause.pre_parser_definition])} |
  {something} kw_prefix no_def_something {-> New pre_parse_unit([])} |
  {nothing} {-> New pre_parse_unit([])};

def_clause {-> pre_parser_definition*} = pre_parser_definitions [list]:definition_list semicolon? {-> [list.pre_parser_definition]};

definition_list {-> pre_parser_definition*} =
  {single} [def]:definition {-> [def.pre_parser_definition]} |
  {multi} [rest]:definition_list semicolon [def]:definition {-> [rest.pre_parser_definition, def.pre_parser_definition]};

definition {-> pre_parser_definition} =
  [def_name]:pre_parser_identifier [parameters]:def_parameters? begin_def_body [rhs]:rhs_body {-> New pre_parser_definition(def_name, [parameters.pre_parser_identifier], rhs)} |
  {file} [filename]:pre_parser_string {-> New pre_parser_definition.file(filename)};

def_parameters {-> pre_parser_identifier*} = left_par [parameters]:identifier_list right_par {-> [parameters.pre_parser_identifier]};

identifier_list {-> pre_parser_identifier*} =
  {single} [parameter]:pre_parser_identifier {-> [parameter]} |
  {multi} [rest]:identifier_list comma [parameter]:pre_parser_identifier {-> [rest.pre_parser_identifier, parameter]};

/*******************************************************************
 * Abstract Syntax Tree                                            *
 *******************************************************************/
Abstract Syntax Tree

pre_parse_unit = pre_parser_definition*;

pre_parser_definition =
  [def_name]:pre_parser_identifier [parameters]:pre_parser_identifier* [rhs]:rhs_body |
  {file} [filename]:pre_parser_string;