org.apache.sysml.parser.pydml.Pydml.g4 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
grammar Pydml;
@header
{
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
}
// This antlr grammar is based on Python 3.3 language reference: https://docs.python.org/3.3/reference/grammar.html
tokens { INDENT, DEDENT }
@lexer::members {
private boolean debugIndentRules = false;
// Indentation level stack
private java.util.Stack indents = new java.util.Stack();
// Extra tokens queue (see the NEWLINE rule).
private java.util.Queue tokens = new java.util.LinkedList();
// Number of opened braces, brackets and parenthesis.
private int opened = 0;
// This is only used to set the line number for dedent
private Token lastToken = null;
@Override
public void emit(Token t) {
if(debugIndentRules)
System.out.println("Emitted token:" + t);
super.setToken(t);
tokens.offer(t);
}
@Override
public Token nextToken() {
if (_input.LA(1) == EOF && !this.indents.isEmpty()) {
if(debugIndentRules)
System.out.println("EOF reached and expecting some DEDENTS, so emitting them");
tokens.poll();
this.emit(commonToken(PydmlParser.NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.isEmpty()) {
if(debugIndentRules)
System.out.println("Emitting (inserted) DEDENTS");
this.emit(createDedent());
indents.pop();
}
// Put the EOF back on the token stream.
this.emit(commonToken(PydmlParser.EOF, ""));
}
Token next = super.nextToken();
if (next.getChannel() == Token.DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this.lastToken = next;
}
Token retVal = tokens.isEmpty() ? next : tokens.poll();
if(debugIndentRules)
System.out.println("Returning nextToken: [" + retVal + "]<<" + tokens.isEmpty());
return retVal;
}
private Token createDedent() {
CommonToken dedent = commonToken(PydmlParser.DEDENT, "");
dedent.setLine(this.lastToken.getLine());
return dedent;
}
private CommonToken commonToken(int type, String text) {
// Nike: Main change: This logic was screwed up and was emitting additional 3 characters, so commenting it for now.
// int start = this.getCharIndex();
// int stop = start + text.length();
// return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
return new CommonToken(type, text); // Main change
}
// Calculates the indentation level from the spaces:
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
// https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
static int getIndentationCount(String spaces) {
int count = 0;
for (char ch : spaces.toCharArray()) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
break;
default:
// A normal space char.
count++;
}
}
return count;
}
}
// 2. Modify this g4 by comparing it with Java:
// - https://pythonconquerstheuniverse.wordpress.com/2009/10/03/python-java-a-side-by-side-comparison/
// - http://www.cs.gordon.edu/courses/cps122/handouts-2014/From%20Python%20to%20Java%20Lecture/A%20Comparison%20of%20the%20Syntax%20of%20Python%20and%20Java.pdf
// - http://cs.joensuu.fi/~pviktor/python/slides/cheatsheet.pdf
// - http://www.interfaceware.com/manual/chameleon/scripts/quickreference.pdf
// DML Program is a list of expression
// For now, we only allow global function definitions (not nested or inside a while block)
programroot: (blocks+=statement | functionBlocks+=functionStatement)* NEWLINE* EOF;
statement returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
// ------------------------------------------
// ImportStatement
'source' OPEN_PAREN filePath = STRING CLOSE_PAREN 'as' namespace=ID NEWLINE # ImportStatement
| 'setwd' OPEN_PAREN pathValue = STRING CLOSE_PAREN NEWLINE # PathStatement
// ------------------------------------------
// AssignmentStatement
| targetList=dataIdentifier '=' 'ifdef' OPEN_PAREN commandLineParam=dataIdentifier ',' source=expression CLOSE_PAREN NEWLINE # IfdefAssignmentStatement
// ------------------------------------------
// Treat function call as AssignmentStatement or MultiAssignmentStatement
// For backward compatibility and also since the behavior of foo() * A + foo() ... where foo returns A
// Convert FunctionCallIdentifier(paramExprs, ..) -> source
| // TODO: Throw an informative error if user doesnot provide the optional assignment
( targetList=dataIdentifier '=' )? name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN NEWLINE # FunctionCallAssignmentStatement
| OPEN_BRACK targetList+=dataIdentifier (',' targetList+=dataIdentifier)* CLOSE_BRACK '=' name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN NEWLINE # FunctionCallMultiAssignmentStatement
// {notifyErrorListeners("Too many parentheses");}
// We don't support block statement
// | '{' body+=expression ';'* ( body+=expression ';'* )* '}' # BlockStatement
// ------------------------------------------
| targetList=dataIdentifier '=' source=expression NEWLINE # AssignmentStatement
// IfStatement
// | 'if' OPEN_PAREN predicate=expression CLOSE_PAREN (ifBody+=statement ';'* | NEWLINE INDENT (ifBody+=statement)+ DEDENT ) ('else' (elseBody+=statement ';'* | '{' (elseBody+=statement ';'*)* '}'))? # IfStatement
| 'if' (OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression) ':' NEWLINE INDENT (ifBody+=statement)+ DEDENT (elifBranches += elifBranch)* ('else' ':' NEWLINE INDENT (elseBody+=statement)+ DEDENT )? # IfStatement
// ------------------------------------------
// ForStatement & ParForStatement
| 'for' (OPEN_PAREN iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* CLOSE_PAREN | iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # ForStatement
// Convert strictParameterizedExpression to HashMap for parForParams
| 'parfor' (OPEN_PAREN iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* CLOSE_PAREN | iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # ParForStatement
| 'while' ( OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # WhileStatement
// ------------------------------------------
| NEWLINE #IgnoreNewLine
;
elifBranch returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
'elif' (OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression) ':' NEWLINE INDENT (elifBody+=statement)+ DEDENT
;
iterablePredicate returns [ org.apache.sysml.parser.common.ExpressionInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.ExpressionInfo();
} :
from=expression ':' to=expression #IterablePredicateColonExpression
| ID OPEN_PAREN from=expression ',' to=expression (',' increment=expression)? CLOSE_PAREN #IterablePredicateSeqExpression
;
functionStatement returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
// ------------------------------------------
// FunctionStatement & ExternalFunctionStatement
// small change: only allow typed arguments here ... instead of data identifier
'def' name=ID OPEN_PAREN ( inputParams+=typedArgNoAssign (',' inputParams+=typedArgNoAssign)* )? CLOSE_PAREN ( '->' OPEN_PAREN ( outputParams+=typedArgNoAssign (',' outputParams+=typedArgNoAssign)* )? CLOSE_PAREN )? ':' NEWLINE INDENT (body+=statement)+ DEDENT # InternalFunctionDefExpression
| 'defExternal' name=ID OPEN_PAREN ( inputParams+=typedArgNoAssign (',' inputParams+=typedArgNoAssign)* )? CLOSE_PAREN ( '->' OPEN_PAREN ( outputParams+=typedArgNoAssign (',' outputParams+=typedArgNoAssign)* )? CLOSE_PAREN )? 'implemented' 'in' OPEN_PAREN ( otherParams+=strictParameterizedKeyValueString (',' otherParams+=strictParameterizedKeyValueString)* )? CLOSE_PAREN NEWLINE # ExternalFunctionDefExpression
// ------------------------------------------
;
// Other data identifiers are typedArgNoAssign, parameterizedExpression and strictParameterizedExpression
dataIdentifier returns [ org.apache.sysml.parser.common.ExpressionInfo dataInfo ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$dataInfo = new org.apache.sysml.parser.common.ExpressionInfo();
// $dataInfo.expr = new org.apache.sysml.parser.DataIdentifier();
} :
// ------------------------------------------
// IndexedIdentifier -- allows implicit lower and upper bounds
name=ID OPEN_BRACK (rowLower=expression)? (rowImplicitSlice=':' (rowUpper=expression)?)? (',' (colLower=expression)? (colImplicitSlice=':' (colUpper=expression)?)?)? CLOSE_BRACK # IndexedExpression
// ------------------------------------------
| ID # SimpleDataIdentifierExpression
| COMMANDLINE_NAMED_ID # CommandlineParamExpression
| COMMANDLINE_POSITION_ID # CommandlinePositionExpression
;
expression returns [ org.apache.sysml.parser.common.ExpressionInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.ExpressionInfo();
// $info.expr = new org.apache.sysml.parser.BinaryExpression(org.apache.sysml.parser.Expression.BinaryOp.INVALID);
} :
// ------------------------------------------
// BinaryExpression
// power
left=expression op='**' right=expression # PowerExpression
// unary plus and minus
| op=('-'|'+') left=expression # UnaryExpression
// sequence - since we are only using this into for loop => Array not supported
//| left=expression op=':' right=expression # SequenceExpression
// matrix multiply
// | left=expression op='*' right=expression # MatrixMulExpression
// modulus and integer division
| left=expression op=('//' | '%' ) right=expression # ModIntDivExpression
// arithmetic multiply and divide
| left=expression op=('*'|'/') right=expression # MultDivExpression
// arithmetic addition and subtraction
| left=expression op=('+'|'-') right=expression # AddSubExpression
// ------------------------------------------
// RelationalExpression
| left=expression op=('>'|'>='|'<'|'<='|'=='|'!=') right=expression # RelationalExpression
// ------------------------------------------
// BooleanExpression
// boolean not
| op='!' left=expression # BooleanNotExpression
// boolean and
| left=expression op=('&'|'and') right=expression # BooleanAndExpression
// boolean or
| left=expression op=('|'|'or') right=expression # BooleanOrExpression
// ---------------------------------
// only applicable for builtin function expressions
// Add following additional functions and check number of parameters:
// power, full, matrix, reshape, dot
// Also take care whether there is y.transpose() => which sometinamespace
| name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN ';'* # BuiltinFunctionExpression
// 4. Atomic
| OPEN_PAREN left=expression CLOSE_PAREN # AtomicExpression
// Should you allow indexed expression here ?
// | OPEN_BRACK targetList+=expression (',' targetList+=expression)* CLOSE_BRACK # MultiIdExpression
// | BOOLEAN # ConstBooleanIdExpression
| 'True' # ConstTrueExpression
| 'False' # ConstFalseExpression
| INT # ConstIntIdExpression
| DOUBLE # ConstDoubleIdExpression
| STRING # ConstStringIdExpression
| dataIdentifier # DataIdExpression
// Special
// | 'NULL' | 'NA' | 'Inf' | 'NaN'
;
typedArgNoAssign : paramName=ID ':' paramType=ml_type ;
parameterizedExpression : (paramName=ID '=')? paramVal=expression;
strictParameterizedExpression : paramName=ID '=' paramVal=expression ;
strictParameterizedKeyValueString : paramName=ID '=' paramVal=STRING ;
// sometimes this is matrix object and sometimes its namespace
ID : (ALPHABET (ALPHABET|DIGIT|'_')* '.')? ALPHABET (ALPHABET|DIGIT|'_')*
// Special ID cases:
// | 'matrix' // --> This is a special case which causes lot of headache
// | 'scalar' | 'float' | 'int' | 'bool' // corresponds to as.scalar, as.double, as.integer and as.logical
| 'index.return' | 'empty.return'
;
// Unfortunately, we have datatype name clashing with builtin function name: matrix :(
// Therefore, ugly work around for checking datatype
ml_type : valueType | dataType OPEN_BRACK valueType CLOSE_BRACK;
// Note to reduce number of keywords, these are case-sensitive,
// To allow case-insenstive, 'int' becomes: ('i' | 'I') ('n' | 'N') ('t' | 'T')
valueType:
ID # ValueDataTypeCheck
// 'int' | 'str' | 'bool' | 'float'
;
dataType:
// 'scalar' # ScalarDataTypeDummyCheck
// |
ID # MatrixDataTypeCheck //{ if($ID.text.compareTo("matrix") != 0) { notifyErrorListeners("incorrect datatype"); } }
//| 'matrix' //---> See ID, this causes lot of headache
;
INT : DIGIT+ [Ll]?;
// BOOLEAN : 'TRUE' | 'FALSE';
DOUBLE: DIGIT+ '.' DIGIT* EXP? [Ll]?
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
;
DIGIT: '0'..'9';
ALPHABET : [a-zA-Z] ;
fragment EXP : ('E' | 'e') ('+' | '-')? INT ;
COMMANDLINE_NAMED_ID: '$' ALPHABET (ALPHABET|DIGIT|'_')*;
COMMANDLINE_POSITION_ID: '$' DIGIT+;
// supports single and double quoted string with escape characters
STRING: '"' ( ESC | ~[\\"] )*? '"' | '\'' ( ESC | ~[\\'] )*? '\'';
fragment ESC : '\\' [btnfr"'\\] ;
// Comments, whitespaces and new line
// LINE_COMMENT : '#' .*? '\r'? '\n' -> skip ;
// MULTILINE_BLOCK_COMMENT : '/*' .*? '*/' -> skip ;
// WHITESPACE : (' ' | '\r' | '\n')+ -> skip ;
OPEN_BRACK : '[' {opened++;};
CLOSE_BRACK : ']' {opened--;};
OPEN_PAREN : '(' {opened++;};
CLOSE_PAREN : ')' {opened--;};
// OPEN_BRACE : '{' {opened++;};
// CLOSE_BRACE : '}' {opened--;};
fragment SPACES : [ \t]+ ;
fragment COMMENT : '#' ~[\r\n]* ;
fragment LINE_JOINING : '\\' SPACES? ( '\r'? '\n' | '\r' ) ;
NEWLINE : ( '\r'? '\n' | '\r' ) SPACES?
{
String newLine = getText().replaceAll("[^\r\n]+", "");
String spaces = getText().replaceAll("[\r\n]+", "");
int next = _input.LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
skip();
if(debugIndentRules) {
if(next == '\r' || next == '\n') {
System.out.println("4.1 Skipping (blank lines)");
}
else if(next == '#') {
System.out.println("4.2 Skipping (comment)");
}
else {
System.out.println("4.2 Skipping something else");
}
}
}
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.isEmpty() ? 0 : indents.peek();
if (indent == previous) {
if(debugIndentRules)
System.out.println("3. Skipping identation as of same size:" + next);
// skip indents of the same size as the present indent-size
skip();
}
else if (indent > previous) {
if(debugIndentRules)
System.out.println("1. Indent:" + next);
indents.push(indent);
emit(commonToken(PydmlParser.INDENT, spaces));
}
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.isEmpty() && indents.peek() > indent) {
if(debugIndentRules)
System.out.println("2. Dedent:" + next);
this.emit(createDedent());
indents.pop();
}
}
}
}
;
SKIP_WS : ( SPACES | COMMENT | LINE_JOINING ) -> skip ;