All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.parser.AstValidator.g Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Grammar file for Pig tree parser (visitor for default data type insertion).
 *
 * NOTE: THIS FILE IS BASED ON QueryParser.g, SO IF YOU CHANGE THAT FILE, YOU WILL
 *       PROBABLY NEED TO MAKE CORRESPONDING CHANGES TO THIS FILE AS WELL.
 */

tree grammar AstValidator;

options {
    tokenVocab=QueryParser;
    ASTLabelType=CommonTree;
    output=AST;
    backtrack=true;
}

@header {
package org.apache.pig.parser;

import org.apache.pig.data.DataType;
import org.apache.pig.impl.util.NumValCarrier;

import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
}

@members {

private static Log log = LogFactory.getLog( AstValidator.class );

@Override
protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
throws RecognitionException {
    throw new MismatchedTokenException( ttype, input );
}

@Override
public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)
throws RecognitionException {
    throw e;
}

private void validateSchemaAliasName(Set fieldNames, CommonTree node, String name)
throws DuplicatedSchemaAliasException {
    if( fieldNames.contains( name ) ) {
        throw new DuplicatedSchemaAliasException( input,
            new SourceLocation( (PigParserNode)node ), name );
    } else {
        fieldNames.add( name );
    }
}

private void validateAliasRef(Set aliases, CommonTree node, String alias)
throws UndefinedAliasException {
    if( !aliases.contains( alias ) ) {
        throw new UndefinedAliasException( input, new SourceLocation( (PigParserNode)node ), alias );
    }
}

private void checkDuplication(int count, CommonTree node) throws ParserValidationException {
    if( count > 1 ) {
        throw new ParserValidationException( input, new SourceLocation( (PigParserNode)node ),
            "Duplicated command option" );
    }
}

private String lastRel = null;

private String getLastRel(CommonTree node) throws UndefinedAliasException {
    if (lastRel != null) {
        return lastRel;
    }
    throw new UndefinedAliasException( input, new SourceLocation((PigParserNode)node), "@");
}

private Set aliases = new HashSet() {
    @Override
    public boolean add(String e) {
        lastRel = e;
        return super.add(e);
    }
};

} // End of @members

@rulecatch {
catch(RecognitionException re) {
    throw re;
}
}

query : ^( QUERY statement* )
;

statement : general_statement
          | split_statement
          | realias_statement
          | register_statement
          | assert_statement
;

split_statement : split_clause
;

realias_statement : realias_clause
;

register_statement : ^( REGISTER QUOTEDSTRING (USING IDENTIFIER AS IDENTIFIER)? )
;

assert_statement : assert_clause
;

general_statement : ^( STATEMENT ( alias { aliases.add( $alias.name ); } )? op_clause parallel_clause? )
;

realias_clause : ^(REALIAS alias IDENTIFIER)
   {
       aliases.add( $alias.name );
   }
;

parallel_clause : ^( PARALLEL INTEGER )
;

alias returns[String name, CommonTree node]
 : IDENTIFIER
   {
       $name = $IDENTIFIER.text;
       $node = $IDENTIFIER;
   }
;

previous_rel returns[String name, CommonTree node]
 : ARROBA
   {
       $name = getLastRel($ARROBA);
       $node = $ARROBA;
   }
;

op_clause : define_clause
          | load_clause
          | group_clause
          | store_clause
          | filter_clause
          | distinct_clause
          | limit_clause
          | sample_clause
          | order_clause
          | rank_clause
          | cross_clause
          | join_clause
          | union_clause
          | stream_clause
          | mr_clause
          | split_clause
          | foreach_clause
          | cube_clause
          | assert_clause
;

define_clause : ^( DEFINE alias ( cmd | func_clause ) )
;

cmd
@init {
    int ship = 0;
    int cache = 0;
    int in = 0;
    int out = 0;
    int error = 0;
}
 : ^( EXECCOMMAND ( ship_clause { checkDuplication( ++ship, $ship_clause.start ); }
                  | cache_clause { checkDuplication( ++cache, $cache_clause.start ); }
                  | input_clause { checkDuplication( ++in, $input_clause.start ); }
                  | output_clause { checkDuplication( ++out, $output_clause.start ); }
                  | error_clause { checkDuplication( ++error, $error_clause.start ); }
                  )*
   )
;

ship_clause : ^( SHIP path_list? )
;

path_list : QUOTEDSTRING+
;

cache_clause : ^( CACHE path_list )
;

input_clause : ^( INPUT stream_cmd+ )
;

stream_cmd : ^( STDIN func_clause? )
           | ^( STDOUT func_clause? )
           | ^( QUOTEDSTRING func_clause? )
;

output_clause : ^( OUTPUT stream_cmd+ )
;

error_clause : ^( STDERROR  ( QUOTEDSTRING INTEGER? )? )
;

load_clause : ^( LOAD filename func_clause? as_clause? )
;

filename : QUOTEDSTRING
;

as_clause: ^( AS field_def_list )
;

field_def[Set fieldNames, NumValCarrier nvc] throws DuplicatedSchemaAliasException
 : ^( FIELD_DEF IDENTIFIER { validateSchemaAliasName( fieldNames, $IDENTIFIER, $IDENTIFIER.text ); } type? )
 | ^( FIELD_DEF_WITHOUT_IDENTIFIER type { validateSchemaAliasName ( fieldNames, $FIELD_DEF_WITHOUT_IDENTIFIER, $nvc.makeNameFromDataType ( $type.typev ) ); } )
;

field_def_list throws DuplicatedSchemaAliasException
scope{
    Set fieldNames;
    NumValCarrier nvc;
}
@init {
    $field_def_list::fieldNames = new HashSet();
    $field_def_list::nvc = new NumValCarrier();
}
 : ( field_def[$field_def_list::fieldNames, $field_def_list::nvc] )+
;

type returns [byte typev]
  : simple_type { $typev = $simple_type.typev; }
  | tuple_type { $typev = DataType.TUPLE; }
  | bag_type { $typev = DataType.BAG; }
  | map_type { $typev = DataType.MAP; }
;

simple_type returns [byte typev]
  : BOOLEAN { $typev = DataType.BOOLEAN; }
  | INT { $typev = DataType.INTEGER; }
  | LONG { $typev = DataType.LONG; }
  | FLOAT { $typev = DataType.FLOAT; }
  | DOUBLE { $typev = DataType.DOUBLE; }
  | BIGINTEGER { $typev = DataType.BIGINTEGER; }
  | BIGDECIMAL { $typev = DataType.BIGDECIMAL; }
  | DATETIME { $typev = DataType.DATETIME; }
  | CHARARRAY { $typev = DataType.CHARARRAY; }
  | BYTEARRAY { $typev = DataType.BYTEARRAY; }
;

tuple_type : ^( TUPLE_TYPE field_def_list? )
;

bag_type : ^( BAG_TYPE IDENTIFIER? tuple_type? )
;

map_type : ^( MAP_TYPE IDENTIFIER? type? )
;

func_clause : ^( FUNC_REF func_name )
            | ^( FUNC func_name func_args? )
;

func_name : eid ( ( PERIOD | DOLLAR ) eid )*
;

func_args_string : QUOTEDSTRING | MULTILINE_QUOTEDSTRING
;

func_args : func_args_string+
;

cube_clause
 : ^( CUBE cube_item )
;

cube_item
 : rel ( cube_by_clause )
;

cube_by_clause
 : ^( BY cube_or_rollup )
;

cube_or_rollup
 : cube_rollup_list+
;

cube_rollup_list
 : ^( ( CUBE | ROLLUP ) cube_by_expr_list )
;

cube_by_expr_list
 : cube_by_expr+
;

cube_by_expr
 : col_range | expr | STAR
;

group_clause
scope {
    int arity;
}
@init {
    $group_clause::arity = 0;
}
 : ^( ( GROUP | COGROUP ) group_item+ group_type? partition_clause? )
;

group_type : QUOTEDSTRING
;

group_item
 : rel ( join_group_by_clause | ALL | ANY ) ( INNER | OUTER )?
   {
       if( $group_clause::arity == 0 ) {
           // For the first input
           $group_clause::arity = $join_group_by_clause.exprCount;
       } else if( $join_group_by_clause.exprCount != $group_clause::arity ) {
           throw new ParserValidationException( input, new SourceLocation( (PigParserNode)$group_item.start ),
               "The arity of the group by columns do not match." );
       }
   }
;

rel : alias {  validateAliasRef( aliases, $alias.node, $alias.name ); }
    | previous_rel { validateAliasRef( aliases, $previous_rel.node, $previous_rel.name ); }
    | op_clause parallel_clause?
;

flatten_generated_item : ( flatten_clause | col_range | expr | STAR ) field_def_list?
;

flatten_clause : ^( FLATTEN expr )
;

store_clause : ^( STORE rel filename func_clause? )
;

assert_clause : ^( ASSERT rel cond comment? )
; 

comment : QUOTEDSTRING
;

filter_clause : ^( FILTER rel cond )
;

cond : ^( OR cond cond )
     | ^( AND cond cond )
     | ^( NOT cond )
     | ^( NULL expr NOT? )
     | ^( rel_op expr expr )
     | in_eval
     | func_eval
     | ^( BOOL_COND expr )
;

in_eval: ^( IN ( ^( IN_LHS expr ) ^( IN_RHS expr ) )+ )
;

func_eval: ^( FUNC_EVAL func_name real_arg* ) | ^( INVOKER_FUNC_EVAL func_name IDENTIFIER real_arg* )
;

real_arg : expr | STAR | col_range
;

expr : ^( PLUS expr expr )
     | ^( MINUS expr expr )
     | ^( STAR expr expr )
     | ^( DIV expr expr )
     | ^( PERCENT expr expr )
     | ^( CAST_EXPR type expr )
     | const_expr
     | var_expr
     | ^( NEG expr )
     | ^( CAST_EXPR type_cast expr )
     | ^( EXPR_IN_PAREN expr )
;

type_cast : simple_type | map_type | tuple_type_cast | bag_type_cast
;

tuple_type_cast : ^( TUPLE_TYPE_CAST type_cast* )
;

bag_type_cast : ^( BAG_TYPE_CAST tuple_type_cast? )
;

var_expr : projectable_expr ( dot_proj | pound_proj )*
;

projectable_expr: func_eval | col_ref | bin_expr | case_expr | case_cond
;

dot_proj : ^( PERIOD col_alias_or_index+ )
;

col_alias_or_index : col_alias | col_index
;

col_alias : GROUP | CUBE | IDENTIFIER
;

col_index : DOLLARVAR
;

col_range :  ^(COL_RANGE col_ref? DOUBLE_PERIOD col_ref?)
;


pound_proj : ^( POUND ( QUOTEDSTRING | NULL ) )
;

bin_expr : ^( BIN_EXPR cond expr expr )
;

case_expr: ^( CASE_EXPR ( ^( CASE_EXPR_LHS expr ) ( ^( CASE_EXPR_RHS expr) )+ )+ )
;

case_cond: ^( CASE_COND ^( WHEN cond+ ) ^( THEN expr+ ) )
;

limit_clause : ^( LIMIT rel ( INTEGER | LONGINTEGER | expr ) )
;

sample_clause : ^( SAMPLE rel ( DOUBLENUMBER | expr ) )
;

rank_clause : ^( RANK rel ( rank_by_statement )? )
;

rank_by_statement : ^( BY rank_by_clause ( DENSE )? )
;

rank_by_clause : STAR ( ASC | DESC )?
               | rank_col+
;

rank_col : col_range (ASC | DESC)?
         | col_ref ( ASC | DESC )?
;

order_clause : ^( ORDER rel order_by_clause func_clause? )
;

order_by_clause : STAR ( ASC | DESC )?
                | order_col+
;

order_col : col_range (ASC | DESC)?
          | col_ref ( ASC | DESC )?
;

distinct_clause : ^( DISTINCT rel partition_clause? )
;

partition_clause : ^( PARTITION func_name )
;

cross_clause : ^( CROSS rel_list partition_clause? )
;

rel_list : rel+
;

join_clause
scope {
    int arity;
}
@init {
    $join_clause::arity = 0;
}
 : ^( JOIN join_sub_clause join_type? partition_clause? )
;

join_type : QUOTEDSTRING
;

join_sub_clause
 : join_item ( LEFT | RIGHT | FULL ) OUTER? join_item
 | join_item+
;

join_item
 : ^( JOIN_ITEM rel join_group_by_clause )
   {
       if( $join_clause::arity == 0 ) {
           // For the first input
           $join_clause::arity = $join_group_by_clause.exprCount;
       } else if( $join_group_by_clause.exprCount != $join_clause::arity ) {
           throw new ParserValidationException( input, new SourceLocation( (PigParserNode)$join_item.start ),
               "The arity of the join columns do not match." );
       }
   }
;

join_group_by_clause returns[int exprCount]
@init {
    $exprCount = 0;
}
 : ^( BY ( join_group_by_expr { $exprCount++; } )+ )
;

join_group_by_expr : col_range  | expr | STAR
;

union_clause : ^( UNION ONSCHEMA? rel_list )
;

foreach_clause : ^( FOREACH rel foreach_plan )
;

foreach_plan : ^( FOREACH_PLAN_SIMPLE generate_clause )
             | ^( FOREACH_PLAN_COMPLEX nested_blk )
;

nested_blk
scope { Set ids; }
@init{ $nested_blk::ids = new HashSet(); }
 : nested_command* generate_clause
;

generate_clause : ^( GENERATE flatten_generated_item+ )
;

nested_command
 : ^( NESTED_CMD IDENTIFIER nested_op )
   {
       $nested_blk::ids.add( $IDENTIFIER.text );
   }
 | ^( NESTED_CMD_ASSI IDENTIFIER expr )
   {
       $nested_blk::ids.add( $IDENTIFIER.text );
   }
;

nested_op : nested_proj
          | nested_filter
          | nested_sort
          | nested_distinct
          | nested_limit
          | nested_cross
          | nested_foreach
;

nested_proj : ^( NESTED_PROJ col_ref col_ref+ )
;

nested_filter
 : ^( FILTER nested_op_input cond )
;

nested_sort : ^( ORDER nested_op_input  order_by_clause func_clause? )
;

nested_distinct : ^( DISTINCT nested_op_input )
;

nested_limit : ^( LIMIT nested_op_input ( INTEGER | expr ) )
;

nested_cross : ^( CROSS nested_op_input_list )
;

nested_foreach : ^( FOREACH nested_op_input generate_clause )
;

nested_op_input : col_ref | nested_proj
;

nested_op_input_list : nested_op_input+
;

stream_clause : ^( STREAM rel ( EXECCOMMAND | IDENTIFIER ) as_clause? )
;

mr_clause : ^( MAPREDUCE QUOTEDSTRING path_list? store_clause load_clause EXECCOMMAND? )
;

split_clause : ^( SPLIT rel split_branch+ split_otherwise? )
;

split_branch
 : ^( SPLIT_BRANCH alias cond )
   {
       aliases.add( $alias.name );
   }
;

split_otherwise : ^( OTHERWISE alias )
   {
       aliases.add( $alias.name );
   }
;

col_ref : alias_col_ref | dollar_col_ref
;

alias_col_ref : GROUP | CUBE | IDENTIFIER
;

dollar_col_ref : DOLLARVAR
;

const_expr : literal
;

literal : scalar | map | bag | tuple
;

scalar : num_scalar | QUOTEDSTRING | NULL | TRUE | FALSE
;

num_scalar : MINUS? ( INTEGER | LONGINTEGER | FLOATNUMBER | DOUBLENUMBER | BIGINTEGERNUMBER | BIGDECIMALNUMBER )
;

map : ^( MAP_VAL keyvalue* )
;

keyvalue : ^( KEY_VAL_PAIR map_key const_expr )
;

map_key : QUOTEDSTRING
;

bag : ^( BAG_VAL tuple* )
;

tuple : ^( TUPLE_VAL literal* )
;

// extended identifier, handling the keyword and identifier conflicts. Ugly but there is no other choice.
eid : rel_str_op
    | IMPORT
    | RETURNS
    | DEFINE
    | LOAD
    | FILTER
    | FOREACH
    | CUBE
    | ROLLUP
    | MATCHES
    | ORDER
    | RANK
    | DISTINCT
    | COGROUP
    | JOIN
    | CROSS
    | UNION
    | SPLIT
    | INTO
    | IF
    | ALL
    | AS
    | BY
    | USING
    | INNER
    | OUTER
    | PARALLEL
    | PARTITION
    | GROUP
    | AND
    | OR
    | NOT
    | GENERATE
    | FLATTEN
    | EVAL
    | ASC
    | DESC
    | BOOLEAN
    | INT
    | LONG
    | FLOAT
    | DOUBLE
    | BIGINTEGER
    | BIGDECIMAL
    | DATETIME
    | CHARARRAY
    | BYTEARRAY
    | BAG
    | TUPLE
    | MAP
    | IS
    | NULL
    | TRUE
    | FALSE
    | STREAM
    | THROUGH
    | STORE
    | MAPREDUCE
    | SHIP
    | CACHE
    | INPUT
    | OUTPUT
    | STDERROR
    | STDIN
    | STDOUT
    | LIMIT
    | SAMPLE
    | LEFT
    | RIGHT
    | FULL
    | IDENTIFIER
    | TOBAG
    | TOMAP
    | TOTUPLE
    | ASSERT
;

// relational operator
rel_op : rel_op_eq
       | rel_op_ne
       | rel_op_gt
       | rel_op_gte
       | rel_op_lt
       | rel_op_lte
       | STR_OP_MATCHES
;

rel_op_eq : STR_OP_EQ | NUM_OP_EQ
;

rel_op_ne : STR_OP_NE | NUM_OP_NE
;

rel_op_gt : STR_OP_GT | NUM_OP_GT
;

rel_op_gte : STR_OP_GTE | NUM_OP_GTE
;

rel_op_lt : STR_OP_LT | NUM_OP_LT
;

rel_op_lte : STR_OP_LTE | NUM_OP_LTE
;

rel_str_op : STR_OP_EQ
           | STR_OP_NE
           | STR_OP_GT
           | STR_OP_LT
           | STR_OP_GTE
           | STR_OP_LTE
           | STR_OP_MATCHES
;




© 2015 - 2024 Weber Informatics LLC | Privacy Policy