beaver.Parser Maven / Gradle / Ivy
Show all versions of soot Show documentation
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* This file is part of Beaver Parser Generator. *
* Copyright (C) 2003,2004 Alexander Demenchuk . *
* All rights reserved. *
* See the file "LICENSE" for the terms and conditions for copying, *
* distribution and modification of Beaver. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
package beaver;
import java.io.IOException;
/**
* Almost complete implementation of a LALR parser. Two components that it lacks to parse a concrete
* grammar -- rule actions and parsing tables -- are provided by a generated subclass.
*/
public abstract class Parser
{
static public class Exception extends java.lang.Exception
{
Exception(String msg)
{
super(msg);
}
}
/**
* This class "lists" reportable events that might happen during parsing.
*/
static public class Events
{
public void scannerError(Scanner.Exception e)
{
System.err.print("Scanner Error:");
if (e.line > 0)
{
System.err.print(e.line);
System.err.print(',');
System.err.print(e.column);
System.err.print(':');
}
System.err.print(' ');
System.err.println(e.getMessage());
}
public void syntaxError(Symbol token)
{
System.err.print(':');
System.err.print(Symbol.getLine(token.start));
System.err.print(',');
System.err.print(Symbol.getColumn(token.start));
System.err.print('-');
System.err.print(Symbol.getLine(token.end));
System.err.print(',');
System.err.print(Symbol.getColumn(token.end));
System.err.print(": Syntax Error: unexpected token ");
if (token.value != null)
{
System.err.print('"');
System.err.print(token.value);
System.err.println('"');
}
else
{
System.err.print('#');
System.err.println(token.id);
}
}
public void unexpectedTokenRemoved(Symbol token)
{
System.err.print(':');
System.err.print(Symbol.getLine(token.start));
System.err.print(',');
System.err.print(Symbol.getColumn(token.start));
System.err.print('-');
System.err.print(Symbol.getLine(token.end));
System.err.print(',');
System.err.print(Symbol.getColumn(token.end));
System.err.print(": Recovered: removed unexpected token ");
if (token.value != null)
{
System.err.print('"');
System.err.print(token.value);
System.err.println('"');
}
else
{
System.err.print('#');
System.err.println(token.id);
}
}
public void missingTokenInserted(Symbol token)
{
System.err.print(':');
System.err.print(Symbol.getLine(token.start));
System.err.print(',');
System.err.print(Symbol.getColumn(token.start));
System.err.print('-');
System.err.print(Symbol.getLine(token.end));
System.err.print(',');
System.err.print(Symbol.getColumn(token.end));
System.err.print(": Recovered: inserted missing token ");
if (token.value != null)
{
System.err.print('"');
System.err.print(token.value);
System.err.println('"');
}
else
{
System.err.print('#');
System.err.println(token.id);
}
}
public void misspelledTokenReplaced(Symbol token)
{
System.err.print(':');
System.err.print(Symbol.getLine(token.start));
System.err.print(',');
System.err.print(Symbol.getColumn(token.start));
System.err.print('-');
System.err.print(Symbol.getLine(token.end));
System.err.print(',');
System.err.print(Symbol.getColumn(token.end));
System.err.print(": Recovered: replaced unexpected token with ");
if (token.value != null)
{
System.err.print('"');
System.err.print(token.value);
System.err.println('"');
}
else
{
System.err.print('#');
System.err.println(token.id);
}
}
public void errorPhraseRemoved(Symbol error)
{
System.err.print(':');
System.err.print(Symbol.getLine(error.start));
System.err.print(',');
System.err.print(Symbol.getColumn(error.start));
System.err.print('-');
System.err.print(Symbol.getLine(error.end));
System.err.print(',');
System.err.print(Symbol.getColumn(error.end));
System.err.println(": Recovered: removed error phrase");
}
}
/**
* This class wraps a Scanner and provides a token "accumulator" for a parsing simulation.
* If a source that is being parsed does not have syntax errors this wrapper only adds
* one indirection while it delivers the next token. However when parser needs to recover
* from a syntax error this wrapper accumulates tokens shifted by a forward parsing simulation
* and later feeds them to the recovered parser.
*/
public class TokenStream
{
private Scanner scanner;
private Symbol[] buffer;
private int n_marked;
private int n_read;
private int n_written;
public TokenStream(Scanner scanner)
{
this.scanner = scanner;
}
public TokenStream(Scanner scanner, Symbol first_symbol)
{
this(scanner);
alloc(1);
buffer[0] = first_symbol;
n_written++;
}
public Symbol nextToken() throws IOException
{
if (buffer != null)
{
if (n_read < n_written)
return buffer[n_read++];
if (n_written < n_marked)
{
n_read++;
return buffer[n_written++] = readToken();
}
buffer = null;
}
return readToken();
}
/**
* Prepare a stream to accumulate tokens.
*
* @param size number of shifted tokens to accumulate
*/
public void alloc(int size)
{
buffer = new Symbol[(n_marked = size) + 1];
n_read = n_written = 0;
}
/**
* Prepare accumulated tokens to be reread by a next simulation run
* or by a recovered parser.
*/
public void rewind()
{
n_read = 0;
}
/**
* Insert two tokens at the beginning of a stream.
*
* @param t0 first token to be inserted
* @param t1 second token to be inserted
*/
public void insert(Symbol t0, Symbol t1)
{
if (buffer.length - n_written < 2)
throw new IllegalStateException ("not enough space in the buffer");
System.arraycopy(buffer, 0, buffer, 2, n_written);
buffer[0] = t0;
buffer[1] = t1;
n_written += 2;
}
/**
* Removes a token from the accumulator.
*
* @param i index of a token in the accumulator.
* @return removed token
*/
public Symbol remove(int i)
{
Symbol token = buffer[i];
int last = n_written - 1;
while (i < last)
{
buffer[i] = buffer[++i];
}
n_written = last;
return token;
}
/**
* Checks whether a simulation filled the token accumulator.
*
* @return true if accumulator is full
*/
boolean isFull()
{
return n_read == n_marked;
}
/**
* Reads next recognized token from the scanner. If scanner fails to recognize a token and
* throws an exception it will be reported via Parser.scannerError().
*
It is expected that scanner is capable of returning at least an EOF token after the
* exception.
*
* @return next recognized token
* @throws IOException
* as thrown by a scanner
*/
private Symbol readToken() throws IOException
{
while (true)
{
try
{
return scanner.nextToken();
}
catch (Scanner.Exception e)
{
report.scannerError(e);
}
}
}
}
/**
* Simulator is a stripped (of action code) version of a parser that will try to parse ahead
* token stream after a syntax error. The simulation is considered successful if 3 tokens were
* shifted successfully. If during simulation this parser encounters an error it drops the first
* token it tried to use and restarts the simulated parsing.
*
* Note: Without a special "error" rule present in a grammar, which a parser will try to shift
* at the beginning of an error recovery, simulation continues without removing anything from
* the original states stack. This often will lead to cases when no parsing ahead will recover
* the parser from a syntax error.
*
*/
public class Simulator
{
private short[] states;
private int top, min_top;
public boolean parse(TokenStream in) throws IOException
{
initStack();
do {
Symbol token = in.nextToken();
while (true)
{
short act = tables.findParserAction(states[top], token.id);
if (act > 0)
{
shift(act);
break;
}
else if (act == accept_action_id)
{
return true;
}
else if (act < 0)
{
short nt_id = reduce(~act);
act = tables.findNextState(states[top], nt_id);
if (act > 0)
shift(act);
else
return act == accept_action_id;
}
else // act == 0, i.e. this is an error
{
return false;
}
}
}
while (!in.isFull());
return true;
}
private void initStack() throws IOException
{
if (states == null || states.length < Parser.this.states.length)
{
states = new short[Parser.this.states.length];
min_top = 0;
}
System.arraycopy(Parser.this.states, min_top, states, min_top, (top = Parser.this.top) + 1);
}
private void increaseStackCapacity()
{
short[] new_states = new short[states.length * 2];
System.arraycopy(states, 0, new_states, 0, states.length);
states = new_states;
}
private void shift(short state)
{
if (++top == states.length)
increaseStackCapacity();
states[top] = state;
}
private short reduce(int rule_id)
{
int rule_info = tables.rule_infos[rule_id];
int rhs_size = rule_info & 0xFFFF;
top -= rhs_size;
min_top = Math.min(min_top, top);
return (short) (rule_info >>> 16);
}
}
/** The automaton tables. */
protected final ParsingTables tables;
/** Cached ID of the ACCEPT action. */
protected final short accept_action_id;
/** The parser's stack. */
protected short[] states;
/** Index of the stack's top element, i.e. it's = -1 when the stack is empty; */
protected int top;
/** The stack of shifted symbols. */
protected Symbol[] _symbols;
/** Parsing events notification "gateway" */
protected Events report;
protected Parser(ParsingTables tables)
{
this.tables = tables;
this.accept_action_id = (short) ~tables.rule_infos.length;
this.states = new short[256];
}
/**
* Parses a source and returns a semantic value of the accepted nonterminal
*
* @param source of tokens - a Scanner
* @return semantic value of the accepted nonterminal
*/
public Object parse(Scanner source) throws IOException, Parser.Exception
{
init();
return parse(new TokenStream(source));
}
/**
* Parses a source and returns a semantic value of the accepted nonterminal.
* Before parsing starts injects alternative goal marker into the source to
* indicate that an alternative goal should be matched.
*
* @param source of tokens - a Scanner
* @param alt_goal_marker_id ID of a token like symbol that will be used as a marker
* @return semantic value of the accepted nonterminal
*/
public Object parse(Scanner source, short alt_goal_marker_id) throws IOException, Parser.Exception
{
init();
TokenStream in = new TokenStream(source, new Symbol(alt_goal_marker_id));
return parse(in);
}
private Object parse(TokenStream in) throws IOException, Parser.Exception
{
while (true)
{
Symbol token = in.nextToken();
while (true)
{
short act = tables.findParserAction(states[top], token.id);
if (act > 0)
{
shift(token, act);
break;
}
else if (act == accept_action_id)
{
Symbol goal = _symbols[top];
_symbols = null; // drop this stack to prevent loitering
return goal.value;
}
else if (act < 0)
{
Symbol nt = reduce(~act);
act = tables.findNextState(states[top], nt.id);
if (act > 0)
{
shift(nt, act);
}
else if (act == accept_action_id)
{
_symbols = null; // no loitering
return nt.value;
}
else
{
throw new IllegalStateException("Cannot shift a nonterminal");
}
}
else // act == 0, i.e. this is an error
{
report.syntaxError(token);
recoverFromError(token, in);
break; // because error recovery altered token stream - parser needs to refetch the next token
}
}
}
}
/**
* Invoke actual reduce action routine.
* Method must be implemented by a generated parser
*
* @param rule_num ID of a reduce action routine to invoke
* @param offset to the symbol before first action routine argument
* @return reduced nonterminal
*/
protected abstract Symbol invokeReduceAction(int rule_num, int offset);
/**
* Performs stacks and, if not initialized yet, reduce actions array initialization.
*/
private void init()
{
if (report == null) report = new Events();
_symbols = new Symbol[states.length];
top = 0; // i.e. it's not empty
_symbols[top] = new Symbol("none"); // need a symbol here for a default reduce on the very first erroneous token
states[top] = 1; // initial/first state
}
/**
* Increases the stack capacity if it has no room for new entries.
*/
private void increaseStackCapacity()
{
short[] new_states = new short[states.length * 2];
System.arraycopy(states, 0, new_states, 0, states.length);
states = new_states;
Symbol[] new_stack = new Symbol[states.length];
System.arraycopy(_symbols, 0, new_stack, 0, _symbols.length);
_symbols = new_stack;
}
/**
* Shift a symbol to stack and go to a new state
*
* @param sym
* symbol that will be shifted
* @param goto_state
* to switch to
*/
private void shift(Symbol sym, short goto_state)
{
if (++top == states.length)
increaseStackCapacity();
_symbols[top] = sym;
states[top] = goto_state;
}
/**
* Perform a reduce action.
*
* @param rule_id
* Number of the production by which to reduce
* @return nonterminal created by a reduction
*/
private Symbol reduce(int rule_id)
{
int rule_info = tables.rule_infos[rule_id];
int rhs_size = rule_info & 0xFFFF;
top -= rhs_size;
Symbol lhs_sym = invokeReduceAction(rule_id, top);
lhs_sym.id = (short) (rule_info >>> 16);
if (rhs_size == 0)
{
lhs_sym.start = lhs_sym.end = _symbols[top].end;
}
else
{
lhs_sym.start = _symbols[top + 1].start;
lhs_sym.end = _symbols[top + rhs_size].end;
}
return lhs_sym;
}
/**
* Implements parsing error recovery. Tries several simple approaches first, like deleting "bad" token
* or replacing the latter with one of the expected in his state (if possible). If simple methods did
* not work tries to do error phrase recovery.
*
* It is expected that normally descendant parsers do not need to alter this method. In same cases though
* they may want to override it if they need a different error recovery strategy.
*
* @param token a lookahead terminal symbol that messed parsing
* @param in token stream
* @throws IOException propagated from a scanner if it has issues with the source
* @throws Parser.Exception if Parser cannot recover
*/
protected void recoverFromError(Symbol token, TokenStream in) throws IOException, Parser.Exception
{
if (token.id == 0) // end of input
throw new Parser.Exception("Cannot recover from the syntax error");
Simulator sim = new Simulator();
in.alloc(3);
short current_state = states[top];
if (!tables.compressed) // then we can try "insert missing" and "replace unexpected" recoveries
{
short first_term_id = tables.findFirstTerminal(current_state);
if (first_term_id >= 0)
{
Symbol term = new Symbol(first_term_id, _symbols[top].end, token.start);
in.insert(term, token); // insert expected terminal before unexpected one
in.rewind();
if (sim.parse(in))
{
in.rewind();
report.missingTokenInserted(term);
return;
}
int offset = tables.actn_offsets[current_state];
for (short term_id = (short) (first_term_id + 1); term_id < tables.n_term; term_id++)
{
int index = offset + term_id;
if (index >= tables.lookaheads.length)
break;
if (tables.lookaheads[index] == term_id)
{
term.id = term_id;
in.rewind();
if (sim.parse(in))
{
in.rewind();
report.missingTokenInserted(term);
return;
}
}
}
in.remove(1); // unexpected token, i.e. alter stream as if we replaced
// the unexpected token to an expected terminal
term.start = token.start;
term.end = token.end;
for (short term_id = first_term_id; term_id < tables.n_term; term_id++)
{
int index = offset + term_id;
if (index >= tables.lookaheads.length)
break;
if (tables.lookaheads[index] == term_id)
{
term.id = term_id;
in.rewind();
if (sim.parse(in))
{
in.rewind();
report.misspelledTokenReplaced(term);
return;
}
}
}
in.remove(0); // simple recoveries failed - remove all stream changes
}
}
// finally try parsing without unexpected token (as if it was "deleted")
if (sim.parse(in))
{
in.rewind();
report.unexpectedTokenRemoved(token);
return;
}
// Simple recoveries failed or are not applicable. Next step is an error phrase recovery.
/*
* Find a state where parser can shift "error" symbol. Discard already reduced (and shifted)
* productions, which are part of a phrase where unexpected terminal is found. (Note that if
* "error" symbol was not used by a grammar, in the end the entire input becomes an error phrase,
* and ... parser won't recover from it :)
*/
Symbol first_sym = token, last_sym = token;
short goto_state;
while ((goto_state = tables.findNextState(states[top], tables.error_symbol_id)) <= 0)
{
// parser cannot shift "error" in this state, so use the top symbol
// as the leftmost symbol of an error phrase
first_sym = _symbols[top];
// and go to the previous state
if (--top < 0)
throw new Parser.Exception("Cannot recover from the syntax error");
}
Symbol error = new Symbol(tables.error_symbol_id, first_sym.start, last_sym.end); // the end is temporary
shift(error, goto_state);
in.rewind();
while (!sim.parse(in))
{
last_sym = in.remove(0);
if (last_sym.id == 0) // EOF
throw new Parser.Exception("Cannot recover from the syntax error");
in.rewind();
}
error.end = last_sym.end;
in.rewind();
report.errorPhraseRemoved(error);
}
}