beaver.Parser Maven / Gradle / Ivy

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * This file is part of Beaver Parser Generator.                       *
 * Copyright (C) 2003,2004 Alexander Demenchuk .  *
 * All rights reserved.                                                *
 * See the file "LICENSE" for the terms and conditions for copying,    *
 * distribution and modification of Beaver.                            *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

package beaver;

import java.io.IOException;

/**
 * Almost complete implementation of a LALR parser. Two components that it lacks to parse a concrete
 * grammar -- rule actions and parsing tables -- are provided by a generated subclass.
 */
public abstract class Parser
{
	static public class Exception extends java.lang.Exception
	{
		Exception(String msg)
		{
			super(msg);
		}
	}
	
	/**
	 * This class "lists" reportable events that might happen during parsing.
	 */
	static public class Events
	{
		public void scannerError(Scanner.Exception e)
		{
			System.err.print("Scanner Error:");
			if (e.line > 0)
			{
				System.err.print(e.line);
				System.err.print(',');
				System.err.print(e.column);
				System.err.print(':');
			}
			System.err.print(' ');
			System.err.println(e.getMessage());
		}
		public void syntaxError(Symbol token)
		{
			System.err.print(':');
			System.err.print(Symbol.getLine(token.start));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.start));
			System.err.print('-');
			System.err.print(Symbol.getLine(token.end));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.end));
			System.err.print(": Syntax Error: unexpected token ");
			if (token.value != null)
			{
				System.err.print('"');
				System.err.print(token.value);
				System.err.println('"');
			}
			else
			{
				System.err.print('#');
				System.err.println(token.id);
			}
		}
		public void unexpectedTokenRemoved(Symbol token)
		{
			System.err.print(':');
			System.err.print(Symbol.getLine(token.start));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.start));
			System.err.print('-');
			System.err.print(Symbol.getLine(token.end));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.end));
			System.err.print(": Recovered: removed unexpected token ");
			if (token.value != null)
			{
				System.err.print('"');
				System.err.print(token.value);
				System.err.println('"');
			}
			else
			{
				System.err.print('#');
				System.err.println(token.id);
			}
		}
		public void missingTokenInserted(Symbol token)
		{
			System.err.print(':');
			System.err.print(Symbol.getLine(token.start));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.start));
			System.err.print('-');
			System.err.print(Symbol.getLine(token.end));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.end));
			System.err.print(": Recovered: inserted missing token ");
			if (token.value != null)
			{
				System.err.print('"');
				System.err.print(token.value);
				System.err.println('"');
			}
			else
			{
				System.err.print('#');
				System.err.println(token.id);
			}
		}
		public void misspelledTokenReplaced(Symbol token)
		{
			System.err.print(':');
			System.err.print(Symbol.getLine(token.start));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.start));
			System.err.print('-');
			System.err.print(Symbol.getLine(token.end));
			System.err.print(',');
			System.err.print(Symbol.getColumn(token.end));
			System.err.print(": Recovered: replaced unexpected token with ");
			if (token.value != null)
			{
				System.err.print('"');
				System.err.print(token.value);
				System.err.println('"');
			}
			else
			{
				System.err.print('#');
				System.err.println(token.id);
			}
		}
		public void errorPhraseRemoved(Symbol error)
		{
			System.err.print(':');
			System.err.print(Symbol.getLine(error.start));
			System.err.print(',');
			System.err.print(Symbol.getColumn(error.start));
			System.err.print('-');
			System.err.print(Symbol.getLine(error.end));
			System.err.print(',');
			System.err.print(Symbol.getColumn(error.end));
			System.err.println(": Recovered: removed error phrase");
		}
	}
	
	/**
	 * This class wraps a Scanner and provides a token "accumulator" for a parsing simulation.
	 * If a source that is being parsed does not have syntax errors this wrapper only adds 
	 * one indirection while it delivers the next token. However when parser needs to recover
	 * from a syntax error this wrapper accumulates tokens shifted by a forward parsing simulation
	 * and later feeds them to the recovered parser.
	 */
	public class TokenStream
	{
		private Scanner  scanner;
		private Symbol[] buffer;
		private int      n_marked;
		private int      n_read;
		private int      n_written;
		
		public TokenStream(Scanner scanner)
		{
			this.scanner = scanner;
		}

        public TokenStream(Scanner scanner, Symbol first_symbol)
        {
            this(scanner);
            alloc(1);
            buffer[0] = first_symbol;
            n_written++;
        }
        
		public Symbol nextToken() throws IOException
		{
			if (buffer != null)
			{				
				if (n_read < n_written)
					return buffer[n_read++];
				
				if (n_written < n_marked)
				{
					n_read++;
					return buffer[n_written++] = readToken();
				}
				buffer = null;
			}
			return readToken();
		}

		/**
		 * Prepare a stream to accumulate tokens.
		 * 
		 * @param size number of shifted tokens to accumulate
		 */
		public void alloc(int size)
		{
			buffer = new Symbol[(n_marked = size) + 1];
			n_read = n_written = 0;
		}
		
		/**
		 * Prepare accumulated tokens to be reread by a next simulation run
		 * or by a recovered parser.
		 */
		public void rewind()
		{
			n_read = 0;
		}
        
		/**
		 * Insert two tokens at the beginning of a stream.
		 * 
		 * @param t0 first token to be inserted
		 * @param t1 second token to be inserted
		 */
		public void insert(Symbol t0, Symbol t1)
		{
		    if (buffer.length - n_written < 2)
		        throw new IllegalStateException ("not enough space in the buffer");
			System.arraycopy(buffer, 0, buffer, 2, n_written);
			buffer[0] = t0;
			buffer[1] = t1;
			n_written += 2;
		}
		
		/**
		 * Removes a token from the accumulator.
		 * 
		 * @param i index of a token in the accumulator.
		 * @return removed token
		 */
		public Symbol remove(int i)
		{
			Symbol token = buffer[i];
			int last = n_written - 1;
			while (i < last)
			{
				buffer[i] = buffer[++i];
			}
			n_written = last;
			return token;
		}

		/**
		 * Checks whether a simulation filled the token accumulator. 
		 * 
		 * @return true if accumulator is full
		 */
		boolean isFull()
		{
			return n_read == n_marked;
		}
		
		/**
		 * Reads next recognized token from the scanner. If scanner fails to recognize a token and
		 * throws an exception it will be reported via Parser.scannerError().
		 * 
It is expected that scanner is capable of returning at least an EOF token after the
		 * exception.
		 * 
		 * @return next recognized token
		 * @throws IOException
		 *             as thrown by a scanner
		 */
		private Symbol readToken() throws IOException
		{
			while (true)
			{
				try
				{
					return scanner.nextToken();
				}
				catch (Scanner.Exception e)
				{
					report.scannerError(e);
				}
			}
		}
	}

	/**
	 * Simulator is a stripped (of action code) version of a parser that will try to parse ahead
	 * token stream after a syntax error. The simulation is considered successful if 3 tokens were
	 * shifted successfully. If during simulation this parser encounters an error it drops the first
	 * token it tried to use and restarts the simulated parsing.
	 * 
	 * Note: Without a special "error" rule present in a grammar, which a parser will try to shift
	 * at the beginning of an error recovery, simulation continues without removing anything from
	 * the original states stack. This often will lead to cases when no parsing ahead will recover
	 * the parser from a syntax error.
	 * 
	 */
	public class Simulator
	{
		private short[] states;
		private int top, min_top;

		public boolean parse(TokenStream in) throws IOException
		{
			initStack();
			do {
				Symbol token = in.nextToken();
				while (true)
				{
					short act = tables.findParserAction(states[top], token.id);
					if (act > 0)
					{
						shift(act);
						break;
					}
					else if (act == accept_action_id)
					{
						return true;
					}
					else if (act < 0)
					{
						short nt_id = reduce(~act);

						act = tables.findNextState(states[top], nt_id);
						if (act > 0)
							shift(act);
						else
							return act == accept_action_id;
					}
					else // act == 0, i.e. this is an error
					{
						return false;
					}
				}
			}
			while (!in.isFull());
			return true;
		}

		private void initStack() throws IOException
		{
			if (states == null || states.length < Parser.this.states.length)
			{
				states = new short[Parser.this.states.length];
				min_top = 0;
			}
			System.arraycopy(Parser.this.states, min_top, states, min_top, (top = Parser.this.top) + 1);
		}

		private void increaseStackCapacity()
		{
			short[] new_states = new short[states.length * 2];
			System.arraycopy(states, 0, new_states, 0, states.length);
			states = new_states;
		}

		private void shift(short state)
		{
			if (++top == states.length)
				increaseStackCapacity();
			states[top] = state;
		}

		private short reduce(int rule_id)
		{
			int rule_info = tables.rule_infos[rule_id];
			int rhs_size = rule_info & 0xFFFF;
			top -= rhs_size;
			min_top = Math.min(min_top, top);
			return (short) (rule_info >>> 16);
		}
	}

	/** The automaton tables. */
	protected final ParsingTables tables;

	/** Cached ID of the ACCEPT action. */
	protected final short accept_action_id;

	/** The parser's stack. */
	protected short[] states;

	/** Index of the stack's top element, i.e. it's = -1 when the stack is empty; */
	protected int top;

	/** The stack of shifted symbols. */
	protected Symbol[] _symbols;

	/** Parsing events notification "gateway" */
	protected Events report;
	

	protected Parser(ParsingTables tables)
	{
		this.tables = tables;
		this.accept_action_id = (short) ~tables.rule_infos.length;
		this.states = new short[256];
	}

    /**
     * Parses a source and returns a semantic value of the accepted nonterminal
     * 
     * @param source of tokens - a Scanner
     * @return semantic value of the accepted nonterminal
     */
	public Object parse(Scanner source) throws IOException, Parser.Exception
	{
		init();
		return parse(new TokenStream(source));
	}
    
    /**
     * Parses a source and returns a semantic value of the accepted nonterminal.
     * Before parsing starts injects alternative goal marker into the source to
     * indicate that an alternative goal should be matched.
     * 
     * @param source of tokens - a Scanner
     * @param alt_goal_marker_id ID of a token like symbol that will be used as a marker
     * @return semantic value of the accepted nonterminal
     */
    public Object parse(Scanner source, short alt_goal_marker_id) throws IOException, Parser.Exception
    {
        init();
        TokenStream in = new TokenStream(source, new Symbol(alt_goal_marker_id));
        return parse(in);
    }
    
    private Object parse(TokenStream in) throws IOException, Parser.Exception
    {
        while (true)
        {
            Symbol token = in.nextToken();
            while (true)
            {
                short act = tables.findParserAction(states[top], token.id);
                if (act > 0)
                {
                    shift(token, act);
                    break;
                }
                else if (act == accept_action_id)
                {
                    Symbol goal = _symbols[top];
                    _symbols = null; // drop this stack to prevent loitering
                    return goal.value;
                }
                else if (act < 0)
                {
                    Symbol nt = reduce(~act);
                    act = tables.findNextState(states[top], nt.id);
                    if (act > 0)
                    {
                        shift(nt, act);
                    }
                    else if (act == accept_action_id)
                    {
                        _symbols = null; // no loitering
                        return nt.value;
                    }
                    else
                    {
                        throw new IllegalStateException("Cannot shift a nonterminal");
                    }
                }
                else // act == 0, i.e. this is an error
                {
                    report.syntaxError(token);
                    recoverFromError(token, in);
                    break; // because error recovery altered token stream - parser needs to refetch the next token
                }
            }
        }
    }

	/**
	 * Invoke actual reduce action routine.
	 * Method must be implemented by a generated parser
	 * 
	 * @param rule_num ID of a reduce action routine to invoke
	 * @param offset to the symbol before first action routine argument
	 * @return reduced nonterminal
	 */
	protected abstract Symbol invokeReduceAction(int rule_num, int offset);

	/**
	 * Performs stacks and, if not initialized yet, reduce actions array initialization.
	 */
	private void init()
	{
		if (report == null) report = new Events();
		
		_symbols = new Symbol[states.length];
		top = 0; // i.e. it's not empty
		_symbols[top] = new Symbol("none"); // need a symbol here for a default reduce on the very first erroneous token  
		states[top] = 1; // initial/first state
	}

	/**
	 * Increases the stack capacity if it has no room for new entries.
	 */
	private void increaseStackCapacity()
	{
		short[] new_states = new short[states.length * 2];
		System.arraycopy(states, 0, new_states, 0, states.length);
		states = new_states;

		Symbol[] new_stack = new Symbol[states.length];
		System.arraycopy(_symbols, 0, new_stack, 0, _symbols.length);
		_symbols = new_stack;
	}

	/**
	 * Shift a symbol to stack and go to a new state 
	 * 
	 * @param sym
	 *            symbol that will be shifted
	 * @param goto_state
	 *            to switch to
	 */
	private void shift(Symbol sym, short goto_state)
	{
		if (++top == states.length)
			increaseStackCapacity();
		_symbols[top] = sym;
		states[top] = goto_state;
	}

	/**
	 * Perform a reduce action.
	 * 
	 * @param rule_id
	 *            Number of the production by which to reduce
	 * @return nonterminal created by a reduction
	 */
	private Symbol reduce(int rule_id)
	{
		int rule_info = tables.rule_infos[rule_id];
		int rhs_size = rule_info & 0xFFFF;

		top -= rhs_size;
		Symbol lhs_sym = invokeReduceAction(rule_id, top);
		lhs_sym.id = (short) (rule_info >>> 16);
		if (rhs_size == 0)
		{
			lhs_sym.start = lhs_sym.end = _symbols[top].end;
		}
		else
		{
			lhs_sym.start = _symbols[top + 1].start;
			lhs_sym.end = _symbols[top + rhs_size].end;
		}
		return lhs_sym;
	}

    /**
     * Implements parsing error recovery. Tries several simple approaches first, like deleting "bad" token
     * or replacing the latter with one of the expected in his state (if possible). If simple methods did
     * not work tries to do error phrase recovery.
     * 
     * It is expected that normally descendant parsers do not need to alter this method. In same cases though
     * they may want to override it if they need a different error recovery strategy. 
     * 
     * @param token a lookahead terminal symbol that messed parsing 
     * @param in token stream
     * @throws IOException propagated from a scanner if it has issues with the source
     * @throws Parser.Exception if Parser cannot recover
     */
	protected void recoverFromError(Symbol token, TokenStream in) throws IOException, Parser.Exception
	{
		if (token.id == 0) // end of input
			throw new Parser.Exception("Cannot recover from the syntax error");
		
		Simulator sim = new Simulator();
		in.alloc(3);
		short current_state = states[top];
		if (!tables.compressed) // then we can try "insert missing" and "replace unexpected" recoveries
		{
			short first_term_id = tables.findFirstTerminal(current_state);
			if (first_term_id >= 0)
			{
				Symbol term = new Symbol(first_term_id, _symbols[top].end, token.start);
				in.insert(term, token); // insert expected terminal before unexpected one
				in.rewind();
				if (sim.parse(in))
				{
					in.rewind();
					report.missingTokenInserted(term);
					return;
				}
				
				int offset = tables.actn_offsets[current_state];
				
				for (short term_id = (short) (first_term_id + 1); term_id < tables.n_term; term_id++)
				{
					int index = offset + term_id;
					if (index >= tables.lookaheads.length)
						break;
					if (tables.lookaheads[index] == term_id)
					{
						term.id = term_id;
						in.rewind();
						if (sim.parse(in))
						{
							in.rewind();
							report.missingTokenInserted(term);
							return;
						}
					}
				}
				in.remove(1); // unexpected token, i.e. alter stream as if we replaced 
				              // the unexpected token to an expected terminal
				term.start = token.start;
				term.end = token.end;
				
				for (short term_id = first_term_id; term_id < tables.n_term; term_id++)
				{
					int index = offset + term_id;
					if (index >= tables.lookaheads.length)
						break;
					if (tables.lookaheads[index] == term_id)
					{
						term.id = term_id;
						in.rewind();
						if (sim.parse(in))
						{
							in.rewind();
							report.misspelledTokenReplaced(term);
							return;
						}
					}
				}
				in.remove(0); // simple recoveries failed - remove all stream changes 
			}
		}
		// finally try parsing without unexpected token (as if it was "deleted")
        if (sim.parse(in)) 
        {
            in.rewind();
            report.unexpectedTokenRemoved(token);
            return;
        }
		
		// Simple recoveries failed or are not applicable. Next step is an error phrase recovery.
		/*
		 * Find a state where parser can shift "error" symbol. Discard already reduced (and shifted)
		 * productions, which are part of a phrase where unexpected terminal is found. (Note that if
		 * "error" symbol was not used by a grammar, in the end the entire input becomes an error phrase,
		 * and ... parser won't recover from it :)
		 */
		Symbol first_sym = token, last_sym = token;
		short goto_state;
		while ((goto_state = tables.findNextState(states[top], tables.error_symbol_id)) <= 0)
		{
			// parser cannot shift "error" in this state, so use the top symbol
			// as the leftmost symbol of an error phrase
			first_sym = _symbols[top];
			// and go to the previous state
			if (--top < 0)
				throw new Parser.Exception("Cannot recover from the syntax error");
		}
		Symbol error = new Symbol(tables.error_symbol_id, first_sym.start, last_sym.end); // the end is temporary
		shift(error, goto_state);

		in.rewind();
		while (!sim.parse(in))
		{
			last_sym = in.remove(0);
			if (last_sym.id == 0) // EOF
				throw new Parser.Exception("Cannot recover from the syntax error");
			in.rewind();
		}
		error.end = last_sym.end;
		in.rewind();
		report.errorPhraseRemoved(error);
	}
}