All Downloads are FREE. Search and download functionalities are using the official Maven repository.

persistence.antlr.CodeGenerator Maven / Gradle / Ivy

There is a newer version: 2.1-60f
Show newest version
package persistence.antlr;

/* ANTLR Translator Generator
 * Project led by Terence Parr at http://www.jGuru.com
 * Software rights: http://www.antlr.org/license.html
 *
 */

import java.io.PrintWriter;
import java.io.IOException;
import java.io.FileWriter;

import persistence.antlr.collections.impl.Vector;
import persistence.antlr.collections.impl.BitSet;

/**A generic ANTLR code generator.  All code generators
 * Derive from this class.
 *
 * 

* A CodeGenerator knows about a Grammar data structure and * a grammar analyzer. The Grammar is walked to generate the * appropriate code for both a parser and lexer (if present). * This interface may change slightly so that the lexer is * itself living inside of a Grammar object (in which case, * this class generates only one recognizer). The main method * to call is gen(), which initiates all code gen. * *

* The interaction of the code generator with the analyzer is * simple: each subrule block calls deterministic() before generating * code for the block. Method deterministic() sets lookahead caches * in each Alternative object. Technically, a code generator * doesn't need the grammar analyzer if all lookahead analysis * is done at runtime, but this would result in a slower parser. * *

* This class provides a set of support utilities to handle argument * list parsing and so on. * * @author Terence Parr, John Lilley * @version 2.00a * @see persistence.antlr.JavaCodeGenerator * @see persistence.antlr.DiagnosticCodeGenerator * @see persistence.antlr.LLkAnalyzer * @see persistence.antlr.Grammar * @see persistence.antlr.AlternativeElement * @see persistence.antlr.Lookahead */ public abstract class CodeGenerator { protected persistence.antlr.Tool antlrTool; /** Current tab indentation for code output */ protected int tabs = 0; /** Current output Stream */ transient protected PrintWriter currentOutput; // SAS: for proper text i/o /** The grammar for which we generate code */ protected Grammar grammar = null; /** List of all bitsets that must be dumped. These are Vectors of BitSet. */ protected Vector bitsetsUsed; /** The grammar behavior */ protected DefineGrammarSymbols behavior; /** The LLk analyzer */ protected LLkGrammarAnalyzer analyzer; /** Object used to format characters in the target language. * subclass must initialize this to the language-specific formatter */ protected CharFormatter charFormatter; /** Use option "codeGenDebug" to generate debugging output */ protected boolean DEBUG_CODE_GENERATOR = false; /** Default values for code-generation thresholds */ protected static final int DEFAULT_MAKE_SWITCH_THRESHOLD = 2; protected static final int DEFAULT_BITSET_TEST_THRESHOLD = 4; /** If there are more than 8 long words to init in a bitset, * try to optimize it; e.g., detect runs of -1L and 0L. */ protected static final int BITSET_OPTIMIZE_INIT_THRESHOLD = 8; /** This is a hint for the language-specific code generator. * A switch() or language-specific equivalent will be generated instead * of a series of if/else statements for blocks with number of alternates * greater than or equal to this number of non-predicated LL(1) alternates. * This is modified by the grammar option "codeGenMakeSwitchThreshold" */ protected int makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD; /** This is a hint for the language-specific code generator. * A bitset membership test will be generated instead of an * ORed series of LA(k) comparisions for lookahead sets with * degree greater than or equal to this value. * This is modified by the grammar option "codeGenBitsetTestThreshold" */ protected int bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD; private static boolean OLD_ACTION_TRANSLATOR = true; public static String TokenTypesFileSuffix = "TokenTypes"; public static String TokenTypesFileExt = ".txt"; /** Construct code generator base class */ public CodeGenerator() { } /** Output a String to the currentOutput stream. * Ignored if string is null. * @param s The string to output */ protected void _print(String s) { if (s != null) { currentOutput.print(s); } } /** Print an action without leading tabs, attempting to * preserve the current indentation level for multi-line actions * Ignored if string is null. * @param s The action string to output */ protected void _printAction(String s) { if (s == null) { return; } // Skip leading newlines, tabs and spaces int start = 0; while (start < s.length() && Character.isSpaceChar(s.charAt(start))) { start++; } // Skip leading newlines, tabs and spaces int end = s.length() - 1; while (end > start && Character.isSpaceChar(s.charAt(end))) { end--; } char c = 0; for (int i = start; i <= end;) { c = s.charAt(i); i++; boolean newline = false; switch (c) { case '\n': newline = true; break; case '\r': if (i <= end && s.charAt(i) == '\n') { i++; } newline = true; break; default: currentOutput.print(c); break; } if (newline) { currentOutput.println(); printTabs(); // Absorb leading whitespace while (i <= end && Character.isSpaceChar(s.charAt(i))) { i++; } newline = false; } } currentOutput.println(); } /** Output a String followed by newline, to the currentOutput stream. * Ignored if string is null. * @param s The string to output */ protected void _println(String s) { if (s != null) { currentOutput.println(s); } } /** Test if a set element array represents a contiguous range. * @param elems The array of elements representing the set, usually from BitSet.toArray(). * @return true if the elements are a contiguous range (with two or more). */ public static boolean elementsAreRange(int[] elems) { if (elems.length == 0) { return false; } int begin = elems[0]; int end = elems[elems.length - 1]; if (elems.length <= 2) { // Not enough elements for a range expression return false; } if (end - begin + 1 > elems.length) { // The set does not represent a contiguous range return false; } int v = begin + 1; for (int i = 1; i < elems.length - 1; i++) { if (v != elems[i]) { // The set does not represent a contiguous range return false; } v++; } return true; } /** Get the identifier portion of an argument-action token. * The ID of an action is assumed to be a trailing identifier. * Specific code-generators may want to override this * if the language has unusual declaration syntax. * @param t The action token * @return A string containing the text of the identifier */ protected String extractIdOfAction(Token t) { return extractIdOfAction(t.getText(), t.getLine(), t.getColumn()); } /** Get the identifier portion of an argument-action. * The ID of an action is assumed to be a trailing identifier. * Specific code-generators may want to override this * if the language has unusual declaration syntax. * @param s The action text * @param line Line used for error reporting. * @param column Line used for error reporting. * @return A string containing the text of the identifier */ protected String extractIdOfAction(String s, int line, int column) { s = removeAssignmentFromDeclaration(s); // Search back from the end for a non alphanumeric. That marks the // beginning of the identifier for (int i = s.length() - 2; i >= 0; i--) { // TODO: make this work for language-independent identifiers? if (!Character.isLetterOrDigit(s.charAt(i)) && s.charAt(i) != '_') { // Found end of type part return s.substring(i + 1); } } // Something is bogus, but we cannot parse the language-specific // actions any better. The compiler will have to catch the problem. antlrTool.warning("Ill-formed action", grammar.getFilename(), line, column); return ""; } /** Get the type string out of an argument-action token. * The type of an action is assumed to precede a trailing identifier * Specific code-generators may want to override this * if the language has unusual declaration syntax. * @param t The action token * @return A string containing the text of the type */ protected String extractTypeOfAction(Token t) { return extractTypeOfAction(t.getText(), t.getLine(), t.getColumn()); } /** Get the type portion of an argument-action. * The type of an action is assumed to precede a trailing identifier * Specific code-generators may want to override this * if the language has unusual declaration syntax. * @param s The action text * @param line Line used for error reporting. * @return A string containing the text of the type */ protected String extractTypeOfAction(String s, int line, int column) { s = removeAssignmentFromDeclaration(s); // Search back from the end for a non alphanumeric. That marks the // beginning of the identifier for (int i = s.length() - 2; i >= 0; i--) { // TODO: make this work for language-independent identifiers? if (!Character.isLetterOrDigit(s.charAt(i)) && s.charAt(i) != '_') { // Found end of type part return s.substring(0, i + 1); } } // Something is bogus, but we cannot parse the language-specific // actions any better. The compiler will have to catch the problem. antlrTool.warning("Ill-formed action", grammar.getFilename(), line, column); return ""; } /** Generate the code for all grammars */ public abstract void gen(); /** Generate code for the given grammar element. * @param action The {...} action to generate */ public abstract void gen(ActionElement action); /** Generate code for the given grammar element. * @param blk The "x|y|z|..." block to generate */ public abstract void gen(AlternativeBlock blk); /** Generate code for the given grammar element. * @param end The block-end element to generate. Block-end * elements are synthesized by the grammar parser to represent * the end of a block. */ public abstract void gen(BlockEndElement end); /** Generate code for the given grammar element. * @param atom The character literal reference to generate */ public abstract void gen(CharLiteralElement atom); /** Generate code for the given grammar element. * @param r The character-range reference to generate */ public abstract void gen(CharRangeElement r); /** Generate the code for a parser */ public abstract void gen(LexerGrammar g) throws IOException; /** Generate code for the given grammar element. * @param blk The (...)+ block to generate */ public abstract void gen(OneOrMoreBlock blk); /** Generate the code for a parser */ public abstract void gen(ParserGrammar g) throws IOException; /** Generate code for the given grammar element. * @param rr The rule-reference to generate */ public abstract void gen(RuleRefElement rr); /** Generate code for the given grammar element. * @param atom The string-literal reference to generate */ public abstract void gen(StringLiteralElement atom); /** Generate code for the given grammar element. * @param r The token-range reference to generate */ public abstract void gen(TokenRangeElement r); /** Generate code for the given grammar element. * @param atom The token-reference to generate */ public abstract void gen(TokenRefElement atom); /** Generate code for the given grammar element. * @param blk The tree to generate code for. */ public abstract void gen(TreeElement t); /** Generate the code for a parser */ public abstract void gen(TreeWalkerGrammar g) throws IOException; /** Generate code for the given grammar element. * @param wc The wildcard element to generate */ public abstract void gen(WildcardElement wc); /** Generate code for the given grammar element. * @param blk The (...)* block to generate */ public abstract void gen(ZeroOrMoreBlock blk); /** Generate the token types as a text file for persistence across shared lexer/parser */ protected void genTokenInterchange(TokenManager tm) throws IOException { // Open the token output Java file and set the currentOutput stream String fName = tm.getName() + TokenTypesFileSuffix + TokenTypesFileExt; currentOutput = antlrTool.openOutputFile(fName); println("// $ANTLR " + antlrTool.version + ": " + antlrTool.fileMinusPath(antlrTool.grammarFile) + " -> " + fName + "$"); tabs = 0; // Header println(tm.getName() + " // output token vocab name"); // Generate a definition for each token type Vector v = tm.getVocabulary(); for (int i = Token.MIN_USER_TYPE; i < v.size(); i++) { String s = (String)v.elementAt(i); if (DEBUG_CODE_GENERATOR) { System.out.println("gen persistence file entry for: " + s); } if (s != null && !s.startsWith("<")) { // if literal, find label if (s.startsWith("\"")) { StringLiteralSymbol sl = (StringLiteralSymbol)tm.getTokenSymbol(s); if (sl != null && sl.label != null) { print(sl.label + "="); } println(s + "=" + i); } else { print(s); // check for a paraphrase TokenSymbol ts = (TokenSymbol)tm.getTokenSymbol(s); if (ts == null) { antlrTool.warning("undefined token symbol: " + s); } else { if (ts.getParaphrase() != null) { print("(" + ts.getParaphrase() + ")"); } } println("=" + i); } } } // Close the tokens output file currentOutput.close(); currentOutput = null; } /** Process a string for an simple expression for use in xx/action.g * it is used to cast simple tokens/references to the right type for * the generated language. * @param str A String. */ public String processStringForASTConstructor(String str) { return str; } /** Get a string for an expression to generate creation of an AST subtree. * @param v A Vector of String, where each element is an expression in the target language yielding an AST node. */ public abstract String getASTCreateString(Vector v); /** Get a string for an expression to generate creating of an AST node * @param str The text of the arguments to the AST construction */ public abstract String getASTCreateString(GrammarAtom atom, String str); /** Given the index of a bitset in the bitset list, generate a unique name. * Specific code-generators may want to override this * if the language does not allow '_' or numerals in identifiers. * @param index The index of the bitset in the bitset list. */ protected String getBitsetName(int index) { return "_tokenSet_" + index; } public static String encodeLexerRuleName(String id) { return "m" + id; } public static String decodeLexerRuleName(String id) { if ( id==null ) { return null; } return id.substring(1,id.length()); } /** Map an identifier to it's corresponding tree-node variable. * This is context-sensitive, depending on the rule and alternative * being generated * @param id The identifier name to map * @param forInput true if the input tree node variable is to be returned, otherwise the output variable is returned. * @return The mapped id (which may be the same as the input), or null if the mapping is invalid due to duplicates */ public abstract String mapTreeId(String id, ActionTransInfo tInfo); /** Add a bitset to the list of bitsets to be generated. * if the bitset is already in the list, ignore the request. * Always adds the bitset to the end of the list, so the * caller can rely on the position of bitsets in the list. * The returned position can be used to format the bitset * name, since it is invariant. * @param p Bit set to mark for code generation * @param forParser true if the bitset is used for the parser, false for the lexer * @return The position of the bitset in the list. */ protected int markBitsetForGen(BitSet p) { // Is the bitset (or an identical one) already marked for gen? for (int i = 0; i < bitsetsUsed.size(); i++) { BitSet set = (BitSet)bitsetsUsed.elementAt(i); if (p.equals(set)) { // Use the identical one already stored return i; } } // Add the new bitset bitsetsUsed.appendElement(p.clone()); return bitsetsUsed.size() - 1; } /** Output tab indent followed by a String, to the currentOutput stream. * Ignored if string is null. * @param s The string to output. */ protected void print(String s) { if (s != null) { printTabs(); currentOutput.print(s); } } /** Print an action with leading tabs, attempting to * preserve the current indentation level for multi-line actions * Ignored if string is null. * @param s The action string to output */ protected void printAction(String s) { if (s != null) { printTabs(); _printAction(s); } } /** Output tab indent followed by a String followed by newline, * to the currentOutput stream. Ignored if string is null. * @param s The string to output */ protected void println(String s) { if (s != null) { printTabs(); currentOutput.println(s); } } /** Output the current tab indentation. This outputs the number of tabs * indicated by the "tabs" variable to the currentOutput stream. */ protected void printTabs() { for (int i = 1; i <= tabs; i++) { currentOutput.print("\t"); } } /** Lexically process $ and # references within the action. * This will replace #id and #(...) with the appropriate * function calls and/or variables etc... */ protected abstract String processActionForSpecialSymbols(String actionStr, int line, RuleBlock currentRule, ActionTransInfo tInfo); public String getFOLLOWBitSet(String ruleName, int k) { GrammarSymbol rs = grammar.getSymbol(ruleName); if ( !(rs instanceof RuleSymbol) ) { return null; } RuleBlock blk = ((RuleSymbol)rs).getBlock(); Lookahead follow = grammar.theLLkAnalyzer.FOLLOW(k, blk.endNode); String followSetName = getBitsetName(markBitsetForGen(follow.fset)); return followSetName; } public String getFIRSTBitSet(String ruleName, int k) { GrammarSymbol rs = grammar.getSymbol(ruleName); if ( !(rs instanceof RuleSymbol) ) { return null; } RuleBlock blk = ((RuleSymbol)rs).getBlock(); Lookahead first = grammar.theLLkAnalyzer.look(k, blk); String firstSetName = getBitsetName(markBitsetForGen(first.fset)); return firstSetName; } /** * Remove the assignment portion of a declaration, if any. * @param d the declaration * @return the declaration without any assignment portion */ protected String removeAssignmentFromDeclaration(String d) { // If d contains an equal sign, then it's a declaration // with an initialization. Strip off the initialization part. if (d.indexOf('=') >= 0) d = d.substring(0, d.indexOf('=')).trim(); return d; } /** Set all fields back like one just created */ private void reset() { tabs = 0; // Allocate list of bitsets tagged for code generation bitsetsUsed = new Vector(); currentOutput = null; grammar = null; DEBUG_CODE_GENERATOR = false; makeSwitchThreshold = DEFAULT_MAKE_SWITCH_THRESHOLD; bitsetTestThreshold = DEFAULT_BITSET_TEST_THRESHOLD; } public static String reverseLexerRuleName(String id) { return id.substring(1, id.length()); } public void setAnalyzer(LLkGrammarAnalyzer analyzer_) { analyzer = analyzer_; } public void setBehavior(DefineGrammarSymbols behavior_) { behavior = behavior_; } /** Set a grammar for the code generator to use */ protected void setGrammar(Grammar g) { reset(); grammar = g; // Lookup make-switch threshold in the grammar generic options if (grammar.hasOption("codeGenMakeSwitchThreshold")) { try { makeSwitchThreshold = grammar.getIntegerOption("codeGenMakeSwitchThreshold"); //System.out.println("setting codeGenMakeSwitchThreshold to " + makeSwitchThreshold); } catch (NumberFormatException e) { Token tok = grammar.getOption("codeGenMakeSwitchThreshold"); antlrTool.error( "option 'codeGenMakeSwitchThreshold' must be an integer", grammar.getClassName(), tok.getLine(), tok.getColumn() ); } } // Lookup bitset-test threshold in the grammar generic options if (grammar.hasOption("codeGenBitsetTestThreshold")) { try { bitsetTestThreshold = grammar.getIntegerOption("codeGenBitsetTestThreshold"); //System.out.println("setting codeGenBitsetTestThreshold to " + bitsetTestThreshold); } catch (NumberFormatException e) { Token tok = grammar.getOption("codeGenBitsetTestThreshold"); antlrTool.error( "option 'codeGenBitsetTestThreshold' must be an integer", grammar.getClassName(), tok.getLine(), tok.getColumn() ); } } // Lookup debug code-gen in the grammar generic options if (grammar.hasOption("codeGenDebug")) { Token t = grammar.getOption("codeGenDebug"); if (t.getText().equals("true")) { //System.out.println("setting code-generation debug ON"); DEBUG_CODE_GENERATOR = true; } else if (t.getText().equals("false")) { //System.out.println("setting code-generation debug OFF"); DEBUG_CODE_GENERATOR = false; } else { antlrTool.error("option 'codeGenDebug' must be true or false", grammar.getClassName(), t.getLine(), t.getColumn()); } } } public void setTool(Tool tool) { antlrTool = tool; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy