All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mitchellbosecke.pebble.lexer.LexerImpl Maven / Gradle / Ivy

There is a newer version: 2.4.0
Show newest version
 * This file is part of Pebble.

* Copyright (c) 2014 by Mitchell Bösecke *

* For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. ******************************************************************************/ package com.mitchellbosecke.pebble.lexer; import com.mitchellbosecke.pebble.error.ParserException; import com.mitchellbosecke.pebble.lexer.Token.Type; import com.mitchellbosecke.pebble.operator.BinaryOperator; import com.mitchellbosecke.pebble.operator.UnaryOperator; import com.mitchellbosecke.pebble.utils.Pair; import com.mitchellbosecke.pebble.utils.StringLengthComparator; import com.mitchellbosecke.pebble.utils.StringUtils; import; import; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * This class reads the template input and builds single items out of it. *

* This class is not thread safe. */ public final class LexerImpl implements Lexer { /** * Syntax */ private final Syntax syntax; /** * Unary operators */ private final Collection unaryOperators; /** * Binary operators */ private final Collection binaryOperators; /** * As we progress through the source we maintain a string which is the text * that has yet to be tokenized. */ private TemplateSource source; /** * The list of tokens that we find and use to create a TokenStream */ private ArrayList tokens; /** * Make sure every opening bracket has a closing bracket. */ private LinkedList> brackets; /** * The state of the lexer is important so that we know what to expect next * and to help discover errors in the template (ex. unclosed comments). */ private State state; private LinkedList states; private enum State { DATA, EXECUTE, PRINT, COMMENT } /** * If we encountered an END delimiter that was preceded with a whitespace * trim character (ex. {{ foo -}}) then this boolean is toggled to "true" * which tells the lexData() method to trim leading whitespace from the next * text token. */ private boolean trimLeadingWhitespaceFromNextData = false; /** * Static regular expressions for names, numbers, and punctuation. */ private static final Pattern REGEX_NAME = Pattern.compile("^[a-zA-Z_][a-zA-Z0-9_]*"); private static final Pattern REGEX_NUMBER = Pattern.compile("^[0-9]+(\\.[0-9]+)?"); // the negative lookbehind assertion is used to ignore escaped quotation // marks private static final Pattern REGEX_STRING = Pattern .compile("((\").*?(? unaryOperators, Collection binaryOperators) { this.syntax = syntax; this.unaryOperators = unaryOperators; this.binaryOperators = binaryOperators; } /** * This is the main method used to tokenize the raw contents of a template. * * @param reader The reader provided from the Loader * @param name The name of the template (used for meaningful error messages) * @throws ParserException Thrown from the Reader object */ @Override public TokenStream tokenize(Reader reader, String name) throws ParserException { // operator regex buildOperatorRegex(); // standardize the character used for line breaks try { this.source = new TemplateSource(reader, name); } catch (IOException e) { throw new ParserException(e, "Can not convert template Reader into a String", 0, name); } /* * Start in a DATA state. This state basically means that we are NOT in * between a pair of meaningful delimiters. */ this.state = State.DATA; this.tokens = new ArrayList<>(); this.states = new LinkedList<>(); this.brackets = new LinkedList<>(); /* * loop through the entire source and apply different lexing methods * depending on what kind of state we are in at the time. * * This will always start on lexData(); */ while (this.source.length() > 0) { switch (this.state) { case DATA: lexData(); break; case EXECUTE: lexExecute(); break; case PRINT: lexPrint(); break; case COMMENT: lexComment(); break; default: break; } } // end of file token pushToken(Token.Type.EOF); // make sure that all brackets have been closed, else throw an error if (!this.brackets.isEmpty()) { String expected = brackets.pop().getLeft(); throw new ParserException(null, String.format("Unclosed \"%s\"", expected), source.getLineNumber(), source.getFilename()); } return new TokenStream(tokens, source.getFilename()); } /** * The DATA state assumes that we are current NOT in between any pair of * meaningful delimiters. We are currently looking for the next "open" or * "start" delimiter, ex. the opening comment delimiter, or the opening * variable delimiter. * * @throws ParserException */ private void lexData() throws ParserException { // find the next start delimiter Matcher matcher = this.syntax.getRegexStartDelimiters().matcher(source); boolean match = matcher.find(); String text; String startDelimiterToken = null; // if we didn't find another start delimiter, the text // token goes all the way to the end of the template. if (!match) { text = source.toString(); source.advance(source.length()); } else { text = source.substring(matcher.start()); startDelimiterToken = source.substring(matcher.start(), matcher.end()); // advance to after the start delimiter source.advance(matcher.end()); } // trim leading whitespace from this text if we previously // encountered the appropriate whitespace trim character if (trimLeadingWhitespaceFromNextData) { text = StringUtils.ltrim(text); trimLeadingWhitespaceFromNextData = false; } Token textToken = pushToken(Type.TEXT, text); if (match) { checkForLeadingWhitespaceTrim(textToken); if (this.syntax.getCommentOpenDelimiter().equals(startDelimiterToken)) { // we don't actually push any tokens for comments pushState(State.COMMENT); } else if (this.syntax.getPrintOpenDelimiter().equals(startDelimiterToken)) { pushToken(Token.Type.PRINT_START); pushState(State.PRINT); } else if ((this.syntax.getExecuteOpenDelimiter().equals(startDelimiterToken))) { // check for verbatim tag Matcher verbatimStartMatcher = this.syntax.getRegexVerbatimStart().matcher(source); if (verbatimStartMatcher.lookingAt()) { lexVerbatimData(verbatimStartMatcher); pushState(State.DATA); } else { pushToken(Token.Type.EXECUTE_START); pushState(State.EXECUTE); } } } } /** * Tokenizes between execute delimiters. * * @throws ParserException */ private void lexExecute() throws ParserException { // check for the trailing whitespace trim character checkForTrailingWhitespaceTrim(); Matcher matcher = this.syntax.getRegexExecuteClose().matcher(source); // check if we are at the execute closing delimiter if (brackets.isEmpty() && matcher.lookingAt()) { pushToken(Token.Type.EXECUTE_END, this.syntax.getExecuteCloseDelimiter()); source.advance(matcher.end()); popState(); } else { lexExpression(); } } /** * Tokenizes between print delimiters. * * @throws ParserException */ private void lexPrint() throws ParserException { // check for the trailing whitespace trim character checkForTrailingWhitespaceTrim(); Matcher matcher = this.syntax.getRegexPrintClose().matcher(source); // check if we are at the print closing delimiter if (brackets.isEmpty() && matcher.lookingAt()) { pushToken(Token.Type.PRINT_END, this.syntax.getPrintCloseDelimiter()); source.advance(matcher.end()); popState(); } else { lexExpression(); } } /** * Tokenizes between comment delimiters. *

* Simply find the closing delimiter for the comment and move the cursor to * that point. * * @throws ParserException */ private void lexComment() throws ParserException { // all we need to do is find the end of the comment. Matcher matcher = this.syntax.getRegexCommentClose().matcher(source); boolean match = matcher.find(0); if (!match) { throw new ParserException(null, "Unclosed comment.", source.getLineNumber(), source.getFilename()); } /* * check if the commented ended with the whitespace trim character by * reversing the comment and performing a regular forward regex search. */ String comment = source.substring(matcher.start()); String reversedComment = new StringBuilder(comment).reverse().toString(); Matcher whitespaceTrimMatcher = this.syntax.getRegexLeadingWhitespaceTrim().matcher(reversedComment); if (whitespaceTrimMatcher.lookingAt()) { this.trimLeadingWhitespaceFromNextData = true; } // move cursor to end of comment (and closing delimiter) source.advance(matcher.end()); popState(); } /** * Tokenizing an expression which can be found within both execute and print * regions. * * @throws ParserException */ private void lexExpression() throws ParserException { String token; // whitespace source.advanceThroughWhitespace(); /* * Matcher matcher = REGEX_WHITESPACE.matcher(source); if * (matcher.lookingAt()) { source.advance(matcher.end()); } */ // operators Matcher matcher = regexOperators.matcher(source); if (matcher.lookingAt()) { token = source.substring(matcher.end()); pushToken(Token.Type.OPERATOR, token); source.advance(matcher.end()); return; } // names matcher = REGEX_NAME.matcher(source); if (matcher.lookingAt()) { token = source.substring(matcher.end()); pushToken(Token.Type.NAME, token); source.advance(matcher.end()); return; } // numbers matcher = REGEX_NUMBER.matcher(source); if (matcher.lookingAt()) { token = source.substring(matcher.end()); pushToken(Token.Type.NUMBER, token); source.advance(matcher.end()); return; } // punctuation if (PUNCTUATION.indexOf(source.charAt(0)) >= 0) { String character = String.valueOf(source.charAt(0)); // opening bracket if ("([{".indexOf(character) >= 0) { brackets.push(new Pair(character, source.getLineNumber())); } // closing bracket else if (")]}".indexOf(character) >= 0) { if (brackets.isEmpty()) throw new ParserException(null, "Unexpected \"" + character + "\"", source.getLineNumber(), source.getFilename()); else { HashMap validPairs = new HashMap<>(); validPairs.put("(", ")"); validPairs.put("[", "]"); validPairs.put("{", "}"); String lastBracket = brackets.pop().getLeft(); String expected = validPairs.get(lastBracket); if (!expected.equals(character)) { throw new ParserException(null, "Unclosed \"" + expected + "\"", source.getLineNumber(), source.getFilename()); } } } pushToken(Token.Type.PUNCTUATION, character); source.advance(1); return; } // strings matcher = REGEX_STRING.matcher(source); if (matcher.lookingAt()) { token = source.substring(matcher.end()); source.advance(matcher.end()); char quotationType = token.charAt(0); // remove first and last quotation marks token = token.substring(1, token.length() - 1); // remove backslashes used to escape inner quotation marks if (quotationType == '\'') { token = token.replaceAll("\\\\(')", "$1"); } else if (quotationType == '"') { token = token.replaceAll("\\\\(\")", "$1"); } pushToken(Token.Type.STRING, token); return; } // we should have found something and returned by this point throw new ParserException(null, String.format("Unexpected character [%s]", source.charAt(0)), source.getLineNumber(), source.getFilename()); } private void checkForLeadingWhitespaceTrim(Token leadingToken) { Matcher whitespaceTrimMatcher = this.syntax.getRegexLeadingWhitespaceTrim().matcher(source); if (whitespaceTrimMatcher.lookingAt()) { if (leadingToken != null) { leadingToken.setValue(StringUtils.rtrim(leadingToken.getValue())); } source.advance(whitespaceTrimMatcher.end()); } } private void checkForTrailingWhitespaceTrim() { Matcher whitespaceTrimMatcher = this.syntax.getRegexTrailingWhitespaceTrim().matcher(source); if (whitespaceTrimMatcher.lookingAt()) { this.trimLeadingWhitespaceFromNextData = true; } } /** * Implementation of the "verbatim" tag * * @throws ParserException */ private void lexVerbatimData(Matcher verbatimStartMatcher) throws ParserException { // move cursor past the opening verbatim tag source.advance(verbatimStartMatcher.end()); // look for the "endverbatim" tag and storing everything between // now and then into a TEXT node Matcher verbatimEndMatcher = this.syntax.getRegexVerbatimEnd().matcher(source); // check for EOF if (!verbatimEndMatcher.find()) { throw new ParserException(null, "Unclosed verbatim tag.", source.getLineNumber(), source.getFilename()); } String verbatimText = source.substring(verbatimEndMatcher.start()); // check if the verbatim start tag has a trailing whitespace trim if ( != null) { verbatimText = StringUtils.ltrim(verbatimText); } // check if the verbatim end tag had a leading whitespace trim if ( != null) { verbatimText = StringUtils.rtrim(verbatimText); } // check if the verbatim end tag had a trailing whitespace trim if ( != null) { trimLeadingWhitespaceFromNextData = true; } // move cursor past the verbatim text and end delimiter source.advance(verbatimEndMatcher.end()); pushToken(Type.TEXT, verbatimText); } /** * Create a Token of a certain type but has no particular value. This will * pass control to the overloaded method that will push this token into a * list of tokens that we are maintaining. * * @param type The type of Token we are creating */ private Token pushToken(Token.Type type) { return pushToken(type, null); } /** * Create a Token of a certain type and value and push it into the list of * tokens that we are maintaining. ` * * @param type The type of token we are creating * @param value The value of the new token */ private Token pushToken(Token.Type type, String value) { // ignore empty text tokens if (type.equals(Token.Type.TEXT) && (value == null || "".equals(value))) { return null; } Token result = new Token(type, value, source.getLineNumber()); this.tokens.add(result); return result; } /** * Pushes the current state onto the stack and then updates the current * state to the new state. * * @param state The new state to use as the current state */ private void pushState(State state) { this.states.push(this.state); this.state = state; } /** * Pop state from the stack */ private void popState() { this.state = this.states.pop(); } /** * Retrieves the operators (both unary and binary) from the PebbleEngine and * then dynamically creates one giant regular expression to detect for the * existence of one of these operators. * * @return Pattern The regular expression used to find an operator */ private void buildOperatorRegex() { List operators = new ArrayList<>(); for (UnaryOperator operator : unaryOperators) { operators.add(operator.getSymbol()); } for (BinaryOperator operator : binaryOperators) { operators.add(operator.getSymbol()); } /* * Since java's matcher doesn't conform with the posix standard of * matching the longest alternative (it matches the first alternative), * we must first sort all of the operators by length before creating the * regex. This is to help match "is not" over "is". */ Collections.sort(operators, new StringLengthComparator()); StringBuilder regex = new StringBuilder("^"); boolean isFirst = true; for (String operator : operators) { if (isFirst) { isFirst = false; } else { regex.append("|"); } regex.append(Pattern.quote(operator)); /* * If the operator ends in an alpha character we use a negative * lookahead assertion to make sure the next character in the stream * is NOT an alpha character. This ensures user can type * "organization" without the "or" being parsed as an operator. */ char nextChar = operator.charAt(operator.length() - 1); if (Character.isLetter(nextChar) || Character.getType(nextChar) == Character.LETTER_NUMBER) { regex.append("(?![a-zA-Z])"); } } this.regexOperators = Pattern.compile(regex.toString()); } }

© 2015 - 2024 Weber Informatics LLC | Privacy Policy