All Downloads are FREE. Search and download functionalities are using the official Maven repository.

se.natusoft.doc.markdown.parser.MarkdownParser.groovy Maven / Gradle / Ivy

/* 
 * 
 * PROJECT
 *     Name
 *         MarkdownDoc Library
 *     
 *     Code Version
 *         1.2.9
 *     
 *     Description
 *         Parses markdown and generates HTML and PDF.
 *         
 * COPYRIGHTS
 *     Copyright (C) 2012 by Natusoft AB All rights reserved.
 *     
 * LICENSE
 *     Apache 2.0 (Open Source)
 *     
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *     
 *       http://www.apache.org/licenses/LICENSE-2.0
 *     
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *     
 * AUTHORS
 *     Tommy Svensson ([email protected])
 *         Changes:
 *         2012-11-04: Created!
 *         
 */
package se.natusoft.doc.markdown.parser

import se.natusoft.doc.markdown.api.Parser
import se.natusoft.doc.markdown.exception.ParseException
import se.natusoft.doc.markdown.io.Line
import se.natusoft.doc.markdown.io.LineReader
import se.natusoft.doc.markdown.model.*
import se.natusoft.doc.markdown.parser.markdown.io.MDLine
import se.natusoft.doc.markdown.parser.markdown.io.MDLineReader
import se.natusoft.doc.markdown.parser.markdown.model.MDImage
import se.natusoft.doc.markdown.parser.markdown.model.MDLink
import se.natusoft.doc.markdown.parser.markdown.model.MDList

/**
 * A parser that parses Markdown.
 * 

* This implements Markdown as documented on http://daringfireball.net/projects/markdown/syntax * with the following exceptions: *

    *
  • No entity encoding of email addresses.
  • *
  • No multiple block quote levels.
  • *
  • '\' will treat the next char as text no matter what it is.
  • *
* <, >, and & is not handled by this parser but by the HTMLGenerator instead since * this tool also can generate PDF such HTML specifics should not be in the parsed text. */ public class MarkdownParser implements Parser { // // Private Members // /** Holds parsed links. Links can be built in 2 different places. */ private Map links = new HashMap() /** The file we are parsing. We save this to pass to ParseException. */ private File file; // // Methods // /** * Parses a markdown file and adds its document structure to the passed Doc. * * @param doc The parsed result is added to this. * @param parseFile The file whose content to parse. * * @throws IOException on failure. * @throws ParseException on parse failures. */ @Override public void parse(Doc doc, File parseFile, Properties parserOptions) throws IOException, ParseException { this.file = parseFile parse(doc, new FileInputStream(parseFile), parserOptions); } /** * Parses a markdown stream and adds its document structure to the passed Doc. * * @param doc The parsed result is added to this. * @param parseStream The stream whose content to parse. * * @throws IOException on failure. * @throws ParseException on parse failures. */ @Override public void parse(Doc doc, InputStream parseStream, Properties parserOptions) throws IOException, ParseException { LineReader lineReader = null try { lineReader = new MDLineReader(new InputStreamReader(parseStream)) DocItem prevDocItem = null Stack hierarchyStack = new Stack(); while (lineReader.hasLine()) { MDLine line = (MDLine)lineReader.readLine() if (!line.empty) { DocItem docItem = null switch (line) { case { it.commentStart } : docItem = parseComment (line, lineReader); break case { it.header } : docItem = parseHeader (line, lineReader); break case { it.list && (it.leadingSpaces < 4 || (prevDocItem != null && prevDocItem.isHierarchy)) } : docItem = parseList (line, lineReader); break case { it.codeBlock } : docItem = parseCodeBlock (line, lineReader); break case { it.blockQuote } : docItem = parseBlockQuote (line, lineReader); break case { it.horizRuler } : docItem = new HorizontalRule(); break case { it.isLinkURLSpec(this.links)} : parseLinkUrlSpec(line); break // The annoying underline header format. case { lineReader.hasLine() && (lineReader.peekNextLine().contains("----") || lineReader.peekNextLine().contains("====")) } : docItem = parseHeader(line, lineReader) break default: Paragraph paragraph = new Paragraph() parseParagraph(paragraph, line, lineReader) docItem = paragraph } // Handle specific DocItem subclass behavior. if (docItem != null && docItem.keepConsecutiveTogether) { if (prevDocItem != null && prevDocItem.isSameType(docItem)) { boolean addItem = true if (docItem.isHierarchy) { if (docItem.isHierarchyDown(prevDocItem)) { prevDocItem.addItem(docItem) addItem = false hierarchyStack.push(prevDocItem) prevDocItem = docItem } else if (docItem.isHierarchyUp(prevDocItem)) { while (docItem.isHierarchyUp(prevDocItem)) { prevDocItem = hierarchyStack.pop() } } } if (prevDocItem.addBetweenKeepTogether != null) { prevDocItem.addItem(prevDocItem.addBetweenKeepTogether) } if (addItem) { for (DocItem content : docItem.items) { prevDocItem.addItem(content) } } docItem = null } } if (docItem != null) { setParseFileOnDocItems(docItem, this.file != null ? this.file : null) doc.addItem(docItem) prevDocItem = docItem } } } } catch (ParseException pe) { throw pe; } catch (Exception e) { throw new ParseException(file: this.file, line: lineReader.getLastReadLine(), lineNo: lineReader.getLineNo(), message: "Unknown error", cause: e) } finally { if (lineReader != null) { lineReader.close() } } } /** * This will provide the parse file to each DocItem created by this parser run. * * @param docItems * @param parseFile */ private void setParseFileOnDocItems(DocItem docItem, File parseFile) { docItem.parseFile = parseFile if (docItem.hasSubItems()) { for (DocItem subDocItem : docItem.items) { setParseFileOnDocItems(subDocItem, parseFile) } } } /** * Returns true if the extension of the specified file is a valid extension for this parser. * * @param fileName The file to check extension of. */ @Override boolean validFileExtension(String fileName) { return fileName.endsWith(".md") || fileName.endsWith(".markdown") || fileName.endsWith(".mdpart") } /** * Parses a html comment * * @param line The current line. * @param lineReader To read more lines from. * * @return A Comment. */ private DocItem parseComment(Line line, LineReader lineReader) { MDLine mdline = (MDLine)line StringBuilder sb = new StringBuilder() if (!mdline.commentEnd) { while (lineReader.hasLine() && !((MDLine)lineReader.peekNextLine()).commentEnd) { line = lineReader.readLine() sb.append("\n") sb.append(line.toString()) } lineReader.readLine() // We have to remove the last "-->" } else { String cmLine = line.toString().substring(5) cmLine = cmLine.substring(0, cmLine.length() - 4) sb.append(cmLine) } return new Comment(text: sb.toString()) } /** * Parses a header. * * @param line The current line. * @param lineReader To read more lines from. * * @return A Header. * * @throws IOException on failure to read input. * @throws ParseException On bad format being parsed. */ private DocItem parseHeader(Line line, LineReader lineReader) throws IOException, ParseException { String text = line.toString() Header.Level level = null; switch (text) { case { it.startsWith("######") } : level = Header.Level.H6; break case { it.startsWith("#####") } : level = Header.Level.H5; break case { it.startsWith("####") } : level = Header.Level.H4; break case { it.startsWith("###") } : level = Header.Level.H3; break case { it.startsWith("##") } : level = Header.Level.H2; break case { it.startsWith("#") } : level = Header.Level.H1; break case { lineReader.hasLine() && lineReader.peekNextLine().contains("===") } : level = Header.Level.H1 lineReader.readLine() break; case { lineReader.hasLine() && lineReader.peekNextLine().contains("---") } : level = Header.Level.H2 lineReader.readLine() break default: throw new ParseException(file: this.file != null ? this.file.toString() : "stream", lineNo: lineReader.lineNo, message: "Bad header found in line: '" + line.toString() + "'!") } Header header = new Header(level: level) header.addItem(line.removeAll("#").removeLeadingSpaces()) // I comment this part out for now since other MD tools seems only allow one line for heading. //while (lineReader.hasLine() && !lineReader.peekNextLine().empty) { // line = lineReader.readLine() // header.addItem(" ") // header.addItem(line.removeAll("#").removeLeadingSpaces().toString()) //} return header } /** * Parses a code block. * * @param line The current line. * @param lineReader To read more lines from. * * @return A CodeBlock. * * @throws IOException on input failure. */ private DocItem parseCodeBlock(Line line, LineReader lineReader) throws IOException { CodeBlock codeBlock = new CodeBlock() codeBlock.addItem(line) while (lineReader.hasLine() && ((MDLine)lineReader.peekNextLine()).codeBlock) { line = lineReader.readLine() codeBlock.addItem(line) } return codeBlock } /** * Parses a block quote. * * @param line The current line. * @param lineReader To read more lines from. * * @return A BlockQuote. * * @throws IOException on input failure. */ private DocItem parseBlockQuote(Line line, LineReader lineReader) throws IOException { BlockQuote blockQuote = new BlockQuote() parseParagraph(blockQuote, line.removeFirstWord(), lineReader, ">") return blockQuote } /** * Parses a list entry. * * @param line The current line. * @param lineReader To read more lines from. * * @return A ListItem. * * @throws IOException on input failure. */ private DocItem parseList(MDLine line, LineReader lineReader) throws IOException { MDList list = new MDList(ordered: line.orderedList, indentLevel: line.leadingSpaces) boolean isList = true int indent = line.leadingSpaces >= 3 ? line.leadingSpaces : 3 ListItem listItem = new ListItem() Paragraph liParagraph = new Paragraph() parseParagraph(liParagraph, line.removeFirstWord(), lineReader, isList) listItem.addItem(liParagraph) MDLine peekLine = (MDLine)lineReader.peekNextLine() while (peekLine != null && peekLine.leadingSpaces >= indent && !peekLine.isList()) { line = (MDLine)lineReader.readLine() liParagraph = new Paragraph() parseParagraph(liParagraph, line, lineReader, isList) listItem.addItem(liParagraph) peekLine = (MDLine)lineReader.peekNextLine() } list.addItem(listItem) return list } /** * Adds urls to already parsed links. * * @param line The current line */ private void parseLinkUrlSpec(Line line) throws ParseException { line = line.removeAll("\\[").removeAll("\\]:") if (line.numberOfWords < 2) { throw new ParseException(file: this.file, lineNo: line.lineNumber, message: "Bad link url specification: '" + line.toString() + "'!") } Link link = this.links[line.getWord(0)] if (link == null) { throw new ParseException(file: this.file, lineNo: line.lineNumber, message: "The specified link is undefined! '" + line.toString() + "'") } link.url = line.getWord(1) if (line.numberOfWords > 2) { line.currentWordPosition = 1 String space = "" link.title = "" while (line.hasMoreWords()) { link.title = link.title + space + line.nextWord space = " " } link.title = link.title.substring(1, link.title.length() - 1) } } /** * Parses a paragraph of text. * * @param paragraph The paragraph to parse. * @param line The current line. * @param lineReader To read more lines from. * * @throws IOException * @throws ParseException */ private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader) throws IOException, ParseException { parseParagraph(paragraph, line, lineReader, null) } /** * Parses a paragraph of text. * * @param paragraph The paragraph to parse. * @param line The current line. * @param lineReader To read more lines from. * * @throws IOException * @throws ParseException */ private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, boolean isList) throws IOException, ParseException { parseParagraph(paragraph, line, lineReader, null, isList) } /** * Parses a paragraph of text. * * @param paragraph The paragraph to parse. * @param line The current line. * @param lineReader To read more lines from. * @param removeBeginningWord if non null and this matches the first word of a line that word is removed. * * @throws IOException * @throws ParseException */ private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, String removeBeginningWord) throws IOException, ParseException { parseParagraph(paragraph, line, lineReader, removeBeginningWord, false) } /** * Parses a paragraph of text. * * @param paragraph The paragraph to parse. * @param line The current line. * @param lineReader To read more lines from. * @param removeBeginningWord if non null and this matches the first word of a line that word is removed. * * @throws IOException * @throws ParseException */ private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, String removeBeginningWord, boolean isList) throws IOException, ParseException { StringBuilder sb = new StringBuilder(); boolean done = false; String space = "" while (!done) { if (removeBeginningWord != null && line.numberOfWords > 0 && line.getWord(0).equals(removeBeginningWord)) { line = line.removeFirstWord() } line.eachWord { sb << space sb << it space = " " } line = lineReader.readLine() done = line == null || line.isEmpty() if (!done && line != null && (isList ? !((MDLine)line).isPartOfListParagraph(links) : !((MDLine)line).isPartOfParagraph(links))) { done = true lineReader.pushBackLine(line) } } DocItem current = new PlainText(renderPrefixedSpace: false) LinkedList itemStack = new LinkedList() char p = 0 boolean escapeChar = false for (int i = 0 ; i < sb.length(); i++) { int j = (i + 1) < sb.length() ? i + 1 : -1 char c = sb.charAt(i); char n = j > 0 ? sb.charAt(j) : 0 if (escapeChar) { current << c escapeChar = false } else { switch (c) { case '\\': escapeChar = true; break case { it == '.' && !(current instanceof Link)}: case { it == ',' && !(current instanceof Link)}: case { it == '!' && n != '[' && !(current instanceof Link)} : case { it == '?' && !(current instanceof Link)}: if (n == ' ') { paragraph.addItem(current) current = current.createNewWithSameConfig() paragraph.addItem(new PlainText(text: c, renderPrefixedSpace: false)) } else { current << c } break case { it == '_' && !(current instanceof Link) && ( ( (current instanceof Strong) || (current instanceof Emphasis) ) || ( (i+2) < sb.length() && sb.substring(i+2).contains("_") ) ) }: case { it == '*' && !(current instanceof Link) && ( ( (current instanceof Strong) || (current instanceof Emphasis) ) || ( (i+2) < sb.length() && sb.substring(i+2).contains("*") ) ) }: paragraph.addItem(current) if (n == '_' || n == '*') { ++i if (current instanceof Strong) { current = itemStack.pop().createNewWithSameConfig() } else { itemStack.push(current) current = new Strong(renderPrefixedSpace: false) } } else { if (current instanceof Emphasis) { current = itemStack.pop().createNewWithSameConfig() } else { itemStack.push(current) current = new Emphasis(renderPrefixedSpace: false) } } break case {it == "&" && n == "n"} : paragraph.addItem(new Space()) i = i + 5 break; case '`': paragraph.addItem(current) if (current instanceof Code) { current = itemStack.pop().createNewWithSameConfig() } else { itemStack.push(current) current = new Code(renderPrefixedSpace: false) } break case { it == '[' && p != '!' && !(current instanceof Link)}: paragraph.addItem(current) itemStack.push(current) current = new MDLink(renderPrefixedSpace: false) break case { it == '!' && n == '[' && !(current instanceof Link)}: paragraph.addItem(current) itemStack.push(current) current = new MDImage(renderPrefixedSpace: false) ++i break case ']': if (current instanceof Link) { if (n != '(') { paragraph.addItem(current) this.links.put(((Link)current).text, current) current = itemStack.pop().createNewWithSameConfig() } } else { current << c } break; case ')': if (current instanceof Link) { paragraph.addItem(current) this.links.put(((Link)current).text, current) current = itemStack.pop().createNewWithSameConfig() } else { current << c } break case { it == '<' && (current.class == PlainText.class)} : paragraph.addItem(current) itemStack.push(current) current = new AutoLink(renderPrefixedSpace: false) break; case { it == '>' && (current.class == AutoLink.class)}: paragraph.addItem(current) current = itemStack.pop().createNewWithSameConfig() break default: current << c } } p = c } paragraph.addItem(current) } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy