se.natusoft.doc.markdown.parser.MarkdownParser.groovy Maven / Gradle / Ivy
/*
*
* PROJECT
* Name
* MarkdownDoc Library
*
* Code Version
* 1.2.9
*
* Description
* Parses markdown and generates HTML and PDF.
*
* COPYRIGHTS
* Copyright (C) 2012 by Natusoft AB All rights reserved.
*
* LICENSE
* Apache 2.0 (Open Source)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* AUTHORS
* Tommy Svensson ([email protected])
* Changes:
* 2012-11-04: Created!
*
*/
package se.natusoft.doc.markdown.parser
import se.natusoft.doc.markdown.api.Parser
import se.natusoft.doc.markdown.exception.ParseException
import se.natusoft.doc.markdown.io.Line
import se.natusoft.doc.markdown.io.LineReader
import se.natusoft.doc.markdown.model.*
import se.natusoft.doc.markdown.parser.markdown.io.MDLine
import se.natusoft.doc.markdown.parser.markdown.io.MDLineReader
import se.natusoft.doc.markdown.parser.markdown.model.MDImage
import se.natusoft.doc.markdown.parser.markdown.model.MDLink
import se.natusoft.doc.markdown.parser.markdown.model.MDList
/**
* A parser that parses Markdown.
*
* This implements Markdown as documented on http://daringfireball.net/projects/markdown/syntax
* with the following exceptions:
*
* - No entity encoding of email addresses.
* - No multiple block quote levels.
* - '\' will treat the next char as text no matter what it is.
*
* <, >, and & is not handled by this parser but by the HTMLGenerator instead since
* this tool also can generate PDF such HTML specifics should not be in the parsed text.
*/
public class MarkdownParser implements Parser {
//
// Private Members
//
/** Holds parsed links. Links can be built in 2 different places. */
private Map links = new HashMap()
/** The file we are parsing. We save this to pass to ParseException. */
private File file;
//
// Methods
//
/**
* Parses a markdown file and adds its document structure to the passed Doc.
*
* @param doc The parsed result is added to this.
* @param parseFile The file whose content to parse.
*
* @throws IOException on failure.
* @throws ParseException on parse failures.
*/
@Override
public void parse(Doc doc, File parseFile, Properties parserOptions) throws IOException, ParseException {
this.file = parseFile
parse(doc, new FileInputStream(parseFile), parserOptions);
}
/**
* Parses a markdown stream and adds its document structure to the passed Doc.
*
* @param doc The parsed result is added to this.
* @param parseStream The stream whose content to parse.
*
* @throws IOException on failure.
* @throws ParseException on parse failures.
*/
@Override
public void parse(Doc doc, InputStream parseStream, Properties parserOptions) throws IOException, ParseException {
LineReader lineReader = null
try {
lineReader = new MDLineReader(new InputStreamReader(parseStream))
DocItem prevDocItem = null
Stack hierarchyStack = new Stack();
while (lineReader.hasLine()) {
MDLine line = (MDLine)lineReader.readLine()
if (!line.empty) {
DocItem docItem = null
switch (line) {
case { it.commentStart } : docItem = parseComment (line, lineReader); break
case { it.header } : docItem = parseHeader (line, lineReader); break
case { it.list && (it.leadingSpaces < 4 || (prevDocItem != null && prevDocItem.isHierarchy)) } :
docItem = parseList (line, lineReader); break
case { it.codeBlock } : docItem = parseCodeBlock (line, lineReader); break
case { it.blockQuote } : docItem = parseBlockQuote (line, lineReader); break
case { it.horizRuler } : docItem = new HorizontalRule(); break
case { it.isLinkURLSpec(this.links)} : parseLinkUrlSpec(line); break
// The annoying underline header format.
case { lineReader.hasLine() && (lineReader.peekNextLine().contains("----") ||
lineReader.peekNextLine().contains("====")) } :
docItem = parseHeader(line, lineReader)
break
default:
Paragraph paragraph = new Paragraph()
parseParagraph(paragraph, line, lineReader)
docItem = paragraph
}
// Handle specific DocItem subclass behavior.
if (docItem != null && docItem.keepConsecutiveTogether) {
if (prevDocItem != null && prevDocItem.isSameType(docItem)) {
boolean addItem = true
if (docItem.isHierarchy) {
if (docItem.isHierarchyDown(prevDocItem)) {
prevDocItem.addItem(docItem)
addItem = false
hierarchyStack.push(prevDocItem)
prevDocItem = docItem
}
else if (docItem.isHierarchyUp(prevDocItem)) {
while (docItem.isHierarchyUp(prevDocItem)) {
prevDocItem = hierarchyStack.pop()
}
}
}
if (prevDocItem.addBetweenKeepTogether != null) {
prevDocItem.addItem(prevDocItem.addBetweenKeepTogether)
}
if (addItem) {
for (DocItem content : docItem.items) {
prevDocItem.addItem(content)
}
}
docItem = null
}
}
if (docItem != null) {
setParseFileOnDocItems(docItem, this.file != null ? this.file : null)
doc.addItem(docItem)
prevDocItem = docItem
}
}
}
}
catch (ParseException pe) {
throw pe;
}
catch (Exception e) {
throw new ParseException(file: this.file, line: lineReader.getLastReadLine(), lineNo: lineReader.getLineNo(), message: "Unknown error", cause: e)
}
finally {
if (lineReader != null) {
lineReader.close()
}
}
}
/**
* This will provide the parse file to each DocItem created by this parser run.
*
* @param docItems
* @param parseFile
*/
private void setParseFileOnDocItems(DocItem docItem, File parseFile) {
docItem.parseFile = parseFile
if (docItem.hasSubItems()) {
for (DocItem subDocItem : docItem.items) {
setParseFileOnDocItems(subDocItem, parseFile)
}
}
}
/**
* Returns true if the extension of the specified file is a valid extension for this parser.
*
* @param fileName The file to check extension of.
*/
@Override
boolean validFileExtension(String fileName) {
return fileName.endsWith(".md") || fileName.endsWith(".markdown") || fileName.endsWith(".mdpart")
}
/**
* Parses a html comment
*
* @param line The current line.
* @param lineReader To read more lines from.
*
* @return A Comment.
*/
private DocItem parseComment(Line line, LineReader lineReader) {
MDLine mdline = (MDLine)line
StringBuilder sb = new StringBuilder()
if (!mdline.commentEnd) {
while (lineReader.hasLine() && !((MDLine)lineReader.peekNextLine()).commentEnd) {
line = lineReader.readLine()
sb.append("\n")
sb.append(line.toString())
}
lineReader.readLine() // We have to remove the last "-->"
}
else {
String cmLine = line.toString().substring(5)
cmLine = cmLine.substring(0, cmLine.length() - 4)
sb.append(cmLine)
}
return new Comment(text: sb.toString())
}
/**
* Parses a header.
*
* @param line The current line.
* @param lineReader To read more lines from.
*
* @return A Header.
*
* @throws IOException on failure to read input.
* @throws ParseException On bad format being parsed.
*/
private DocItem parseHeader(Line line, LineReader lineReader) throws IOException, ParseException {
String text = line.toString()
Header.Level level = null;
switch (text) {
case { it.startsWith("######") } : level = Header.Level.H6; break
case { it.startsWith("#####") } : level = Header.Level.H5; break
case { it.startsWith("####") } : level = Header.Level.H4; break
case { it.startsWith("###") } : level = Header.Level.H3; break
case { it.startsWith("##") } : level = Header.Level.H2; break
case { it.startsWith("#") } : level = Header.Level.H1; break
case { lineReader.hasLine() && lineReader.peekNextLine().contains("===") } :
level = Header.Level.H1
lineReader.readLine()
break;
case { lineReader.hasLine() && lineReader.peekNextLine().contains("---") } :
level = Header.Level.H2
lineReader.readLine()
break
default: throw new ParseException(file: this.file != null ? this.file.toString() : "stream", lineNo: lineReader.lineNo,
message: "Bad header found in line: '" + line.toString() + "'!")
}
Header header = new Header(level: level)
header.addItem(line.removeAll("#").removeLeadingSpaces())
// I comment this part out for now since other MD tools seems only allow one line for heading.
//while (lineReader.hasLine() && !lineReader.peekNextLine().empty) {
// line = lineReader.readLine()
// header.addItem(" ")
// header.addItem(line.removeAll("#").removeLeadingSpaces().toString())
//}
return header
}
/**
* Parses a code block.
*
* @param line The current line.
* @param lineReader To read more lines from.
*
* @return A CodeBlock.
*
* @throws IOException on input failure.
*/
private DocItem parseCodeBlock(Line line, LineReader lineReader) throws IOException {
CodeBlock codeBlock = new CodeBlock()
codeBlock.addItem(line)
while (lineReader.hasLine() && ((MDLine)lineReader.peekNextLine()).codeBlock) {
line = lineReader.readLine()
codeBlock.addItem(line)
}
return codeBlock
}
/**
* Parses a block quote.
*
* @param line The current line.
* @param lineReader To read more lines from.
*
* @return A BlockQuote.
*
* @throws IOException on input failure.
*/
private DocItem parseBlockQuote(Line line, LineReader lineReader) throws IOException {
BlockQuote blockQuote = new BlockQuote()
parseParagraph(blockQuote, line.removeFirstWord(), lineReader, ">")
return blockQuote
}
/**
* Parses a list entry.
*
* @param line The current line.
* @param lineReader To read more lines from.
*
* @return A ListItem.
*
* @throws IOException on input failure.
*/
private DocItem parseList(MDLine line, LineReader lineReader) throws IOException {
MDList list = new MDList(ordered: line.orderedList, indentLevel: line.leadingSpaces)
boolean isList = true
int indent = line.leadingSpaces >= 3 ? line.leadingSpaces : 3
ListItem listItem = new ListItem()
Paragraph liParagraph = new Paragraph()
parseParagraph(liParagraph, line.removeFirstWord(), lineReader, isList)
listItem.addItem(liParagraph)
MDLine peekLine = (MDLine)lineReader.peekNextLine()
while (peekLine != null && peekLine.leadingSpaces >= indent && !peekLine.isList()) {
line = (MDLine)lineReader.readLine()
liParagraph = new Paragraph()
parseParagraph(liParagraph, line, lineReader, isList)
listItem.addItem(liParagraph)
peekLine = (MDLine)lineReader.peekNextLine()
}
list.addItem(listItem)
return list
}
/**
* Adds urls to already parsed links.
*
* @param line The current line
*/
private void parseLinkUrlSpec(Line line) throws ParseException {
line = line.removeAll("\\[").removeAll("\\]:")
if (line.numberOfWords < 2) {
throw new ParseException(file: this.file, lineNo: line.lineNumber, message: "Bad link url specification: '" +
line.toString() + "'!")
}
Link link = this.links[line.getWord(0)]
if (link == null) {
throw new ParseException(file: this.file, lineNo: line.lineNumber, message: "The specified link is undefined! '" + line.toString() + "'")
}
link.url = line.getWord(1)
if (line.numberOfWords > 2) {
line.currentWordPosition = 1
String space = ""
link.title = ""
while (line.hasMoreWords()) {
link.title = link.title + space + line.nextWord
space = " "
}
link.title = link.title.substring(1, link.title.length() - 1)
}
}
/**
* Parses a paragraph of text.
*
* @param paragraph The paragraph to parse.
* @param line The current line.
* @param lineReader To read more lines from.
*
* @throws IOException
* @throws ParseException
*/
private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader) throws IOException, ParseException {
parseParagraph(paragraph, line, lineReader, null)
}
/**
* Parses a paragraph of text.
*
* @param paragraph The paragraph to parse.
* @param line The current line.
* @param lineReader To read more lines from.
*
* @throws IOException
* @throws ParseException
*/
private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, boolean isList) throws IOException, ParseException {
parseParagraph(paragraph, line, lineReader, null, isList)
}
/**
* Parses a paragraph of text.
*
* @param paragraph The paragraph to parse.
* @param line The current line.
* @param lineReader To read more lines from.
* @param removeBeginningWord if non null and this matches the first word of a line that word is removed.
*
* @throws IOException
* @throws ParseException
*/
private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, String removeBeginningWord) throws IOException, ParseException {
parseParagraph(paragraph, line, lineReader, removeBeginningWord, false)
}
/**
* Parses a paragraph of text.
*
* @param paragraph The paragraph to parse.
* @param line The current line.
* @param lineReader To read more lines from.
* @param removeBeginningWord if non null and this matches the first word of a line that word is removed.
*
* @throws IOException
* @throws ParseException
*/
private void parseParagraph(Paragraph paragraph, Line line, LineReader lineReader, String removeBeginningWord, boolean isList) throws IOException, ParseException {
StringBuilder sb = new StringBuilder();
boolean done = false;
String space = ""
while (!done) {
if (removeBeginningWord != null && line.numberOfWords > 0 && line.getWord(0).equals(removeBeginningWord)) {
line = line.removeFirstWord()
}
line.eachWord {
sb << space
sb << it
space = " "
}
line = lineReader.readLine()
done = line == null || line.isEmpty()
if (!done && line != null && (isList ? !((MDLine)line).isPartOfListParagraph(links) : !((MDLine)line).isPartOfParagraph(links))) {
done = true
lineReader.pushBackLine(line)
}
}
DocItem current = new PlainText(renderPrefixedSpace: false)
LinkedList itemStack = new LinkedList()
char p = 0
boolean escapeChar = false
for (int i = 0 ; i < sb.length(); i++) {
int j = (i + 1) < sb.length() ? i + 1 : -1
char c = sb.charAt(i);
char n = j > 0 ? sb.charAt(j) : 0
if (escapeChar) {
current << c
escapeChar = false
}
else {
switch (c) {
case '\\':
escapeChar = true;
break
case { it == '.' && !(current instanceof Link)}:
case { it == ',' && !(current instanceof Link)}:
case { it == '!' && n != '[' && !(current instanceof Link)} :
case { it == '?' && !(current instanceof Link)}:
if (n == ' ') {
paragraph.addItem(current)
current = current.createNewWithSameConfig()
paragraph.addItem(new PlainText(text: c, renderPrefixedSpace: false))
}
else {
current << c
}
break
case {
it == '_' &&
!(current instanceof Link) &&
(
(
(current instanceof Strong) ||
(current instanceof Emphasis)
) ||
(
(i+2) < sb.length() &&
sb.substring(i+2).contains("_")
)
)
}:
case {
it == '*' &&
!(current instanceof Link) &&
(
(
(current instanceof Strong) ||
(current instanceof Emphasis)
) ||
(
(i+2) < sb.length() &&
sb.substring(i+2).contains("*")
)
)
}:
paragraph.addItem(current)
if (n == '_' || n == '*') {
++i
if (current instanceof Strong) {
current = itemStack.pop().createNewWithSameConfig()
}
else {
itemStack.push(current)
current = new Strong(renderPrefixedSpace: false)
}
}
else {
if (current instanceof Emphasis) {
current = itemStack.pop().createNewWithSameConfig()
}
else {
itemStack.push(current)
current = new Emphasis(renderPrefixedSpace: false)
}
}
break
case {it == "&" && n == "n"} :
paragraph.addItem(new Space())
i = i + 5
break;
case '`':
paragraph.addItem(current)
if (current instanceof Code) {
current = itemStack.pop().createNewWithSameConfig()
}
else {
itemStack.push(current)
current = new Code(renderPrefixedSpace: false)
}
break
case { it == '[' && p != '!' && !(current instanceof Link)}:
paragraph.addItem(current)
itemStack.push(current)
current = new MDLink(renderPrefixedSpace: false)
break
case { it == '!' && n == '[' && !(current instanceof Link)}:
paragraph.addItem(current)
itemStack.push(current)
current = new MDImage(renderPrefixedSpace: false)
++i
break
case ']':
if (current instanceof Link) {
if (n != '(') {
paragraph.addItem(current)
this.links.put(((Link)current).text, current)
current = itemStack.pop().createNewWithSameConfig()
}
}
else {
current << c
}
break;
case ')':
if (current instanceof Link) {
paragraph.addItem(current)
this.links.put(((Link)current).text, current)
current = itemStack.pop().createNewWithSameConfig()
}
else {
current << c
}
break
case { it == '<' && (current.class == PlainText.class)} :
paragraph.addItem(current)
itemStack.push(current)
current = new AutoLink(renderPrefixedSpace: false)
break;
case { it == '>' && (current.class == AutoLink.class)}:
paragraph.addItem(current)
current = itemStack.pop().createNewWithSameConfig()
break
default:
current << c
}
}
p = c
}
paragraph.addItem(current)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy