All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.viaboxx.markdown.Confluence2MD.groovy Maven / Gradle / Ivy

There is a newer version: 2.1
Show newest version
package de.viaboxx.markdown

import groovy.json.JsonSlurper
import groovy.util.slurpersupport.Node
import net.sourceforge.plantuml.SourceStringReader
import org.apache.commons.io.FileUtils
import org.apache.commons.lang.StringUtils

import javax.xml.bind.DatatypeConverter

import static de.viaboxx.markdown.Confluence2MD.Mode.*

/**
 * Description: convert a confluence Json page(s)/space to a markdown for pandoc
*

* Date: 15.09.14
*

* @see ArgsParser#usage */ @SuppressWarnings("GroovyFallthrough") class Confluence2MD implements Walker { protected static final String PLANT_UML = "PlantUML" protected static final String PLANTUML_PREFIX1 = PLANT_UML + " - " protected static final String PLANTUML_PREFIX2 = PLANT_UML + ": " boolean verbose /** * InputStream, URL or File (or something a JsonSlurper can parse) */ def input PrintStream out = System.out String userPassword enum Mode { Default, Panel, BlockQuote, CodeBlock, Table } protected int listIndent = 0, blockIndent = 0 protected Integer itemNumber protected Mode mode = Default protected Table table protected Map pageIdCache = [:] // key = page.title, value = page.id protected Map attachmentCache = [:] // key = page.id, value = attachments (jsonSlurper) String wikiServerUrl = "https://viaboxx.atlassian.net/wiki" String rootPageId int maxDepth = -1 protected int depth = 0 protected def currentPage protected def written = null File downloadFolder = new File("attachments") int maxHeaderDepth = 5 // docx does not support more! def GET_PAGE_BODY = { pageId -> "/rest/api/content/${pageId}?expand=body.storage" } def GET_CHILD_PAGES = { pageId -> "/rest/api/content/${pageId}/child/page?limit=300" } def GET_CHILD_ATTACHMENTS = { pageId -> "/rest/api/content/${pageId}/child/attachment?limit=300" } def QUERY_PAGE_BY_TITLE = { title -> "/rest/api/content?title=${URLEncoder.encode(title, 'UTF-8')}" } def QUERY_PAGE_BY_TITLE_AND_SPACE = { title, space -> "/rest/api/content?title=${URLEncoder.encode(title, 'UTF-8')}&spaceKey=${URLEncoder.encode(space, 'UTF-8')}" } protected int imageCounter = 0 boolean runPlantUml = true boolean docHierarchy = true, newLine = true, blankLine = true, strong = false boolean titleTransformation = true, titleRootPage = true def File outFile ArgsParser getArgsParser() { return new ArgsParser() } static void main(String[] args) { def inst = new Confluence2MD() if (inst.argsParser.parseArgs(args, inst)) { inst.run() } } void run() { try { def page = new JsonSlurper().parse(input) fillPageIdCache(page) log("Generating markdown") parsePages(page) } finally { close() } log("Done") } void parsePages(page) { withPages(page) { each -> currentPage = each imageCounter = 0 if (!each.body) { def childInput = openInput(wikiServerUrl + GET_PAGE_BODY(each.id), each.id + ".body") try { parsePage(new JsonSlurper().parse(childInput)) } finally { close(childInput) } } else { parsePage(each) } } } protected void withPages(def page, Closure processor) { processor(page) if (rootPageId && goDeeper()) { def queryChildren = openInput(wikiServerUrl + GET_CHILD_PAGES(page.id), page.id + ".children") try { def children = new JsonSlurper().parse(queryChildren) depth++ children.results.each { child -> withPages(child, processor) } depth-- } finally { close(queryChildren) } } } void parsePage(page) { log("${depth} - Processing page $page.id '$page.title'") if (depth > 0 || titleRootPage) { writeHeader(1, { write("${pageTitle(page.title)}") }, "PAGE_${page.id}") } parseBody(page.body.storage.value) } protected String pageTitle(String realTitle) { if (!titleTransformation) return realTitle String tok = ' - ' int pos = realTitle.indexOf(tok) if (pos < 0) return realTitle return realTitle.substring(pos + tok.length()) } protected void fillPageIdCache(def page) { withPages(page) { each -> def effectiveLevel = effectiveLevel(1) if (effectiveLevel <= maxHeaderDepth) { log("${depth} - Found page $each.id '$each.title'") pageIdCache.put(each.title, each.id) } else { log("${depth} - Found page $each.id '${each.title}', but header depth $effectiveLevel is too deep in the hierachy") } } } protected boolean goDeeper() { return (maxDepth < 0 || maxDepth > depth) } void parseBody(String body) { def xmlSlurper = new XmlSlurper(HTMLParser.build()) def html = xmlSlurper.parseText("" + body + "") html.BODY.childNodes().each { Node node -> if (!handle(node)) { walkThrough(node) } } } String intoString(Closure process) { def baos = new ByteArrayOutputStream() def out = new PrintStream(baos) withOut(out, process) return baos.toString() } void withOut(final PrintStream out, Closure process) { def old = this.out this.out = out process() this.out = old } protected String INTERPUNKTION = ".,;:\"'" void walkThrough(def parent) { parent.children().each { def nodeOrText -> if (nodeOrText instanceof Node) { if (!handle(nodeOrText as Node)) { walkThrough(nodeOrText as Node) } } else { def text = nodeOrText as String if (written == 1 && text) { // workaround, because neko-html parser does not detect spaces between /* example 1: write the space is a locker system example 2: do not write the space because there it no space before a , system, where parcels */ int lastChar = text.charAt(text.length() - 1) if (INTERPUNKTION.indexOf(lastChar) < 0) writeRaw(' ') } write(text) written = text } } } protected void writeHeader(int level, def node) { if (node.text()) { writeHeader(level) { write(node.text().trim()) } } else { writeHeader(level) { walkThrough(node) } } } protected void writeHeader(int level, Closure processor, String ref = null) { if (!blankLine) { if (!newLine) writeln() writeln() } final int effectiveLevel = effectiveLevel(level) boolean isHeader = writeHeaderBeginTags(effectiveLevel) processor() writeHeaderEndTags(effectiveLevel) if (ref && isHeader) writeRaw(" {#${ref}}") assertBlankLine() } protected int effectiveLevel(int level) { return docHierarchy ? depth + (titleRootPage ? level : level - 1) : level } protected void assertBlankLine() { if (!blankLine) { if (!newLine) writeln() writeln() } } protected boolean writeHeaderBeginTags(int level) { return writeHeaderTags(level) } protected boolean writeHeaderEndTags(int level) { return writeHeaderTags(level) } protected boolean writeHeaderTags(int level) { if (mode != Default || level > maxHeaderDepth) { writeRaw("__") return false } else { level.times { writeRaw('#') } return true } } protected boolean handle(Node node) { switch (node.name()) { case "H1": writeHeader(2, node) return true case "H2": writeHeader(3, node) return true case "H3": writeHeader(4, node) return true case "H4": writeHeader(5, node) return true case "H5": writeHeader(6, node) return true case "H6": writeHeader(7, node) return true case "P": if (mode != Mode.Table && listIndent == 0) { assertBlankLine() } walkThrough(node) if (mode != Mode.Table) writeln() return true case "A": def href = node.attributes()["href"] if (href) { writeRaw("[") walkThrough(node) writeRaw("]") writeRaw("(") writeRaw(href as String) writeRaw(")") } // else --> anchor not yet supported return true case "AC:LINK": // // PrinterSignalState // String linkText = null String linkUrl = null def child child = getFirstChildNamed(node, "RI:PAGE") if (child) { linkUrl = child.attributes()["ri:content-title"] if (pageIdCache.get(linkUrl)) { linkText = linkUrl linkUrl = "PAGE_" + pageIdCache.get(linkUrl) } else { log("Link out of scope to page: \'$linkUrl\'") linkText = linkUrl linkUrl = null } } child = getFirstChildNamed(node, "AC:LINK-BODY") if (child) { linkText = child.text() } else { child = getFirstChildNamed(node, "AC:PLAIN-TEXT-LINK-BODY") if (child) { linkText = child.text() } } if (!linkText) { child = getFirstChildNamed(node, "RI:ATTACHMENT") if (child) { linkText = child.attributes()["ri:filename"] } } if (linkUrl) { writeRaw("[") write(linkText ?: linkUrl) writeRaw("](#") writeRaw(linkUrl) writeRaw(")") } else if (linkText) { writeRaw("_") write(linkText) writeRaw("_ ") } return true case "AC:STRUCTURED-MACRO": def macroName = node.attributes().get('ac:name') List parameters = getChildrenNamed(node, "AC:PARAMETER") Node titleNode = getParameterWithAttribute(parameters, "ac:name", "title") String title = null if (titleNode) { title = titleNode.text().trim() } // Node collapseNode = getParameterWithAttribute(parameters, "ac:name", "collapse") // boolean collapse = collapseNode?.text() == "true" switch (macroName) { case "code": if (runPlantUml && title && (title.equalsIgnoreCase(PLANT_UML) || StringUtils.startsWithIgnoreCase(title, PLANTUML_PREFIX1) || StringUtils.startsWithIgnoreCase(title, PLANTUML_PREFIX2))) { Node umlNode = getFirstChildNamed(node, "AC:PLAIN-TEXT-BODY") if (umlNode) { String imageTitle = null if (StringUtils.startsWithIgnoreCase(title, PLANTUML_PREFIX1)) { imageTitle = title.substring(PLANTUML_PREFIX1.length()) } else if (StringUtils.startsWithIgnoreCase(title, PLANTUML_PREFIX2)) { imageTitle = title.substring(PLANTUML_PREFIX2.length()) } plantUML(umlNode, imageTitle) return true } } case "noformat": case "panel": case "info": case "warning": // ignore if (title) { assertBlankLine() writeRaw("**") if (macroName == "info") write("(i) ") else if (macroName == "warning") write("(!) ") write(title) writeRaw("**") writeln() } if (mode != Mode.Table && macroName != "code" && macroName != "noformat") { def needBlankLine = (mode != BlockQuote) withMode(BlockQuote) { blockIndent++ if (needBlankLine) assertBlankLine() walkThrough(node) blockIndent-- } return true } break case "section": // Inhaltsverzeichnis default: log("WARN: '$macroName' structured-macro not supported") } break case "AC:PARAMETER": return true // skip case "BLOCKQUOTE": if (mode != Mode.Table) { def needBlankLine = (mode != BlockQuote) withMode(BlockQuote) { blockIndent++ if (needBlankLine) assertBlankLine() walkThrough(node) blockIndent-- } return true } break case "AC:RICH-TEXT-BODY": if (mode == Default) { String text = intoString { withMode(Panel) { walkThrough(node) } } if (text) { if (!newLine) writeln() writeRaw(text) assertBlankLine() } return true } break case "PRE": if (mode == Default) { String text = intoString { def needBlankLine = (mode != BlockQuote) withMode(BlockQuote) { blockIndent++ if (needBlankLine) assertBlankLine() walkThrough(node) blockIndent-- } } if (text) { if (!newLine) writeln() writeRaw(text) assertBlankLine() } return true } break case "CODE": case "AC:PLAIN-TEXT-BODY": // codeblock /** * Problem codeblock inside table currently not supported, * see http://comments.gmane.org/gmane.text.pandoc/5170 */ /* +-----------------------+------------------------+ | ~~~~ | | | This is a code block! | This is ordinary text! | | ~~~~ | | +-----------------------+------------------------+ */ if (mode != Default) { log("WARN code block nested in $mode currently not supported: " + node.text()) write(node.text()) } else { withMode(CodeBlock) { writeRaw("\n\n~~~~~~~\n") write(node.text()) writeRaw("\n~~~~~~~\n") } } return true case "UL": withList { itemNumber = null walkThrough(node) } return true case "OL": withList { itemNumber = 1 walkThrough(node) } return true case "LI": // def oldItemNumber = itemNumber // itemNumber = null assertBlankLine() // itemNumber = oldItemNumber ((listIndent - 1) * 2).times { writeRaw(' ') } if (itemNumber != null) { write("${itemNumber}. ") itemNumber = itemNumber + 1 } else { writeRaw("+ ") } break case "TABLE": // we use pipe_tables: Pandoc says: /* "The cells of pipe tables cannot contain block elements like paragraphs and lists, and cannot span multiple lines." */ // see http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#extension-pipe_tables /* At least multiple lines are possible, with a \\ at the end of the line. But paragraphs and list corrupt the table! */ if (mode == Mode.Table) { // nested tables not supported by pandoc def table = intoString { assertBlankLine() withMode(Mode.Table) { def oldTable = table table = new Table() walkThrough(node) table = oldTable } assertBlankLine() } log("WARN nested table not supported: $table") writeRaw("{table}" + table.replace("|", ",").replace('\n', ';') + "{/table}") return true } else { assertBlankLine() withMode(Mode.Table) { def oldTable = table table = new Table() walkThrough(node) table = oldTable } assertBlankLine() return true } break case "TBODY": // ignore break case "TR": table.rows << new Row() walkThrough(node) writeRaw("|\n") if (table.rows.size() == 1) { table.row.renderSeparator(this) } return true case "TD": writeRaw("|") table.row.cells << new Cell(node) table.row.cell.render(this) return true case "TH": writeRaw("|") table.row.cells << new Cell(node) table.row.cell.render(this) return true case "BR": writeln() return true case "SPAN": // ignore walkThrough(node) if (written instanceof String) { /* char(160)   */ if (!written.endsWith(' ') && !written.endsWith("\u00A0")) { written = 1 // space maybe to be written } } return true case "EM": case "I": // italic = emphasis case "U": // underline: not yet supported. using italic def markdown = intoString { walkThrough(node) } markdown = writeMovedSpaces(markdown) writeRaw("_") writeRaw(markdown) writeRaw("_ ") return true break case "STRONG": case "B": if (strong) { // avoid duplication of ** because **** would not work, this can happen when ... is nested return false } else { strong = true def markdown = intoString { walkThrough(node) } markdown = writeMovedSpaces(markdown) writeRaw("**") writeRaw(markdown) writeRaw("**") strong = false return true } break case "AC:IMAGE": /* */ String title = node.attributes()["ac:title"] String url = null def child = getFirstChildNamed(node, "RI:ATTACHMENT") if (child) { // attached image String fileName = child.attributes()["ri:filename"] title = title ?: fileName Node page = getFirstChildNamed(child, "RI:PAGE") def pageId if (page) { String pageTitle = page.attributes()["ri:content-title"] String pageSpace = page.attributes()["ri:space-key"] pageId = pageTitle ? queryPageIdByTitle(pageTitle, pageSpace) : currentPage.id if (!pageId) pageId = currentPage.id } else { pageId = currentPage.id } def attachments = pageId ? getAttachments(pageId) : null def attachment = findAttachmentTitled(attachments, fileName) if (!attachment) { log("WARN: Cannot find attachment $fileName") } else { url = downloadedFile(attachment).path } } else { child = getFirstChildNamed(node, "RI:URL") if (child) { // image by URL url = child.attributes()["ri:value"] } } assertBlankLine() writeRaw("![") write(title ?: url) writeRaw("](") write(url) writeRaw(")\n") return true case "S": // strikeout writeRaw("~~") def text = intoString { walkThrough(node) } writeRaw(text.trim()) writeRaw("~~") return true case "HR": writeRaw("\n---\n") return true case "AC:EMOTICON": def icon = node.attributes()["ac:name"] switch (icon) { case "minus": write(" (-) ") break case "smile": write(" :-) ") break case "sad": write(" :-( ") break case "cheeky": write(" :-P ") break case "laugh": write(" :-D ") break case "wink": write(" ;-) ") break case "thumbs-up": write(" (^.^) ") break case "thumbs-down": write(" (:-[) ") break case "tick": write(" (ok) ") break case "cross": write(" (x) ") break case "warning": write(" (!) ") break case "question": write(" (?) ") break default: write("($icon)") } return true case "DIV": break // ignore case "AC:MACRO": // e.g. plantUML def macro = node.attributes()['ac:name'] switch (macro) { case "plantuml": /* ac:title: not yet tested if tag name correct */ plantUML(node, (node.attributes()['ac:title']) as String) break default: log("Unknown macro tag ${node.name()} = ${macro}") } return true case "COL": case "COLGROUP": case "CITE": default: log("Unhandled tag ${node.name()} = ${node.text()}") } return false } Node getParameterWithAttribute(List parameters, String attributeName, String attributeValue) { return parameters.find({ it.attributes()[attributeName] == attributeValue }) } protected String writeMovedSpaces(String markdown) { int idx = 0 while (markdown.length() > idx && markdown.charAt(idx) == ' ') { idx++ writeRaw(" ") // write spaces before ** or _ because after ** or _ must not follow a direct space } if (idx > 0) { markdown = markdown.substring(idx) } markdown } protected void plantUML(Node node, String title = null) { def text = node.text() imageCounter++ File img = new File(downloadFolder, "puml${currentPage.id}_${imageCounter}.png") if (runPlantUml) { if (text.trim().startsWith("!include ")) { log("Looking for plantUML-!include for ${img.path} with $text") def pumlAttachment = text.trim().substring("!include ".length()) def page def idx = pumlAttachment.indexOf("^") if (idx >= 0) { page = pumlAttachment.substring(0, idx) pumlAttachment = pumlAttachment.substring(idx + 1) } else { page = null } def attachments if (page) { def pageId = queryPageIdByTitle(page) if (pageId) { attachments = getAttachments(pageId) } } else { attachments = getAttachments(currentPage.id) } def attachment if (attachments) { attachment = findAttachmentTitled(attachments, pumlAttachment) } if (attachment) { File pumlFile = downloadedFile(attachment) if (pumlFile) { text = FileUtils.readFileToString(pumlFile) } } } if (!text.contains("@startuml") && !text.contains("@startdot")) text = "@startuml\n" + text if (!text.contains("@enduml") && !text.contains("@enddot")) text += "\n@enduml" log("Running PlantUml on ${img.path} with \n$text") def reader = new SourceStringReader(text) FileOutputStream file = new FileOutputStream(img) reader.generateImage(file); file.close() } else { writeRaw("\n") } assertBlankLine() writeRaw("![") write(title ?: img.name) writeRaw("](") write(img.path) writeRaw(")\n") } protected String queryPageIdByTitle(String title, String spaceKey = null) { def pageId = pageIdCache.get(title) if (pageId) return pageId def input = openInput(wikiServerUrl + (spaceKey ? QUERY_PAGE_BY_TITLE_AND_SPACE(title, spaceKey) : QUERY_PAGE_BY_TITLE(title)), spaceKey ? '$' + spaceKey + '$' + title + ".title.query" : title + ".title.query") def json = new JsonSlurper().parse(input) def page = json.results.find { it.title == title } close(input) if (page) { pageId = page.id pageIdCache.put(title, pageId) } return pageId } protected def getAttachments(def pageId) { def attachments = attachmentCache[pageId] if (attachments == null) { def url = wikiServerUrl + GET_CHILD_ATTACHMENTS(pageId) def stream = openInput(url, pageId + ".attachments") try { attachments = new JsonSlurper().parse(stream) attachmentCache[pageId] = attachments } finally { close(stream) } } return attachments } protected File downloadedFile(def attachment) { if (!downloadFolder.exists()) downloadFolder.mkdirs() File targetFile = new File(downloadFolder, attachment.id + "_" + attachment.title) if (!targetFile.exists()) { // speed up - use existing file def downloadUrl = wikiServerUrl + attachment._links.download log("Downloading '${targetFile.name}' from '$downloadUrl'") def stream = openStream(downloadUrl) FileUtils.copyInputStreamToFile(stream, targetFile) stream.close() } else { log("Found downloaded file ${targetFile.name}") } return targetFile } protected Node getFirstChildNamed(Node node, String name) { return node.children().find { child -> (child instanceof Node && name == child.name()) } } protected List getChildrenNamed(Node node, String name) { return node.children().findAll { child -> (child instanceof Node && name == child.name()) } } protected def findAttachmentTitled(def attachments, String title) { if (attachments == null) return null return attachments.results.find { it.title == title } } protected def openInput(String urlString, String cache = null) { URL url = new URL(urlString) File cacheFile = cache ? new File(downloadFolder, "." + cache + ".json") : null if (cacheFile?.exists()) { log("Found cached file $cacheFile.name") return new FileInputStream(cacheFile) } log("Requesting $urlString") if (userPassword) { def conn = url.openConnection() String basicAuth = "Basic " + DatatypeConverter.printBase64Binary(userPassword.getBytes()) conn.setRequestProperty("Authorization", basicAuth) if (!cache) return conn.inputStream else { def stream = conn.inputStream FileUtils.copyInputStreamToFile(stream, cacheFile) stream.close() return new FileInputStream(cacheFile) } } else { if (!cache) return url else { def stream = url.openStream() FileUtils.copyInputStreamToFile(stream, cacheFile) stream.close() return new FileInputStream(cacheFile) } } } protected InputStream openStream(String urlString) { def stream = openInput(urlString) if (stream instanceof URL) stream = stream.openStream() return stream } void writeln() { boolean newLineBefore = newLine if (mode == Mode.Table) { out.println('\\') newLine = true } else { out.println() // if (itemNumber != null) out.print(" ") newLine = true if (mode == Panel) { writeRaw("\n| ") } else if (mode == BlockQuote) { writeRaw("\n") blockIndent.times { writeRaw("> ") } } } blankLine = newLine && newLineBefore } void log(String text) { if (verbose) { (depth * 2).times { print(" ") } println(text) } } void write(String text) { if (text) { writeRaw(transform(text)) } } protected void computeLineStatus(String text) { if (mode == Panel) { newLine = text.endsWith("\n ") blankLine = text.endsWith("\n \n ") } else if (mode == BlockQuote) { StringBuffer buf = new StringBuffer(2 * blockIndent) blockIndent.times { buf.append("> ") } def nl = "\n " + buf.toString() newLine = text.endsWith(nl) blankLine = text.endsWith(nl + nl) } else if (mode == Mode.Table) { text = text.replace('\n', '\\\n') newLine = text.endsWith('\\\n') blankLine = text.endsWith('\\\n\\\n') } else { newLine = text.endsWith('\n') blankLine = text.endsWith('\n\n') } } protected static final String[] SEARCH = ['_', '$', '*', '\\', '<', '#', "^[", "*", "`", "{", "}", "[", "]", ">", "#", "+", "-", ".", "!"] as String[] protected static final String[] REPLACE = ['\\_', '\\$', '\\*', '\\\\', '\\<', '\\#', "^\\[", "\\*", "\\`", "\\{", "\\}", "\\[", "\\]", "\\>", "\\#", "\\+", "\\-", "\\.", "\\!"] as String[] protected String transform(String text) { // escape unwanted footnotes etc. if (mode != CodeBlock && itemNumber == null) { // CodeBlock: do not replace most of the things text = StringUtils.replaceEach(text, SEARCH, REPLACE) } if (mode == Panel) { text = text.replace("\n", "\n ") } else if (mode == BlockQuote) { StringBuffer buf = new StringBuffer(2 * blockIndent) blockIndent.times { buf.append("> ") } text = text.replace("\n", "\n " + buf.toString()) } else if (mode == Mode.Table) { text = text.replace('\n', '\\\n') } // \`*_{}[]()>#+-.! return text } void writeRaw(String text) { if (text) { out.print(text) computeLineStatus(text) } } protected void withList(Closure processor) { listIndent++ def oldItemNumber = itemNumber processor() itemNumber = oldItemNumber listIndent-- assertBlankLine() } protected void withMode(final Mode mode, Closure processor) { def old = this.mode this.mode = mode processor() this.mode = old } void close() { close(input) if (out instanceof Closeable) out.close() } void close(def inputThing) { if (inputThing instanceof Closeable) input.close() } }