eu.henkelmann.actuarius.LineParsers.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of actuarius_2.10 Show documentation
Actuarius is a Markdown Processor written in Scala using parser combinators.
The newest version!
package eu.henkelmann.actuarius

import scala.util.parsing.input.{Position, Reader}
import java.util.StringTokenizer
import scala.collection.mutable.{HashMap, ArrayBuffer, ListBuffer}


/**
 * Represents a line of markdown.
 * The prefix is the beginning of the line that indicates the line type,
 * the payload is the actual content after the prefix.
 */
sealed abstract class MarkdownLine(val prefix:String, val payload:String){
    /**
     * Constructs a MarkdownLine where the prefix is the empty String and the
     * payload is the whole line.
     */
    def this(c:String) = this ("", c)

    /**
     * Returns the full line as it was originally, i.e. prefix+payload.
     */
    def fullLine = prefix + payload
}

/**Represents lines of verbatim xml.
 * Actually this class is a little cheat, as it represents multiple lines.
 * But it is a token that is created when "parsing with a line scope", so it is not too bad.
 */
case class XmlChunk(content:String) extends MarkdownLine(content)
/** Represents the underline for a setext style header
 */
case class SetExtHeaderLine(content:String, headerLevel:Int) extends MarkdownLine(content)

/**
 * An atx style header line.
 * Trims hashes automatically and determines the header level from them.
 */
case class AtxHeaderLine(pre:String, pay:String) extends MarkdownLine(pre, pay) {
    /** removes all whitespace, nl and trailing hashes from the payload
     * "  foo ##  \n" => "foo"
     */
    def trimHashes() = {
        val s = payload.trim
        var idx = s.length - 1
        while (idx >= 0 && s.charAt(idx) == '#') idx -= 1
        s.substring(0,idx+1).trim
    }

    def headerLevel = prefix.length
}
/** A line consisting only of whitespace.
 */
case class EmptyLine(content:String) extends MarkdownLine(content)
/** A horizontal ruler line.
 */
case class RulerLine(content:String) extends MarkdownLine(content)
/** A line indicating a block quote (starts with "> ")
 */
case class BlockQuoteLine(pre:String, pay:String) extends MarkdownLine(pre, pay)
/** A line indicating the start of an unordered list item (starts with "   *  ")
 */
case class UItemStartLine(pre:String, pay:String) extends MarkdownLine(pre, pay)
/** A line indicating the start of an ordered list item (starts with "   [NUMBER].  ")
 */
case class OItemStartLine(pre:String, pay:String) extends MarkdownLine(pre, pay)
/** A line in verbatim code or the continuation of a list item
 */
case class CodeLine(pre:String, pay:String) extends MarkdownLine(pre, pay)
/** Starting line of a fenced code block: three backticks followed by an optional
 * language token
 */
case class ExtendedFencedCode(pre:String, pay:String) extends MarkdownLine(pre, pay) {
    def languageFormat = pay.trim()
}
/** Ending line of a fenced code block: three backticks followed by optional whitespace 
 */
case class FencedCode(pre:String) extends MarkdownLine(pre)
/** Any other line.
 */
case class OtherLine(content:String) extends MarkdownLine(content)


/** Definition of a link or url that can be referenced by id.
 */
case class LinkDefinition(id:String, url:String, title:Option[String])

/** Stub class that is an intermediate result when parsing link definitions.
 */
case class LinkDefinitionStart(id:String, url:String) {
    def toLinkDefinition(title:Option[String]) = new LinkDefinition(id, url, title)
}

/**
 * This class allows us to reference a map with link definitions resulting from the line parsing during block parsing.
 * It extends a Reader for MarkdownLines and allows us to add the said map to the parsing context.
 * This is basically a modification of the parser monad's state. 
 */
case class MarkdownLineReader private (val lines:Seq[MarkdownLine],
                                       val lookup:Map[String, LinkDefinition],
                                       val lineCount:Int)
        extends Reader[MarkdownLine] {
    /** Not existing line that signals EOF.
     * This object cannot be referenced by any other code so it will fail all line parsers. 
     */
    private object EofLine extends MarkdownLine("\nEOF\n")


    def this(ls:Seq[MarkdownLine], lu:Map[String, LinkDefinition]) = this(ls, lu, 1)
    def this(ls:Seq[MarkdownLine]) = this (ls, Map())
    def first = if (lines.isEmpty) EofLine else lines.head
    def rest  = if (lines.isEmpty) this else new MarkdownLineReader(lines.tail, lookup, lineCount + 1)
    def atEnd = lines.isEmpty
    def pos   = new Position {
        def line   = lineCount
        def column = 1
        protected def lineContents = first.fullLine
    }
}

/**
 * Parses single lines into tokens.
 * Markdown lines are differentiated by their beginning.
 * These lines are then organized in blocks by the BlockParsers.
 */
trait LineParsers extends InlineParsers {

    /////////////////////////////////
    // Link definition pre-parsing //
    /////////////////////////////////

    /** The Start of a link definition: the id in square brackets, optionally indented by three spaces
     */
    def linkDefinitionId:Parser[String] =
        """ {0,3}\[""".r ~> markdownText(Set(']'), true) <~ ("]:" ~ ows) ^^ {_.trim.toLowerCase}
    /** The link url in a link definition.
     */
    def linkDefinitionUrl:Parser[String] =
        (elem('<') ~> markdownText(Set('>'), true) <~ '>' ^^ {_.mkString.trim}) |
        (markdownText(Set(' ','\t'), true) ^^ {_.mkString})
    /** The title in a link definition.
     */
    def linkDefinitionTitle:Parser[String] =
        ows ~> ("""\"[^\n]*["]""".r |
                """\'[^\n]*\'""".r  |
                """\([^\n]*\)""".r) <~ ows ^^ { s => s.substring(1,s.length-1) }

    /** A link definition that later gets stripped from the output.
     * Either a link definition on one line or the first line of a two line link definition.
     */
    def linkDefinitionStart:Parser[(LinkDefinitionStart, Option[String])] =
        linkDefinitionId ~ linkDefinitionUrl ~ opt(linkDefinitionTitle) ^^ {case i ~ u ~ t => (new LinkDefinitionStart(i, u), t)}


    //////////////////////////////////////////
    // Lines for XML Block tokenizing       //
    //////////////////////////////////////////

    /** A line that starts an xml block: an opening xml element fragment.
     */
    def xmlBlockStartLine:Parser[String] = guard('<' ~ xmlName) ~> rest
    /** A line that ends an xml block: a line starting with an xml end tag
     */
    def xmlBlockEndLine:Parser[String] = guard(xmlEndTag) ~> rest
    /** A line not starting with an xml end tag
     */
    def notXmlBlockEndLine:Parser[String] = not(xmlEndTag) ~> rest


    //////////////////////////////
    // Markdown line tokenizing //
    //////////////////////////////

    /** Parses the line under a setext style level 1 header: =====
     */
    val setextHeader1:Parser[SetExtHeaderLine] = """=+([ \t]*)$""".r ^^ {new SetExtHeaderLine(_, 1)}

    /** Parses the line under a setext style level 2 header: -----
     */
    val setextHeader2:Parser[SetExtHeaderLine] = """((\-)+)([ \t]*)$""".r ^^ {new SetExtHeaderLine(_, 2)}

    /** Parses headers of the form: ### header ###
     */
    val atxHeader:Parser[AtxHeaderLine] = """#+""".r ~ rest ^^ {
        case prefix ~ payload => new AtxHeaderLine(prefix, payload)
    }

    /** Parses a horizontal rule.
     */
    val ruler:Parser[MarkdownLine] = """ {0,3}(((-[ \t]*){3,})|((\*[ \t]*){3,}))$""".r ^^ { new RulerLine(_) }

    /** Matches a line starting with up to three spaces, a '>' and an optional whitespace.
     * (i.e.: the start or continuation of a block quote.)
     */
    val blockquoteLine:Parser[BlockQuoteLine] = """ {0,3}\>( )?""".r ~ rest ^^ {
        case prefix ~ payload => new BlockQuoteLine(prefix,payload)
    }


    /** A line that starts an unordered list item.
     * Matches a line starting with up to three spaces followed by an asterisk, a space, and any whitespace.
     */
    val uItemStartLine:Parser[UItemStartLine] = (""" {0,3}[\*\+-] [\t\v ]*""".r) ~ rest ^^ {
        case prefix ~ payload => new UItemStartLine(prefix, payload)
    }


    /** A line that starts an ordered list item.
     * Matches a line starting with up to three spaces followed by a number, a dot and a space, and any whitespace
     */
    val oItemStartLine:Parser[OItemStartLine] = (""" {0,3}[0-9]+\. [\t\v ]*""".r) ~ rest ^^ {
        case prefix ~ payload => new OItemStartLine(prefix, payload)
    }

    /** Accepts an empty line. (A line that consists only of optional whitespace or the empty string.)
     */
    val emptyLine:Parser[MarkdownLine] = """([ \t]*)$""".r ^^ {new EmptyLine(_)}

    /** Matches a code example line: any line starting with four spaces or a tab.
     */
    val codeLine:Parser[CodeLine] = ("    " | "\t") ~ rest ^^ {
        case prefix ~ payload => new CodeLine(prefix, payload)
    }
        
    /**
     * A fenced code line. Can be the start or the end of a fenced code block 
     */
    val fencedCodeLine:Parser[FencedCode] = """ {0,3}\`{3,}[\t\v ]*""".r ^^ {
        case prefix => new FencedCode(prefix) 
    }
        
    /** Matches the start of a fenced code block with additional language token: 
     * up to three spaces, three or more backticks, whitespace, an optional
     * language token, optional whitespace 
     */
    val extendedFencedCodeLine:Parser[ExtendedFencedCode] = fencedCodeLine ~ """\w+[\t\v ]*""".r ^^ {
        case prefix ~ languageToken => new ExtendedFencedCode(prefix.fullLine, languageToken) 
    }  

    /** Matches any line. Only called when all other line parsers have failed.
     * Makes sure line tokenizing does not fail and we do not loose any lines on the way.
     */
    val otherLine:Parser[OtherLine] = rest ^^ {new OtherLine(_)}

    ///////////////////////////////////////////////////////////////
    // combined parsers for faster tokenizing based on lookahead //
    ///////////////////////////////////////////////////////////////
    /** First tries for a setext header level 2, then for a ruler.
     */
    val setext2OrRulerOrUItem:Parser[MarkdownLine] = setextHeader2 | ruler | uItemStartLine
    /** First tries for a ruler, then for an unordered list item start.
     */
    val rulerOrUItem:Parser[MarkdownLine] = ruler | uItemStartLine
    /** First tries if the line is empty, if not tries for a code line.
     */
    val emptyOrCode:Parser[MarkdownLine] = emptyLine | codeLine
    
    /** Parses one of the fenced code lines
     */
    val fencedCodeStartOrEnd:Parser[MarkdownLine] = extendedFencedCodeLine | fencedCodeLine  
}