
laika.rst.InlineParsers.scala Maven / Gradle / Ivy
/*
* Copyright 2013-2016 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package laika.rst
import laika.ast._
import laika.bundle.{SpanParser, SpanParserBuilder}
import laika.collection.TransitionalCollectionOps._
import laika.parse.markup.RecursiveSpanParsers
import laika.parse.text.TextParsers._
import laika.parse.text.{DelimitedText, DelimiterOptions}
import laika.parse.uri.{AutoLinkParsers, URIParsers}
import laika.parse.{Failure, Parser, Success}
import laika.rst.BaseParsers._
import laika.rst.ast.{InterpretedText, ReferenceName, SubstitutionReference}
/** Provides all inline parsers for reStructuredText.
*
* Inline parsers deal with markup within a block of text, such as a
* link or emphasized text. They are used in the second phase of parsing,
* after the block parsers have cut the document into a (potentially nested)
* block structure.
*
* @author Jens Halm
*/
object InlineParsers {
/** Parses an escaped character. For most characters it produces the character
* itself as the result with the only exception being an escaped space character
* which is removed from the output in reStructuredText.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#escaping-mechanism]].
*/
val escapedChar: Parser[String] = (" " ^^^ "") | (any take 1)
private val pairs: Map[Char, Set[Char]] = List(/* Ps/Pe pairs */
'('->')', '['->']', '{'->'}', '<'->'>', '"'->'"', '\''->'\'',
'\u0f3a'->'\u0f3b', '\u0f3c'->'\u0f3d', '\u169b'->'\u169c', '\u2045'->'\u2046',
'\u207d'->'\u207e', '\u208d'->'\u208e', '\u2329'->'\u232a', '\u2768'->'\u2769', '\u276a'->'\u276b',
'\u276c'->'\u276d', '\u276e'->'\u276f', '\u2770'->'\u2771', '\u2772'->'\u2773', '\u2774'->'\u2775',
'\u27c5'->'\u27c6', '\u27e6'->'\u27e7', '\u27e8'->'\u27e9', '\u27ea'->'\u27eb', '\u27ec'->'\u27ed',
'\u27ee'->'\u27ef', '\u2983'->'\u2984', '\u2985'->'\u2986', '\u2987'->'\u2988', '\u2989'->'\u298a',
'\u298b'->'\u298c', '\u298d'->'\u298e', '\u298f'->'\u2990', '\u2991'->'\u2992', '\u2993'->'\u2994',
'\u2995'->'\u2996', '\u2997'->'\u2998', '\u29d8'->'\u29d9', '\u29da'->'\u29db', '\u29fc'->'\u29fd',
'\u2e22'->'\u2e23', '\u2e24'->'\u2e25', '\u2e26'->'\u2e27', '\u2e28'->'\u2e29', '\u3008'->'\u3009',
'\u300a'->'\u300b', '\u300c'->'\u300d', '\u300e'->'\u300f', '\u3010'->'\u3011', '\u3014'->'\u3015',
'\u3016'->'\u3017', '\u3018'->'\u3019', '\u301a'->'\u301b', '\u301d'->'\u301e', '\ufd3e'->'\ufd3f',
'\ufe17'->'\ufe18', '\ufe35'->'\ufe36', '\ufe37'->'\ufe38', '\ufe39'->'\ufe3a', '\ufe3b'->'\ufe3c',
'\ufe3d'->'\ufe3e', '\ufe3f'->'\ufe40', '\ufe41'->'\ufe42', '\ufe43'->'\ufe44', '\ufe47'->'\ufe48',
'\ufe59'->'\ufe5a', '\ufe5b'->'\ufe5c', '\ufe5d'->'\ufe5e', '\uff08'->'\uff09', '\uff3b'->'\uff3d',
'\uff5b'->'\uff5d', '\uff5f'->'\uff60', '\uff62'->'\uff63',
/* Pi/Pf pairs */
'\u00ab'->'\u00bb', '\u2018'->'\u2019', '\u201c'->'\u201d', '\u2039'->'\u203a', '\u2e02'->'\u2e03',
'\u2e04'->'\u2e05', '\u2e09'->'\u2e0a', '\u2e0c'->'\u2e0d', '\u2e1c'->'\u2e1d', '\u2e20'->'\u2e21',
/* Pi/Pf pairs reverse */
'\u00bb'->'\u00ab', '\u2019'->'\u2018', '\u201d'->'\u201c', '\u203a'->'\u2039', '\u2e03'->'\u2e02',
'\u2e05'->'\u2e04', '\u2e0a'->'\u2e09', '\u2e0d'->'\u2e0c', '\u2e1d'->'\u2e1c', '\u2e21'->'\u2e20',
/* pairs added explicitly in the reStructuredText ref impl */
'\u301d'->'\u301f', '\u201a'->'\u201b', '\u201e'->'\u201f', '\u201b'->'\u201a', '\u201f'->'\u201e',
/* additional pairing of open/close quotes for different typographic conventions in different languages */
'\u00bb'->'\u00bb', '\u2018'->'\u201a', '\u2019'->'\u2019', '\u201a'->'\u2018', '\u201a'->'\u2019',
'\u201c'->'\u201e', '\u201e'->'\u201c', '\u201e'->'\u201d', '\u201d'->'\u201d', '\u203a'->'\u203a')
.groupBy(_._1).mapValuesStrict(_.map(_._2).toSet)
private val startChars = anyOf(' ','-',':','/','\'','"','<','(','[','{','\n') take 1
private val startCategories = Set[Int](Character.DASH_PUNCTUATION, Character.OTHER_PUNCTUATION, Character.START_PUNCTUATION,
Character.INITIAL_QUOTE_PUNCTUATION, Character.FINAL_QUOTE_PUNCTUATION)
private val endChars = anyOf(' ','-','.',',',':',';','!','?','\\','/','\'','"','>',')',']','}','\u201a','\u201e') take 1
private val endCategories = Set[Int](Character.DASH_PUNCTUATION, Character.OTHER_PUNCTUATION, Character.END_PUNCTUATION,
Character.INITIAL_QUOTE_PUNCTUATION, Character.FINAL_QUOTE_PUNCTUATION)
/** Parses the markup at the start of an inline element according to reStructuredText markup recognition rules.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules]].
*
* @param start the parser that recognizes the markup at the start of an inline element
* @param end the parser that recognizes the markup at the end of an inline element, needed to verify
* the start sequence is not immediately followed by an end sequence as empty elements are not allowed.
* @return a parser without a useful result, as it is only needed to verify it succeeds
*/
def markupStart (start: Parser[Any], end: Parser[String]): Parser[Any] = {
((lookBehind(2, beforeStartMarkup) | lookBehind(1, atStart ^^^ ' ')) >> afterStartMarkup(start)) ~ not(end) // not(end) == rule 6
}
/** Parses the start of an inline element without specific start markup
* according to reStructuredText markup recognition rules.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules]].
*
* @param end the parser that recognizes the markup at the end of an inline element, needed to verify
* the start sequence is not immediately followed by an end sequence as empty elements are not allowed.
* @return a parser without a useful result, as it is only needed to verify it succeeds
*/
def markupStart (end: Parser[String]): Parser[Any] = markupStart(success(()), end)
/** Parses the end of an inline element according to reStructuredText markup recognition rules.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules]].
*
* @param end the parser that recognizes the markup at the end of an inline element
* @return a parser that produces the same result as the parser passed as an argument
*/
def markupEnd (end: Parser[String]): Parser[String] = {
end >> { markup => (lookBehind(markup.length + 1, beforeEndMarkup) ~ lookAhead(eol | afterEndMarkup)) ^^^ markup }
}
def delimitedByMarkupEnd (end: String): DelimitedText[String] with DelimiterOptions = {
val postCondition = lookBehind(end.length + 1, beforeEndMarkup) ~ lookAhead(eol | afterEndMarkup)
delimitedBy(end, postCondition)
}
def delimitedByMarkupEnd (end: String, postCondition: Parser[Any]): DelimitedText[String] with DelimiterOptions = {
val combinedPostCondition = lookBehind(end.length + 1, beforeEndMarkup) ~ lookAhead(eol | afterEndMarkup) ~ postCondition
delimitedBy(end, combinedPostCondition)
}
/** Parses the end of an inline element according to reStructuredText markup recognition rules.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules]].
*
* @param delimLength the length of the end delimiter that has already been consumed from the input
* @return a parser that succeeds without consuming any input when the rst markup end conditions are met
*/
def markupEnd (delimLength: Int): Parser[Any] = {
lookBehind(delimLength + 1, beforeEndMarkup) ~ lookAhead(eol | afterEndMarkup)
}
/** Inline markup recognition rules 2 and 5
*/
private def afterStartMarkup (start: Parser[Any])(before: Char): Parser[Any ~ String] = {
val matching = pairs.getOrElse(before, Set())
val excluded = (matching + ' ' + '\n').toList
start ~ lookAhead(anyBut(excluded:_*) take 1)
}
/** Inline markup recognition rules 3
*/
private val beforeEndMarkup: Parser[String] = anyBut(' ','\n') take 1
/** Inline markup recognition rule 1
*/
private val beforeStartMarkup: Parser[Char] = (startChars | anyWhile(char => startCategories(Character.getType(char))).take(1)) ^^ {_.charAt(0)}
/** Inline markup recognition rule 4
*/
private val afterEndMarkup: Parser[Any] = endChars | anyWhile(char => endCategories(Character.getType(char))).take(1)
/** Parses a span of emphasized text.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#emphasis]]
*/
lazy val em: SpanParserBuilder = SpanParser.forStartChar('*').recursive { implicit recParsers =>
span(not(lookBehind(2, '*')), "*", not('*')) ^^ (Emphasized(_))
}.withLowPrecedence
/** Parses a span of text with strong emphasis.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#strong-emphasis]]
*/
lazy val strong: SpanParserBuilder = SpanParser.forStartChar('*').recursive { implicit recParsers =>
span('*',"**") ^^ (Strong(_))
}
private def span (start: Parser[Any], end: String)(implicit recParsers: RecursiveSpanParsers): Parser[List[Span]]
= markupStart(start, end) ~> recParsers.escapedText(delimitedByMarkupEnd(end)) ^^ { text => List(Text(text)) }
private def span (start: Parser[Any], end: String, postCondition: Parser[Any])(implicit recParsers: RecursiveSpanParsers): Parser[List[Span]]
= markupStart(start, end) ~> recParsers.escapedText(delimitedByMarkupEnd(end, postCondition)) ^^ { text => List(Text(text)) }
/** Parses an inline literal element.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-literals]].
*/
lazy val inlineLiteral: SpanParserBuilder = SpanParser.forStartChar('`').standalone {
markupStart('`', "``") ~> delimitedByMarkupEnd("``") ^^ (Literal(_))
}
private def toSource (label: FootnoteLabel): String = label match {
case Autonumber => "[#]_"
case Autosymbol => "[*]_"
case AutonumberLabel(label) => s"[#$label]_"
case NumericLabel(label) => s"[$label]_"
}
/** Parses a footnote reference.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#footnote-references]].
*/
lazy val footnoteRef: SpanParserBuilder = SpanParser.forStartChar('[').standalone {
markupStart("]_") ~> footnoteLabel <~ markupEnd("]_") ^^ { label => FootnoteReference(label, toSource(label)) }
}
/** Parses a citation reference.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#citation-references]].
*/
lazy val citationRef: SpanParserBuilder = SpanParser.forStartChar('[').standalone {
markupStart("]_") ~> simpleRefName <~ markupEnd("]_") ^^ { label => CitationReference(label, s"[$label]_") }
}
/** Parses a substitution reference.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#substitution-references]].
*/
lazy val substitutionRef: SpanParserBuilder = SpanParser.forStartChar('|').standalone {
markupStart("|") ~> simpleRefName >> { ref =>
markupEnd("|__") ^^ { _ => LinkReference(List(SubstitutionReference(ref)), "", s"|$ref|__") } |
markupEnd("|_") ^^ { _ => LinkReference(List(SubstitutionReference(ref)), ref, s"|$ref|_") } |
markupEnd("|") ^^ { _ => SubstitutionReference(ref) }
}
}
/** Parses an inline internal link target.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-internal-targets]]
*/
lazy val internalTarget: SpanParserBuilder = SpanParser.forStartChar('_').recursive { recParsers =>
markupStart('`', "`") ~>
(recParsers.escapedText(delimitedBy('`').nonEmpty) ^^ ReferenceName) <~
markupEnd(1) ^^ (id => Text(id.original, Id(id.normalized) + Styles("target")))
}
/** Parses an interpreted text element with the role name as a prefix.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#interpreted-text]]
*/
lazy val interpretedTextWithRolePrefix: SpanParserBuilder = SpanParser.forStartChar(':').recursive { recParsers =>
(markupStart(":") ~> simpleRefName) ~ (":`" ~> recParsers.escapedText(delimitedBy('`').nonEmpty) <~ markupEnd(1)) ^^
{ case role ~ text => InterpretedText(role,text,s":$role:`$text`") }
}
/** Parses an interpreted text element with the role name as a suffix.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#interpreted-text]]
*/
def interpretedTextWithRoleSuffix (defaultTextRole: String): SpanParserBuilder =
SpanParser.forStartChar('`').recursive { recParsers =>
(markupStart("`") ~> recParsers.escapedText(delimitedBy('`').nonEmpty) <~ markupEnd(1)) ~ opt(":" ~> simpleRefName <~ markupEnd(":")) ^^
{ case text ~ role => InterpretedText(role.getOrElse(defaultTextRole), text, s"`$text`" + role.map(":"+_+":").getOrElse("")) }
}.withLowPrecedence
/** Parses a phrase link reference (enclosed in back ticks).
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-references]]
*/
lazy val phraseLinkRef: SpanParserBuilder = SpanParser.forStartChar('`').recursive { recParsers =>
def ref (refName: String, url: String) = if (refName.isEmpty) url else refName
val url = '<' ~> delimitedBy('>') ^^ { _.replaceAll("[ \n]+", "") }
val refName = recParsers.escapedText(delimitedBy('`','<').keepDelimiter) ^^ ReferenceName
markupStart("`") ~> refName ~ opt(url) ~ (markupEnd("`__") ^^^ false | markupEnd("`_") ^^^ true) ^^ {
case refName ~ Some(url) ~ true =>
SpanSequence(List(ExternalLink(List(Text(ref(refName.original, url))), url), ExternalLinkDefinition(ref(refName.normalized, url), url)))
case refName ~ Some(url) ~ false => ExternalLink(List(Text(ref(refName.original, url))), url)
case refName ~ None ~ true => LinkReference(List(Text(refName.original)), refName.normalized, s"`${refName.original}`_")
case refName ~ None ~ false => LinkReference(List(Text(refName.original)), "", s"`${refName.original}`__")
}
}
/** Parses a simple link reference.
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-references]]
*/
lazy val simpleLinkRef: SpanParserBuilder = SpanParser.forStartChar('_').standalone {
markupEnd('_' ^^^ "__" | success("_")) >> {
markup => reverse(markup.length, simpleRefName <~ reverseMarkupStart) ^^ { refName =>
markup match {
case "_" => Reverse(refName.length, LinkReference(List(Text(refName)), ReferenceName(refName).normalized, s"${refName}_"), Text("_"))
case "__" => Reverse(refName.length, LinkReference(List(Text(refName)), "", s"${refName}__"), Text("__"))
}
}
}
}.withLowPrecedence
private def reverse (offset: Int, p: => Parser[String]): Parser[String] = Parser { in =>
p.parse(in.reverse.consume(offset)) match {
case Success(result, _) => Success(result.reverse, in)
case Failure(msg, _) => Failure(msg, in)
}
}
private lazy val reverseMarkupStart: Parser[Any] = lookAhead(eof | beforeStartMarkup)
private def trim (p: Parser[(String,String,String)]): Parser[Span] = p >> { res => Parser { in =>
val startChar = Set('-',':','/','\'','(','{')
val endChar = Set('-',':','/','\'',')','}','.',',',';','!','?')
res match {
case (start, sep, end) =>
val startTrimmed = start.dropWhile(startChar)
val endTrimmed = end.reverse.dropWhile(endChar).reverse
val uri = startTrimmed + sep + endTrimmed
val uriWithScheme = if (sep == "@" && !uri.startsWith("mailto:")) "mailto:"+uri else uri
val nextIn = in.consume(endTrimmed.length - end.length)
Success(Reverse(startTrimmed.length, ExternalLink(List(Text(uri)), uriWithScheme), Text(sep+endTrimmed)), nextIn)
}
}}
private val autoLinks = new AutoLinkParsers(
reverseMarkupStart,
afterEndMarkup,
Set('-',':','/','\'','(','{'),
Set('-',':','/','\'',')','}','.',',',';','!','?')
)
/** Parses a standalone HTTP or HTTPS hyperlink (with no surrounding markup).
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#standalone-hyperlinks]]
*/
lazy val uri: SpanParserBuilder = autoLinks.http
/** Parses a standalone email address (with no surrounding markup).
*
* See [[http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#standalone-hyperlinks]]
*/
lazy val email: SpanParserBuilder = autoLinks.email
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy