org.sweble.wikitext.lazy.parser.Content.rats Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of swc-parser-lazy Show documentation
A parser for MediaWiki's Wikitext.
There is a newer version: 3.1.9
/**
 * Copyright 2011 The Open Source Research Group,
 *                University of Erlangen-Nürnberg
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*!
 *
 * Content
 * -------
 *
 *   Overview:
 *     - First see if it's something special (a link, a magic word, ...)
 *     - If not, it's just text
 *
 *   Closer look:
 *     - Special things always have their unique prefix.
 *     - If such a prefix is encountered in the text, the rule for the
 *       corresponding thing MUST match. Either because we really found
 *       something special, or because the rule just consumed the prefix as 
 *       plain text to let the parser continue with it's work.
 *
 *   Even closer look:
 *     - This "must match" requirement helps us escape the content if a prefix
 *       of a closing element is encountered in the text and this prefix is also
 *       the prefix of a one of these special things.
 *
 * Paragraph
 * ---------
 *
 *   Grammar:
 *     - ( '\n' Space* &'\n' )+
 *
 *   AST node:
 *     Name        : Paragraph
 *     Extends     : ContentNode
 *     Constructor : content
 *     NodeType    : org.sweble.wikitext.lazy.AstNodeTypes.NT_PARAGRAPH
 *
 */

module org.sweble.wikitext.lazy.parser.Content;

import org.sweble.wikitext.lazy.utils.ContentUtils;
import org.sweble.wikitext.lazy.utils.Roundtrip;
import org.sweble.wikitext.lazy.utils.XmlReference;

import org.sweble.wikitext.lazy.parser.ExternalLink;
import org.sweble.wikitext.lazy.parser.InternalLink;
import org.sweble.wikitext.lazy.parser.MagicWord;
import org.sweble.wikitext.lazy.parser.ParserEntity;
import org.sweble.wikitext.lazy.parser.Signature;
import org.sweble.wikitext.lazy.parser.Ticks;
import org.sweble.wikitext.lazy.parser.Url;
import org.sweble.wikitext.lazy.parser.HorizontalRule;
import org.sweble.wikitext.lazy.parser.List;
import org.sweble.wikitext.lazy.parser.SemiPre;
import org.sweble.wikitext.lazy.parser.Section;
import org.sweble.wikitext.lazy.parser.Table;
import org.sweble.wikitext.lazy.parser.TableHeader;
import org.sweble.wikitext.lazy.parser.TableCell;
import org.sweble.wikitext.lazy.parser.XmlElement;
import org.sweble.wikitext.lazy.parser.Whitespace;




// -- Block content ------------------------------------------------------------

/* BolBlockContent = ( LineStartProd / Paragraph )*
 */

noinline transient NodeList BolBlockContent =
  content:Block*
  {
    yyValue = new NodeList(content);
  }
;

/* BlockContent = Paragraph ( LineStartProd / Paragraph )*
 */

noinline transient NodeList BlockContent =
  head:Paragraph tail:Block*
  {
    yyValue = new NodeList(head, tail);
  }
;

private inline AstNode Block =
  !BlockStopper ( LineStartProd / Paragraph )
;

private inline void BlockStopper =
    BlockStopperNextSection
  / BlockStopperNextTableElement
;

private noinline transient AstNode LineStartProd =
    &( pExtSpaceStar "{|"  ) Table
  / &( pTpStar       "---" ) HorizontalRule
  / &( pTpStar       [*#:] ) List
  / &( pTpStar       [ ]   ) SemiPre
  / &( pTpStar       [=]   ) Sections
;




// -- Paragraph --[ State Aware Memoization ]-----------------------------------

noinline transient AstNode Paragraph =
 ^{
    StateAwareResult r = (StateAwareResult) pParagraphMemoized(yyBase);
    final LazyParserContext context = getContext();
    Result yyResult = r.getResult(context);
    if (yyResult == null)
      yyResult = r.setResult(context, pParagraphTransient(yyBase));
    if (returnTrue(r))
      return yyResult;
  }
;

noinline memoized AstNode ParagraphMemoized =
 ^{
    Result yyResult = new StateAwareResult("Paragraph", getContext(), pParagraphTransient(yyBase));
    if (returnTrue(yyResult))
      return yyResult;
  }
;




// -- Paragraph ----------------------------------------------------------------

/* Paragraph = PreParaWs ParagraphText PostParaWs?
 *           / PreParaWs
 *           / ParagraphText PostParaWs?
 */

noinline transient AstNode ParagraphTransient =
    pre:PreParaWs !ParagraphStopper p:ParagraphText post:PostParaWs?
    {
      NodeList l = new NodeList();
      l.add(pre);
      l.add(IntermediateTags.PARAGRAPH.createOpen(false));
      l.add(p);
      l.add(post);
      l.add(IntermediateTags.PARAGRAPH.createClose(false));
      yyValue = l;
    }
  / PreParaWs
  / p:ParagraphText post:PostParaWs?
    {
      NodeList l = new NodeList();
      l.add(IntermediateTags.PARAGRAPH.createOpen(false));
      l.add(p);
      l.add(post);
      l.add(IntermediateTags.PARAGRAPH.createClose(false));
      yyValue = l;
    }
;

private inline void ParagraphStopper =
    BlockStopper
  / LineStartProd
;

/* PreParaWs = S* EOL ( S* EOL )*
 */

private noinline transient Whitespace PreParaWs =
  s:pExtSpaceStar lt:pEol tail:PreParaWsMore*
  {
    yyValue = new Whitespace(
        new NodeList(s, new Text(lt), new NodeList(tail)),
        true);
  }
;

private inline NodeList PreParaWsMore =
  !ParagraphStopper s:pExtSpaceStar lt:pEol
  {
    yyValue = new NodeList(s, new Text(lt));
  }
;

/* ParagraphText = Content+ ( EOL Content+ )*
 */

private noinline transient AstNode ParagraphText =
  head:InlineContentPlus tail:ParagraphTextMore*
  {
    yyValue = new NodeList(head, tail);
  }
;

private inline AstNode ParagraphTextMore =
  lt:pEol !ParagraphStopper s:pExtSpaceStar c:InlineContentPlus
  {
    yyValue = new NodeList(
        new Whitespace(
          new NodeList(new Text(lt)),
          true),
        s,
        c);
  }
;

/* PostParaWs = EOL ( S* EOL )*
 */

private noinline transient Whitespace PostParaWs =
  lt:pEol tail:PostParaWsMore*
  {
    yyValue = new Whitespace(
        new NodeList(new Text(lt), new NodeList(tail)),
        true);
  }
;

private inline NodeList PostParaWsMore =
  !ParagraphStopper s:pExtSpaceStar lt:pEol
  {
    yyValue = new NodeList(s, new Text(lt));
  }
;




// -- For strange things like internal link titles -----------------------------

/* InlineBlockContent = InlineTextBlock ( LineStartProd / InlineTextBlock )
 */

noinline transient NodeList InlineBlockContent =
  head:( !BlockStopper InlineTextBlock )? tail:InlineBlock*
  {
    yyValue = new NodeList(head, new NodeList(tail));
  }
;

private inline AstNode InlineBlock =
  !BlockStopper ( LineStartProd / InlineTextBlock )
;

/* InlineTextBlock = Content+ ( EOL Content* )*
 *                 /          ( EOL Content* )+
 */

private transient AstNode InlineTextBlock =
    head:InlineContentPlus tail:InlineTextBlockMore*
    {
      yyValue = new NodeList(head, tail);
    }
  / tail:InlineTextBlockMore+
    {
      yyValue = new NodeList(tail);
    }
;

private inline AstNode InlineTextBlockMore =
  lt:pEol more:( !ParagraphStopper InlineContentPlus )?
  {
    yyValue = new NodeList(new Text(lt), more);
  }
;




// -- Inline Content -----------------------------------------------------------

transient NodeList InlineContentStar =
  content:InlineContentChoice*
  {
    yyValue = new NodeList(content);
  }
;

transient NodeList InlineContentPlus =
  content:InlineContentChoice+
  {
    yyValue = new NodeList(content);
  }
;

/* TextPlus will stop on ContentAtoms and ContentStoppers.
 *
 * ContentAtoms will never fail if there is a ContentAtom prefix. If it's not a
 * real ContentAtom or the respective ContentAtom is not allowed in a given
 * context the production responsible for the prefix will consume a part of the
 * prefix and return a Text node containing only the consumed prefix. That's
 * why ContentAtoms will never fail if there is a ContentAtom prefix.
 *
 * Eventually this production will only fail, if it encounters a
 * ContentStopper, that is a syntactic element which is important to the
 * surrounding production and must therefore not be consumed as part of the
 * content.
 *
 * Plain URLs are quite the hack (for performance reasons; they don't have a
 * "proper prefix") and have to be treated specially (see Url grammar for
 * details). The same goes for internal links and their prefix.
 */

private transient AstNode InlineContentChoice =
    text:TextPlus ':' &{ accept(ParserAtoms.PLAIN_EXTERNAL_LINK) } path:UrlPathString &{ isProtocol(text, path) }
    {
      yyValue = makeExternalLink(text, path);
    }
  / text:TextPlus &"[[" !InlineContentStopper link:InternalLink
    {
      yyValue = link.isNodeType(AstNodeTypes.NT_INTERNAL_LINK) ?
          addLinkPrefix(text, (InternalLink) link) :
          new NodeList(text, link);
    }
  / TextPlus
  / !InlineContentStopper InlineContentAtom
  / !InlineContentStopper text:TextStopperPrefix
    {
      yyValue = new Text(text);
    }
;


/* The predicates before each ContentAtom are only there to simplify debugging.
 * And sort of to remind us which production is responsible for which prefix.
 */

private inline AstNode InlineContentAtom =
    &"~~"     Signature
  / &"[["     InternalLink
  / &"["      ExternalLink
  / &"''"     Ticks
  / &"__"     MagicWord
  / &"<"      XmlElement
  / &"&"      XmlReference
  / &"\uE000" ParserEntity
;

private inline void InlineContentStopper =
    slEol
  / InlineContentStopperExternalLink
  / InlineContentStopperInternalLink
  / InlineContentStopperHeading
  / InlineContentStopperTableHeader
  / InlineContentStopperTableCell
;

/* Plain text is basically everything that is not a ContentAtom (semantically,
 * a Text node is also a ContentAtom, however, it is not part of the respective
 * production). Also plain text has to stop at so-called TextStoppers. These
 * are syntactic elements which are significant to the surrounding production
 * and must not become part of the plain text (in order for the surrounding
 * production to see and consume them).
 */

private Text TextPlus =
  text:TextPlusStr
  {
    yyValue = new Text(text);
  }
;

private transient String TextPlusStr = ( !TextStopperPrefix _ )+ ;

private inline String TextStopperPrefix =

  // -- stop characters --

    pSlEol          // block level end (lists, semi pre)
                    // heading abort
                    // external link title abort
                    // table element start
                    // table cell/header inline element end
  / "="             // heading end
  / "|"             // internal link parameter end
                    // table cell inline element end
  / "!"             // table header inline element end
  / "]"             // external and internal link title end
  / ":"             // Definition list definition (in the pair TERM:DEF)

  // -- content atom prefix --

  / ":"             // plain url
  / "~~"            // signature
  / "["             // internal or external link
  / "''"            // bold or italic
  / "__"            // magic word
  / "<"             // xml tag and comment
  / "&"             // xml entity
  / "\uE000"        // parser entity
;




// -- End of file --------------------------------------------------------------