All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kr.motd.maven.sphinx.dist.CommonMark.blocks.py Maven / Gradle / Ivy

There is a newer version: 2.10.0
Show newest version
from __future__ import absolute_import, unicode_literals

import re
from importlib import import_module
from CommonMark import common
from CommonMark.common import unescape_string
from CommonMark.inlines import InlineParser
from CommonMark.node import Node
from CommonMark.utils import to_camel_case


CODE_INDENT = 4
reHtmlBlockOpen = [
    re.compile(r'.'),  # dummy for 0
    re.compile(r'^<(?:script|pre|style)(?:\s|>|$)', re.IGNORECASE),
    re.compile(r'^'),
    re.compile(r'\?>'),
    re.compile(r'>'),
    re.compile(r'\]\]>'),
]
reThematicBreak = re.compile(
    r'^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$')
reMaybeSpecial = re.compile(r'^[#`~*+_=<>0-9-]')
reNonSpace = re.compile(r'[^ \t\f\v\r\n]')
reBulletListMarker = re.compile(r'^[*+-]')
reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])')
reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)')
reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}(?!.*~)')
reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)')
reSetextHeadingLine = re.compile(r'^(?:=+|-+) *$')
reLineEnding = re.compile(r'\r\n|\n|\r')


def is_blank(s):
    """Returns True if string contains only space characters."""
    return re.search(reNonSpace, s) is None


def is_space_or_tab(s):
    return s == ' ' or s == '\t'


def peek(ln, pos):
    if pos < len(ln):
        return ln[pos]
    else:
        return None


def ends_with_blank_line(block):
    """ Returns true if block ends with a blank line,
    descending if needed into lists and sublists."""
    while block:
        if block.last_line_blank:
            return True
        if (block.t == 'list' or block.t == 'item'):
            block = block.last_child
        else:
            break

    return False


def parse_list_marker(parser, container):
    """ Parse a list marker and return data on the marker (type,
    start, delimiter, bullet character, padding) or None."""
    rest = parser.current_line[parser.next_nonspace:]
    data = {
        'type': None,
        'tight': True,  # lists are tight by default
        'bullet_char': None,
        'start': None,
        'delimiter': None,
        'padding': None,
        'marker_offset': parser.indent,
    }
    m = re.match(reBulletListMarker, rest)
    m2 = re.match(reOrderedListMarker, rest)
    if m:
        data['type'] = 'bullet'
        data['bullet_char'] = m.group()[0]
    elif m2 and (container.t != 'paragraph' or m2.group(1) == '1'):
        m = m2
        data['type'] = 'ordered'
        data['start'] = int(m.group(1))
        data['delimiter'] = m.group(2)
    else:
        return None

    # make sure we have spaces after
    nextc = peek(parser.current_line, parser.next_nonspace + len(m.group()))
    if not (nextc is None or nextc == '\t' or nextc == ' '):
        return None

    # if it interrupts paragraph, make sure first line isn't blank
    if container.t == 'paragraph' and \
       not re.search(
           reNonSpace,
           parser.current_line[parser.next_nonspace + len(m.group()):]):
        return None

    # we've got a match! advance offset and calculate padding
    parser.advance_next_nonspace()  # to start of marker
    parser.advance_offset(len(m.group()), True)  # to end of marker
    spaces_start_col = parser.column
    spaces_start_offset = parser.offset
    while True:
        parser.advance_offset(1, True)
        nextc = peek(parser.current_line, parser.offset)
        if parser.column - spaces_start_col < 5 and \
           is_space_or_tab(nextc):
            pass
        else:
            break
    blank_item = peek(parser.current_line, parser.offset) is None
    spaces_after_marker = parser.column - spaces_start_col
    if spaces_after_marker >= 5 or \
       spaces_after_marker < 1 or \
       blank_item:
        data['padding'] = len(m.group()) + 1
        parser.column = spaces_start_col
        parser.offset = spaces_start_offset
        if is_space_or_tab(peek(parser.current_line, parser.offset)):
            parser.advance_offset(1, True)
    else:
        data['padding'] = len(m.group()) + spaces_after_marker

    return data


def lists_match(list_data, item_data):
    """
    Returns True if the two list items are of the same type,
    with the same delimiter and bullet character.  This is used
    in agglomerating list items into lists.
    """
    return list_data.get('type') == item_data.get('type') and \
        list_data.get('delimiter') == item_data.get('delimiter') and \
        list_data.get('bullet_char') == item_data.get('bullet_char')


class Block(object):
    accepts_lines = None

    @staticmethod
    def continue_(parser=None, container=None):
        return

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return


class Document(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class List(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        item = block.first_child
        while item:
            # check for non-final list item ending with blank line:
            if ends_with_blank_line(item) and item.nxt:
                block.list_data['tight'] = False
                break
            # recurse into children of list item, to see if there are
            # spaces between any of them:
            subitem = item.first_child
            while subitem:
                if ends_with_blank_line(subitem) and \
                   (item.nxt or subitem.nxt):
                    block.list_data['tight'] = False
                    break
                subitem = subitem.nxt
            item = item.nxt

    @staticmethod
    def can_contain(t):
        return t == 'item'


class BlockQuote(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        ln = parser.current_line
        if not parser.indented and peek(ln, parser.next_nonspace) == '>':
            parser.advance_next_nonspace()
            parser.advance_offset(1, False)
            if is_space_or_tab(peek(ln, parser.offset)):
                parser.advance_offset(1, True)
        else:
            return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class Item(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        if parser.blank:
            if container.first_child is None:
                # Blank line after empty list item
                return 1
            else:
                parser.advance_next_nonspace()
        elif parser.indent >= (container.list_data['marker_offset'] +
                               container.list_data['padding']):
            parser.advance_offset(
                container.list_data['marker_offset'] +
                container.list_data['padding'], True)
        else:
            return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class Heading(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        # A heading can never container > 1 line, so fail to match:
        return 1

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return False


class ThematicBreak(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        # A thematic break can never container > 1 line, so fail to match:
        return 1

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return False


class CodeBlock(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        ln = parser.current_line
        indent = parser.indent
        if container.is_fenced:
            match = indent <= 3 and \
                len(ln) >= parser.next_nonspace + 1 and \
                ln[parser.next_nonspace] == container.fence_char and \
                re.match(reClosingCodeFence, ln[parser.next_nonspace:])
            if match and len(match.group()) >= container.fence_length:
                # closing fence - we're at end of line, so we can return
                parser.finalize(container, parser.line_number)
                return 2
            else:
                # skip optional spaces of fence offset
                i = container.fence_offset
                while i > 0 and is_space_or_tab(peek(ln, parser.offset)):
                    parser.advance_offset(1, True)
                    i -= 1
        else:
            # indented
            if indent >= CODE_INDENT:
                parser.advance_offset(CODE_INDENT, True)
            elif parser.blank:
                parser.advance_next_nonspace()
            else:
                return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        if block.is_fenced:
            # first line becomes info string
            content = block.string_content
            newline_pos = content.index('\n')
            first_line = content[0:newline_pos]
            rest = content[newline_pos + 1:]
            block.info = unescape_string(first_line.strip())
            block.literal = rest
        else:
            # indented
            block.literal = re.sub(r'(\n *)+$', '\n', block.string_content)

        block.string_content = None

    @staticmethod
    def can_contain(t):
        return False


class HtmlBlock(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        if parser.blank and (container.html_block_type == 6 or
                             container.html_block_type == 7):
            return 1
        else:
            return 0

    @staticmethod
    def finalize(parser=None, block=None):
        block.literal = re.sub(r'(\n *)+$', '', block.string_content)
        # allow GC
        block.string_content = None

    @staticmethod
    def can_contain(t):
        return False


class Paragraph(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        return 1 if parser.blank else 0

    @staticmethod
    def finalize(parser=None, block=None):
        has_reference_defs = False

        # try parsing the beginning as link reference definitions:
        while peek(block.string_content, 0) == '[':
            pos = parser.inline_parser.parseReference(
                block.string_content, parser.refmap)
            if not pos:
                break
            block.string_content = block.string_content[pos:]
            has_reference_defs = True
        if has_reference_defs and is_blank(block.string_content):
            block.unlink()

    @staticmethod
    def can_contain(t):
        return False


class BlockStarts(object):
    """Block start functions.

    Return values:
    0 = no match
    1 = matched container, keep going
    2 = matched leaf, no more block starts
    """
    METHODS = [
        'block_quote',
        'atx_heading',
        'fenced_code_block',
        'html_block',
        'setext_heading',
        'thematic_break',
        'list_item',
        'indented_code_block',
    ]

    @staticmethod
    def block_quote(parser, container=None):
        if not parser.indented and \
           peek(parser.current_line, parser.next_nonspace) == '>':
            parser.advance_next_nonspace()
            parser.advance_offset(1, False)
            # optional following space
            if is_space_or_tab(peek(parser.current_line, parser.offset)):
                parser.advance_offset(1, True)
            parser.close_unmatched_blocks()
            parser.add_child('block_quote', parser.next_nonspace)
            return 1

        return 0

    @staticmethod
    def atx_heading(parser, container=None):
        if not parser.indented:
            m = re.match(reATXHeadingMarker,
                         parser.current_line[parser.next_nonspace:])
            if m:
                parser.advance_next_nonspace()
                parser.advance_offset(len(m.group()), False)
                parser.close_unmatched_blocks()
                container = parser.add_child('heading', parser.next_nonspace)
                # number of #s
                container.level = len(m.group().strip())
                # remove trailing ###s:
                container.string_content = re.sub(
                    r' +#+ *$', '', re.sub(
                        r'^ *#+ *$', '', parser.current_line[parser.offset:]))
                parser.advance_offset(
                    len(parser.current_line) - parser.offset, False)
                return 2

        return 0

    @staticmethod
    def fenced_code_block(parser, container=None):
        if not parser.indented:
            m = re.match(
                reCodeFence,
                parser.current_line[parser.next_nonspace:])
            if m:
                fence_length = len(m.group())
                parser.close_unmatched_blocks()
                container = parser.add_child(
                    'code_block', parser.next_nonspace)
                container.is_fenced = True
                container.fence_length = fence_length
                container.fence_char = m.group()[0]
                container.fence_offset = parser.indent
                parser.advance_next_nonspace()
                parser.advance_offset(fence_length, False)
                return 2

        return 0

    @staticmethod
    def html_block(parser, container=None):
        if not parser.indented and \
           peek(parser.current_line, parser.next_nonspace) == '<':
            s = parser.current_line[parser.next_nonspace:]

            for block_type in range(1, 8):
                if re.search(reHtmlBlockOpen[block_type], s) and \
                   (block_type < 7 or container.t != 'paragraph'):
                    parser.close_unmatched_blocks()
                    # We don't adjust parser.offset;
                    # spaces are part of the HTML block:
                    b = parser.add_child('html_block', parser.offset)
                    b.html_block_type = block_type
                    return 2
        return 0

    @staticmethod
    def setext_heading(parser, container=None):
        if not parser.indented and container.t == 'paragraph':
            m = re.match(
                reSetextHeadingLine,
                parser.current_line[parser.next_nonspace:])
            if m:
                parser.close_unmatched_blocks()
                heading = Node('heading', container.sourcepos)
                heading.level = 1 if m.group()[0] == '=' else 2
                heading.string_content = container.string_content
                container.insert_after(heading)
                container.unlink()
                parser.tip = heading
                parser.advance_offset(
                    len(parser.current_line) - parser.offset, False)
                return 2

        return 0

    @staticmethod
    def thematic_break(parser, container=None):
        if not parser.indented and re.search(
                reThematicBreak, parser.current_line[parser.next_nonspace:]):
            parser.close_unmatched_blocks()
            parser.add_child('thematic_break', parser.next_nonspace)
            parser.advance_offset(
                len(parser.current_line) - parser.offset, False)
            return 2
        return 0

    @staticmethod
    def list_item(parser, container=None):
        if (not parser.indented or container.t == 'list'):
            data = parse_list_marker(parser, container)
            if data:
                parser.close_unmatched_blocks()

                # add the list if needed
                if parser.tip.t != 'list' or \
                   not lists_match(container.list_data, data):
                    container = parser.add_child('list', parser.next_nonspace)
                    container.list_data = data

                # add the list item
                container = parser.add_child('item', parser.next_nonspace)
                container.list_data = data
                return 1

        return 0

    @staticmethod
    def indented_code_block(parser, container=None):
        if parser.indented and \
           parser.tip.t != 'paragraph' and \
                           not parser.blank:
            # indented code
            parser.advance_offset(CODE_INDENT, True)
            parser.close_unmatched_blocks()
            parser.add_child('code_block', parser.offset)
            return 2

        return 0


class Parser(object):
    def __init__(self, options={}):
        self.doc = Node('document', [[1, 1], [0, 0]])
        self.block_starts = BlockStarts()
        self.tip = self.doc
        self.oldtip = self.doc
        self.current_line = ''
        self.line_number = 0
        self.offset = 0
        self.column = 0
        self.next_nonspace = 0
        self.next_nonspace_column = 0
        self.indent = 0
        self.indented = False
        self.blank = False
        self.partially_consumed_tab = False
        self.all_closed = True
        self.last_matched_container = self.doc
        self.refmap = {}
        self.last_line_length = 0
        self.inline_parser = InlineParser(options)
        self.options = options

    def add_line(self):
        """ Add a line to the block at the tip.  We assume the tip
        can accept lines -- that check should be done before calling this."""
        if self.partially_consumed_tab:
            # Skip over tab
            self.offset += 1
            # Add space characters
            chars_to_tab = 4 - (self.column % 4)
            self.tip.string_content += (' ' * chars_to_tab)
        self.tip.string_content += (self.current_line[self.offset:] + '\n')

    def add_child(self, tag, offset):
        """ Add block of type tag as a child of the tip.  If the tip can't
        accept children, close and finalize it and try its parent,
        and so on til we find a block that can accept children."""
        block_class = getattr(import_module('CommonMark.blocks'),
                              to_camel_case(self.tip.t))
        while not block_class.can_contain(tag):
            self.finalize(self.tip, self.line_number - 1)
            block_class = getattr(
                import_module('CommonMark.blocks'),
                to_camel_case(self.tip.t))

        column_number = offset + 1
        new_block = Node(tag, [[self.line_number, column_number], [0, 0]])
        new_block.string_content = ''
        self.tip.append_child(new_block)
        self.tip = new_block
        return new_block

    def close_unmatched_blocks(self):
        """Finalize and close any unmatched blocks."""
        if not self.all_closed:
            while self.oldtip != self.last_matched_container:
                parent = self.oldtip.parent
                self.finalize(self.oldtip, self.line_number - 1)
                self.oldtip = parent
            self.all_closed = True

    def find_next_nonspace(self):
        current_line = self.current_line
        i = self.offset
        cols = self.column

        try:
            c = current_line[i]
        except IndexError:
            c = ''
        while c != '':
            if c == ' ':
                i += 1
                cols += 1
            elif c == '\t':
                i += 1
                cols += (4 - (cols % 4))
            else:
                break

            try:
                c = current_line[i]
            except IndexError:
                c = ''

        self.blank = (c == '\n' or c == '\r' or c == '')
        self.next_nonspace = i
        self.next_nonspace_column = cols
        self.indent = self.next_nonspace_column - self.column
        self.indented = self.indent >= CODE_INDENT

    def advance_next_nonspace(self):
        self.offset = self.next_nonspace
        self.column = self.next_nonspace_column
        self.partially_consumed_tab = False

    def advance_offset(self, count, columns):
        current_line = self.current_line
        try:
            c = current_line[self.offset]
        except IndexError:
            c = None
        while count > 0 and c is not None:
            if c == '\t':
                chars_to_tab = 4 - (self.column % 4)
                if columns:
                    self.partially_consumed_tab = chars_to_tab > count
                    chars_to_advance = min(count, chars_to_tab)
                    self.column += chars_to_advance
                    self.offset += 0 if self.partially_consumed_tab else 1
                    count -= chars_to_advance
                else:
                    self.partially_consumed_tab = False
                    self.column += chars_to_tab
                    self.offset += 1
                    count -= 1
            else:
                self.partially_consumed_tab = False
                self.offset += 1
                # assume ascii; block starts are ascii
                self.column += 1
                count -= 1
            try:
                c = current_line[self.offset]
            except IndexError:
                c = None

    def incorporate_line(self, ln):
        """Analyze a line of text and update the document appropriately.

        We parse markdown text by calling this on each line of input,
        then finalizing the document.
        """
        all_matched = True

        container = self.doc
        self.oldtip = self.tip
        self.offset = 0
        self.column = 0
        self.blank = False
        self.partially_consumed_tab = False
        self.line_number += 1

        # replace NUL characters for security
        if re.search(r'\u0000', ln) is not None:
            ln = re.sub(r'\0', '\uFFFD', ln)

        self.current_line = ln

        # For each containing block, try to parse the associated line start.
        # Bail out on failure: container will point to the last matching block.
        # Set all_matched to false if not all containers match.
        last_child = container.last_child
        while last_child and last_child.is_open:
            container = last_child

            self.find_next_nonspace()
            block_class = getattr(
                import_module('CommonMark.blocks'),
                to_camel_case(container.t))
            rv = block_class.continue_(self, container)
            if rv == 0:
                # we've matched, keep going
                pass
            elif rv == 1:
                # we've failed to match a block
                all_matched = False
            elif rv == 2:
                # we've hit end of line for fenced code close and can return
                self.last_line_length = len(ln)
                return
            else:
                raise ValueError('returned illegal value, must be 0, 1, or 2')

            if not all_matched:
                # back up to last matching block
                container = container.parent
                break

            last_child = container.last_child

        self.all_closed = (container == self.oldtip)
        self.last_matched_container = container

        block_class = getattr(import_module('CommonMark.blocks'),
                              to_camel_case(container.t))
        matched_leaf = container.t != 'paragraph' and block_class.accepts_lines
        starts = self.block_starts
        starts_len = len(starts.METHODS)
        # Unless last matched container is a code block, try new container
        # starts, adding children to the last matched container:
        while not matched_leaf:
            self.find_next_nonspace()

            # this is a little performance optimization:
            if not self.indented and \
               not re.search(reMaybeSpecial, ln[self.next_nonspace:]):
                self.advance_next_nonspace()
                break

            i = 0
            while i < starts_len:
                res = getattr(starts, starts.METHODS[i])(self, container)
                if res == 1:
                    container = self.tip
                    break
                elif res == 2:
                    container = self.tip
                    matched_leaf = True
                    break
                else:
                    i += 1

            if i == starts_len:
                # nothing matched
                self.advance_next_nonspace()
                break

        # What remains at the offset is a text line. Add the text to the
        # appropriate container.
        if not self.all_closed and not self.blank and \
           self.tip.t == 'paragraph':
            # lazy paragraph continuation
            self.add_line()
        else:
            # not a lazy continuation
            # finalize any blocks not matched
            self.close_unmatched_blocks()
            if self.blank and container.last_child:
                container.last_child.last_line_blank = True

            t = container.t

            # Block quote lines are never blank as they start with >
            # and we don't count blanks in fenced code for purposes of
            # tight/loose lists or breaking out of lists.  We also
            # don't set last_line_blank on an empty list item, or if we
            # just closed a fenced block.
            last_line_blank = self.blank and \
                not (t == 'block_quote' or
                     (t == 'code_block' and container.is_fenced) or
                     (t == 'item' and
                      not container.first_child and
                      container.sourcepos[0][0] == self.line_number))

            # propagate last_line_blank up through parents:
            cont = container
            while cont:
                cont.last_line_blank = last_line_blank
                cont = cont.parent

            block_class = getattr(import_module('CommonMark.blocks'),
                                  to_camel_case(t))
            if block_class.accepts_lines:
                self.add_line()
                # if HtmlBlock, check for end condition
                if t == 'html_block' and \
                   container.html_block_type >= 1 and \
                   container.html_block_type <= 5 and \
                   re.search(
                       reHtmlBlockClose[container.html_block_type],
                       self.current_line[self.offset:]):
                    self.finalize(container, self.line_number)
            elif self.offset < len(ln) and not self.blank:
                # create a paragraph container for one line
                container = self.add_child('paragraph', self.offset)
                self.advance_next_nonspace()
                self.add_line()

        self.last_line_length = len(ln)

    def finalize(self, block, line_number):
        """ Finalize a block.  Close it and do any necessary postprocessing,
        e.g. creating string_content from strings, setting the 'tight'
        or 'loose' status of a list, and parsing the beginnings
        of paragraphs for reference definitions.  Reset the tip to the
        parent of the closed block."""
        above = block.parent
        block.is_open = False
        block.sourcepos[1] = [line_number, self.last_line_length]
        block_class = getattr(import_module('CommonMark.blocks'),
                              to_camel_case(block.t))
        block_class.finalize(self, block)

        self.tip = above

    def process_inlines(self, block):
        """
        Walk through a block & children recursively, parsing string content
        into inline content where appropriate.
        """
        walker = block.walker()
        self.inline_parser.refmap = self.refmap
        self.inline_parser.options = self.options
        event = walker.nxt()
        while event is not None:
            node = event['node']
            t = node.t
            if not event['entering'] and (t == 'paragraph' or t == 'heading'):
                self.inline_parser.parse(node)
            event = walker.nxt()

    def parse(self, my_input):
        """ The main parsing function.  Returns a parsed document AST."""
        self.doc = Node('document', [[1, 1], [0, 0]])
        self.tip = self.doc
        self.refmap = {}
        self.line_number = 0
        self.last_line_length = 0
        self.offset = 0
        self.column = 0
        self.last_matched_container = self.doc
        self.current_line = ''
        lines = re.split(reLineEnding, my_input)
        length = len(lines)
        if len(my_input) > 0 and my_input[-1] == '\n':
            # ignore last blank line created by final newline
            length -= 1
        for i in range(length):
            self.incorporate_line(lines[i])
        while (self.tip):
            self.finalize(self.tip, length)
        self.process_inlines(self.doc)
        return self.doc




© 2015 - 2024 Weber Informatics LLC | Privacy Policy