lib-python.2.7.test.test_htmlparser.py Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jython-standalone Show documentation

Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform.

The newest version!

"""Tests for HTMLParser.py."""

import HTMLParser
import pprint
import unittest
from test import test_support


class EventCollector(HTMLParser.HTMLParser):

    def __init__(self):
        self.events = []
        self.append = self.events.append
        HTMLParser.HTMLParser.__init__(self)

    def get_events(self):
        # Normalize the list of events so that buffer artefacts don't
        # separate runs of contiguous characters.
        L = []
        prevtype = None
        for event in self.events:
            type = event[0]
            if type == prevtype == "data":
                L[-1] = ("data", L[-1][1] + event[1])
            else:
                L.append(event)
            prevtype = type
        self.events = L
        return L

    # structure markup

    def handle_starttag(self, tag, attrs):
        self.append(("starttag", tag, attrs))

    def handle_startendtag(self, tag, attrs):
        self.append(("startendtag", tag, attrs))

    def handle_endtag(self, tag):
        self.append(("endtag", tag))

    # all other markup

    def handle_comment(self, data):
        self.append(("comment", data))

    def handle_charref(self, data):
        self.append(("charref", data))

    def handle_data(self, data):
        self.append(("data", data))

    def handle_decl(self, data):
        self.append(("decl", data))

    def handle_entityref(self, data):
        self.append(("entityref", data))

    def handle_pi(self, data):
        self.append(("pi", data))

    def unknown_decl(self, decl):
        self.append(("unknown decl", decl))


class EventCollectorExtra(EventCollector):

    def handle_starttag(self, tag, attrs):
        EventCollector.handle_starttag(self, tag, attrs)
        self.append(("starttag_text", self.get_starttag_text()))


class TestCaseBase(unittest.TestCase):

    def _run_check(self, source, expected_events, collector=EventCollector):
        parser = collector()
        for s in source:
            parser.feed(s)
        parser.close()
        events = parser.get_events()
        if events != expected_events:
            self.fail("received events did not match expected events\n"
                      "Expected:\n" + pprint.pformat(expected_events) +
                      "\nReceived:\n" + pprint.pformat(events))

    def _run_check_extra(self, source, events):
        self._run_check(source, events, EventCollectorExtra)

    def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse)


class HTMLParserTestCase(TestCaseBase):

    def test_processing_instruction_only(self):
        self._run_check("", [
            ("pi", "processing instruction"),
            ])
        self._run_check("", [
            ("pi", "processing instruction ?"),
            ])

    def test_simple_html(self):
        self._run_check("""

&entity; 

sample
text
“


""", [
    ("data", "\n"),
    ("decl", "DOCTYPE html PUBLIC 'foo'"),
    ("data", "\n"),
    ("starttag", "html", []),
    ("entityref", "entity"),
    ("charref", "32"),
    ("data", "\n"),
    ("comment", "comment1a\n-><", [
            ("starttag", "a", []),
            ("starttag", "b", []),
            ("endtag", "a"),
            ("endtag", "b"),
            ])

    def test_bare_ampersands(self):
        self._run_check("this text & contains & ampersands &", [
            ("data", "this text & contains & ampersands &"),
            ])

    def test_bare_pointy_brackets(self):
        self._run_check("this < text > contains < bare>pointy< brackets", [
            ("data", "this < text > contains < bare>pointy< brackets"),
            ])

    def test_illegal_declarations(self):
        self._run_check('',
                        [('comment', 'spacer type="block" height="25"')])

    def test_starttag_end_boundary(self):
        self._run_check("""""", [("starttag", "a", [("b", "<")])])
        self._run_check("""""", [("starttag", "a", [("b", ">")])])

    def test_buffer_artefacts(self):
        output = [("starttag", "a", [("b", "<")])]
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)

        output = [("starttag", "a", [("b", ">")])]
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check(["'>"], output)
        self._run_check([""], output)
        self._run_check([""], output)

        output = [("comment", "abc")]
        self._run_check(["", ""], output)
        self._run_check(["<", "!--abc-->"], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check([""], output)
        self._run_check(["", ""], output)

    def test_starttag_junk_chars(self):
        self._run_check("", [])
        self._run_check("", [('comment', '$')])
        self._run_check("", [('data', '", [('endtag', 'a'", [('data', "" % dtd,
                            [('decl', 'DOCTYPE ' + dtd)])

    def test_slashes_in_starttag(self):
        self._run_check('', [('startendtag', 'a', [('foo', 'var')])])
        html = ('')
        expected = [(
            'startendtag', 'img',
            [('width', '902'), ('height', '250px'),
             ('src', '/sites/default/files/images/homepage/foo.jpg'),
             ('*what', None), ('am', None), ('i', None),
             ('doing', None), ('here*', None)]
        )]
        self._run_check(html, expected)
        html = (''
                '')
        expected = [
            ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
            ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
        ]
        self._run_check(html, expected)
        #see issue #14538
        html = (''
                '')
        expected = [
            ('starttag', 'meta', []), ('starttag', 'meta', []),
            ('starttag', 'meta', []), ('starttag', 'meta', []),
            ('startendtag', 'meta', []), ('startendtag', 'meta', []),
            ('startendtag', 'meta', []), ('startendtag', 'meta', []),
        ]
        self._run_check(html, expected)

    def test_declaration_junk_chars(self):
        self._run_check("", [('decl', 'DOCTYPE foo $ ')])

    def test_startendtag(self):
        self._run_check("", [
            ("startendtag", "p", []),
            ])
        self._run_check("
", [
            ("starttag", "p", []),
            ("endtag", "p"),
            ])
        self._run_check("", [
            ("starttag", "p", []),
            ("startendtag", "img", [("src", "foo")]),
            ("endtag", "p"),
            ])

    def test_invalid_end_tags(self):
        # A collection of broken end tags. 
 is used as separator.
        # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
        # and #13993
        html = ('



'
                '


')
        expected = [('starttag', 'br', []),
                    # < is part of the name, / is discarded, p is an attribute
                    ('endtag', 'label<'),
                    ('starttag', 'br', []),
                    # text and attributes are discarded
                    ('endtag', 'div'),
                    ('starttag', 'br', []),
                    # comment because the first char after  is ignored
                    ('starttag', 'br', [])]
        self._run_check(html, expected)

    def test_broken_invalid_end_tag(self):
        # This is technically wrong (the "> shouldn't be included in the 'data')
        # but is probably not worth fixing it (in addition to all the cases of
        # the previous test, it would require a full attribute parsing).
        # see #13993
        html = 'This confuses the parser'
        expected = [('starttag', 'b', []),
                    ('data', 'This'),
                    ('endtag', 'b'),
                    ('data', '"> confuses the parser')]
        self._run_check(html, expected)

    def test_get_starttag_text(self):
        s = """"""
        self._run_check_extra(s, [
            ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
            ("starttag_text", s)])

    def test_cdata_content(self):
        contents = [
            ' ¬-an-entity-ref;',
            "",
            '  ',
            'foo = "";',
            'foo = "";',
            'foo = <\n/script> ',
            '',
            ('\n//<\\/s\'+\'cript>\');\n//]]>'),
            '\n\n',
            'foo = "";',
            u'',
            # these two should be invalid according to the HTML 5 spec,
            # section 8.1.2.2
            #'foo = ',
            #'foo = ',
        ]
        elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
        for content in contents:
            for element in elements:
                element_lower = element.lower()
                s = u'<{element}>{content}'.format(element=element,
                                                               content=content)
                self._run_check(s, [("starttag", element_lower, []),
                                    ("data", content),
                                    ("endtag", element_lower)])

    def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """ ¬-an-entity-ref;
                   
 & 
                  ''   !"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'