lib-python.2.5.test.test_sgmllib.py Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython-standalone Show documentation
Show all versions of jython-standalone Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
import htmlentitydefs
import pprint
import re
import sgmllib
import unittest
from test import test_support
class EventCollector(sgmllib.SGMLParser):
def __init__(self):
self.events = []
self.append = self.events.append
sgmllib.SGMLParser.__init__(self)
def get_events(self):
# Normalize the list of events so that buffer artefacts don't
# separate runs of contiguous characters.
L = []
prevtype = None
for event in self.events:
type = event[0]
if type == prevtype == "data":
L[-1] = ("data", L[-1][1] + event[1])
else:
L.append(event)
prevtype = type
self.events = L
return L
# structure markup
def unknown_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
def unknown_endtag(self, tag):
self.append(("endtag", tag))
# all other markup
def handle_comment(self, data):
self.append(("comment", data))
def handle_charref(self, data):
self.append(("charref", data))
def handle_data(self, data):
self.append(("data", data))
def handle_decl(self, decl):
self.append(("decl", decl))
def handle_entityref(self, data):
self.append(("entityref", data))
def handle_pi(self, data):
self.append(("pi", data))
def unknown_decl(self, decl):
self.append(("unknown decl", decl))
class CDATAEventCollector(EventCollector):
def start_cdata(self, attrs):
self.append(("starttag", "cdata", attrs))
self.setliteral()
class HTMLEntityCollector(EventCollector):
entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
'|(x[0-9a-zA-Z]+|[0-9]+))(;?)')
def convert_charref(self, name):
self.append(("charref", "convert", name))
if name[0] != "x":
return EventCollector.convert_charref(self, name)
def convert_codepoint(self, codepoint):
self.append(("codepoint", "convert", codepoint))
EventCollector.convert_codepoint(self, codepoint)
def convert_entityref(self, name):
self.append(("entityref", "convert", name))
return EventCollector.convert_entityref(self, name)
# These to record that they were called, then pass the call along
# to the default implementation so that it's actions can be
# recorded.
def handle_charref(self, data):
self.append(("charref", data))
sgmllib.SGMLParser.handle_charref(self, data)
def handle_entityref(self, data):
self.append(("entityref", data))
sgmllib.SGMLParser.handle_entityref(self, data)
class SGMLParserTestCase(unittest.TestCase):
collector = EventCollector
def get_events(self, source):
parser = self.collector()
try:
for s in source:
parser.feed(s)
parser.close()
except:
#self.events = parser.events
raise
return parser.get_events()
def check_events(self, source, expected_events):
try:
events = self.get_events(source)
except:
import sys
#print >>sys.stderr, pprint.pformat(self.events)
raise
if events != expected_events:
self.fail("received events did not match expected events\n"
"Expected:\n" + pprint.pformat(expected_events) +
"\nReceived:\n" + pprint.pformat(events))
def check_parse_error(self, source):
parser = EventCollector()
try:
parser.feed(source)
parser.close()
except sgmllib.SGMLParseError:
pass
else:
self.fail("expected SGMLParseError for %r\nReceived:\n%s"
% (source, pprint.pformat(parser.get_events())))
def test_doctype_decl_internal(self):
inside = """\
DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
%paramEntity;
]"""
self.check_events(["" % inside], [
("decl", inside),
])
def test_doctype_decl_external(self):
inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
self.check_events("" % inside, [
("decl", inside),
])
def test_underscore_in_attrname(self):
# SF bug #436621
"""Make sure attribute names with underscores are accepted"""
self.check_events("", [
("starttag", "a", [("has_under", "has_under"),
("_under", "_under")]),
])
def test_underscore_in_tagname(self):
# SF bug #436621
"""Make sure tag names with underscores are accepted"""
self.check_events(" ", [
("starttag", "has_under", []),
("endtag", "has_under"),
])
def test_quotes_in_unquoted_attrs(self):
# SF bug #436621
"""Be sure quotes in unquoted attributes are made part of the value"""
self.check_events("", [
("starttag", "a", [("href", "foo'bar\"baz")]),
])
def test_xhtml_empty_tag(self):
"""Handling of XHTML-style empty start tags"""
self.check_events("
text", [
("starttag", "br", []),
("data", "text"),
("starttag", "i", []),
("endtag", "i"),
])
def test_processing_instruction_only(self):
self.check_events("", [
("pi", "processing instruction"),
])
def test_bad_nesting(self):
self.check_events("", [
("starttag", "a", []),
("starttag", "b", []),
("endtag", "a"),
("endtag", "b"),
])
def test_bare_ampersands(self):
self.check_events("this text & contains & ampersands &", [
("data", "this text & contains & ampersands &"),
])
def test_bare_pointy_brackets(self):
self.check_events("this < text > contains < bare>pointy< brackets", [
("data", "this < text > contains < bare>pointy< brackets"),
])
def test_attr_syntax(self):
output = [
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
]
self.check_events("""""", output)
self.check_events("""""", output)
self.check_events("""""", output)
self.check_events("""""", output)
def test_attr_values(self):
self.check_events("""""",
[("starttag", "a", [("b", "xxx\n\txxx"),
("c", "yyy\t\nyyy"),
("d", "\txyz\n")])
])
self.check_events("""""", [
("starttag", "a", [("b", ""), ("c", "")]),
])
# URL construction stuff from RFC 1808:
safe = "$-_.+"
extra = "!*'(),"
reserved = ";/?:@&="
url = "http://example.com:8080/path/to/file?%s%s%s" % (
safe, extra, reserved)
self.check_events("""""" % url, [
("starttag", "e", [("a", url)]),
])
# Regression test for SF patch #669683.
self.check_events("", [
("starttag", "e", [("a", "rgb(1,2,3)")]),
])
def test_attr_values_entities(self):
"""Substitution of entities and charrefs in attribute values"""
# SF bug #1452246
self.check_events("""""",
[("starttag", "a", [("b", "<"),
("c", "<>"),
("d", "<->"),
("e", "< "),
("f", "&xxx;"),
("g", " !"),
("h", "Ǵ"),
("i", "x?a=b&c=d;"),
("j", "*"),
("k", "*"),
])])
def test_convert_overrides(self):
# This checks that the character and entity reference
# conversion helpers are called at the documented times. No
# attempt is made to really change what the parser accepts.
#
self.collector = HTMLEntityCollector
self.check_events(('foo'
'&foobar;*'), [
('entityref', 'convert', 'ldquo'),
('charref', 'convert', 'x201d'),
('starttag', 'a', [('title', '“test”')]),
('data', 'foo'),
('endtag', 'a'),
('entityref', 'foobar'),
('entityref', 'convert', 'foobar'),
('charref', '42'),
('charref', 'convert', '42'),
('codepoint', 'convert', 42),
])
def test_attr_funky_names(self):
self.check_events("""""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
])
def test_attr_value_ip6_url(self):
# http://www.python.org/sf/853506
self.check_events((""
""), [
("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
])
def test_illegal_declarations(self):
s = 'abcdef'
self.check_events(s, [
("data", "abc"),
("unknown decl", 'spacer type="block" height="25"'),
("data", "def"),
])
def test_weird_starttags(self):
self.check_events("", [
("starttag", "a", []),
("starttag", "a", []),
])
self.check_events("", [
("endtag", "a"),
("starttag", "a", []),
])
def test_declaration_junk_chars(self):
self.check_parse_error("")
def test_get_starttag_text(self):
s = """"""
self.check_events(s, [
("starttag", "foobar", [("one", "1"), ("two", "2")]),
])
def test_cdata_content(self):
s = (" ¬-an-entity-ref; "
" ")
self.collector = CDATAEventCollector
self.check_events(s, [
("starttag", "cdata", []),
("data", " ¬-an-entity-ref; "),
("endtag", "cdata"),
("starttag", "notcdata", []),
("data", " "),
("comment", " comment "),
("data", " "),
("endtag", "notcdata"),
])
s = """ """
self.check_events(s, [
("starttag", "cdata", []),
("data", " "),
("endtag", "cdata"),
])
def test_illegal_declarations(self):
s = 'abcdef'
self.check_events(s, [
("data", "abc"),
("unknown decl", 'spacer type="block" height="25"'),
("data", "def"),
])
def test_enumerated_attr_type(self):
s = "]>"
self.check_events(s, [
('decl', 'DOCTYPE doc []'),
])
def test_read_chunks(self):
# SF bug #1541697, this caused sgml parser to hang
# Just verify this code doesn't cause a hang.
CHUNK = 1024 # increasing this to 8212 makes the problem go away
f = open(test_support.findfile('sgml_input.html'))
fp = sgmllib.SGMLParser()
while 1:
data = f.read(CHUNK)
fp.feed(data)
if len(data) != CHUNK:
break
# XXX These tests have been disabled by prefixing their names with
# an underscore. The first two exercise outstanding bugs in the
# sgmllib module, and the third exhibits questionable behavior
# that needs to be carefully considered before changing it.
def _test_starttag_end_boundary(self):
self.check_events("", [("starttag", "a", [("b", "<")])])
self.check_events("", [("starttag", "a", [("b", ">")])])
def _test_buffer_artefacts(self):
output = [("starttag", "a", [("b", "<")])]
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
output = [("starttag", "a", [("b", ">")])]
self.check_events([""], output)
self.check_events(["'>"], output)
self.check_events(["'>"], output)
self.check_events(["'>"], output)
self.check_events([""], output)
self.check_events([""], output)
output = [("comment", "abc")]
self.check_events(["", ""], output)
self.check_events(["<", "!--abc-->"], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events([""], output)
self.check_events(["", ""], output)
def _test_starttag_junk_chars(self):
self.check_parse_error("<")
self.check_parse_error("<>")
self.check_parse_error("$>")
self.check_parse_error("")
self.check_parse_error("")
self.check_parse_error("")
self.check_parse_error("'")
self.check_parse_error("