org.w3c.tidy.ParserImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jtidy Show documentation
Show all versions of jtidy Show documentation
JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be
used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the
document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.
/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett
* Andy Quick (translation to Java)
* Gary L Peskin (Java development)
* Sami Lempinen (release management)
* Fabrizio Giustina
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy;
/**
* HTML Parser implementation.
* @author Dave Raggett [email protected]
* @author Andy Quick [email protected] (translation to Java)
* @author Fabrizio Giustina
* @version $Revision$ ($Author$)
*/
public final class ParserImpl
{
/**
* parser for html.
*/
public static final Parser HTML = new ParseHTML();
/**
* parser for head.
*/
public static final Parser HEAD = new ParseHead();
/**
* parser for title.
*/
public static final Parser TITLE = new ParseTitle();
/**
* parser for script.
*/
public static final Parser SCRIPT = new ParseScript();
/**
* parser for body.
*/
public static final Parser BODY = new ParseBody();
/**
* parser for frameset.
*/
public static final Parser FRAMESET = new ParseFrameSet();
/**
* parser for inline.
*/
public static final Parser INLINE = new ParseInline();
/**
* parser for list.
*/
public static final Parser LIST = new ParseList();
/**
* parser for definition lists.
*/
public static final Parser DEFLIST = new ParseDefList();
/**
* parser for pre.
*/
public static final Parser PRE = new ParsePre();
/**
* parser for block elements.
*/
public static final Parser BLOCK = new ParseBlock();
/**
* parser for table.
*/
public static final Parser TABLETAG = new ParseTableTag();
/**
* parser for colgroup.
*/
public static final Parser COLGROUP = new ParseColGroup();
/**
* parser for rowgroup.
*/
public static final Parser ROWGROUP = new ParseRowGroup();
/**
* parser for row.
*/
public static final Parser ROW = new ParseRow();
/**
* parser for noframes.
*/
public static final Parser NOFRAMES = new ParseNoFrames();
/**
* parser for select.
*/
public static final Parser SELECT = new ParseSelect();
/**
* parser for text.
*/
public static final Parser TEXT = new ParseText();
/**
* parser for empty elements.
*/
public static final Parser EMPTY = new ParseEmpty();
/**
* parser for optgroup.
*/
public static final Parser OPTGROUP = new ParseOptGroup();
/**
* ParserImpl should not be instantiated.
*/
private ParserImpl()
{
// unused
}
/**
* Parse tag.
* @param lexer the Lexer to use
* @param node the node to use
* @param mode the mode to use
*/
protected static void parseTag(Lexer lexer, Node node, short mode)
{
// Fix by GLP 2000-12-21. Need to reset insertspace if this
// is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
if ((node.tag.model & Dict.CM_EMPTY) != 0)
{
lexer.waswhite = false;
}
else if ((node.tag.model & Dict.CM_INLINE) == 0)
{
lexer.insertspace = false;
}
if (node.tag.getParser() == null)
{
return;
}
if (node.type == Node.START_END_TAG)
{
Node.trimEmptyElement(lexer, node);
return;
}
node.tag.getParser().parse(lexer, node, mode);
}
/**
* Move node to the head, where element is used as starting point in hunt for head.
* Normally called during parsing.
* @param lexer the Lexer to use
* @param element the element to use
* @param node the node to use
*/
protected static void moveToHead(Lexer lexer, Node element, Node node)
{
Node head;
node.removeNode(); // make sure that node is isolated
TagTable tt = lexer.configuration.tt;
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
while (element.tag != tt.tagHtml)
{
element = element.parent;
}
for (head = element.content; head != null; head = head.next)
{
if (head.tag == tt.tagHead)
{
head.insertNodeAtEnd(node);
break;
}
}
if (node.tag.getParser() != null)
{
parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
}
}
else
{
lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
}
/**
* moves given node to end of body element.
* @param lexer Lexer
* @param node Node to insert
*/
static void moveNodeToBody(Lexer lexer, Node node)
{
node.removeNode();
Node body = lexer.root.findBody(lexer.configuration.tt);
body.insertNodeAtEnd(node);
}
/**
* Parser for HTML.
*/
public static class ParseHTML implements Parser
{
/**
* @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
*/
public void parse(Lexer lexer, Node html, short mode)
{
Node node, head;
Node frameset = null;
Node noframes = null;
lexer.configuration.xmlTags = false;
lexer.seenEndBody = false;
TagTable tt = lexer.configuration.tt;
while (true)
{
node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
if (node == null)
{
node = lexer.inferredTag("head");
break;
}
if (node.tag == tt.tagHead)
{
break;
}
if (node.tag == html.tag && node.type == Node.END_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
// deal with comments etc.
if (Node.insertMisc(html, node))
{
continue;
}
lexer.ungetToken();
node = lexer.inferredTag("head");
break;
}
head = node;
html.insertNodeAtEnd(head);
HEAD.parse(lexer, head, mode);
while (true)
{
node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
if (node == null)
{
if (frameset == null)
{
// implied body
node = lexer.inferredTag("body");
html.insertNodeAtEnd(node);
BODY.parse(lexer, node, mode);
}
return;
}
// robustly handle html tags
if (node.tag == html.tag)
{
if (node.type != Node.START_TAG && frameset == null)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
}
else if (node.type == Node.END_TAG)
{
lexer.seenEndHtml = true;
}
continue;
}
// deal with comments etc.
if (Node.insertMisc(html, node))
{
continue;
}
// if frameset document coerce to
if (node.tag == tt.tagBody)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
{
lexer.ungetToken();
if (noframes == null)
{
noframes = lexer.inferredTag("noframes");
frameset.insertNodeAtEnd(noframes);
lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG);
}
parseTag(lexer, noframes, mode);
continue;
}
lexer.constrainVersion(~Dict.VERS_FRAMESET);
break; // to parse body
}
// flag an error if we see more than one frameset
if (node.tag == tt.tagFrameset)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
{
lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
}
else
{
frameset = node;
}
html.insertNodeAtEnd(node);
parseTag(lexer, node, mode);
// see if it includes a noframes element so that we can merge subsequent noframes elements
for (node = frameset.content; node != null; node = node.next)
{
if (node.tag == tt.tagNoframes)
{
noframes = node;
}
}
continue;
}
// if not a frameset document coerce to
if (node.tag == tt.tagNoframes)
{
if (node.type != Node.START_TAG)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset == null)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
node = lexer.inferredTag("body");
break;
}
if (noframes == null)
{
noframes = node;
frameset.insertNodeAtEnd(noframes);
}
parseTag(lexer, noframes, mode);
continue;
}
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
{
moveToHead(lexer, html, node);
continue;
}
// #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00
if (frameset != null && node.tag == tt.tagFrame)
{
lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
}
lexer.ungetToken();
// insert other content into noframes element
if (frameset != null)
{
if (noframes == null)
{
noframes = lexer.inferredTag("noframes");
frameset.insertNodeAtEnd(noframes);
}
else
{
lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
}
lexer.constrainVersion(Dict.VERS_FRAMESET);
parseTag(lexer, noframes, mode);
continue;
}
node = lexer.inferredTag("body");
lexer.constrainVersion(~Dict.VERS_FRAMESET);
break;
}
// node must be body
html.insertNodeAtEnd(node);
parseTag(lexer, node, mode);
lexer.seenEndHtml = true;
}
}
/**
* Parser for HEAD.
*/
public static class ParseHead implements Parser
{
/**
* @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
*/
public void parse(Lexer lexer, Node head, short mode)
{
Node node;
int hasTitle = 0;
int hasBase = 0;
TagTable tt = lexer.configuration.tt;
while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
{
if (node.tag == head.tag && node.type == Node.END_TAG)
{
head.closed = true;
break;
}
if (node.type == Node.TEXT_NODE)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
lexer.ungetToken();
break;
}
// deal with comments etc.
if (Node.insertMisc(head, node))
{
continue;
}
if (node.type == Node.DOCTYPE_TAG)
{
Node.insertDocType(lexer, head, node);
continue;
}
// discard unknown tags
if (node.tag == null)
{
lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD))
{
// #545067 Implicit closing of head broken - warn only for XHTML input
if (lexer.isvoyager)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
}
lexer.ungetToken();
break;
}
if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
{
if (node.tag == tt.tagTitle)
{
++hasTitle;
if (hasTitle > 1)
{
lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
}
else if (node.tag == tt.tagBase)
{
++hasBase;
if (hasBase > 1)
{
lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
}
else if (node.tag == tt.tagNoscript)
{
lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
}
head.insertNodeAtEnd(node);
parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
continue;
}
// discard unexpected text nodes and end tags
lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
}
}
}
/**
* Parser for TITLE.
*/
public static class ParseTitle implements Parser
{
/**
* @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
*/
public void parse(Lexer lexer, Node title, short mode)
{
Node node;
while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null)
{
// [438658] : Missing / in title endtag makes 2 titles
if (node.tag == title.tag && node.type == Node.START_TAG)
{
lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG);
node.type = Node.END_TAG;
continue;
}
else if (node.tag == title.tag && node.type == Node.END_TAG)
{
title.closed = true;
Node.trimSpaces(lexer, title);
return;
}
if (node.type == Node.TEXT_NODE)
{
// only called for 1st child
if (title.content == null)
{
Node.trimInitialSpace(lexer, title, node);
}
if (node.start >= node.end)
{
continue;
}
title.insertNodeAtEnd(node);
continue;
}
// deal with comments etc.
if (Node.insertMisc(title, node))
{
continue;
}
// discard unknown tags
if (node.tag == null)
{
lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
continue;
}
// pushback unexpected tokens
lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
Node.trimSpaces(lexer, title);
return;
}
lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
}
}
/**
* Parser for SCRIPT.
*/
public static class ParseScript implements Parser
{
/**
* @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
*/
public void parse(Lexer lexer, Node script, short mode) {
Node node = lexer.getCDATA(script);
if (node != null) {
script.insertNodeAtEnd(node);
} else {
/* handle e.g. a document like "